Background
If you have been following my posts, I am attempting to replicate the results found in Kazushige Goto's seminal paper on square matrix multiplication C = AB. My last post regarding this topic can be found here. In that version of my code, I follow the memory layering and packing strategy of Goto with an inner kernel computing 2x8 blocks of C using 128 bit SSE3 intrinsics. My CPU is i5-540M with hyperthreading off. Additional info about my hardware can be found in another post and is repeated below.
My Hardware
My CPU is an Intel i5 - 540M. You can find the relevant CPUID information on cpu-world.com. The microarchitecture is Nehalem (westmere), so it can theoretically compute 4 double precision flops per core per cycle. I will be using just one core (no OpenMP), so with hyperthreading off and 4-step Intel Turbo Boost, I should be seeing a peak of ( 2.533 Ghz + 4*0.133 Ghz ) * ( 4 DP flops/core/cycle ) * ( 1 core ) = 12.27 DP Gflops. For reference, with both cores running at peak, Intel Turbo Boost gives a 2-step speed up and I should get a theoretical peak of 22.4 DP Gflops.
My Software
Windows7 64 bit, but MinGW/GCC 32 bit due to restrictions on my computer.
What's new this time?
I compute 2x4 blocks of C. This gives better performance and is in line with what Goto says (half the registers shoubld be used to compute C). I have tried many sizes: 1x8, 2x8, 2x4, 4x2, 2x2, 4x4.
My inner kernel is hand-coded x86 assembly, optimized to the best of my ability (matches some of the kernels that Goto wrote), which gives a rather large performance boost over SIMD. This code is unrolled 8 times inside the inner kernel (defined as a macro for convenience), giving the best performance out of other unrolling strategies I tried.
I use the Windows performance counters to time the codes, rather than clock().
I time the inner kernel independently of the total code, to see how well my hand-coded assembly is doing.
I report the best result from some number of trials, rather than an average over the trials.
No more OpenMP (single core perfomance only).
NOTE I will recompile OpenBLAS tonight to use only 1 core so I can compare.
Some Preliminary Results
N is the dimension of the square matrices, Total Gflops/s is the Gflops/s of the entire code, and Kernel Gflops/s is the Gflops/s of the inner kernel. You can see that with a 12.26 Gflops/s peak on one core, the inner kernel is getting around 75% efficiency, while the overall code is about 60% efficient.
I would like to get closer to 95% efficiency for the kernel and 80% for the overall code. What else can I do to improve the performance, of at least the inner kernel?
N Total Gflops/s Kernel Gflops/s
256 7.778089 9.756284
512 7.308523 9.462700
768 7.223283 9.253639
1024 7.197375 9.132235
1280 7.142538 8.974122
1536 7.114665 8.967249
1792 7.060789 8.861958
Source Code
If you are feeling particularly magnanimous, please test my code on your machine. Compiled with gcc -std=c99 -O2 -m32 -msse3 -mincoming-stack-boundary=2 -masm=intel somatmul2.c -o somatmul2.exe. Feel free to try other flags, but I have found these to work best on my machine.
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <string.h>
#include <xmmintrin.h>
#include <math.h>
#include <omp.h>
#include <stdint.h>
#include <windows.h>
#ifndef min
#define min( a, b ) ( ((a) < (b)) ? (a) : (b) )
#endif
inline void
__attribute__ ((gnu_inline))
__attribute__ ((aligned(64))) rpack( double* restrict dst,
const double* restrict src,
const int kc, const int mc, const int mr, const int n)
{
double tmp[mc*kc] __attribute__ ((aligned(64)));
double* restrict ptr = &tmp[0];
for (int i = 0; i < mc; ++i)
for (int j = 0; j < kc; j+=4) {
*ptr++ = *(src + i*n + j );
*ptr++ = *(src + i*n + j+1);
*ptr++ = *(src + i*n + j+2);
*ptr++ = *(src + i*n + j+3);
}
ptr = &tmp[0];
//const int inc_dst = mr*kc;
for (int k = 0; k < mc; k+=mr)
for (int j = 0; j < kc; ++j)
for (int i = 0; i < mr*kc; i+=kc)
*dst++ = *(ptr + k*kc + j + i);
}
inline void
__attribute__ ((gnu_inline))
__attribute__ ((aligned(64))) cpack(double* restrict dst,
const double* restrict src,
const int nc,
const int kc,
const int nr,
const int n)
{ //nc cols, kc rows, nr size ofregister strips
double tmp[kc*nc] __attribute__ ((aligned(64)));
double* restrict ptr = &tmp[0];
for (int i = 0; i < kc; ++i)
for (int j = 0; j < nc; j+=4) {
*ptr++ = *(src + i*n + j );
*ptr++ = *(src + i*n + j+1);
*ptr++ = *(src + i*n + j+2);
*ptr++ = *(src + i*n + j+3);
}
ptr = &tmp[0];
// const int inc_k = nc/nr;
for (int k = 0; k < nc; k+=nr)
for (int j = 0; j < kc*nc; j+=nc)
for (int i = 0; i < nr; ++i)
*dst++ = *(ptr + k + i + j);
}
#define KERNEL0(add0,add1,add2,add3) \
"mulpd xmm4, xmm6 \n\t" \
"addpd xmm0, xmm4 \n\t" \
"movapd xmm4, XMMWORD PTR [edx+"#add2"] \n\t" \
"mulpd xmm7, xmm4 \n\t" \
"addpd xmm1, xmm7 \n\t" \
"movddup xmm5, QWORD PTR [eax+"#add0"] \n\t" \
"mulpd xmm6, xmm5 \n\t" \
"addpd xmm2, xmm6 \n\t" \
"movddup xmm7, QWORD PTR [eax+"#add1"] \n\t" \
"mulpd xmm4, xmm5 \n\t" \
"movapd xmm6, XMMWORD PTR [edx+"#add3"] \n\t" \
"addpd xmm3, xmm4 \n\t" \
"movapd xmm4, xmm7 \n\t" \
" \n\t"
inline void
__attribute__ ((gnu_inline))
__attribute__ ((aligned(64))) dgemm_2x4_asm_j
(
const int mc, const int nc, const int kc,
const double* restrict locA, const int cs_a, // mr
const double* restrict locB, const int rs_b, // nr
double* restrict C, const int rs_c
)
{
const double* restrict a1 = locA;
for (int i = 0; i < mc ; i+=cs_a) {
const double* restrict b1 = locB;
double* restrict c11 = C + i*rs_c;
for (int j = 0; j < nc ; j+=rs_b) {
__asm__ __volatile__
(
"mov eax, %[a1] \n\t"
"mov edx, %[b1] \n\t"
"mov edi, %[c11] \n\t"
"mov ecx, %[kc] \n\t"
"pxor xmm0, xmm0 \n\t"
"movddup xmm7, QWORD PTR [eax] \n\t" // a1
"pxor xmm1, xmm1 \n\t"
"movapd xmm6, XMMWORD PTR [edx] \n\t" // b1
"pxor xmm2, xmm2 \n\t"
"movapd xmm4, xmm7 \n\t" // a1
"pxor xmm3, xmm3 \n\t"
"sar ecx, 3 \n\t" // divide by 2^num
"L%=: \n\t" // start loop
KERNEL0( 8, 16, 16, 32)
KERNEL0( 24, 32, 48, 64)
KERNEL0( 40, 48, 80, 96)
KERNEL0( 56, 64, 112, 128)
KERNEL0( 72, 80, 144, 160)
KERNEL0( 88, 96, 176, 192)
KERNEL0( 104, 112, 208, 224)
KERNEL0( 120, 128, 240, 256)
"add eax, 128 \n\t"
"add edx, 256 \n\t"
" \n\t"
"dec ecx \n\t"
"jne L%= \n\t" // end loop
" \n\t"
"mov esi, %[rs_c] \n\t" // don't need cs_a anymore
"sal esi, 3 \n\t" // times 8
"lea ebx, [edi+esi] \n\t" // don't need b1 anymore
"addpd xmm0, XMMWORD PTR [edi] \n\t" // c11
"addpd xmm1, XMMWORD PTR [edi+16] \n\t" // c11 + 2
"addpd xmm2, XMMWORD PTR [ebx] \n\t" // c11
"addpd xmm3, XMMWORD PTR [ebx+16] \n\t" // c11 + 2
"movapd XMMWORD PTR [edi], xmm0 \n\t"
"movapd XMMWORD PTR [edi+16], xmm1 \n\t"
"movapd XMMWORD PTR [ebx], xmm2 \n\t"
"movapd XMMWORD PTR [ebx+16], xmm3 \n\t"
: // no outputs
: // inputs
[kc] "m" (kc),
[a1] "m" (a1),
[cs_a] "m" (cs_a),
[b1] "m" (b1),
[rs_b] "m" (rs_b),
[c11] "m" (c11),
[rs_c] "m" (rs_c)
: //register clobber
"memory",
"eax","ebx","ecx","edx","esi","edi",
"xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7"
);
b1 += rs_b*kc;
c11 += rs_b;
}
a1 += cs_a*kc;
}
}
double blis_dgemm_ref(
const int n,
const double* restrict A,
const double* restrict B,
double* restrict C,
const int mc,
const int nc,
const int kc
)
{
int mr = 2;
int nr = 4;
double locA[mc*kc] __attribute__ ((aligned(64)));
double locB[kc*nc] __attribute__ ((aligned(64)));
LARGE_INTEGER frequency, time1, time2;
double time3 = 0.0;
QueryPerformanceFrequency(&frequency);
// zero C
memset(C, 0, n*n*sizeof(double));
int ii,jj,kk;
//#pragma omp parallel num_threads(2) shared(A,B,C) private(ii,jj,kk,locA,locB)
{//use all threads in parallel
//#pragma omp for
for ( jj = 0; jj < n; jj+=nc) {
for ( kk = 0; kk < n; kk+=kc) {
cpack(locB, B + kk*n + jj, nc, kc, nr, n);
for ( ii = 0; ii < n; ii+=mc) {
rpack(locA, A + ii*n + kk, kc, mc, mr, n);
QueryPerformanceCounter(&time1);
dgemm_2x4_asm_j( mc, nc, kc,
locA , mr,
locB , nr,
C + ii*n + jj, n );
QueryPerformanceCounter(&time2);
time3 += (double) (time2.QuadPart - time1.QuadPart);
}
}
}
}
return time3 / frequency.QuadPart;
}
double compute_gflops(const double time, const int n)
{
// computes the gigaflops for a square matrix-matrix multiplication
double gflops;
gflops = (double) (2.0*n*n*n)/time/1.0e9;
return(gflops);
}
void main() {
LARGE_INTEGER frequency, time1, time2;
double time3, best_time;
double kernel_time, best_kernel_time;
QueryPerformanceFrequency(&frequency);
int best_flag;
double gflops, kernel_gflops;
const int trials = 100;
int nmax = 4096;
printf("%16s %16s %16s\n","N","Total Gflops/s","Kernel Gflops/s");
int mc = 256;
int kc = 256;
int nc = 128;
for (int n = kc; n <= nmax; n+=kc) {
double *A = NULL;
double *B = NULL;
double *C = NULL;
A = _mm_malloc (n*n * sizeof(*A),64); if (!A) {printf("A failed\n"); return;}
B = _mm_malloc (n*n * sizeof(*B),64); if (!B) {printf("B failed\n"); return;}
C = _mm_malloc (n*n * sizeof(*C),64); if (!C) {printf("C failed\n"); return;}
srand(time(NULL));
// Create the matrices
for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
*(A + i*n + j) = (double) rand()/RAND_MAX;
*(B + i*n + j) = (double) rand()/RAND_MAX;
}
}
// warmup
blis_dgemm_ref(n,A,B,C,mc,nc,kc);
for (int count = 0; count < trials; count++){
QueryPerformanceCounter(&time1);
kernel_time = blis_dgemm_ref(n,A,B,C,mc,nc,kc);
QueryPerformanceCounter(&time2);
time3 = (double)(time2.QuadPart - time1.QuadPart) / frequency.QuadPart;
if (count == 0) {
best_time = time3;
best_kernel_time = kernel_time;
}
else {
best_flag = ( time3 < best_time ? 1 : 0 );
if (best_flag) {
best_time = time3;
best_kernel_time = kernel_time;
}
}
}
gflops = compute_gflops(best_time, n);
kernel_gflops = compute_gflops(best_kernel_time, n);
printf("%16d %16f %16f\n",n,gflops,kernel_gflops);
_mm_free(A);
_mm_free(B);
_mm_free(C);
}
printf("tests are done\n");
}
EDIT
Replace the packing functions with the following new and faster versions:
inline void
__attribute__ ((gnu_inline))
__attribute__ ((aligned(64))) rpack( double* restrict dst,
const double* restrict src,
const int kc, const int mc, const int mr, const int n)
{
for (int i = 0; i < mc/mr; ++i)
for (int j = 0; j < kc; ++j)
for (int k = 0; k < mr; ++k)
*dst++ = *(src + i*n*mr + k*n + j);
}
inline void
__attribute__ ((gnu_inline))
__attribute__ ((aligned(64))) cpack(double* restrict dst,
const double* restrict src,
const int nc,
const int kc,
const int nr,
const int n)
{
for (int i = 0; i < nc/nr; ++i)
for (int j = 0; j < kc; ++j)
for (int k = 0; k < nr; ++k)
*dst++ = *(src + i*nr + j*n + k);
}
Results With New Packing Functions
Nice boost to the overall performance:
N Total Gflops/s Kernel Gflops/s
256 7.915617 8.794849
512 8.466467 9.350920
768 8.354890 9.135575
1024 8.168944 8.884611
1280 8.174249 8.825920
1536 8.285458 8.938712
1792 7.988038 8.581001
LINPACK 32-bit with 1 thread
CPU frequency: 2.792 GHz
Number of CPUs: 1
Number of cores: 2
Number of threads: 1
Performance Summary (GFlops)
Size LDA Align. Average Maximal
128 128 4 4.7488 5.0094
256 256 4 6.0747 6.9652
384 384 4 6.5208 7.2767
512 512 4 6.8329 7.5706
640 640 4 7.4278 7.8835
768 768 4 7.7622 8.0677
896 896 4 7.8860 8.4737
1024 1024 4 7.7292 8.1076
1152 1152 4 8.0411 8.4738
1280 1280 4 8.1429 8.4863
1408 1408 4 8.2284 8.7073
1536 1536 4 8.3753 8.6437
1664 1664 4 8.6993 8.9108
1792 1792 4 8.7576 8.9176
1920 1920 4 8.7945 9.0678
2048 2048 4 8.5490 8.8827
2176 2176 4 9.0138 9.1161
2304 2304 4 8.1402 9.1446
2432 2432 4 9.0003 9.2082
2560 2560 4 8.8560 9.1197
2688 2688 4 9.1008 9.3144
2816 2816 4 9.0876 9.3089
2944 2944 4 9.0771 9.4191
3072 3072 4 8.9402 9.2920
3200 3200 4 9.2259 9.3699
3328 3328 4 9.1224 9.3821
3456 3456 4 9.1354 9.4082
3584 3584 4 9.0489 9.3351
3712 3712 4 9.3093 9.5108
3840 3840 4 9.3307 9.5324
3968 3968 4 9.3895 9.5352
4096 4096 4 9.3269 9.3872
EDIT 2
Here are some results from single-threaded OpenBLAS, which took 4 hours to compile last night. As you can see, it is getting close to 95% CPU usage. Max single-threaded performance with both cores on is 11.2 Gflops (2 step Intel Turbo Boost). I need to turn off the other core to get 12.26 Gflops (4 step Intel Turbo Boost). Assume that the packing functions in OpeBLAS generate no additional overhead. Then the OpenBLAS kernel must be running at least as fast as the total OpenBLAS code. So I need to get my kernel running at that speed. I have yet to figure out how to make my assembly faster. I will be focusing on this over the next few days.
Ran the tests below from Windows command line with: start /realtime /affinity 1
My Code:
N Total Gflops/s Kernel Gflops/s
256 7.927740 8.832366
512 8.427591 9.347094
768 8.547722 9.352993
1024 8.597336 9.351794
1280 8.588663 9.296724
1536 8.589808 9.271710
1792 8.634201 9.306406
2048 8.527889 9.235653
OpenBLAS:
N Total Gflops/s
256 10.599065
512 10.622686
768 10.745133
1024 10.762757
1280 10.832540
1536 10.793132
1792 10.848356
2048 10.819986
It's theoretically possible to look at that code and reason through whether it could be arranged to make better use of microarchitectural resources - but even the performance architects at Intel might not recommend doing it that way. It might help to use a tool like VTune or Intel Performance Counter Monitor to find out how much of your workload is memory versus front-end versus back-end bound. Intel Architecture Code Analyzer might also be a quick source of help narrowing down which of the potential issues listed below to follow up on first.
Nominal Animal is probably on the right track in the comments talking about interleaving instructions that access memory and those that do computation. A few other possibilities:
Using other instructions for some of the computation might reduce pressure on one of the execution ports (see section 3.3.4 of this presentation). In particular, mulpd is always going to dispatch to port 1 on Westmere. Maybe if there are any cycles where port 0 isn't getting used, you could sneak in a scalar FP multiply there.
One or another of the hardware prefetchers could be saturating the bus early or polluting the cache with lines you don't end up using.
On the other hand, there's a slim possibility that the ordering of memory references or the memory layout implied in dgemm_2x4_asm_j is faking out the prefetchers.
Changing the relative ordering of pairs of instructions that don't have any data dependencies might lead to better use of front-end or back-end resources.
Related
I have found that manually calculating the % operator on __int128 is significantly faster than the built-in compiler operator. I will show you how to calculate modulo 9, but the method can be used to calculate modulo any other number.
First, consider the built-in compiler operator:
uint64_t mod9_v1(unsigned __int128 n)
{
return n % 9;
}
Now consider my manual implementation:
uint64_t mod9_v2(unsigned __int128 n)
{
uint64_t r = 0;
r += (uint32_t)(n);
r += (uint32_t)(n >> 32) * (uint64_t)4;
r += (uint32_t)(n >> 64) * (uint64_t)7;
r += (uint32_t)(n >> 96);
return r % 9;
}
Measuring over 100,000,000 random numbers gives the following results:
mod9_v1 | 3.986052 secs
mod9_v2 | 1.814339 secs
GCC 9.3.0 with -march=native -O3 was used on AMD Ryzen Threadripper 2990WX.
Here is a link to godbolt.
I would like to ask if it behaves the same way on your side?
(Before reporting a bug to GCC Bugzilla).
UPDATE:
On request, I supply a generated assembly:
mod9_v1:
sub rsp, 8
mov edx, 9
xor ecx, ecx
call __umodti3
add rsp, 8
ret
mod9_v2:
mov rax, rdi
shrd rax, rsi, 32
mov rdx, rsi
mov r8d, eax
shr rdx, 32
mov eax, edi
add rax, rdx
lea rax, [rax+r8*4]
mov esi, esi
lea rcx, [rax+rsi*8]
sub rcx, rsi
mov rax, rcx
movabs rdx, -2049638230412172401
mul rdx
mov rax, rdx
shr rax, 3
and rdx, -8
add rdx, rax
mov rax, rcx
sub rax, rdx
ret
The reason for this difference is clear from the assembly listings: the % operator applied to 128-bit integers is implemented via a library call to a generic function that cannot take advantage of compile time knowledge of the divisor value, which makes it possible to turn division and modulo operations into much faster multiplications.
The timing difference is even more significant on my old Macbook-pro using clang, where I mod_v2() is x15 times faster than mod_v1().
Note however these remarks:
you should measure the cpu time just after the end of the for loop, not after the first printf as currently coded.
rand_u128() only produces 124 bits assuming RAND_MAX is 0x7fffffff.
most of the time is spent computing the random numbers.
Using your slicing approach, I extended you code to reduce the number of steps using slices of 42, 42 and 44 bits, which further improves the timings (because 242 % 9 == 1):
#pragma GCC diagnostic ignored "-Wpedantic"
#include <stddef.h>
#include <stdint.h>
#include <stdlib.h>
#include <assert.h>
#include <inttypes.h>
#include <stdio.h>
#include <time.h>
static uint64_t mod9_v1(unsigned __int128 n) {
return n % 9;
}
static uint64_t mod9_v2(unsigned __int128 n) {
uint64_t r = 0;
r += (uint32_t)(n);
r += (uint32_t)(n >> 32) * (uint64_t)(((uint64_t)1ULL << 32) % 9);
r += (uint32_t)(n >> 64) * (uint64_t)(((unsigned __int128)1 << 64) % 9);
r += (uint32_t)(n >> 96);
return r % 9;
}
static uint64_t mod9_v3(unsigned __int128 n) {
return (((uint64_t)(n >> 0) & 0x3ffffffffff) +
((uint64_t)(n >> 42) & 0x3ffffffffff) +
((uint64_t)(n >> 84))) % 9;
}
unsigned __int128 rand_u128() {
return ((unsigned __int128)rand() << 97 ^
(unsigned __int128)rand() << 66 ^
(unsigned __int128)rand() << 35 ^
(unsigned __int128)rand() << 4 ^
(unsigned __int128)rand());
}
#define N 100000000
int main() {
srand(42);
unsigned __int128 *arr = malloc(sizeof(unsigned __int128) * N);
if (arr == NULL) {
return 1;
}
for (size_t n = 0; n < N; ++n) {
arr[n] = rand_u128();
}
#if 1
/* check that modulo 9 is calculated correctly */
for (size_t n = 0; n < N; ++n) {
uint64_t m = mod9_v1(arr[n]);
assert(m == mod9_v2(arr[n]));
assert(m == mod9_v3(arr[n]));
}
#endif
clock_t clk1 = -clock();
uint64_t sum1 = 0;
for (size_t n = 0; n < N; ++n) {
sum1 += mod9_v1(arr[n]);
}
clk1 += clock();
clock_t clk2 = -clock();
uint64_t sum2 = 0;
for (size_t n = 0; n < N; ++n) {
sum2 += mod9_v2(arr[n]);
}
clk2 += clock();
clock_t clk3 = -clock();
uint64_t sum3 = 0;
for (size_t n = 0; n < N; ++n) {
sum3 += mod9_v3(arr[n]);
}
clk3 += clock();
printf("mod9_v1: sum=%"PRIu64", elapsed time: %.3f secs\n", sum1, clk1 / (double)CLOCKS_PER_SEC);
printf("mod9_v2: sum=%"PRIu64", elapsed time: %.3f secs\n", sum2, clk2 / (double)CLOCKS_PER_SEC);
printf("mod9_v3: sum=%"PRIu64", elapsed time: %.3f secs\n", sum3, clk3 / (double)CLOCKS_PER_SEC);
free(arr);
return 0;
}
Here are the timings on my linux server (gcc):
mod9_v1: sum=400041273, elapsed time: 7.992 secs
mod9_v2: sum=400041273, elapsed time: 1.295 secs
mod9_v3: sum=400041273, elapsed time: 1.131 secs
The same code on my Macbook (clang):
mod9_v1: sum=399978071, elapsed time: 32.900 secs
mod9_v2: sum=399978071, elapsed time: 0.204 secs
mod9_v3: sum=399978071, elapsed time: 0.185 secs
In the mean time (while waiting for Bugzilla), you could let the preprocessor do the optimization for you. E.g. define a macro called MOD_INT128(n,d) :
#define MODCALC0(n,d) ((65536*n)%d)
#define MODCALC1(n,d) MODCALC0(MODCALC0(n,d),d)
#define MODCALC2(n,d) MODCALC1(MODCALC1(n,d),d)
#define MODCALC3(n,d) MODCALC2(MODCALC1(n,d),d)
#define MODPARAM(n,d,a,b,c) \
((uint64_t)((uint32_t)(n) ) + \
(uint64_t)((uint32_t)(n >> 32) * (uint64_t)a) + \
(uint64_t)((uint32_t)(n >> 64) * (uint64_t)b) + \
(uint64_t)((uint32_t)(n >> 96) * (uint64_t)c) ) % d
#define MOD_INT128(n,d) MODPARAM(n,d,MODCALC1(1,d),MODCALC2(1,d),MODCALC3(1,d))
Now,
uint64_t mod9_v3(unsigned __int128 n)
{
return MOD_INT128( n, 9 );
}
will generate similar assembly language as the mod9_v2() function, and
uint64_t mod8_v3(unsigned __int128 n)
{
return MOD_INT128( n, 8 );
}
works fine with already existing optimization (GCC 10.2.0)
Why the following code results unaligned AVX instructions ( MOVUPD instead of MOVAPD)? I compiled this on Visual Studio 2015. How can I tell the compiler that my data is indeed aligned?
const size_t ALIGN_SIZE = 64;
const size_t ARRAY_SIZE = 1024;
double __declspec(align(ALIGN_SIZE)) a[ARRAY_SIZE];
double __declspec(align(ALIGN_SIZE)) b[ARRAY_SIZE];
//Calculate the dotproduct
__m256d ymm0 = _mm256_set1_pd(0.0);
for (int i = 0; i < ARRAY_SIZE; i += 8)
{
__m256d ymm1 = _mm256_load_pd(a + i);
__m256d ymm2 = _mm256_load_pd(b + i);
__m256d ymm3 = _mm256_mul_pd(ymm1, ymm2);
ymm0 = _mm256_add_pd(ymm3, ymm0);
__m256d ymm4 = _mm256_load_pd(a + i + 4);
__m256d ymm5 = _mm256_load_pd(b + i + 4);
__m256d ymm6 = _mm256_mul_pd(ymm4, ymm5);
ymm0 = _mm256_add_pd(ymm6, ymm0);
}
Assembly of the loop:
00007FF7AC7A1400 vmovupd ymm1,ymmword ptr [rbp+rax*8+2020h]
00007FF7AC7A1409 vmulpd ymm3,ymm1,ymmword ptr [rbp+rax*8+20h]
00007FF7AC7A140F vmovupd ymm2,ymmword ptr [rbp+rax*8]
00007FF7AC7A1415 vmulpd ymm0,ymm2,ymmword ptr b[rax*8]
00007FF7AC7A141E add r8d,8
00007FF7AC7A1422 movsxd rax,r8d
00007FF7AC7A1425 vaddpd ymm1,ymm0,ymm4
00007FF7AC7A1429 vaddpd ymm4,ymm1,ymm3
00007FF7AC7A142D cmp rax,400h
00007FF7AC7A1433 jb main+70h (07FF7AC7A1400h)
There is the way to solve this problem (it allows to use instruction VMOVDQA (analogue of MOVAPD) instead of MOVUPD):
inline __m256d Load(const double * p)
{
#ifdef _MSC_VER
return _mm256_castsi256_pd(_mm256_load_si256((__m256i*)p));
#else
return _mm256_load_pd(p);
#endif
}
Analogous solution for float type:
inline __m256 Load(const float * p)
{
#ifdef _MSC_VER
return _mm256_castsi256_ps(_mm256_load_si256((__m256i*)p));
#else
return _mm256_load_ps(p);
#endif
}
But in order to cheat Visual Studio compiler you have to use dynamically allocated pointers. Otherwise compiler doesn't use VMOVDQA instruction.
#include <immintrin.h>
int main()
{
float * ps = (float*)_mm_malloc(40, 32);
double * pd = (double*)_mm_malloc(40, 32);
__m256 s = Load(ps);
//00007FF79FF81325 vmovdqa ymm1,ymmword ptr [rdi]
__m256d d = Load(pd);
//00007FF79FF8132F vmovdqa ymm0,ymmword ptr [rax]
_mm256_storeu_ps(ps, s);
_mm256_storeu_pd(pd, d);
_mm_free(ps);
_mm_free(pd);
}
I have large block of data to calculate:
static float source0[COUNT];
static float source1[COUNT];
static float result[COUNT]; /* result[i] = source0[i] * source1[i]; */
s0 = (size_t)source0;
s1 = (size_t)source1;
r = (size_t)result;
They are all 32-byte aligned.
The related SSE code:
for(i = 0; i < COUNT; i += 16)
{
__asm volatile
(
"movntdqa xmm0, [%0]\n\t"
"movntdqa xmm1, [%1]\n\t"
"mulps xmm1, xmm0\n\t"
"movntps [%2], xmm1"
: : "r"(s0 + i), "r"(s1 + i), "r"(r + i) : "xmm0", "xmm1"
);
}
The related AVX code:
for(i = 0; i < COUNT; i += 32)
{
__asm volatile
(
"vmovapd ymm0, [%0]\n\t"
"vmovapd ymm1, [%1]\n\t"
"vmulps ymm1, ymm1, ymm0\n\t"
"vmovntps [%2], ymm1"
: : "r"(s0 + i), "r"(s1 + i), "r"(r + i) : "ymm0", "ymm1"
);
}
The result is that AVX code used time is always nearly the same as SSE code. But they are much faster then normal C code.
I think the major reason is that "vmodapd" does not support "NT" version, until AVX2 extension. This causes too much d-cache pollution.
Is there any better way to explore the power of AVX(not AVX2)?
Problem
I have been studying HPC, specifically using matrix multiplication as my project (see my other posts in profile). I achieve good performance in those, but not good enough. I am taking a step back to see how well I can do with a dot product calculation.
Dot Product vs. Matrix Multiplication
The dot product is simpler, and will allow me to test HPC concepts without dealing with packing and other related issues. Cache blocking is still an issue, which forms my second question.
Algorithm
Multiply n corresponding elements in two double arrays A and B and sum them. A double dot product in assembly is just a series of movapd, mulpd, addpd. Unrolled and arranged in a clever way, it is possible to have groups of movapd/mulpd/addpd that operate on different xmm registers and are thus independent, optimizing pipelining. Of course, it turns out that this does not matter so much as my CPU has out-of-order execution. Also note that the re-arrangement requires peeling off the last iteration.
Other Assumptions
I am not writing the code for general dot products. The code is for specific sizes and I am not handling fringe cases. This is just to test HPC concepts and to see what type of CPU usage I can attain.
Results
Compiled with gcc -std=c99 -O2 -m32 -mincoming-stack-boundary=2 -msse3 -mfpmath=sse,387 -masm=intel. I am on a different computer than usual. This computer has a i5 540m which can obtain 2.8 GHz * 4 FLOPS/cycle/core = 11.2 GFLOPS/s per core after a two-step Intel Turbo Boost (both cores are on right now so it only gets 2 step...a 4 step boost is possible if I turn off one core). 32 bit LINPACK gets around 9.5 GFLOPS/s when set to run with one thread.
N Total Gflops/s Residual
256 5.580521 1.421085e-014
384 5.734344 -2.842171e-014
512 5.791168 0.000000e+000
640 5.821629 0.000000e+000
768 5.814255 2.842171e-014
896 5.807132 0.000000e+000
1024 5.817208 -1.421085e-013
1152 5.805388 0.000000e+000
1280 5.830746 -5.684342e-014
1408 5.881937 -5.684342e-014
1536 5.872159 -1.705303e-013
1664 5.881536 5.684342e-014
1792 5.906261 -2.842171e-013
1920 5.477966 2.273737e-013
2048 5.620931 0.000000e+000
2176 3.998713 1.136868e-013
2304 3.370095 -3.410605e-013
2432 3.371386 -3.410605e-013
Question 1
How can I do better than this? I am not even coming close to the peak performance. I have optimized the assembly code to high heaven. Further unrolling might boost it just a little more, but less unrolling seems to degrade performance.
Question 2
When n > 2048, you can see a drop in performance. This is because my L1 cache is 32KB, and when n = 2048 and A and B are double, they take up the entire cache. Any bigger and they are streamed from memory.
I tried cache blocking (not shown in source), but maybe I did it wrong. Can anyone provide some code or explain how to block a dot product for a cache?
Source Code
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <string.h>
#include <x86intrin.h>
#include <math.h>
#include <omp.h>
#include <stdint.h>
#include <windows.h>
// computes 8 dot products
#define KERNEL(address) \
"movapd xmm4, XMMWORD PTR [eax+"#address"] \n\t" \
"mulpd xmm7, XMMWORD PTR [edx+48+"#address"] \n\t" \
"addpd xmm2, xmm6 \n\t" \
"movapd xmm5, XMMWORD PTR [eax+16+"#address"] \n\t" \
"mulpd xmm4, XMMWORD PTR [edx+"#address"] \n\t" \
"addpd xmm3, xmm7 \n\t" \
"movapd xmm6, XMMWORD PTR [eax+96+"#address"] \n\t" \
"mulpd xmm5, XMMWORD PTR [edx+16+"#address"] \n\t" \
"addpd xmm0, xmm4 \n\t" \
"movapd xmm7, XMMWORD PTR [eax+112+"#address"] \n\t" \
"mulpd xmm6, XMMWORD PTR [edx+96+"#address"] \n\t" \
"addpd xmm1, xmm5 \n\t"
#define PEELED(address) \
"movapd xmm4, XMMWORD PTR [eax+"#address"] \n\t" \
"mulpd xmm7, [edx+48+"#address"] \n\t" \
"addpd xmm2, xmm6 \n\t" \
"movapd xmm5, XMMWORD PTR [eax+16+"#address"] \n\t" \
"mulpd xmm4, XMMWORD PTR [edx+"#address"] \n\t" \
"addpd xmm3, xmm7 \n\t" \
"mulpd xmm5, XMMWORD PTR [edx+16+"#address"] \n\t" \
"addpd xmm0, xmm4 \n\t" \
"addpd xmm1, xmm5 \n\t"
inline double
__attribute__ ((gnu_inline))
__attribute__ ((aligned(64))) ddot_ref(
int n,
const double* restrict A,
const double* restrict B)
{
double sum0 = 0.0;
double sum1 = 0.0;
double sum2 = 0.0;
double sum3 = 0.0;
double sum;
for(int i = 0; i < n; i+=4) {
sum0 += *(A + i ) * *(B + i );
sum1 += *(A + i+1) * *(B + i+1);
sum2 += *(A + i+2) * *(B + i+2);
sum3 += *(A + i+3) * *(B + i+3);
}
sum = sum0+sum1+sum2+sum3;
return(sum);
}
inline double
__attribute__ ((gnu_inline))
__attribute__ ((aligned(64))) ddot_asm
( int n,
const double* restrict A,
const double* restrict B)
{
double sum;
__asm__ __volatile__
(
"mov eax, %[A] \n\t"
"mov edx, %[B] \n\t"
"mov ecx, %[n] \n\t"
"pxor xmm0, xmm0 \n\t"
"pxor xmm1, xmm1 \n\t"
"pxor xmm2, xmm2 \n\t"
"pxor xmm3, xmm3 \n\t"
"movapd xmm6, XMMWORD PTR [eax+32] \n\t"
"movapd xmm7, XMMWORD PTR [eax+48] \n\t"
"mulpd xmm6, XMMWORD PTR [edx+32] \n\t"
"sar ecx, 7 \n\t"
"sub ecx, 1 \n\t" // peel
"L%=: \n\t"
KERNEL(64 * 0)
KERNEL(64 * 1)
KERNEL(64 * 2)
KERNEL(64 * 3)
KERNEL(64 * 4)
KERNEL(64 * 5)
KERNEL(64 * 6)
KERNEL(64 * 7)
KERNEL(64 * 8)
KERNEL(64 * 9)
KERNEL(64 * 10)
KERNEL(64 * 11)
KERNEL(64 * 12)
KERNEL(64 * 13)
KERNEL(64 * 14)
KERNEL(64 * 15)
"lea eax, [eax+1024] \n\t"
"lea edx, [edx+1024] \n\t"
" \n\t"
"dec ecx \n\t"
"jnz L%= \n\t" // end loop
" \n\t"
KERNEL(64 * 0)
KERNEL(64 * 1)
KERNEL(64 * 2)
KERNEL(64 * 3)
KERNEL(64 * 4)
KERNEL(64 * 5)
KERNEL(64 * 6)
KERNEL(64 * 7)
KERNEL(64 * 8)
KERNEL(64 * 9)
KERNEL(64 * 10)
KERNEL(64 * 11)
KERNEL(64 * 12)
KERNEL(64 * 13)
KERNEL(64 * 14)
PEELED(64 * 15)
" \n\t"
"addpd xmm0, xmm1 \n\t" // summing result
"addpd xmm2, xmm3 \n\t"
"addpd xmm0, xmm2 \n\t" // cascading add
"movapd xmm1, xmm0 \n\t" // copy xmm0
"shufpd xmm1, xmm0, 0x03 \n\t" // shuffle
"addsd xmm0, xmm1 \n\t" // add low qword
"movsd %[sum], xmm0 \n\t" // mov low qw to sum
: // outputs
[sum] "=m" (sum)
: // inputs
[A] "m" (A),
[B] "m" (B),
[n] "m" (n)
: //register clobber
"memory",
"eax","ecx","edx","edi",
"xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7"
);
return(sum);
}
int main()
{
// timers
LARGE_INTEGER frequency, time1, time2;
double time3;
QueryPerformanceFrequency(&frequency);
// clock_t time1, time2;
double gflops;
int nmax = 4096;
int trials = 1e7;
double sum, residual;
FILE *f = fopen("soddot.txt","w+");
printf("%16s %16s %16s\n","N","Total Gflops/s","Residual");
fprintf(f,"%16s %16s %16s\n","N","Total Gflops/s","Residual");
for(int n = 256; n <= nmax; n += 128 ) {
double* A = NULL;
double* B = NULL;
A = _mm_malloc(n*sizeof(*A), 64); if (!A) {printf("A failed\n"); return(1);}
B = _mm_malloc(n*sizeof(*B), 64); if (!B) {printf("B failed\n"); return(1);}
srand(time(NULL));
// create arrays
for(int i = 0; i < n; ++i) {
*(A + i) = (double) rand()/RAND_MAX;
*(B + i) = (double) rand()/RAND_MAX;
}
// warmup
sum = ddot_asm(n,A,B);
QueryPerformanceCounter(&time1);
// time1 = clock();
for (int count = 0; count < trials; count++){
// sum = ddot_ref(n,A,B);
sum = ddot_asm(n,A,B);
}
QueryPerformanceCounter(&time2);
time3 = (double)(time2.QuadPart - time1.QuadPart) / frequency.QuadPart;
// time3 = (double) (clock() - time1)/CLOCKS_PER_SEC;
gflops = (double) (2.0*n*trials)/time3/1.0e9;
residual = ddot_ref(n,A,B) - sum;
printf("%16d %16f %16e\n",n,gflops,residual);
fprintf(f,"%16d %16f %16e\n",n,gflops,residual);
_mm_free(A);
_mm_free(B);
}
fclose(f);
return(0); // successful completion
}
EDIT: explanation of assembly
A dot product is just a repeat sum of products of two numbers: sum += a[i]*b[i]. sum must be initialized to 0 before the first iteration. Vectorized, you can do 2 sums at a time which must be summed at the end: [sum0 sum1] = [a[i] a[i+1]]*[b[i] b[i+1]], sum = sum0 + sum1. In (Intel) assembly, this is 3 steps (after the initialization):
pxor xmm0, xmm0 // accumulator [sum0 sum1] = [0 0]
movapd xmm1, XMMWORD PTR [eax] // load [a[i] a[i+1]] into xmm1
mulpd xmm1, XMMWORD PTR [edx] // xmm1 = xmm1 * [b[i] b[i+1]]
addpd xmm0, xmm1 // xmm0 = xmm0 + xmm1
At this point you have nothing special, the compiler can come up with this. You can usually get better performance by unrolling the code enough times to use all xmm registers available to you (8 registers in 32 bit mode). So if you unroll it 4 times that allows you to utilize all 8 registers xmm0 through xmm7. You will have 4 accumulators and 4 registers for storing the results of movapd and addpd. Again, the compiler can come up with this. The real thinking part is trying to come up with a way to pipeline the code, i.e., make each instruction in the group of MOV/MUL/ADD operate on different registers so that all 3 instructions execute at the same time (usually the case on most CPUs). That's how you beat the compiler. So you have to pattern the 4x unrolled code to do just that, which may require loading vectors ahead of time and peeling off the first or last iteration. This is what KERNEL(address) is. I made a macro of the 4x unrolled pipelined code for convenience. That way I can easily unroll it in multiples of 4 by just changing address. Each KERNEL computes 8 dot products.
To answer your overall question you can't achieve peak performance with the dot product.
The problem is that your CPU can do one 128-bit load per clock cycle and to do the dot product you need two 128-bit loads per clock cycle.
But it's worse than that for large n. The answer to your second question is that the dot product is memory bound and not compute bound and so it cannot parallelize for large n with fast cores. This is explained better here why-vectorizing-the-loop-does-not-have-performance-improvement. This is a big problem with parallelization with fast cores. It took me a while to figure this out but it's very important to learn.
There are actually few basic algorithms that can fully benefit from parallelization on fast cores. In terms of BLAS algorithms it's only the Level-3 algorithms (O(n^3)) such as matrix multiplication that really benefit from parallelization. The situation is better on slow cores e.g. with GPUs and the Xeon Phi because the discrepancy between memory speed and core speed is much smaller.
If you want to find an algorithm which can get close to peak flops for small n try e.g. scalar * vector or the sum of scalar * vector. The first case should do one load, one mult, and one store every clock cycle and the second case one mult, one add, and one load every clock cycle.
I tested the following code on a Core 2 Duo P9600#2.67GHz in Knoppix 7.3 32-bit. I get about 75% of the peak for the scalar product and 75% of the peakfor the sum of the scalar product. The flops/cycle for the scalar product is 2 and for the sum of the scalar product it's 4.
Compiled with g++ -msse2 -O3 -fopenmp foo.cpp -ffast-math
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
#include <x86intrin.h>
void scalar_product(double * __restrict a, int n) {
a = (double*)__builtin_assume_aligned (a, 64);
double k = 3.14159;
for(int i=0; i<n; i++) {
a[i] = k*a[i];
}
}
void scalar_product_SSE(double * __restrict a, int n) {
a = (double*)__builtin_assume_aligned (a, 64);
__m128d k = _mm_set1_pd(3.14159);
for(int i=0; i<n; i+=8) {
__m128d t1 = _mm_load_pd(&a[i+0]);
_mm_store_pd(&a[i],_mm_mul_pd(k,t1));
__m128d t2 = _mm_load_pd(&a[i+2]);
_mm_store_pd(&a[i+2],_mm_mul_pd(k,t2));
__m128d t3 = _mm_load_pd(&a[i+4]);
_mm_store_pd(&a[i+4],_mm_mul_pd(k,t3));
__m128d t4 = _mm_load_pd(&a[i+6]);
_mm_store_pd(&a[i+6],_mm_mul_pd(k,t4));
}
}
double scalar_sum(double * __restrict a, int n) {
a = (double*)__builtin_assume_aligned (a, 64);
double sum = 0.0;
double k = 3.14159;
for(int i=0; i<n; i++) {
sum += k*a[i];
}
return sum;
}
double scalar_sum_SSE(double * __restrict a, int n) {
a = (double*)__builtin_assume_aligned (a, 64);
__m128d sum1 = _mm_setzero_pd();
__m128d sum2 = _mm_setzero_pd();
__m128d sum3 = _mm_setzero_pd();
__m128d sum4 = _mm_setzero_pd();
__m128d k = _mm_set1_pd(3.14159);
for(int i=0; i<n; i+=8) {
__m128d t1 = _mm_load_pd(&a[i+0]);
sum1 = _mm_add_pd(_mm_mul_pd(k,t1),sum1);
__m128d t2 = _mm_load_pd(&a[i+2]);
sum2 = _mm_add_pd(_mm_mul_pd(k,t2),sum2);
__m128d t3 = _mm_load_pd(&a[i+4]);
sum3 = _mm_add_pd(_mm_mul_pd(k,t3),sum3);
__m128d t4 = _mm_load_pd(&a[i+6]);
sum4 = _mm_add_pd(_mm_mul_pd(k,t4),sum4);
}
double tmp[8];
_mm_storeu_pd(&tmp[0],sum1);
_mm_storeu_pd(&tmp[2],sum2);
_mm_storeu_pd(&tmp[4],sum3);
_mm_storeu_pd(&tmp[6],sum4);
double sum = 0;
for(int i=0; i<8; i++) sum+=tmp[i];
return sum;
}
int main() {
//_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
//_mm_setcsr(_mm_getcsr() | 0x8040);
double dtime, peak, flops, sum;
int repeat = 1<<18;
const int n = 2048;
double *a = (double*)_mm_malloc(sizeof(double)*n,64);
double *b = (double*)_mm_malloc(sizeof(double)*n,64);
for(int i=0; i<n; i++) a[i] = 1.0*rand()/RAND_MAX;
dtime = omp_get_wtime();
for(int r=0; r<repeat; r++) {
scalar_product_SSE(a,n);
}
dtime = omp_get_wtime() - dtime;
peak = 2*2.67;
flops = 1.0*n/dtime*1E-9*repeat;
printf("time %f, %f, %f\n", dtime,flops, flops/peak);
//for(int i=0; i<n; i++) a[i] = 1.0*rand()/RAND_MAX;
//sum = 0.0;
dtime = omp_get_wtime();
for(int r=0; r<repeat; r++) {
scalar_sum_SSE(a,n);
}
dtime = omp_get_wtime() - dtime;
peak = 2*2*2.67;
flops = 2.0*n/dtime*1E-9*repeat;
printf("time %f, %f, %f\n", dtime,flops, flops/peak);
//printf("sum %f\n", sum);
}
I have a pointer to an array of bytes mixed that contains the interleaved bytes of two distinct arrays array1 and array2. Say mixed looks something like this:
a1b2c3d4...
What I need to do is de-interleave the bytes so I get array1 = abcd... and array2 = 1234.... I know the length of mixed ahead of time, and the lengths of array1 and array2 are equivalent, both equal to mixed / 2.
Here is my current implementation (array1 and array2 are already allocated):
int i, j;
int mixedLength_2 = mixedLength / 2;
for (i = 0, j = 0; i < mixedLength_2; i++, j += 2)
{
array1[i] = mixed[j];
array2[i] = mixed[j+1];
}
This avoids any expensive multiplication or division operations, but still doesn't run fast enough. I'm hoping there is something like memcpy that takes an indexer that can use low-level block copy operations to speed up the process. Is there a faster implementation than what I currently have?
Edit
The target platform is Objective-C for iOS and Mac. A fast operation is more important for iOS devices, so a solution targeting iOS specifically would be better than nothing.
Update
Thanks everyone for the responses, especially Stephen Canon, Graham Lee, and Mecki. Here is my "master" function that uses Stephen's NEON intrinsics if available and otherwise Graham's union cursors with a reduced number of iterations as suggested by Mecki.
void interleave(const uint8_t *srcA, const uint8_t *srcB, uint8_t *dstAB, size_t dstABLength)
{
#if defined __ARM_NEON__
// attempt to use NEON intrinsics
// iterate 32-bytes at a time
div_t dstABLength_32 = div(dstABLength, 32);
if (dstABLength_32.rem == 0)
{
while (dstABLength_32.quot --> 0)
{
const uint8x16_t a = vld1q_u8(srcA);
const uint8x16_t b = vld1q_u8(srcB);
const uint8x16x2_t ab = { a, b };
vst2q_u8(dstAB, ab);
srcA += 16;
srcB += 16;
dstAB += 32;
}
return;
}
// iterate 16-bytes at a time
div_t dstABLength_16 = div(dstABLength, 16);
if (dstABLength_16.rem == 0)
{
while (dstABLength_16.quot --> 0)
{
const uint8x8_t a = vld1_u8(srcA);
const uint8x8_t b = vld1_u8(srcB);
const uint8x8x2_t ab = { a, b };
vst2_u8(dstAB, ab);
srcA += 8;
srcB += 8;
dstAB += 16;
}
return;
}
#endif
// if the bytes were not aligned properly
// or NEON is unavailable, fall back to
// an optimized iteration
// iterate 8-bytes at a time
div_t dstABLength_8 = div(dstABLength, 8);
if (dstABLength_8.rem == 0)
{
typedef union
{
uint64_t wide;
struct { uint8_t a1; uint8_t b1; uint8_t a2; uint8_t b2; uint8_t a3; uint8_t b3; uint8_t a4; uint8_t b4; } narrow;
} ab8x8_t;
uint64_t *dstAB64 = (uint64_t *)dstAB;
int j = 0;
for (int i = 0; i < dstABLength_8.quot; i++)
{
ab8x8_t cursor;
cursor.narrow.a1 = srcA[j ];
cursor.narrow.b1 = srcB[j++];
cursor.narrow.a2 = srcA[j ];
cursor.narrow.b2 = srcB[j++];
cursor.narrow.a3 = srcA[j ];
cursor.narrow.b3 = srcB[j++];
cursor.narrow.a4 = srcA[j ];
cursor.narrow.b4 = srcB[j++];
dstAB64[i] = cursor.wide;
}
return;
}
// iterate 4-bytes at a time
div_t dstABLength_4 = div(dstABLength, 4);
if (dstABLength_4.rem == 0)
{
typedef union
{
uint32_t wide;
struct { uint8_t a1; uint8_t b1; uint8_t a2; uint8_t b2; } narrow;
} ab8x4_t;
uint32_t *dstAB32 = (uint32_t *)dstAB;
int j = 0;
for (int i = 0; i < dstABLength_4.quot; i++)
{
ab8x4_t cursor;
cursor.narrow.a1 = srcA[j ];
cursor.narrow.b1 = srcB[j++];
cursor.narrow.a2 = srcA[j ];
cursor.narrow.b2 = srcB[j++];
dstAB32[i] = cursor.wide;
}
return;
}
// iterate 2-bytes at a time
div_t dstABLength_2 = div(dstABLength, 2);
typedef union
{
uint16_t wide;
struct { uint8_t a; uint8_t b; } narrow;
} ab8x2_t;
uint16_t *dstAB16 = (uint16_t *)dstAB;
for (int i = 0; i < dstABLength_2.quot; i++)
{
ab8x2_t cursor;
cursor.narrow.a = srcA[i];
cursor.narrow.b = srcB[i];
dstAB16[i] = cursor.wide;
}
}
void deinterleave(const uint8_t *srcAB, uint8_t *dstA, uint8_t *dstB, size_t srcABLength)
{
#if defined __ARM_NEON__
// attempt to use NEON intrinsics
// iterate 32-bytes at a time
div_t srcABLength_32 = div(srcABLength, 32);
if (srcABLength_32.rem == 0)
{
while (srcABLength_32.quot --> 0)
{
const uint8x16x2_t ab = vld2q_u8(srcAB);
vst1q_u8(dstA, ab.val[0]);
vst1q_u8(dstB, ab.val[1]);
srcAB += 32;
dstA += 16;
dstB += 16;
}
return;
}
// iterate 16-bytes at a time
div_t srcABLength_16 = div(srcABLength, 16);
if (srcABLength_16.rem == 0)
{
while (srcABLength_16.quot --> 0)
{
const uint8x8x2_t ab = vld2_u8(srcAB);
vst1_u8(dstA, ab.val[0]);
vst1_u8(dstB, ab.val[1]);
srcAB += 16;
dstA += 8;
dstB += 8;
}
return;
}
#endif
// if the bytes were not aligned properly
// or NEON is unavailable, fall back to
// an optimized iteration
// iterate 8-bytes at a time
div_t srcABLength_8 = div(srcABLength, 8);
if (srcABLength_8.rem == 0)
{
typedef union
{
uint64_t wide;
struct { uint8_t a1; uint8_t b1; uint8_t a2; uint8_t b2; uint8_t a3; uint8_t b3; uint8_t a4; uint8_t b4; } narrow;
} ab8x8_t;
uint64_t *srcAB64 = (uint64_t *)srcAB;
int j = 0;
for (int i = 0; i < srcABLength_8.quot; i++)
{
ab8x8_t cursor;
cursor.wide = srcAB64[i];
dstA[j ] = cursor.narrow.a1;
dstB[j++] = cursor.narrow.b1;
dstA[j ] = cursor.narrow.a2;
dstB[j++] = cursor.narrow.b2;
dstA[j ] = cursor.narrow.a3;
dstB[j++] = cursor.narrow.b3;
dstA[j ] = cursor.narrow.a4;
dstB[j++] = cursor.narrow.b4;
}
return;
}
// iterate 4-bytes at a time
div_t srcABLength_4 = div(srcABLength, 4);
if (srcABLength_4.rem == 0)
{
typedef union
{
uint32_t wide;
struct { uint8_t a1; uint8_t b1; uint8_t a2; uint8_t b2; } narrow;
} ab8x4_t;
uint32_t *srcAB32 = (uint32_t *)srcAB;
int j = 0;
for (int i = 0; i < srcABLength_4.quot; i++)
{
ab8x4_t cursor;
cursor.wide = srcAB32[i];
dstA[j ] = cursor.narrow.a1;
dstB[j++] = cursor.narrow.b1;
dstA[j ] = cursor.narrow.a2;
dstB[j++] = cursor.narrow.b2;
}
return;
}
// iterate 2-bytes at a time
div_t srcABLength_2 = div(srcABLength, 2);
typedef union
{
uint16_t wide;
struct { uint8_t a; uint8_t b; } narrow;
} ab8x2_t;
uint16_t *srcAB16 = (uint16_t *)srcAB;
for (int i = 0; i < srcABLength_2.quot; i++)
{
ab8x2_t cursor;
cursor.wide = srcAB16[i];
dstA[i] = cursor.narrow.a;
dstB[i] = cursor.narrow.b;
}
}
Off the top of my head, I don't know of a library function for de-interleaving 2 channel byte data. However it's worth filing a bug report with Apple to request such a function.
In the meantime, it's pretty easy to vectorize such a function using NEON or SSE intrinsics. Specifically, on ARM you will want to use vld1q_u8 to load a vector from each source array, vuzpq_u8 to de-interleave them, and vst1q_u8 to store the resulting vectors; here's a rough sketch that I haven't tested or even tried to build, but it should illustrate the general idea. More sophisticated implementations are definitely possible (in particular, NEON can load/store two 16B registers in a single instruction, which the compiler may not do with this, and some amount of pipelining and/or unrolling may be beneficial depending on how long your buffers are):
#if defined __ARM_NEON__
# include <arm_neon.h>
#endif
#include <stdint.h>
#include <stddef.h>
void deinterleave(uint8_t *mixed, uint8_t *array1, uint8_t *array2, size_t mixedLength) {
#if defined __ARM_NEON__
size_t vectors = mixedLength / 32;
mixedLength %= 32;
while (vectors --> 0) {
const uint8x16_t src0 = vld1q_u8(mixed);
const uint8x16_t src1 = vld1q_u8(mixed + 16);
const uint8x16x2_t dst = vuzpq_u8(src0, src1);
vst1q_u8(array1, dst.val[0]);
vst1q_u8(array2, dst.val[1]);
mixed += 32;
array1 += 16;
array2 += 16;
}
#endif
for (size_t i=0; i<mixedLength/2; ++i) {
array1[i] = mixed[2*i];
array2[i] = mixed[2*i + 1];
}
}
I've only tested this lightly but it seemed at least twice as fast as your version:
typedef union {
uint16_t wide;
struct { uint8_t top; uint8_t bottom; } narrow;
} my_union;
uint16_t *source = (uint16_t *)mixed;
for (int i = 0; i < mixedLength/2; i++)
{
my_union cursor;
cursor.wide = source[i];
array1[i] = cursor.narrow.top;
array2[i] = cursor.narrow.bottom;
}
Notice that I wasn't careful with structure packing, but that in this case on this architecture that isn't a problem. Notice also someone might complain at my choice of naming top and bottom; I assume you know which half of which integers you need.
Okay, here is your original method:
static void simpleDeint (
uint8_t * array1, uint8_t * array2, uint8_t * mixed, int mixedLength
) {
int i, j;
int mixedLength_2 = mixedLength / 2;
for (i = 0, j = 0; i < mixedLength_2; i++, j += 2)
{
array1[i] = mixed[j];
array2[i] = mixed[j+1];
}
}
With 10 million entries and -O3 (compiler shall optimize for maximum speed), I can run this 154 times per second on my Mac.
Here is my first suggestion:
static void structDeint (
uint8_t * array1, uint8_t * array2, uint8_t * mixed, int mixedLength
) {
int i;
int len;
uint8_t * array1Ptr = (uint8_t *)array1;
uint8_t * array2Ptr = (uint8_t *)array2;
struct {
uint8_t byte1;
uint8_t byte2;
} * tb = (void *)mixed;
len = mixedLength / 2;
for (i = 0; i < len; i++) {
*(array1Ptr++) = tb->byte1;
*(array2Ptr++) = tb->byte2;
tb++;
}
}
Same count and optimization as before, I get 193 runs per second.
Now the suggestion from Graham Lee:
static void unionDeint (
uint8_t * array1, uint8_t * array2, uint8_t * mixed, int mixedLength
) {
union my_union {
uint16_t wide;
struct { uint8_t top; uint8_t bottom; } narrow;
};
uint16_t * source = (uint16_t *)mixed;
for (int i = 0; i < mixedLength/2; i++) {
union my_union cursor;
cursor.wide = source[i];
array1[i] = cursor.narrow.top;
array2[i] = cursor.narrow.bottom;
}
}
Same setup as before, 198 runs per second (NOTE: This method is not endian safe, result depends on CPU endianess. In your case array1 and array2 are probably swapped since ARM is little endian, so you would have to swap them in the code).
Here's my best one so far:
static void uint32Deint (
uint8_t * array1, uint8_t * array2, uint8_t * mixed, int mixedLength
) {
int i;
int count;
uint32_t * fourBytes = (void *)mixed;
uint8_t * array1Ptr = (uint8_t *)array1;
uint8_t * array2Ptr = (uint8_t *)array2;
count = mixedLength / 4;
for (i = 0; i < count; i++) {
uint32_t temp = *(fourBytes++);
#if __LITTLE_ENDIAN__
*(array1Ptr++) = (uint8_t)(temp & 0xFF);
temp >>= 8;
*(array2Ptr++) = (uint8_t)(temp & 0xFF);
temp >>= 8;
*(array1Ptr++) = (uint8_t)(temp & 0xFF);
temp >>= 8;
*(array2Ptr++) = tb->byte2;
#else
*(array1Ptr++) = (uint8_t)(temp >> 24);
*(array2Ptr++) = (uint8_t)((temp >> 16) & 0xFF);
*(array1Ptr++) = (uint8_t)((temp >> 8) & 0xFF);
*(array2Ptr++) = (uint8_t)(temp & 0xFF);
#endif
}
// Either it is a multiple of 4 or a multiple of 2.
// If it is a multiple of 2, 2 bytes are left over.
if (count * 4 != mixedLength) {
*(array1Ptr) = mixed[mixedLength - 2];
*(array2Ptr) = mixed[mixedLength - 1];
}
}
Same setup as above, 219 times a second and unless I made a mistake, should work with either endianess.
I recommend Graham's solution, but if this is really speed critical and you are willing to go Assembler, you can get even faster.
The idea is this:
Read an entire 32bit integer from mixed. You'll get 'a1b2'.
Rotate the lower 16bit by 8 bits to get '1ab2'(we are using little endians, since this is the default in ARM and therefore Apple A#, so the first two bytes are the lower ones).
Rotate the entire 32bit register right(I think it's right...) by 8 bits to get '21ab'.
Rotate the lower 16bit by 8 bits to get '12ab'
Write the lower 8 bits to array2.
Rotate the entire 32bit register by 16bit.
Write the lower 8 bits to array1
Advance array1 by 16bit, array2 by 16bit, and mixed by 32bit.
Repeat.
We have traded 2 memory reads(assuming we use the Graham's version or equivalent) and 4 memory with one memory read, two memory writes and 4 register operations. While the number of operations has gone up from 6 to 7, register operations are faster than memory operations, so it's more efficient that way. Also, since we read from mixed 32bit at a time instead of 16, we cut iteration management by half.
PS: Theoretically this can also be done for 64bit architecture, but doing all those rotations for 'a1b2c3d4' will drive you to madness.
For x86 SSE, the pack and punpck instructions are what you need. Examples using AVX for the convenience of non-destructive 3-operand instructions. (Not using AVX2 256b-wide instructions, because the 256b pack/unpck instructions do two 128b unpacks in the low and high 128b lanes, so you'd need a shuffle to get things in the correct final order.)
An intrinsics version of the following would work the same. Asm instructions are shorter to type for just writing a quick answer.
Interleave: abcd and 1234 -> a1b2c3d4:
# loop body:
vmovdqu (%rax), %xmm0 # load the sources
vmovdqu (%rbx), %xmm1
vpunpcklbw %xmm0, %xmm1, %xmm2 # low halves -> 128b reg
vpunpckhbw %xmm0, %xmm2, %xmm3 # high halves -> 128b reg
vmovdqu %xmm2, (%rdi) # store the results
vmovdqu %xmm3, 16(%rdi)
# blah blah some loop structure.
`punpcklbw` interleaves the bytes in the low 64 of the two source `xmm` registers. There are `..wd` (word->dword), and dword->qword versions which would be useful for 16 or 32bit elements.
De-interleave: a1b2c3d4 -> abcd and 1234
#outside the loop
vpcmpeqb %xmm5, %xmm5 # set to all-1s
vpsrlw $8, %xmm5, %xmm5 # every 16b word has low 8b = 0xFF, high 8b = 0.
# loop body
vmovdqu (%rsi), %xmm2 # load two src chunks
vmovdqu 16(%rsi), %xmm3
vpand %xmm2, %xmm5, %xmm0 # mask to leave only the odd bytes
vpand %xmm3, %xmm5, %xmm1
vpackuswb %xmm0, %xmm1, %xmm4
vmovdqu %xmm4, (%rax) # store 16B of a[]
vpsrlw $8, %xmm2, %xmm6 # even bytes -> odd bytes
vpsrlw $8, %xmm3, %xmm7
vpackuswb %xmm6, %xmm7, %xmm4
vmovdqu %xmm4, (%rbx)
This can of course use a lot fewer registers. I avoided reusing registers for readability, not performance. Hardware register renaming makes reuse a non-issue, as long as you start with something that doesn't depend on the previous value. (e.g. movd, not movss or pinsrd.)
Deinterleave is so much more work because the pack instructions do signed or unsigned saturation, so the upper 8b of each 16b element has to be zeroed first.
An alternative would be to use pshufb to pack the odd or even words of a single source reg into the low 64 of a register. However, outside of the AMD XOP instruction set's VPPERM, there isn't a shuffle that can select bytes from 2 registers at once (like Altivec's much-loved vperm). So with just SSE/AVX, you'd need 2 shuffles for every 128b of interleaved data. And since store-port usage could be the bottleneck, a punpck to combine two 64bit chunks of a into a single register to set up a 128b store.
With AMD XOP, deinterleave would be 2x128b loads, 2 VPPERM, and 2x128b stores.
premature optimisation is bad
your compiler is probably better at optimising than you are.
That said, there are things you can do to help out the compiler because you have semantic knowledge of your data that a compiler cannot have:
read and write as many bytes as you can, up to the native word size - memory operations are expensive, so do manipulations in registers where possible
unroll loops - look into "Duff's Device".
FWIW, I produced two versions of your copy loop, one much the same as yours, the second using what most would consider "optimal" (albeit still simple) C code:
void test1(byte *p, byte *p1, byte *p2, int n)
{
int i, j;
for (i = 0, j = 0; i < n / 2; i++, j += 2) {
p1[i] = p[j];
p2[i] = p[j + 1];
}
}
void test2(byte *p, byte *p1, byte *p2, int n)
{
while (n) {
*p1++ = *p++;
*p2++ = *p++;
n--; n--;
}
}
With gcc -O3 -S on Intel x86 they both produced almost identical assembly code. Here are the inner loops:
LBB1_2:
movb -1(%rdi), %al
movb %al, (%rsi)
movb (%rdi), %al
movb %al, (%rdx)
incq %rsi
addq $2, %rdi
incq %rdx
decq %rcx
jne LBB1_2
and
LBB2_2:
movb -1(%rdi), %al
movb %al, (%rsi)
movb (%rdi), %al
movb %al, (%rdx)
incq %rsi
addq $2, %rdi
incq %rdx
addl $-2, %ecx
jne LBB2_2
Both have the same number of instructions, the difference accounted for solely because the first version counts up to n / 2, and the second counts down to zero.
EDIT here's a better version:
/* non-portable - assumes little endian */
void test3(byte *p, byte *p1, byte *p2, int n)
{
ushort *ps = (ushort *)p;
n /= 2;
while (n) {
ushort n = *ps++;
*p1++ = n;
*p2++ = n >> 8;
}
}
resulting in:
LBB3_2:
movzwl (%rdi), %ecx
movb %cl, (%rsi)
movb %ch, (%rdx) # NOREX
addq $2, %rdi
incq %rsi
incq %rdx
decq %rax
jne LBB3_2
which is one fewer instruction because it takes advantage of the immediate access to %cl and %ch.