Why _mm256_load_pd compiled to MOVUPD instead of MOVAPD? - c

Why the following code results unaligned AVX instructions ( MOVUPD instead of MOVAPD)? I compiled this on Visual Studio 2015. How can I tell the compiler that my data is indeed aligned?
const size_t ALIGN_SIZE = 64;
const size_t ARRAY_SIZE = 1024;
double __declspec(align(ALIGN_SIZE)) a[ARRAY_SIZE];
double __declspec(align(ALIGN_SIZE)) b[ARRAY_SIZE];
//Calculate the dotproduct
__m256d ymm0 = _mm256_set1_pd(0.0);
for (int i = 0; i < ARRAY_SIZE; i += 8)
{
__m256d ymm1 = _mm256_load_pd(a + i);
__m256d ymm2 = _mm256_load_pd(b + i);
__m256d ymm3 = _mm256_mul_pd(ymm1, ymm2);
ymm0 = _mm256_add_pd(ymm3, ymm0);
__m256d ymm4 = _mm256_load_pd(a + i + 4);
__m256d ymm5 = _mm256_load_pd(b + i + 4);
__m256d ymm6 = _mm256_mul_pd(ymm4, ymm5);
ymm0 = _mm256_add_pd(ymm6, ymm0);
}
Assembly of the loop:
00007FF7AC7A1400 vmovupd ymm1,ymmword ptr [rbp+rax*8+2020h]
00007FF7AC7A1409 vmulpd ymm3,ymm1,ymmword ptr [rbp+rax*8+20h]
00007FF7AC7A140F vmovupd ymm2,ymmword ptr [rbp+rax*8]
00007FF7AC7A1415 vmulpd ymm0,ymm2,ymmword ptr b[rax*8]
00007FF7AC7A141E add r8d,8
00007FF7AC7A1422 movsxd rax,r8d
00007FF7AC7A1425 vaddpd ymm1,ymm0,ymm4
00007FF7AC7A1429 vaddpd ymm4,ymm1,ymm3
00007FF7AC7A142D cmp rax,400h
00007FF7AC7A1433 jb main+70h (07FF7AC7A1400h)

There is the way to solve this problem (it allows to use instruction VMOVDQA (analogue of MOVAPD) instead of MOVUPD):
inline __m256d Load(const double * p)
{
#ifdef _MSC_VER
return _mm256_castsi256_pd(_mm256_load_si256((__m256i*)p));
#else
return _mm256_load_pd(p);
#endif
}
Analogous solution for float type:
inline __m256 Load(const float * p)
{
#ifdef _MSC_VER
return _mm256_castsi256_ps(_mm256_load_si256((__m256i*)p));
#else
return _mm256_load_ps(p);
#endif
}
But in order to cheat Visual Studio compiler you have to use dynamically allocated pointers. Otherwise compiler doesn't use VMOVDQA instruction.
#include <immintrin.h>
int main()
{
float * ps = (float*)_mm_malloc(40, 32);
double * pd = (double*)_mm_malloc(40, 32);
__m256 s = Load(ps);
//00007FF79FF81325 vmovdqa ymm1,ymmword ptr [rdi]
__m256d d = Load(pd);
//00007FF79FF8132F vmovdqa ymm0,ymmword ptr [rax]
_mm256_storeu_ps(ps, s);
_mm256_storeu_pd(pd, d);
_mm_free(ps);
_mm_free(pd);
}

Related

Using AVX to improve performance of float subtract, divide, truncate to int32

Trying to use AVX to improve performance of the following
__declspec(dllexport) void __cdecl calculate_quantized_vertical_values(long length, float min, float step, float* source, unsigned long* destination)
{
for (long i = 0; i < length; i++)
{
destination[i] = (source[i] - min) / step;
}
}
by replacing it with
__declspec(dllexport) void __cdecl calculate_quantized_vertical_values_avx(long length, float min, float step, float* source, unsigned long* destination)
{
long multiple8end = ((long)(length / 8)) * 8;
__m256 min256 = _mm256_broadcast_ss((const float*)&min);
__m256 step256 = _mm256_broadcast_ss((const float*)&step);
for (long i = 0; i < multiple8end; i+=8)
{
__m256 value256 = _mm256_load_ps((const float*)(source + i));
__m256 offset256 = _mm256_sub_ps(value256, min256);
__m256 floatres256 = _mm256_div_ps(offset256, step256);
__m256i long256 = _mm256_cvttps_epi32(floatres256);
_mm256_store_si256((__m256i*)(destination + i), long256);
}
for (long i = multiple8end; i < length; i ++)
{
destination[i] = (source[i] - min) / step;
}
}
The original loop takes around 330ms with my 55M element source array and the contents of the loop compile to
loc_180001050:
movss xmm0, dword ptr [r10+rcx-4]
subss xmm0, xmm3
divss xmm0, xmm2
cvttss2si rax, xmm0
mov [rcx-4], eax
movss xmm1, dword ptr [r10+rcx]
subss xmm1, xmm3
divss xmm1, xmm2
cvttss2si rax, xmm1
mov [rcx], eax
movss xmm0, dword ptr [r10+rcx+4]
subss xmm0, xmm3
divss xmm0, xmm2
cvttss2si rax, xmm0
mov [rcx+4], eax
movss xmm1, dword ptr [r10+rcx+8]
subss xmm1, xmm3
divss xmm1, xmm2
cvttss2si rax, xmm1
mov [rcx+8], eax
add rcx, 10h
sub r8, 1
jnz short loc_180001050
The AVX loop takes around 170ms over the same 55M element source array and the contents of the (main) loop compile to:
loc_180001160:
vmovups ymm0, ymmword ptr [r8+rdx]
lea rdx, [rdx+20h]
vsubps ymm1, ymm0, ymm6
vdivps ymm2, ymm1, ymm7
vcvttps2dq ymm3, ymm2
vmovdqu ymmword ptr [rdx-20h], ymm3
sub rax, 1
jnz short loc_180001160
So there IS a performance improvement with AVX but I wonder if it's possible to get a more significant performance improvement or this is about the limit for this particular calculation
Edit: I should also mention that I'm calling these DLL functions from a .NET app if it makes any difference.
Edit: I would ideally want unsigned char array for destination but sticking with int32 for now because I've not found a way to do the float -> unsigned char conversion with AVX
Also multiplication by 1.f/step instead of division by step should be fine for me if it improves performance
If you scale by 1/step instead of dividing by step you should be significantly faster, unless you are limited by memory-throughput. If you factor out the subtraction of min, you are also able to use FMA instructions, if they are available:
void calculate_quantized_vertical_values_avx(size_t length, float min, float step, float* source, uint32_t* destination)
{
size_t multiple8end = ((length / 8)) * 8;
const float scale = 1.f/step;
const float offset = -min * scale;
const __m256 scale256 = _mm256_set1_ps(scale);
const __m256 offset256 = _mm256_set1_ps(offset);
for (size_t i = 0; i < multiple8end; i+=8)
{
__m256 value256 = _mm256_load_ps((const float*)(source + i));
#ifdef __FMA__
__m256 floatres256 = _mm256_fmadd_ps(value256, scale256, offset256);
#else
__m256 floatres256 = _mm256_add_ps(_mm256_mul_ps(value256, scale256), offset256);
#endif
__m256i long256 = _mm256_cvttps_epi32(floatres256);
_mm256_store_si256((__m256i*)(destination + i), long256);
}
for (size_t i = multiple8end; i < length; i ++)
{
destination[i] = (source[i] * scale) + offset;
}
}
If you want to convert the result to uint8, have a look at _mm256_packus_epi32 and _mm256_packus_epi16 (or _mm_packus_epi32 and _mm_packus_epi16 if you don't have AVX2).

How to convert this assembly code to intrinsic code?

Below it seems like intrinsics, however, I am not familiar with intrinsic functions. Please help me to convert the real code. Especially, testFunc() is more ambiguous for me.
I guess it is also for dot product of two float vectors, but, the labels Lrep and Lexit make me confuse.
Please figure out clearly for me.
And intrinsics are available for mobile processor?
void testFunc(int M, int N, int K, float* A, float* B, float* C)
{
float *a;
float *b = new float[K*N];
float *pointb = B;
float *bb;
float *answer = C;
float c[8];
for (int j = 0, k; j < K; j++) {
bb = b + j;
for (k = N / 8; k > 0; k--) {
*bb = *pointb++; bb += K;
*bb = *pointb++; bb += K;
*bb = *pointb++; bb += K;
*bb = *pointb++; bb += K;
*bb = *pointb++; bb += K;
*bb = *pointb++; bb += K;
*bb = *pointb++; bb += K;
*bb = *pointb++; bb += K;
}
for (k = N / 8 * 8; k < N; k++) {
*bb = *pointb++; bb += K;
}
}
int K8 = K / 8 * 8;
for (int i = 0; i < M; i++) for (int k = 0; k < N; k++) {
a = A + i * K;
bb = b + k * K;
__asm {
mov esi, K8;
sub esi, 8;
shl esi, 2;
xor edi, edi;
mov edx, a;
mov ebx, bb;
vxorps ymm3, ymm3, ymm3;
Lrep:
cmp edi, esi;
jg Lexit;
vmovups ymm0, ymmword ptr[edx + edi];
vfmadd231ps ymm3, ymm0, ymmword ptr[ebx + edi];
add edi, 32;
jmp Lrep;
Lexit:
vmovups ymmword ptr[c], ymm3;
}
for (int j = K8; j < K; ) {
*c += *(a + j) * *(bb + j); j++;
}
*answer = (c[0] + c[1] + c[2] + c[3] + c[4] + c[5] + c[6] + c[7]);
answer++;
}
}
and
pA = A;
for (k = 0; k < K; k++) {
pC = C;
for (i = 0; i < M; i++) {
pA = A + i * K + k;
pB = B + k * N;
for (j = N / 32; j > 0; j--) {
_asm {
mov eax, pC;
mov ebx, pA;
mov ecx, pB;
vmovups ymm0, ymmword ptr[eax];
vmovss xmm1, dword ptr[ebx];
vbroadcastss ymm4, xmm1;
vmovups ymm2, ymmword ptr[ecx];
vfmadd231ps ymm0, ymm4, ymm2;
vmovups ymmword ptr[eax], ymm0;
}
pC += 8; pB += 8;
_asm {
mov eax, pC;
mov ebx, pA;
mov ecx, pB;
vmovups ymm0, ymmword ptr[eax];
vmovss xmm1, dword ptr[ebx];
vbroadcastss ymm4, xmm1;
vmovups ymm2, ymmword ptr[ecx];
vfmadd231ps ymm0, ymm4, ymm2;
vmovups ymmword ptr[eax], ymm0;
}
pC += 8; pB += 8;
_asm {
mov eax, pC;
mov ebx, pA;
mov ecx, pB;
vmovups ymm0, ymmword ptr[eax];
vmovss xmm1, dword ptr[ebx];
vbroadcastss ymm4, xmm1;
vmovups ymm2, ymmword ptr[ecx];
vfmadd231ps ymm0, ymm4, ymm2;
vmovups ymmword ptr[eax], ymm0;
}
pC += 8; pB += 8;
_asm {
mov eax, pC;
mov ebx, pA;
mov ecx, pB;
vmovups ymm0, ymmword ptr[eax];
vmovss xmm1, dword ptr[ebx];
vbroadcastss ymm4, xmm1;
vmovups ymm2, ymmword ptr[ecx];
vfmadd231ps ymm0, ymm4, ymm2;
vmovups ymmword ptr[eax], ymm0;
}
pC += 8; pB += 8;
}
for (j = N / 32 * 32; j < N; j++) {
*pC += *pA * *pB;
pC += 1; pB += 1;
}
}
}
In intrinsics, it's this code repeated 4 times.
{
// vmovups ymm0, ymmword ptr[eax];
__m256 tempC = _mm256_loadu_ps((float*)pC);
// vmovss xmm1, dword ptr[ebx];
// vbroadcastss ymm4, xmm1;
__m256 tempA = _mm256_set1_ps(*pA);
// vmovups ymm2, ymmword ptr[ecx];
__m256 tempB = _mm256_loadu_ps((float*)pB);
// vfmadd231ps ymm0, ymm4, ymm2;
__m256 result = _mm256_fmadd_ps(tempA, tempB, tempC);
// vmovups ymmword ptr[eax], ymm0;
_mm256_storeu_ps(pC, result);
}
pC += 8; pB += 8;
Constantly broadcasting the same value from pA seems a bit redundant though.
2 vector loads (from the same position in 2 arrays) feeding an FMA into a vector accumulator smells like a dot-product to me.
I didn't check the asm reference manual to see that the destination operand was the sum rather than 1 of the multiplicands, but that's the way that makes sense.
The triple-nested loop looks like a matrix multiplication. It broadcasts 1 input while doing a vector load from the other to feed an FMA, so probably it's generating a SIMD vector of results for an output row.
Using MSVC inline asm syntax for this is pretty bad; it can only accept inputs via memory operands so it forces a reload + store between each block of asm. If you're going to unroll, use one big asm statement and use displacements in the addressing modes.
IDK why the dot-produce loop is written inefficiently (with both a conditional and unconditional branch inside the loop), and not unrolled with multiple accumulators. Pretty much defeats the purpose of hand-coding in asm. See Why does mulss take only 3 cycles on Haswell, different from Agner's instruction tables? for how to use multiple accumulators to hide FMA latency. Or let clang do it for you when unrolling+vectorizing a pure C loop.
I also don't know why it doesn't horizontal-sum the result, but instead just stores it to memory with vmovups [c], ymm3. Seems pointless. I guess the caller has to reload from memory and sum, or you could declare the function as returning a __m256 vector and ignore the store.
Anyway, you can obviously write a dot-product in scalar C code, perhaps using fma(a[i], b[i], sum) from math.h to replicate the asm's behaviour of not rounding the temporary result.
Or copy the manual vectorization with intrinsics like sum = _mm256_fmadd_ps(_mm256_loadu_ps(a[i]), _mm256_loadu_ps(b[i]), sum); or something. (See Intel's intrinsics guide).
I'll do the first couple of lines to get you started, but really, if you can't read the assembly you'll need to refer to the Intel CPU manual to be able to decipher it.
mov esi, K8;
sub esi, 8;
shl esi, 2;
xor edi, edi;
mov edx, a;
mov ebx, bb;
mov esi, K8
copy the contents of K8 into esi
subtract 8 from the value in easi
shift left 2 bits of esi and the copy result into esi
apply xor operation to edi against edi (this will be 0 and the reason clear if you understand binary and how registers work)
copy contents of a into edx
copy contents of bb into ebx
copy contents of K8 into esi
From here you'll need to familiarise yourself with depending on where your knowledge is at, binary and basic cpu architecture and assembly language operands that are relevant to your problem. Once you can read each line, then you can decipher the blocks and finally the program.

AVX is not faster than SSE?

I have large block of data to calculate:
static float source0[COUNT];
static float source1[COUNT];
static float result[COUNT]; /* result[i] = source0[i] * source1[i]; */
s0 = (size_t)source0;
s1 = (size_t)source1;
r = (size_t)result;
They are all 32-byte aligned.
The related SSE code:
for(i = 0; i < COUNT; i += 16)
{
__asm volatile
(
"movntdqa xmm0, [%0]\n\t"
"movntdqa xmm1, [%1]\n\t"
"mulps xmm1, xmm0\n\t"
"movntps [%2], xmm1"
: : "r"(s0 + i), "r"(s1 + i), "r"(r + i) : "xmm0", "xmm1"
);
}
The related AVX code:
for(i = 0; i < COUNT; i += 32)
{
__asm volatile
(
"vmovapd ymm0, [%0]\n\t"
"vmovapd ymm1, [%1]\n\t"
"vmulps ymm1, ymm1, ymm0\n\t"
"vmovntps [%2], ymm1"
: : "r"(s0 + i), "r"(s1 + i), "r"(r + i) : "ymm0", "ymm1"
);
}
The result is that AVX code used time is always nearly the same as SSE code. But they are much faster then normal C code.
I think the major reason is that "vmodapd" does not support "NT" version, until AVX2 extension. This causes too much d-cache pollution.
Is there any better way to explore the power of AVX(not AVX2)?

How Do I Attain Peak CPU Performance With Dot Product?

Problem
I have been studying HPC, specifically using matrix multiplication as my project (see my other posts in profile). I achieve good performance in those, but not good enough. I am taking a step back to see how well I can do with a dot product calculation.
Dot Product vs. Matrix Multiplication
The dot product is simpler, and will allow me to test HPC concepts without dealing with packing and other related issues. Cache blocking is still an issue, which forms my second question.
Algorithm
Multiply n corresponding elements in two double arrays A and B and sum them. A double dot product in assembly is just a series of movapd, mulpd, addpd. Unrolled and arranged in a clever way, it is possible to have groups of movapd/mulpd/addpd that operate on different xmm registers and are thus independent, optimizing pipelining. Of course, it turns out that this does not matter so much as my CPU has out-of-order execution. Also note that the re-arrangement requires peeling off the last iteration.
Other Assumptions
I am not writing the code for general dot products. The code is for specific sizes and I am not handling fringe cases. This is just to test HPC concepts and to see what type of CPU usage I can attain.
Results
Compiled with gcc -std=c99 -O2 -m32 -mincoming-stack-boundary=2 -msse3 -mfpmath=sse,387 -masm=intel. I am on a different computer than usual. This computer has a i5 540m which can obtain 2.8 GHz * 4 FLOPS/cycle/core = 11.2 GFLOPS/s per core after a two-step Intel Turbo Boost (both cores are on right now so it only gets 2 step...a 4 step boost is possible if I turn off one core). 32 bit LINPACK gets around 9.5 GFLOPS/s when set to run with one thread.
N Total Gflops/s Residual
256 5.580521 1.421085e-014
384 5.734344 -2.842171e-014
512 5.791168 0.000000e+000
640 5.821629 0.000000e+000
768 5.814255 2.842171e-014
896 5.807132 0.000000e+000
1024 5.817208 -1.421085e-013
1152 5.805388 0.000000e+000
1280 5.830746 -5.684342e-014
1408 5.881937 -5.684342e-014
1536 5.872159 -1.705303e-013
1664 5.881536 5.684342e-014
1792 5.906261 -2.842171e-013
1920 5.477966 2.273737e-013
2048 5.620931 0.000000e+000
2176 3.998713 1.136868e-013
2304 3.370095 -3.410605e-013
2432 3.371386 -3.410605e-013
Question 1
How can I do better than this? I am not even coming close to the peak performance. I have optimized the assembly code to high heaven. Further unrolling might boost it just a little more, but less unrolling seems to degrade performance.
Question 2
When n > 2048, you can see a drop in performance. This is because my L1 cache is 32KB, and when n = 2048 and A and B are double, they take up the entire cache. Any bigger and they are streamed from memory.
I tried cache blocking (not shown in source), but maybe I did it wrong. Can anyone provide some code or explain how to block a dot product for a cache?
Source Code
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <string.h>
#include <x86intrin.h>
#include <math.h>
#include <omp.h>
#include <stdint.h>
#include <windows.h>
// computes 8 dot products
#define KERNEL(address) \
"movapd xmm4, XMMWORD PTR [eax+"#address"] \n\t" \
"mulpd xmm7, XMMWORD PTR [edx+48+"#address"] \n\t" \
"addpd xmm2, xmm6 \n\t" \
"movapd xmm5, XMMWORD PTR [eax+16+"#address"] \n\t" \
"mulpd xmm4, XMMWORD PTR [edx+"#address"] \n\t" \
"addpd xmm3, xmm7 \n\t" \
"movapd xmm6, XMMWORD PTR [eax+96+"#address"] \n\t" \
"mulpd xmm5, XMMWORD PTR [edx+16+"#address"] \n\t" \
"addpd xmm0, xmm4 \n\t" \
"movapd xmm7, XMMWORD PTR [eax+112+"#address"] \n\t" \
"mulpd xmm6, XMMWORD PTR [edx+96+"#address"] \n\t" \
"addpd xmm1, xmm5 \n\t"
#define PEELED(address) \
"movapd xmm4, XMMWORD PTR [eax+"#address"] \n\t" \
"mulpd xmm7, [edx+48+"#address"] \n\t" \
"addpd xmm2, xmm6 \n\t" \
"movapd xmm5, XMMWORD PTR [eax+16+"#address"] \n\t" \
"mulpd xmm4, XMMWORD PTR [edx+"#address"] \n\t" \
"addpd xmm3, xmm7 \n\t" \
"mulpd xmm5, XMMWORD PTR [edx+16+"#address"] \n\t" \
"addpd xmm0, xmm4 \n\t" \
"addpd xmm1, xmm5 \n\t"
inline double
__attribute__ ((gnu_inline))
__attribute__ ((aligned(64))) ddot_ref(
int n,
const double* restrict A,
const double* restrict B)
{
double sum0 = 0.0;
double sum1 = 0.0;
double sum2 = 0.0;
double sum3 = 0.0;
double sum;
for(int i = 0; i < n; i+=4) {
sum0 += *(A + i ) * *(B + i );
sum1 += *(A + i+1) * *(B + i+1);
sum2 += *(A + i+2) * *(B + i+2);
sum3 += *(A + i+3) * *(B + i+3);
}
sum = sum0+sum1+sum2+sum3;
return(sum);
}
inline double
__attribute__ ((gnu_inline))
__attribute__ ((aligned(64))) ddot_asm
( int n,
const double* restrict A,
const double* restrict B)
{
double sum;
__asm__ __volatile__
(
"mov eax, %[A] \n\t"
"mov edx, %[B] \n\t"
"mov ecx, %[n] \n\t"
"pxor xmm0, xmm0 \n\t"
"pxor xmm1, xmm1 \n\t"
"pxor xmm2, xmm2 \n\t"
"pxor xmm3, xmm3 \n\t"
"movapd xmm6, XMMWORD PTR [eax+32] \n\t"
"movapd xmm7, XMMWORD PTR [eax+48] \n\t"
"mulpd xmm6, XMMWORD PTR [edx+32] \n\t"
"sar ecx, 7 \n\t"
"sub ecx, 1 \n\t" // peel
"L%=: \n\t"
KERNEL(64 * 0)
KERNEL(64 * 1)
KERNEL(64 * 2)
KERNEL(64 * 3)
KERNEL(64 * 4)
KERNEL(64 * 5)
KERNEL(64 * 6)
KERNEL(64 * 7)
KERNEL(64 * 8)
KERNEL(64 * 9)
KERNEL(64 * 10)
KERNEL(64 * 11)
KERNEL(64 * 12)
KERNEL(64 * 13)
KERNEL(64 * 14)
KERNEL(64 * 15)
"lea eax, [eax+1024] \n\t"
"lea edx, [edx+1024] \n\t"
" \n\t"
"dec ecx \n\t"
"jnz L%= \n\t" // end loop
" \n\t"
KERNEL(64 * 0)
KERNEL(64 * 1)
KERNEL(64 * 2)
KERNEL(64 * 3)
KERNEL(64 * 4)
KERNEL(64 * 5)
KERNEL(64 * 6)
KERNEL(64 * 7)
KERNEL(64 * 8)
KERNEL(64 * 9)
KERNEL(64 * 10)
KERNEL(64 * 11)
KERNEL(64 * 12)
KERNEL(64 * 13)
KERNEL(64 * 14)
PEELED(64 * 15)
" \n\t"
"addpd xmm0, xmm1 \n\t" // summing result
"addpd xmm2, xmm3 \n\t"
"addpd xmm0, xmm2 \n\t" // cascading add
"movapd xmm1, xmm0 \n\t" // copy xmm0
"shufpd xmm1, xmm0, 0x03 \n\t" // shuffle
"addsd xmm0, xmm1 \n\t" // add low qword
"movsd %[sum], xmm0 \n\t" // mov low qw to sum
: // outputs
[sum] "=m" (sum)
: // inputs
[A] "m" (A),
[B] "m" (B),
[n] "m" (n)
: //register clobber
"memory",
"eax","ecx","edx","edi",
"xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7"
);
return(sum);
}
int main()
{
// timers
LARGE_INTEGER frequency, time1, time2;
double time3;
QueryPerformanceFrequency(&frequency);
// clock_t time1, time2;
double gflops;
int nmax = 4096;
int trials = 1e7;
double sum, residual;
FILE *f = fopen("soddot.txt","w+");
printf("%16s %16s %16s\n","N","Total Gflops/s","Residual");
fprintf(f,"%16s %16s %16s\n","N","Total Gflops/s","Residual");
for(int n = 256; n <= nmax; n += 128 ) {
double* A = NULL;
double* B = NULL;
A = _mm_malloc(n*sizeof(*A), 64); if (!A) {printf("A failed\n"); return(1);}
B = _mm_malloc(n*sizeof(*B), 64); if (!B) {printf("B failed\n"); return(1);}
srand(time(NULL));
// create arrays
for(int i = 0; i < n; ++i) {
*(A + i) = (double) rand()/RAND_MAX;
*(B + i) = (double) rand()/RAND_MAX;
}
// warmup
sum = ddot_asm(n,A,B);
QueryPerformanceCounter(&time1);
// time1 = clock();
for (int count = 0; count < trials; count++){
// sum = ddot_ref(n,A,B);
sum = ddot_asm(n,A,B);
}
QueryPerformanceCounter(&time2);
time3 = (double)(time2.QuadPart - time1.QuadPart) / frequency.QuadPart;
// time3 = (double) (clock() - time1)/CLOCKS_PER_SEC;
gflops = (double) (2.0*n*trials)/time3/1.0e9;
residual = ddot_ref(n,A,B) - sum;
printf("%16d %16f %16e\n",n,gflops,residual);
fprintf(f,"%16d %16f %16e\n",n,gflops,residual);
_mm_free(A);
_mm_free(B);
}
fclose(f);
return(0); // successful completion
}
EDIT: explanation of assembly
A dot product is just a repeat sum of products of two numbers: sum += a[i]*b[i]. sum must be initialized to 0 before the first iteration. Vectorized, you can do 2 sums at a time which must be summed at the end: [sum0 sum1] = [a[i] a[i+1]]*[b[i] b[i+1]], sum = sum0 + sum1. In (Intel) assembly, this is 3 steps (after the initialization):
pxor xmm0, xmm0 // accumulator [sum0 sum1] = [0 0]
movapd xmm1, XMMWORD PTR [eax] // load [a[i] a[i+1]] into xmm1
mulpd xmm1, XMMWORD PTR [edx] // xmm1 = xmm1 * [b[i] b[i+1]]
addpd xmm0, xmm1 // xmm0 = xmm0 + xmm1
At this point you have nothing special, the compiler can come up with this. You can usually get better performance by unrolling the code enough times to use all xmm registers available to you (8 registers in 32 bit mode). So if you unroll it 4 times that allows you to utilize all 8 registers xmm0 through xmm7. You will have 4 accumulators and 4 registers for storing the results of movapd and addpd. Again, the compiler can come up with this. The real thinking part is trying to come up with a way to pipeline the code, i.e., make each instruction in the group of MOV/MUL/ADD operate on different registers so that all 3 instructions execute at the same time (usually the case on most CPUs). That's how you beat the compiler. So you have to pattern the 4x unrolled code to do just that, which may require loading vectors ahead of time and peeling off the first or last iteration. This is what KERNEL(address) is. I made a macro of the 4x unrolled pipelined code for convenience. That way I can easily unroll it in multiples of 4 by just changing address. Each KERNEL computes 8 dot products.
To answer your overall question you can't achieve peak performance with the dot product.
The problem is that your CPU can do one 128-bit load per clock cycle and to do the dot product you need two 128-bit loads per clock cycle.
But it's worse than that for large n. The answer to your second question is that the dot product is memory bound and not compute bound and so it cannot parallelize for large n with fast cores. This is explained better here why-vectorizing-the-loop-does-not-have-performance-improvement. This is a big problem with parallelization with fast cores. It took me a while to figure this out but it's very important to learn.
There are actually few basic algorithms that can fully benefit from parallelization on fast cores. In terms of BLAS algorithms it's only the Level-3 algorithms (O(n^3)) such as matrix multiplication that really benefit from parallelization. The situation is better on slow cores e.g. with GPUs and the Xeon Phi because the discrepancy between memory speed and core speed is much smaller.
If you want to find an algorithm which can get close to peak flops for small n try e.g. scalar * vector or the sum of scalar * vector. The first case should do one load, one mult, and one store every clock cycle and the second case one mult, one add, and one load every clock cycle.
I tested the following code on a Core 2 Duo P9600#2.67GHz in Knoppix 7.3 32-bit. I get about 75% of the peak for the scalar product and 75% of the peakfor the sum of the scalar product. The flops/cycle for the scalar product is 2 and for the sum of the scalar product it's 4.
Compiled with g++ -msse2 -O3 -fopenmp foo.cpp -ffast-math
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
#include <x86intrin.h>
void scalar_product(double * __restrict a, int n) {
a = (double*)__builtin_assume_aligned (a, 64);
double k = 3.14159;
for(int i=0; i<n; i++) {
a[i] = k*a[i];
}
}
void scalar_product_SSE(double * __restrict a, int n) {
a = (double*)__builtin_assume_aligned (a, 64);
__m128d k = _mm_set1_pd(3.14159);
for(int i=0; i<n; i+=8) {
__m128d t1 = _mm_load_pd(&a[i+0]);
_mm_store_pd(&a[i],_mm_mul_pd(k,t1));
__m128d t2 = _mm_load_pd(&a[i+2]);
_mm_store_pd(&a[i+2],_mm_mul_pd(k,t2));
__m128d t3 = _mm_load_pd(&a[i+4]);
_mm_store_pd(&a[i+4],_mm_mul_pd(k,t3));
__m128d t4 = _mm_load_pd(&a[i+6]);
_mm_store_pd(&a[i+6],_mm_mul_pd(k,t4));
}
}
double scalar_sum(double * __restrict a, int n) {
a = (double*)__builtin_assume_aligned (a, 64);
double sum = 0.0;
double k = 3.14159;
for(int i=0; i<n; i++) {
sum += k*a[i];
}
return sum;
}
double scalar_sum_SSE(double * __restrict a, int n) {
a = (double*)__builtin_assume_aligned (a, 64);
__m128d sum1 = _mm_setzero_pd();
__m128d sum2 = _mm_setzero_pd();
__m128d sum3 = _mm_setzero_pd();
__m128d sum4 = _mm_setzero_pd();
__m128d k = _mm_set1_pd(3.14159);
for(int i=0; i<n; i+=8) {
__m128d t1 = _mm_load_pd(&a[i+0]);
sum1 = _mm_add_pd(_mm_mul_pd(k,t1),sum1);
__m128d t2 = _mm_load_pd(&a[i+2]);
sum2 = _mm_add_pd(_mm_mul_pd(k,t2),sum2);
__m128d t3 = _mm_load_pd(&a[i+4]);
sum3 = _mm_add_pd(_mm_mul_pd(k,t3),sum3);
__m128d t4 = _mm_load_pd(&a[i+6]);
sum4 = _mm_add_pd(_mm_mul_pd(k,t4),sum4);
}
double tmp[8];
_mm_storeu_pd(&tmp[0],sum1);
_mm_storeu_pd(&tmp[2],sum2);
_mm_storeu_pd(&tmp[4],sum3);
_mm_storeu_pd(&tmp[6],sum4);
double sum = 0;
for(int i=0; i<8; i++) sum+=tmp[i];
return sum;
}
int main() {
//_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
//_mm_setcsr(_mm_getcsr() | 0x8040);
double dtime, peak, flops, sum;
int repeat = 1<<18;
const int n = 2048;
double *a = (double*)_mm_malloc(sizeof(double)*n,64);
double *b = (double*)_mm_malloc(sizeof(double)*n,64);
for(int i=0; i<n; i++) a[i] = 1.0*rand()/RAND_MAX;
dtime = omp_get_wtime();
for(int r=0; r<repeat; r++) {
scalar_product_SSE(a,n);
}
dtime = omp_get_wtime() - dtime;
peak = 2*2.67;
flops = 1.0*n/dtime*1E-9*repeat;
printf("time %f, %f, %f\n", dtime,flops, flops/peak);
//for(int i=0; i<n; i++) a[i] = 1.0*rand()/RAND_MAX;
//sum = 0.0;
dtime = omp_get_wtime();
for(int r=0; r<repeat; r++) {
scalar_sum_SSE(a,n);
}
dtime = omp_get_wtime() - dtime;
peak = 2*2*2.67;
flops = 2.0*n/dtime*1E-9*repeat;
printf("time %f, %f, %f\n", dtime,flops, flops/peak);
//printf("sum %f\n", sum);
}

Replicating BLAS matrix multiplication performance: Can I match it?

Background
If you have been following my posts, I am attempting to replicate the results found in Kazushige Goto's seminal paper on square matrix multiplication C = AB. My last post regarding this topic can be found here. In that version of my code, I follow the memory layering and packing strategy of Goto with an inner kernel computing 2x8 blocks of C using 128 bit SSE3 intrinsics. My CPU is i5-540M with hyperthreading off. Additional info about my hardware can be found in another post and is repeated below.
My Hardware
My CPU is an Intel i5 - 540M. You can find the relevant CPUID information on cpu-world.com. The microarchitecture is Nehalem (westmere), so it can theoretically compute 4 double precision flops per core per cycle. I will be using just one core (no OpenMP), so with hyperthreading off and 4-step Intel Turbo Boost, I should be seeing a peak of ( 2.533 Ghz + 4*0.133 Ghz ) * ( 4 DP flops/core/cycle ) * ( 1 core ) = 12.27 DP Gflops. For reference, with both cores running at peak, Intel Turbo Boost gives a 2-step speed up and I should get a theoretical peak of 22.4 DP Gflops.
My Software
Windows7 64 bit, but MinGW/GCC 32 bit due to restrictions on my computer.
What's new this time?
I compute 2x4 blocks of C. This gives better performance and is in line with what Goto says (half the registers shoubld be used to compute C). I have tried many sizes: 1x8, 2x8, 2x4, 4x2, 2x2, 4x4.
My inner kernel is hand-coded x86 assembly, optimized to the best of my ability (matches some of the kernels that Goto wrote), which gives a rather large performance boost over SIMD. This code is unrolled 8 times inside the inner kernel (defined as a macro for convenience), giving the best performance out of other unrolling strategies I tried.
I use the Windows performance counters to time the codes, rather than clock().
I time the inner kernel independently of the total code, to see how well my hand-coded assembly is doing.
I report the best result from some number of trials, rather than an average over the trials.
No more OpenMP (single core perfomance only).
NOTE I will recompile OpenBLAS tonight to use only 1 core so I can compare.
Some Preliminary Results
N is the dimension of the square matrices, Total Gflops/s is the Gflops/s of the entire code, and Kernel Gflops/s is the Gflops/s of the inner kernel. You can see that with a 12.26 Gflops/s peak on one core, the inner kernel is getting around 75% efficiency, while the overall code is about 60% efficient.
I would like to get closer to 95% efficiency for the kernel and 80% for the overall code. What else can I do to improve the performance, of at least the inner kernel?
N Total Gflops/s Kernel Gflops/s
256 7.778089 9.756284
512 7.308523 9.462700
768 7.223283 9.253639
1024 7.197375 9.132235
1280 7.142538 8.974122
1536 7.114665 8.967249
1792 7.060789 8.861958
Source Code
If you are feeling particularly magnanimous, please test my code on your machine. Compiled with gcc -std=c99 -O2 -m32 -msse3 -mincoming-stack-boundary=2 -masm=intel somatmul2.c -o somatmul2.exe. Feel free to try other flags, but I have found these to work best on my machine.
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <string.h>
#include <xmmintrin.h>
#include <math.h>
#include <omp.h>
#include <stdint.h>
#include <windows.h>
#ifndef min
#define min( a, b ) ( ((a) < (b)) ? (a) : (b) )
#endif
inline void
__attribute__ ((gnu_inline))
__attribute__ ((aligned(64))) rpack( double* restrict dst,
const double* restrict src,
const int kc, const int mc, const int mr, const int n)
{
double tmp[mc*kc] __attribute__ ((aligned(64)));
double* restrict ptr = &tmp[0];
for (int i = 0; i < mc; ++i)
for (int j = 0; j < kc; j+=4) {
*ptr++ = *(src + i*n + j );
*ptr++ = *(src + i*n + j+1);
*ptr++ = *(src + i*n + j+2);
*ptr++ = *(src + i*n + j+3);
}
ptr = &tmp[0];
//const int inc_dst = mr*kc;
for (int k = 0; k < mc; k+=mr)
for (int j = 0; j < kc; ++j)
for (int i = 0; i < mr*kc; i+=kc)
*dst++ = *(ptr + k*kc + j + i);
}
inline void
__attribute__ ((gnu_inline))
__attribute__ ((aligned(64))) cpack(double* restrict dst,
const double* restrict src,
const int nc,
const int kc,
const int nr,
const int n)
{ //nc cols, kc rows, nr size ofregister strips
double tmp[kc*nc] __attribute__ ((aligned(64)));
double* restrict ptr = &tmp[0];
for (int i = 0; i < kc; ++i)
for (int j = 0; j < nc; j+=4) {
*ptr++ = *(src + i*n + j );
*ptr++ = *(src + i*n + j+1);
*ptr++ = *(src + i*n + j+2);
*ptr++ = *(src + i*n + j+3);
}
ptr = &tmp[0];
// const int inc_k = nc/nr;
for (int k = 0; k < nc; k+=nr)
for (int j = 0; j < kc*nc; j+=nc)
for (int i = 0; i < nr; ++i)
*dst++ = *(ptr + k + i + j);
}
#define KERNEL0(add0,add1,add2,add3) \
"mulpd xmm4, xmm6 \n\t" \
"addpd xmm0, xmm4 \n\t" \
"movapd xmm4, XMMWORD PTR [edx+"#add2"] \n\t" \
"mulpd xmm7, xmm4 \n\t" \
"addpd xmm1, xmm7 \n\t" \
"movddup xmm5, QWORD PTR [eax+"#add0"] \n\t" \
"mulpd xmm6, xmm5 \n\t" \
"addpd xmm2, xmm6 \n\t" \
"movddup xmm7, QWORD PTR [eax+"#add1"] \n\t" \
"mulpd xmm4, xmm5 \n\t" \
"movapd xmm6, XMMWORD PTR [edx+"#add3"] \n\t" \
"addpd xmm3, xmm4 \n\t" \
"movapd xmm4, xmm7 \n\t" \
" \n\t"
inline void
__attribute__ ((gnu_inline))
__attribute__ ((aligned(64))) dgemm_2x4_asm_j
(
const int mc, const int nc, const int kc,
const double* restrict locA, const int cs_a, // mr
const double* restrict locB, const int rs_b, // nr
double* restrict C, const int rs_c
)
{
const double* restrict a1 = locA;
for (int i = 0; i < mc ; i+=cs_a) {
const double* restrict b1 = locB;
double* restrict c11 = C + i*rs_c;
for (int j = 0; j < nc ; j+=rs_b) {
__asm__ __volatile__
(
"mov eax, %[a1] \n\t"
"mov edx, %[b1] \n\t"
"mov edi, %[c11] \n\t"
"mov ecx, %[kc] \n\t"
"pxor xmm0, xmm0 \n\t"
"movddup xmm7, QWORD PTR [eax] \n\t" // a1
"pxor xmm1, xmm1 \n\t"
"movapd xmm6, XMMWORD PTR [edx] \n\t" // b1
"pxor xmm2, xmm2 \n\t"
"movapd xmm4, xmm7 \n\t" // a1
"pxor xmm3, xmm3 \n\t"
"sar ecx, 3 \n\t" // divide by 2^num
"L%=: \n\t" // start loop
KERNEL0( 8, 16, 16, 32)
KERNEL0( 24, 32, 48, 64)
KERNEL0( 40, 48, 80, 96)
KERNEL0( 56, 64, 112, 128)
KERNEL0( 72, 80, 144, 160)
KERNEL0( 88, 96, 176, 192)
KERNEL0( 104, 112, 208, 224)
KERNEL0( 120, 128, 240, 256)
"add eax, 128 \n\t"
"add edx, 256 \n\t"
" \n\t"
"dec ecx \n\t"
"jne L%= \n\t" // end loop
" \n\t"
"mov esi, %[rs_c] \n\t" // don't need cs_a anymore
"sal esi, 3 \n\t" // times 8
"lea ebx, [edi+esi] \n\t" // don't need b1 anymore
"addpd xmm0, XMMWORD PTR [edi] \n\t" // c11
"addpd xmm1, XMMWORD PTR [edi+16] \n\t" // c11 + 2
"addpd xmm2, XMMWORD PTR [ebx] \n\t" // c11
"addpd xmm3, XMMWORD PTR [ebx+16] \n\t" // c11 + 2
"movapd XMMWORD PTR [edi], xmm0 \n\t"
"movapd XMMWORD PTR [edi+16], xmm1 \n\t"
"movapd XMMWORD PTR [ebx], xmm2 \n\t"
"movapd XMMWORD PTR [ebx+16], xmm3 \n\t"
: // no outputs
: // inputs
[kc] "m" (kc),
[a1] "m" (a1),
[cs_a] "m" (cs_a),
[b1] "m" (b1),
[rs_b] "m" (rs_b),
[c11] "m" (c11),
[rs_c] "m" (rs_c)
: //register clobber
"memory",
"eax","ebx","ecx","edx","esi","edi",
"xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7"
);
b1 += rs_b*kc;
c11 += rs_b;
}
a1 += cs_a*kc;
}
}
double blis_dgemm_ref(
const int n,
const double* restrict A,
const double* restrict B,
double* restrict C,
const int mc,
const int nc,
const int kc
)
{
int mr = 2;
int nr = 4;
double locA[mc*kc] __attribute__ ((aligned(64)));
double locB[kc*nc] __attribute__ ((aligned(64)));
LARGE_INTEGER frequency, time1, time2;
double time3 = 0.0;
QueryPerformanceFrequency(&frequency);
// zero C
memset(C, 0, n*n*sizeof(double));
int ii,jj,kk;
//#pragma omp parallel num_threads(2) shared(A,B,C) private(ii,jj,kk,locA,locB)
{//use all threads in parallel
//#pragma omp for
for ( jj = 0; jj < n; jj+=nc) {
for ( kk = 0; kk < n; kk+=kc) {
cpack(locB, B + kk*n + jj, nc, kc, nr, n);
for ( ii = 0; ii < n; ii+=mc) {
rpack(locA, A + ii*n + kk, kc, mc, mr, n);
QueryPerformanceCounter(&time1);
dgemm_2x4_asm_j( mc, nc, kc,
locA , mr,
locB , nr,
C + ii*n + jj, n );
QueryPerformanceCounter(&time2);
time3 += (double) (time2.QuadPart - time1.QuadPart);
}
}
}
}
return time3 / frequency.QuadPart;
}
double compute_gflops(const double time, const int n)
{
// computes the gigaflops for a square matrix-matrix multiplication
double gflops;
gflops = (double) (2.0*n*n*n)/time/1.0e9;
return(gflops);
}
void main() {
LARGE_INTEGER frequency, time1, time2;
double time3, best_time;
double kernel_time, best_kernel_time;
QueryPerformanceFrequency(&frequency);
int best_flag;
double gflops, kernel_gflops;
const int trials = 100;
int nmax = 4096;
printf("%16s %16s %16s\n","N","Total Gflops/s","Kernel Gflops/s");
int mc = 256;
int kc = 256;
int nc = 128;
for (int n = kc; n <= nmax; n+=kc) {
double *A = NULL;
double *B = NULL;
double *C = NULL;
A = _mm_malloc (n*n * sizeof(*A),64); if (!A) {printf("A failed\n"); return;}
B = _mm_malloc (n*n * sizeof(*B),64); if (!B) {printf("B failed\n"); return;}
C = _mm_malloc (n*n * sizeof(*C),64); if (!C) {printf("C failed\n"); return;}
srand(time(NULL));
// Create the matrices
for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
*(A + i*n + j) = (double) rand()/RAND_MAX;
*(B + i*n + j) = (double) rand()/RAND_MAX;
}
}
// warmup
blis_dgemm_ref(n,A,B,C,mc,nc,kc);
for (int count = 0; count < trials; count++){
QueryPerformanceCounter(&time1);
kernel_time = blis_dgemm_ref(n,A,B,C,mc,nc,kc);
QueryPerformanceCounter(&time2);
time3 = (double)(time2.QuadPart - time1.QuadPart) / frequency.QuadPart;
if (count == 0) {
best_time = time3;
best_kernel_time = kernel_time;
}
else {
best_flag = ( time3 < best_time ? 1 : 0 );
if (best_flag) {
best_time = time3;
best_kernel_time = kernel_time;
}
}
}
gflops = compute_gflops(best_time, n);
kernel_gflops = compute_gflops(best_kernel_time, n);
printf("%16d %16f %16f\n",n,gflops,kernel_gflops);
_mm_free(A);
_mm_free(B);
_mm_free(C);
}
printf("tests are done\n");
}
EDIT
Replace the packing functions with the following new and faster versions:
inline void
__attribute__ ((gnu_inline))
__attribute__ ((aligned(64))) rpack( double* restrict dst,
const double* restrict src,
const int kc, const int mc, const int mr, const int n)
{
for (int i = 0; i < mc/mr; ++i)
for (int j = 0; j < kc; ++j)
for (int k = 0; k < mr; ++k)
*dst++ = *(src + i*n*mr + k*n + j);
}
inline void
__attribute__ ((gnu_inline))
__attribute__ ((aligned(64))) cpack(double* restrict dst,
const double* restrict src,
const int nc,
const int kc,
const int nr,
const int n)
{
for (int i = 0; i < nc/nr; ++i)
for (int j = 0; j < kc; ++j)
for (int k = 0; k < nr; ++k)
*dst++ = *(src + i*nr + j*n + k);
}
Results With New Packing Functions
Nice boost to the overall performance:
N Total Gflops/s Kernel Gflops/s
256 7.915617 8.794849
512 8.466467 9.350920
768 8.354890 9.135575
1024 8.168944 8.884611
1280 8.174249 8.825920
1536 8.285458 8.938712
1792 7.988038 8.581001
LINPACK 32-bit with 1 thread
CPU frequency: 2.792 GHz
Number of CPUs: 1
Number of cores: 2
Number of threads: 1
Performance Summary (GFlops)
Size LDA Align. Average Maximal
128 128 4 4.7488 5.0094
256 256 4 6.0747 6.9652
384 384 4 6.5208 7.2767
512 512 4 6.8329 7.5706
640 640 4 7.4278 7.8835
768 768 4 7.7622 8.0677
896 896 4 7.8860 8.4737
1024 1024 4 7.7292 8.1076
1152 1152 4 8.0411 8.4738
1280 1280 4 8.1429 8.4863
1408 1408 4 8.2284 8.7073
1536 1536 4 8.3753 8.6437
1664 1664 4 8.6993 8.9108
1792 1792 4 8.7576 8.9176
1920 1920 4 8.7945 9.0678
2048 2048 4 8.5490 8.8827
2176 2176 4 9.0138 9.1161
2304 2304 4 8.1402 9.1446
2432 2432 4 9.0003 9.2082
2560 2560 4 8.8560 9.1197
2688 2688 4 9.1008 9.3144
2816 2816 4 9.0876 9.3089
2944 2944 4 9.0771 9.4191
3072 3072 4 8.9402 9.2920
3200 3200 4 9.2259 9.3699
3328 3328 4 9.1224 9.3821
3456 3456 4 9.1354 9.4082
3584 3584 4 9.0489 9.3351
3712 3712 4 9.3093 9.5108
3840 3840 4 9.3307 9.5324
3968 3968 4 9.3895 9.5352
4096 4096 4 9.3269 9.3872
EDIT 2
Here are some results from single-threaded OpenBLAS, which took 4 hours to compile last night. As you can see, it is getting close to 95% CPU usage. Max single-threaded performance with both cores on is 11.2 Gflops (2 step Intel Turbo Boost). I need to turn off the other core to get 12.26 Gflops (4 step Intel Turbo Boost). Assume that the packing functions in OpeBLAS generate no additional overhead. Then the OpenBLAS kernel must be running at least as fast as the total OpenBLAS code. So I need to get my kernel running at that speed. I have yet to figure out how to make my assembly faster. I will be focusing on this over the next few days.
Ran the tests below from Windows command line with: start /realtime /affinity 1
My Code:
N Total Gflops/s Kernel Gflops/s
256 7.927740 8.832366
512 8.427591 9.347094
768 8.547722 9.352993
1024 8.597336 9.351794
1280 8.588663 9.296724
1536 8.589808 9.271710
1792 8.634201 9.306406
2048 8.527889 9.235653
OpenBLAS:
N Total Gflops/s
256 10.599065
512 10.622686
768 10.745133
1024 10.762757
1280 10.832540
1536 10.793132
1792 10.848356
2048 10.819986
It's theoretically possible to look at that code and reason through whether it could be arranged to make better use of microarchitectural resources - but even the performance architects at Intel might not recommend doing it that way. It might help to use a tool like VTune or Intel Performance Counter Monitor to find out how much of your workload is memory versus front-end versus back-end bound. Intel Architecture Code Analyzer might also be a quick source of help narrowing down which of the potential issues listed below to follow up on first.
Nominal Animal is probably on the right track in the comments talking about interleaving instructions that access memory and those that do computation. A few other possibilities:
Using other instructions for some of the computation might reduce pressure on one of the execution ports (see section 3.3.4 of this presentation). In particular, mulpd is always going to dispatch to port 1 on Westmere. Maybe if there are any cycles where port 0 isn't getting used, you could sneak in a scalar FP multiply there.
One or another of the hardware prefetchers could be saturating the bus early or polluting the cache with lines you don't end up using.
On the other hand, there's a slim possibility that the ordering of memory references or the memory layout implied in dgemm_2x4_asm_j is faking out the prefetchers.
Changing the relative ordering of pairs of instructions that don't have any data dependencies might lead to better use of front-end or back-end resources.

Resources