A better 8x8 bytes matrix transpose with SSE?

A better 8x8 bytes matrix transpose with SSE? - c

I found this post that explains how to transpose an 8x8 bytes matrix with 24 operations, and a few scrolls later there's the code that implements the transpose. However, this method does not exploit the fact that we can block the 8x8 transpose into four 4x4 transposes, and each one can be done in one shuffle instruction only (this post is the reference). So I came out with this solution:
__m128i transpose4x4mask = _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
__m128i shuffle8x8Mask = _mm_setr_epi8(0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15);
void TransposeBlock8x8(uint8_t *src, uint8_t *dst, int srcStride, int dstStride) {
__m128i load0 = _mm_set_epi64x(*(uint64_t*)(src + 1 * srcStride), *(uint64_t*)(src + 0 * srcStride));
__m128i load1 = _mm_set_epi64x(*(uint64_t*)(src + 3 * srcStride), *(uint64_t*)(src + 2 * srcStride));
__m128i load2 = _mm_set_epi64x(*(uint64_t*)(src + 5 * srcStride), *(uint64_t*)(src + 4 * srcStride));
__m128i load3 = _mm_set_epi64x(*(uint64_t*)(src + 7 * srcStride), *(uint64_t*)(src + 6 * srcStride));
__m128i shuffle0 = _mm_shuffle_epi8(load0, shuffle8x8Mask);
__m128i shuffle1 = _mm_shuffle_epi8(load1, shuffle8x8Mask);
__m128i shuffle2 = _mm_shuffle_epi8(load2, shuffle8x8Mask);
__m128i shuffle3 = _mm_shuffle_epi8(load3, shuffle8x8Mask);
__m128i block0 = _mm_unpacklo_epi64(shuffle0, shuffle1);
__m128i block1 = _mm_unpackhi_epi64(shuffle0, shuffle1);
__m128i block2 = _mm_unpacklo_epi64(shuffle2, shuffle3);
__m128i block3 = _mm_unpackhi_epi64(shuffle2, shuffle3);
__m128i transposed0 = _mm_shuffle_epi8(block0, transpose4x4mask);
__m128i transposed1 = _mm_shuffle_epi8(block1, transpose4x4mask);
__m128i transposed2 = _mm_shuffle_epi8(block2, transpose4x4mask);
__m128i transposed3 = _mm_shuffle_epi8(block3, transpose4x4mask);
__m128i store0 = _mm_unpacklo_epi32(transposed0, transposed2);
__m128i store1 = _mm_unpackhi_epi32(transposed0, transposed2);
__m128i store2 = _mm_unpacklo_epi32(transposed1, transposed3);
__m128i store3 = _mm_unpackhi_epi32(transposed1, transposed3);
*((uint64_t*)(dst + 0 * dstStride)) = _mm_extract_epi64(store0, 0);
*((uint64_t*)(dst + 1 * dstStride)) = _mm_extract_epi64(store0, 1);
*((uint64_t*)(dst + 2 * dstStride)) = _mm_extract_epi64(store1, 0);
*((uint64_t*)(dst + 3 * dstStride)) = _mm_extract_epi64(store1, 1);
*((uint64_t*)(dst + 4 * dstStride)) = _mm_extract_epi64(store2, 0);
*((uint64_t*)(dst + 5 * dstStride)) = _mm_extract_epi64(store2, 1);
*((uint64_t*)(dst + 6 * dstStride)) = _mm_extract_epi64(store3, 0);
*((uint64_t*)(dst + 7 * dstStride)) = _mm_extract_epi64(store3, 1);
}
Excluding load/store operations this procedure consists of only 16 instructions instead of 24.
What am I missing?

Apart from the loads, stores and pinsrq-s to read from and write to memory, with possibly a stride not equal to 8 bytes,
you can do the transpose with only 12 instructions (this code can easily be used in combination with Z boson's test code):
void tran8x8b_SSE_v2(char *A, char *B) {
__m128i pshufbcnst = _mm_set_epi8(15,11,7,3, 14,10,6,2, 13,9,5,1, 12,8,4,0);
__m128i B0, B1, B2, B3, T0, T1, T2, T3;
B0 = _mm_loadu_si128((__m128i*)&A[ 0]);
B1 = _mm_loadu_si128((__m128i*)&A[16]);
B2 = _mm_loadu_si128((__m128i*)&A[32]);
B3 = _mm_loadu_si128((__m128i*)&A[48]);
T0 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(B0),_mm_castsi128_ps(B1),0b10001000));
T1 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(B2),_mm_castsi128_ps(B3),0b10001000));
T2 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(B0),_mm_castsi128_ps(B1),0b11011101));
T3 = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(B2),_mm_castsi128_ps(B3),0b11011101));
B0 = _mm_shuffle_epi8(T0,pshufbcnst);
B1 = _mm_shuffle_epi8(T1,pshufbcnst);
B2 = _mm_shuffle_epi8(T2,pshufbcnst);
B3 = _mm_shuffle_epi8(T3,pshufbcnst);
T0 = _mm_unpacklo_epi32(B0,B1);
T1 = _mm_unpackhi_epi32(B0,B1);
T2 = _mm_unpacklo_epi32(B2,B3);
T3 = _mm_unpackhi_epi32(B2,B3);
_mm_storeu_si128((__m128i*)&B[ 0], T0);
_mm_storeu_si128((__m128i*)&B[16], T1);
_mm_storeu_si128((__m128i*)&B[32], T2);
_mm_storeu_si128((__m128i*)&B[48], T3);
}
Here we use the 32 bit floating point shuffle which is more flexible than the epi32 shuffle.
The casts do not generate extra instructions (code generated with gcc 5.4):
tran8x8b_SSE_v2:
.LFB4885:
.cfi_startproc
vmovdqu 48(%rdi), %xmm5
vmovdqu 32(%rdi), %xmm2
vmovdqu 16(%rdi), %xmm0
vmovdqu (%rdi), %xmm1
vshufps $136, %xmm5, %xmm2, %xmm4
vshufps $221, %xmm5, %xmm2, %xmm2
vmovdqa .LC6(%rip), %xmm5
vshufps $136, %xmm0, %xmm1, %xmm3
vshufps $221, %xmm0, %xmm1, %xmm1
vpshufb %xmm5, %xmm3, %xmm3
vpshufb %xmm5, %xmm1, %xmm0
vpshufb %xmm5, %xmm4, %xmm4
vpshufb %xmm5, %xmm2, %xmm1
vpunpckldq %xmm4, %xmm3, %xmm5
vpunpckldq %xmm1, %xmm0, %xmm2
vpunpckhdq %xmm4, %xmm3, %xmm3
vpunpckhdq %xmm1, %xmm0, %xmm0
vmovups %xmm5, (%rsi)
vmovups %xmm3, 16(%rsi)
vmovups %xmm2, 32(%rsi)
vmovups %xmm0, 48(%rsi)
ret
.cfi_endproc
On some, but not all, older cpus there might be a small bypass delay (between 0 and 2 cycles) for moving data between the
integer and the floating point units. This increases the latency of the function, but it does not necessarily affect the
throughput of the code.
A simple latency test with 1e9 tranpositions:
for (int i=0;i<500000000;i++){
tran8x8b_SSE(A,C);
tran8x8b_SSE(C,A);
}
print8x8b(A);
This takes about 5.5 seconds (19.7e9 cycles) with tran8x8b_SSE and 4.5 seconds (16.0e9 cycles) with tran8x8b_SSE_v2 (Intel core i5-6500). Note that
the load and stores were not eliminated by the compiler, although the functions were inlined in the for loop.
Update: AVX2-128 / SSE 4.1 solution with blends.
The 'shuffles' (unpack, shuffle) are handled by port 5, with 1 instruction per cpu cycle on modern cpus.
Sometimes it pays off to replace one 'shuffle' with two blends. On Skylake the 32 bit blend instructions can run on either port 0, 1 or 5.
Unfortunately, _mm_blend_epi32 is only AVX2-128. An efficient SSE 4.1 alternative is _mm_blend_ps in combination
with a few casts (which are usually free). The 12 'shuffles' are replaced by
8 shuffles in combination with 8 blends.
The simple latency test now runs in about 3.6 seconds (13e9 cpu cycles), which is 18 % faster than the results with tran8x8b_SSE_v2.
Code:
/* AVX2-128 version, sse 4.1 version see ----------------> SSE 4.1 version of tran8x8b_AVX2_128() */
void tran8x8b_AVX2_128(char *A, char *B) { /* void tran8x8b_SSE4_1(char *A, char *B) { */
__m128i pshufbcnst_0 = _mm_set_epi8(15, 7,11, 3,
13, 5, 9, 1, 14, 6,10, 2, 12, 4, 8, 0); /* __m128i pshufbcnst_0 = _mm_set_epi8(15, 7,11, 3, 13, 5, 9, 1, 14, 6,10, 2, 12, 4, 8, 0); */
__m128i pshufbcnst_1 = _mm_set_epi8(13, 5, 9, 1,
15, 7,11, 3, 12, 4, 8, 0, 14, 6,10, 2); /* __m128i pshufbcnst_1 = _mm_set_epi8(13, 5, 9, 1, 15, 7,11, 3, 12, 4, 8, 0, 14, 6,10, 2); */
__m128i pshufbcnst_2 = _mm_set_epi8(11, 3,15, 7,
9, 1,13, 5, 10, 2,14, 6, 8, 0,12, 4); /* __m128i pshufbcnst_2 = _mm_set_epi8(11, 3,15, 7, 9, 1,13, 5, 10, 2,14, 6, 8, 0,12, 4); */
__m128i pshufbcnst_3 = _mm_set_epi8( 9, 1,13, 5,
11, 3,15, 7, 8, 0,12, 4, 10, 2,14, 6); /* __m128i pshufbcnst_3 = _mm_set_epi8( 9, 1,13, 5, 11, 3,15, 7, 8, 0,12, 4, 10, 2,14, 6); */
__m128i B0, B1, B2, B3, T0, T1, T2, T3; /* __m128 B0, B1, B2, B3, T0, T1, T2, T3; */
/* */
B0 = _mm_loadu_si128((__m128i*)&A[ 0]); /* B0 = _mm_loadu_ps((float*)&A[ 0]); */
B1 = _mm_loadu_si128((__m128i*)&A[16]); /* B1 = _mm_loadu_ps((float*)&A[16]); */
B2 = _mm_loadu_si128((__m128i*)&A[32]); /* B2 = _mm_loadu_ps((float*)&A[32]); */
B3 = _mm_loadu_si128((__m128i*)&A[48]); /* B3 = _mm_loadu_ps((float*)&A[48]); */
/* */
B1 = _mm_shuffle_epi32(B1,0b10110001); /* B1 = _mm_shuffle_ps(B1,B1,0b10110001); */
B3 = _mm_shuffle_epi32(B3,0b10110001); /* B3 = _mm_shuffle_ps(B3,B3,0b10110001); */
T0 = _mm_blend_epi32(B0,B1,0b1010); /* T0 = _mm_blend_ps(B0,B1,0b1010); */
T1 = _mm_blend_epi32(B2,B3,0b1010); /* T1 = _mm_blend_ps(B2,B3,0b1010); */
T2 = _mm_blend_epi32(B0,B1,0b0101); /* T2 = _mm_blend_ps(B0,B1,0b0101); */
T3 = _mm_blend_epi32(B2,B3,0b0101); /* T3 = _mm_blend_ps(B2,B3,0b0101); */
/* */
B0 = _mm_shuffle_epi8(T0,pshufbcnst_0); /* B0 = _mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(T0),pshufbcnst_0)); */
B1 = _mm_shuffle_epi8(T1,pshufbcnst_1); /* B1 = _mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(T1),pshufbcnst_1)); */
B2 = _mm_shuffle_epi8(T2,pshufbcnst_2); /* B2 = _mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(T2),pshufbcnst_2)); */
B3 = _mm_shuffle_epi8(T3,pshufbcnst_3); /* B3 = _mm_castsi128_ps(_mm_shuffle_epi8(_mm_castps_si128(T3),pshufbcnst_3)); */
/* */
T0 = _mm_blend_epi32(B0,B1,0b1010); /* T0 = _mm_blend_ps(B0,B1,0b1010); */
T1 = _mm_blend_epi32(B0,B1,0b0101); /* T1 = _mm_blend_ps(B0,B1,0b0101); */
T2 = _mm_blend_epi32(B2,B3,0b1010); /* T2 = _mm_blend_ps(B2,B3,0b1010); */
T3 = _mm_blend_epi32(B2,B3,0b0101); /* T3 = _mm_blend_ps(B2,B3,0b0101); */
T1 = _mm_shuffle_epi32(T1,0b10110001); /* T1 = _mm_shuffle_ps(T1,T1,0b10110001); */
T3 = _mm_shuffle_epi32(T3,0b10110001); /* T3 = _mm_shuffle_ps(T3,T3,0b10110001); */
/* */
_mm_storeu_si128((__m128i*)&B[ 0], T0); /* _mm_storeu_ps((float*)&B[ 0], T0); */
_mm_storeu_si128((__m128i*)&B[16], T1); /* _mm_storeu_ps((float*)&B[16], T1); */
_mm_storeu_si128((__m128i*)&B[32], T2); /* _mm_storeu_ps((float*)&B[32], T2); */
_mm_storeu_si128((__m128i*)&B[48], T3); /* _mm_storeu_ps((float*)&B[48], T3); */
} /* } */

Posting this as an answer. I'm also going to change the title of the question from "... with SSE" to "... with SIMD" due to some answers and comments received so far.
I succeeded in transposing the matrix with AVX2 in 8 instructions only, 10 including load/store (excluding masks loads). EDIT: I found a shorter version. See below. This is the case where the matrices are all contiguous in memory, so direct load/store can be used.
Here's the C code:
void tran8x8b_AVX2(char *src, char *dst) {
__m256i perm = _mm256_set_epi8(
0, 0, 0, 7,
0, 0, 0, 5,
0, 0, 0, 3,
0, 0, 0, 1,
0, 0, 0, 6,
0, 0, 0, 4,
0, 0, 0, 2,
0, 0, 0, 0
);
__m256i tm = _mm256_set_epi8(
15, 11, 7, 3,
14, 10, 6, 2,
13, 9, 5, 1,
12, 8, 4, 0,
15, 11, 7, 3,
14, 10, 6, 2,
13, 9, 5, 1,
12, 8, 4, 0
);
__m256i load0 = _mm256_loadu_si256((__m256i*)&src[ 0]);
__m256i load1 = _mm256_loadu_si256((__m256i*)&src[32]);
__m256i perm0 = _mm256_permutevar8x32_epi32(load0, perm);
__m256i perm1 = _mm256_permutevar8x32_epi32(load1, perm);
__m256i transpose0 = _mm256_shuffle_epi8(perm0, tm);
__m256i transpose1 = _mm256_shuffle_epi8(perm1, tm);
__m256i unpack0 = _mm256_unpacklo_epi32(transpose0, transpose1);
__m256i unpack1 = _mm256_unpackhi_epi32(transpose0, transpose1);
perm0 = _mm256_castps_si256(_mm256_permute2f128_ps(_mm256_castsi256_ps(unpack0), _mm256_castsi256_ps(unpack1), 32));
perm1 = _mm256_castps_si256(_mm256_permute2f128_ps(_mm256_castsi256_ps(unpack0), _mm256_castsi256_ps(unpack1), 49));
_mm256_storeu_si256((__m256i*)&dst[ 0], perm0);
_mm256_storeu_si256((__m256i*)&dst[32], perm1);
}
GCC was smart enough to perform a permutation during AVX load, saving two instructions. Here's the compiler output:
tran8x8b_AVX2(char*, char*):
vmovdqa ymm1, YMMWORD PTR .LC0[rip]
vmovdqa ymm2, YMMWORD PTR .LC1[rip]
vpermd ymm0, ymm1, YMMWORD PTR [rdi]
vpermd ymm1, ymm1, YMMWORD PTR [rdi+32]
vpshufb ymm0, ymm0, ymm2
vpshufb ymm1, ymm1, ymm2
vpunpckldq ymm2, ymm0, ymm1
vpunpckhdq ymm0, ymm0, ymm1
vinsertf128 ymm1, ymm2, xmm0, 1
vperm2f128 ymm0, ymm2, ymm0, 49
vmovdqu YMMWORD PTR [rsi], ymm1
vmovdqu YMMWORD PTR [rsi+32], ymm0
vzeroupper
ret
It emitted the vzerupper instruction with -O3, but going down to -O1 removes this.
In case of my original problem (a large matrix and I'm zooming in to an 8x8 part of it), handling strides destroys the output in a pretty bad way:
void tran8x8b_AVX2(char *src, char *dst, int srcStride, int dstStride) {
__m256i load0 = _mm256_set_epi64x(*(uint64_t*)(src + 3 * srcStride), *(uint64_t*)(src + 2 * srcStride), *(uint64_t*)(src + 1 * srcStride), *(uint64_t*)(src + 0 * srcStride));
__m256i load1 = _mm256_set_epi64x(*(uint64_t*)(src + 7 * srcStride), *(uint64_t*)(src + 6 * srcStride), *(uint64_t*)(src + 5 * srcStride), *(uint64_t*)(src + 4 * srcStride));
// ... the same as before, however we can skip the final permutations because we need to handle the destination stride...
*((uint64_t*)(dst + 0 * dstStride)) = _mm256_extract_epi64(unpack0, 0);
*((uint64_t*)(dst + 1 * dstStride)) = _mm256_extract_epi64(unpack0, 1);
*((uint64_t*)(dst + 2 * dstStride)) = _mm256_extract_epi64(unpack1, 0);
*((uint64_t*)(dst + 3 * dstStride)) = _mm256_extract_epi64(unpack1, 1);
*((uint64_t*)(dst + 4 * dstStride)) = _mm256_extract_epi64(unpack0, 2);
*((uint64_t*)(dst + 5 * dstStride)) = _mm256_extract_epi64(unpack0, 3);
*((uint64_t*)(dst + 6 * dstStride)) = _mm256_extract_epi64(unpack1, 2);
*((uint64_t*)(dst + 7 * dstStride)) = _mm256_extract_epi64(unpack1, 3);
}
Here's the compiler output:
tran8x8b_AVX2(char*, char*, int, int):
movsx rdx, edx
vmovq xmm5, QWORD PTR [rdi]
lea r9, [rdi+rdx]
vmovdqa ymm3, YMMWORD PTR .LC0[rip]
movsx rcx, ecx
lea r11, [r9+rdx]
vpinsrq xmm0, xmm5, QWORD PTR [r9], 1
lea r10, [r11+rdx]
vmovq xmm4, QWORD PTR [r11]
vpinsrq xmm1, xmm4, QWORD PTR [r10], 1
lea r8, [r10+rdx]
lea rax, [r8+rdx]
vmovq xmm7, QWORD PTR [r8]
vmovq xmm6, QWORD PTR [rax+rdx]
vpinsrq xmm2, xmm7, QWORD PTR [rax], 1
vinserti128 ymm1, ymm0, xmm1, 0x1
vpinsrq xmm0, xmm6, QWORD PTR [rax+rdx*2], 1
lea rax, [rsi+rcx]
vpermd ymm1, ymm3, ymm1
vinserti128 ymm0, ymm2, xmm0, 0x1
vmovdqa ymm2, YMMWORD PTR .LC1[rip]
vpshufb ymm1, ymm1, ymm2
vpermd ymm0, ymm3, ymm0
vpshufb ymm0, ymm0, ymm2
vpunpckldq ymm2, ymm1, ymm0
vpunpckhdq ymm0, ymm1, ymm0
vmovdqa xmm1, xmm2
vmovq QWORD PTR [rsi], xmm1
vpextrq QWORD PTR [rax], xmm1, 1
vmovdqa xmm1, xmm0
add rax, rcx
vextracti128 xmm0, ymm0, 0x1
vmovq QWORD PTR [rax], xmm1
add rax, rcx
vpextrq QWORD PTR [rax], xmm1, 1
add rax, rcx
vextracti128 xmm1, ymm2, 0x1
vmovq QWORD PTR [rax], xmm1
add rax, rcx
vpextrq QWORD PTR [rax], xmm1, 1
vmovq QWORD PTR [rax+rcx], xmm0
vpextrq QWORD PTR [rax+rcx*2], xmm0, 1
vzeroupper
ret
However, this seems not a big deal if compared against the output my original code.
EDIT: I found a shorter version. 4 instructions in total, 8 counting both load/stores. This is possible because I read the matrix in a different way, hiding some "shuffles" in the "gather" instruction during load. Also, note that the final permutation is needed to perform the store because AVX2 doesn't have a "scatter" instruction. Having a scatter instruction would bring down everything to 2 instructions only. Also, note that I can handle without hassles the src stride by changing the content of the vindex vector.
Unfortunately this AVX_v2 seems to be slower than the previous one. Here's the code:
void tran8x8b_AVX2_v2(char *src1, char *dst1) {
__m256i tm = _mm256_set_epi8(
15, 11, 7, 3,
14, 10, 6, 2,
13, 9, 5, 1,
12, 8, 4, 0,
15, 11, 7, 3,
14, 10, 6, 2,
13, 9, 5, 1,
12, 8, 4, 0
);
__m256i vindex = _mm256_setr_epi32(0, 8, 16, 24, 32, 40, 48, 56);
__m256i perm = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7);
__m256i load0 = _mm256_i32gather_epi32((int*)src1, vindex, 1);
__m256i load1 = _mm256_i32gather_epi32((int*)(src1 + 4), vindex, 1);
__m256i transpose0 = _mm256_shuffle_epi8(load0, tm);
__m256i transpose1 = _mm256_shuffle_epi8(load1, tm);
__m256i final0 = _mm256_permutevar8x32_epi32(transpose0, perm);
__m256i final1 = _mm256_permutevar8x32_epi32(transpose1, perm);
_mm256_storeu_si256((__m256i*)&dst1[ 0], final0);
_mm256_storeu_si256((__m256i*)&dst1[32], final1);
}
And here's the output of the compiler:
tran8x8b_AVX2_v2(char*, char*):
vpcmpeqd ymm3, ymm3, ymm3
vmovdqa ymm2, YMMWORD PTR .LC0[rip]
vmovdqa ymm4, ymm3
vpgatherdd ymm0, DWORD PTR [rdi+4+ymm2*8], ymm3
vpgatherdd ymm1, DWORD PTR [rdi+ymm2*8], ymm4
vmovdqa ymm2, YMMWORD PTR .LC1[rip]
vpshufb ymm1, ymm1, ymm2
vpshufb ymm0, ymm0, ymm2
vmovdqa ymm2, YMMWORD PTR .LC2[rip]
vpermd ymm1, ymm2, ymm1
vpermd ymm0, ymm2, ymm0
vmovdqu YMMWORD PTR [rsi], ymm1
vmovdqu YMMWORD PTR [rsi+32], ymm0
vzeroupper
ret

A simplified one
void tp128_8x8(char *A, char *B) {
__m128i sv = _mm_set_epi8(15, 7, 14, 6, 13, 5, 12, 4, 11, 3, 10, 2, 9, 1, 8, 0);
__m128i iv[4], ov[4];
ov[0] = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*)A), sv);
ov[1] = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*)(A+16)), sv);
ov[2] = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*)(A+32)), sv);
ov[3] = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*)(A+48)), sv);
iv[0] = _mm_unpacklo_epi16(ov[0], ov[1]);
iv[1] = _mm_unpackhi_epi16(ov[0], ov[1]);
iv[2] = _mm_unpacklo_epi16(ov[2], ov[3]);
iv[3] = _mm_unpackhi_epi16(ov[2], ov[3]);
_mm_storeu_si128((__m128i*)B, _mm_unpacklo_epi32(iv[0], iv[2]));
_mm_storeu_si128((__m128i*)(B+16), _mm_unpackhi_epi32(iv[0], iv[2]));
_mm_storeu_si128((__m128i*)(B+32), _mm_unpacklo_epi32(iv[1], iv[3]));
_mm_storeu_si128((__m128i*)(B+48), _mm_unpackhi_epi32(iv[1], iv[3]));
}
Benchmark:i5-5300U 2.3GHz (cycles per byte)
tran8x8b : 2.140
tran8x8b_SSE : 1.602
tran8x8b_SSE_v2 : 1.551
tp128_8x8 : 1.535
tran8x8b_AVX2 : 1.563
tran8x8b_AVX2_v2 : 1.731

Normally when load and store instructions are not counted it's because the code is working with a matrix in register e.g. doing multiple operations in addition to the transpose in a loop. The loads and stores in this case are not counted because they are not part of the main loop.
But in your code the loads and stores (or rather sets and extracts) are doing part of the transpose.
GCC implements _mm_set_epi64x for SSE4.1 in your code with _mm_insert_epi64 and _mm_loadl_epi64. The insert instruction is doing part of the transpose i.e. the transpose starts at load0,1,2,3 not at shuffle0,1,2,3. And then your final store0,1,2,3 values don't contain the transpose either. You have to use eight _mm_extract_epi64 instructions to finish the transpose in memory. So it does not really make sense to not count the set and extract intrinsics.
In any case, it turns out you can do the transpose from register with only 16 instructions using only SSSE3 like this:
//__m128i B0, __m128i B1, __m128i B2, __m128i B3
__m128i mask = _mm_setr_epi8(0x0,0x04,0x01,0x05, 0x02,0x06,0x03,0x07, 0x08,0x0c,0x09,0x0d, 0x0a,0x0e,0x0b,0x0f);
__m128i T0, T1, T2, T3;
T0 = _mm_unpacklo_epi8(B0,B1);
T1 = _mm_unpackhi_epi8(B0,B1);
T2 = _mm_unpacklo_epi8(B2,B3);
T3 = _mm_unpackhi_epi8(B2,B3);
B0 = _mm_unpacklo_epi16(T0,T2);
B1 = _mm_unpackhi_epi16(T0,T2);
B2 = _mm_unpacklo_epi16(T1,T3);
B3 = _mm_unpackhi_epi16(T1,T3);
T0 = _mm_unpacklo_epi32(B0,B2);
T1 = _mm_unpackhi_epi32(B0,B2);
T2 = _mm_unpacklo_epi32(B1,B3);
T3 = _mm_unpackhi_epi32(B1,B3);
B0 = _mm_shuffle_epi8(T0,mask);
B1 = _mm_shuffle_epi8(T1,mask);
B2 = _mm_shuffle_epi8(T2,mask);
B3 = _mm_shuffle_epi8(T3,mask);
I'm not sure if it makes sense to exclude the loads and store here either because I'm not sure how convenient it is to work with a 8x8 byte matrix in four 128-bit registers.
Here is code testing this:
#include <stdio.h>
#include <x86intrin.h>
void print8x8b(char *A) {
for(int i=0; i<8; i++) {
for(int j=0; j<8; j++) {
printf("%2d ", A[i*8+j]);
} puts("");
} puts("");
}
void tran8x8b(char *A, char *B) {
for(int i=0; i<8; i++) {
for(int j=0; j<8; j++) {
B[j*8+i] = A[i*8+j];
}
}
}
void tran8x8b_SSE(char *A, char *B) {
__m128i mask = _mm_setr_epi8(0x0,0x04,0x01,0x05, 0x02,0x06,0x03,0x07, 0x08,0x0c,0x09,0x0d, 0x0a,0x0e,0x0b,0x0f);
__m128i B0, B1, B2, B3, T0, T1, T2, T3;
B0 = _mm_loadu_si128((__m128i*)&A[ 0]);
B1 = _mm_loadu_si128((__m128i*)&A[16]);
B2 = _mm_loadu_si128((__m128i*)&A[32]);
B3 = _mm_loadu_si128((__m128i*)&A[48]);
T0 = _mm_unpacklo_epi8(B0,B1);
T1 = _mm_unpackhi_epi8(B0,B1);
T2 = _mm_unpacklo_epi8(B2,B3);
T3 = _mm_unpackhi_epi8(B2,B3);
B0 = _mm_unpacklo_epi16(T0,T2);
B1 = _mm_unpackhi_epi16(T0,T2);
B2 = _mm_unpacklo_epi16(T1,T3);
B3 = _mm_unpackhi_epi16(T1,T3);
T0 = _mm_unpacklo_epi32(B0,B2);
T1 = _mm_unpackhi_epi32(B0,B2);
T2 = _mm_unpacklo_epi32(B1,B3);
T3 = _mm_unpackhi_epi32(B1,B3);
B0 = _mm_shuffle_epi8(T0,mask);
B1 = _mm_shuffle_epi8(T1,mask);
B2 = _mm_shuffle_epi8(T2,mask);
B3 = _mm_shuffle_epi8(T3,mask);
_mm_storeu_si128((__m128i*)&B[ 0], B0);
_mm_storeu_si128((__m128i*)&B[16], B1);
_mm_storeu_si128((__m128i*)&B[32], B2);
_mm_storeu_si128((__m128i*)&B[48], B3);
}
int main(void) {
char A[64], B[64], C[64];
for(int i=0; i<64; i++) A[i] = i;
print8x8b(A);
tran8x8b(A,B);
print8x8b(B);
tran8x8b_SSE(A,C);
print8x8b(C);
}

This was really interesting to me, and I was looking to do exactly this, but for various reasons, I ended up needing to do it in Go, instead of C, and I didn't have vector intrinsics, so I thought "well, I'll just write something and see how it does".
My reported times, on a ~3.6GHz CPU, are about 28ns per 64-byte block transposed for a naive implementation, and about 19ns each for one done using bit shifts. I used perf to confirm the numbers, which seemed a bit unlikely to me, and they seem to add up. The fancy bit shift implementation is a bit over 250 instructions, and gets about 3.6 instructions per cycle, so it comes out to about 69-70 cycles per operation.
This is Go, but honestly it should be trivial to implement; it's just treating the input array of 64 bytes as 8 uint64_t.
You can get another nanosecond or so with declaring some of these things as new variables to hint to the register allocator.
import (
"unsafe"
)
const (
hi16 = uint64(0xFFFF0000FFFF0000)
lo16 = uint64(0x0000FFFF0000FFFF)
hi8 = uint64(0xFF00FF00FF00FF00)
lo8 = uint64(0x00FF00FF00FF00FF)
)
// Okay, this might take some explaining. We are working on a logical
// 8x8 matrix of bytes, which we get as a 64-byte array. We want to transpose
// it (row/column).
//
// start:
// [[00 08 16 24 32 40 48 56]
// [01 09 17 25 33 41 49 57]
// [02 10 18 26 34 42 50 58]
// [03 11 19 27 35 43 51 59]
// [04 12 20 28 36 44 52 60]
// [05 13 21 29 37 45 53 61]
// [06 14 22 30 38 46 54 62]
// [07 15 23 31 39 47 55 63]]
//
// First, let's make sure everything under 32 is in the top four rows,
// and everything over 32 is in the bottom four rows. We do this by
// swapping pairs of 32-bit words.
// swap32:
// [[00 08 16 24 04 12 20 28]
// [01 09 17 25 05 13 21 29]
// [02 10 18 26 06 14 22 30]
// [03 11 19 27 07 15 23 31]
// [32 40 48 56 36 44 52 60]
// [33 41 49 57 37 45 53 61]
// [34 42 50 58 38 46 54 62]
// [35 43 51 59 39 47 55 63]]
//
// Next, let's make sure everything over 16 or 48 is in the bottom two
// rows of the two four-row sections, and everything under 16 or 48 is
// in the top two rows of the section. We do this by swapping masked
// pairs in much the same way:
// swap16:
// [[00 08 02 10 04 12 06 14]
// [01 09 03 11 05 13 07 15]
// [16 24 18 26 20 28 22 30]
// [17 25 19 27 21 29 23 31]
// [32 40 34 42 36 44 38 46]
// [33 41 35 43 37 45 39 47]
// [48 56 50 58 52 60 54 62]
// [49 57 51 59 53 61 55 63]]
//
// Now, we will do the same thing to each pair -- but because of
// clever choices in the specific arrange ment leading up to this, that's
// just one more byte swap, where each 2x2 block has its upper right
// and lower left corners swapped, and that turns out to be an easy
// shift and mask.
func UnswizzleLazy(m *[64]uint8) {
// m32 treats the 8x8 array as a 2x8 array, because
// it turns out we only need to swap a handful of the
// bits...
m32 := (*[16]uint32)(unsafe.Pointer(&m[0]))
m32[1], m32[8] = m32[8], m32[1]
m32[3], m32[10] = m32[10], m32[3]
m32[5], m32[12] = m32[12], m32[5]
m32[7], m32[14] = m32[14], m32[7]
m64 := (*[8]uint64)(unsafe.Pointer(&m[0]))
// we're now at the state described above as "swap32"
tmp0, tmp1, tmp2, tmp3 :=
(m64[0]&lo16)|(m64[2]&lo16)<<16,
(m64[1]&lo16)|(m64[3]&lo16)<<16,
(m64[0]&hi16)>>16|(m64[2]&hi16),
(m64[1]&hi16)>>16|(m64[3]&hi16)
tmp4, tmp5, tmp6, tmp7 :=
(m64[4]&lo16)|(m64[6]&lo16)<<16,
(m64[5]&lo16)|(m64[7]&lo16)<<16,
(m64[4]&hi16)>>16|(m64[6]&hi16),
(m64[5]&hi16)>>16|(m64[7]&hi16)
// now we're at "swap16".
lo8 := lo8
hi8 := hi8
m64[0], m64[1] = (tmp0&lo8)|(tmp1&lo8)<<8, (tmp0&hi8)>>8|tmp1&hi8
m64[2], m64[3] = (tmp2&lo8)|(tmp3&lo8)<<8, (tmp2&hi8)>>8|tmp3&hi8
m64[4], m64[5] = (tmp4&lo8)|(tmp5&lo8)<<8, (tmp4&hi8)>>8|tmp5&hi8
m64[6], m64[7] = (tmp6&lo8)|(tmp7&lo8)<<8, (tmp6&hi8)>>8|tmp7&hi8
}
What this is doing is, I hope, reasonably obvious: shuffle the half-words around, so the first four words have all the values that belong in them, and the last four have all the values that belong in them. Then do a similar thing to each set of four words, so you end up with the things that belong in the top two words in the top two, etcetera.
I wasn't going to comment until I realized that, if the cycles/byte numbers above are right, this actually outperforms the shuffle/unpack solution.
(Note that this is an in-place transpose, but it's easy to use temps for the intermediate steps and do the final store somewhere else. It's actually probably faster.)
UPDATE: I originally described my algorithm slightly incorrectly, then I realized that I could actually do what I'd described. This one's running about 65.7 cycles per 64 bits.
EDIT #2: Tried one of the above AVX versions on this machine. On my hardware (Xeon E3-1505M, nominally 3GHz), I get a little over 10 cycles per 64-byte block, so, about 6 bytes per cycle. That seems a lot more reasonable to me than 1.5 cycles per byte did.
EDIT #3: Got down a bit further, about 45 cycles per 64 bits, by just writing the first part as shifts and masks on uint64 instead of trying to be "smart" and just move the 32 bits I cared about.

AVX512VBMI (Cascade Lake / Ice Lake)
AVX512VBMI introduces vpermb, a 64-byte lane-crossing shuffle with byte granularity.
_mm512_permutexvar_epi8( __m512i idx, __m512i a);
Existing CPUs that support it run it as a single uop, with 1/clock throughput. (https://www.uops.info/html-tp/CNL/VPERMB_ZMM_ZMM_M512-Measurements.html)
That trivializes the problem, making it possible with 1 instruction (at least for the stride=8 case where the whole 8x8 block is contiguous). Otherwise you should look at vpermt2b to shuffle together bytes from 2 sources. But that's 3 uops on CannonLake.
// TODO: strided loads / stores somehow for stride != 8
// AVX512VBMI
void TransposeBlock8x8_contiguous(uint8_t *src, uint8_t *dst)
{
const __m512i trans8x8shuf = _mm512_set_epi8(
63, 63-8*1, 63-8*2, 63-8*3, 63-8*4, ...
...
57, 49, 41, 33, 25, 17, 9, 1,
56, 48, 40, 32, 24, 16, 8, 0
);
__m512i vsrc = _mm512_loadu_si512(src);
__m512i shuffled = _mm512_permutexvar_epi8(trans8x8shuf, vsrc);
_mm512_storeu_si512(dst, shuffled);
}
https://godbolt.org/z/wrfyy3
Apparently _mm512_setr_epi8 doesn't exist for gcc/clang (only the 256 and 128 versions) so you have to define the constant in last-to-first order, opposite of C array initializer order.
vpermb even works with the data as a memory source operand, so it can load+shuffle in a single instruction. But according to https://uops.info/, it doesn't micro-fuse on CannonLake: unlike vpermd zmm, zmm, [r14] which decodes to 1 fused-domain uop (note "retire_slots: 1.0")
vpermd zmm, zmm, [r14] decodes to 2 separate uops for the front-end / fused-domain: "retire_slots: 2.0"). This from experimental testing with perf counters on a real CannonLake CPU. uops.info doesn't have a Cascade Lake or Ice Lake available yet, so it's possible it will be even more efficient there.
The uops.info tables uselessly count total number of unfused-domain uops, so you have to click on an instruction to see if it micro-fuses or not.
Source or dst strided, not contiguous, data
I guess you'd want to do qword (8-byte) loads into XMM registers and shuffle together pairs of inputs, or concatenate them with movhps or pinsrq. It's possibly worth it to use a qword-gather load with strided indices, but that's often not worth it.
I'm not sure if it's worth combining as far as YMM registers, let alone ZMM, or if it's best to only get as wide as XMM registers so we can efficiently scatter qwords back to memory manually with vmovq and vmovhps (which don't need a shuffle uop, just a store, on Intel CPUs). If the dst is contiguous, merging a non-contiguous strided src makes a lot more sense.
AVX512VBMI vpermt2b ymm looks like it would be useful for shuffle+merge like a lane-crossing punpcklbw, selecting any 32 bytes from the concatenation of two other 32-byte YMM registers. (Or 64 from 2x 64-byte regs for the ZMM version). But unfortunately on CannonLake it costs 3 uops, like vpermt2w on Skylake-X and Cannon Lake.
If we can worry about bytes later, vpermt2d is efficient on CPUs that support it (single uop)! Skylake-X and later.
Ice Lake has one per 2-cycle throughput for vpermt2b (instlat), perhaps because it has an extra shuffle unit that can run some (but not all) shuffle uops. Notice for example that vpshufb xmm and ymm is 0.5c throughput, but vpshufb zmm is 1c throughput. But vpermb is always just 1c throughput.
I wonder if we can take advantage of merge-masking? Like maybe vpmovzxbq to zero-extend input bytes to qwords. (one 8-byte row -> 64-byte ZMM register). Then maybe dword left-shift with merge-masking into another register? No, that doesn't help, the useful data is in the same dword elements for both inputs unless you do something to one register first, defeating the purpose.
Overlapped byte-masked stores (vmovdqu8 [rdi + 0..7]{k1}, zmm0..7) of vpmovzxbq load results are also possible, but probably not efficient. All but one of them would be misaligned, at best. The store buffer and/or cache hardware might be able to efficiently commit 8x masked stores, though.
A hybrid strategy doing some moving around in registers and some masked-stores might be interesting to balance shuffle/blend vs. store work for a contiguous dst. Especially if all the stores can be aligned, which would require moving data around in each vector so it's in the right place.
Ice Lake has 2 store execution units. (IDK if L1d cache commit can keep up with that, or if merging in the store buffer usually helps, or if that just helps with bursts of work.)

Most answers here use a combination of different sized shuffles and permutations using _mm_shuffle_epi8, which is available only at SSSE3 and above.
A pure SSE2 implementation with 12* instruction kernel can be formed from interleaving the first 32 elements with the last 32 elements three times in a row:
void systolic_kernel(__m128i a[4]) {
__m128i a0 = _mm_unpacklo_epi8(a[0], a[2]);
__m128i a1 = _mm_unpackhi_epi8(a[0], a[2]);
__m128i a2 = _mm_unpacklo_epi8(a[1], a[3]);
__m128i a3 = _mm_unpackhi_epi8(a[1], a[3]);
a[0] = a0;
a[1] = a1;
a[2] = a2;
a[3] = a3;
}
void transpose(__m128i a[4]) {
systolic_kernel(a);
systolic_kernel(a);
systolic_kernel(a);
}
*without VEX encoding (for three operand instructions), there will be 6 potentially zero cost movdqa instructions added.
The same strategy can be more easily applied to 4x4, 16x16 transposes and more, as the calculation of the indices to be permuted and the block sizes is factored out from the equation.

Related

bitpack ascii string into 7-bit binary blob using SIMD

Related: bitpack ascii string into 7-bit binary blob using ARM-v8 Neon SIMD - same question specialized for AArch64 intrinsics. This question covers portable C and x86-64 intrinsics.
I would like to encode a char string as a 7-bit blob to gain a 12.5% reduction in memory.
I want to do it as fast a possible, i.e. with minimal latency when encoding large strings.
Here is the plain implementation of the algo:
void ascii_pack(const char* ascii, size_t len, uint8_t* bin) {
uint64_t val;
const char* end = ascii + len;
while (ascii + 8 <= end) {
memcpy(&val, ascii, 8);
uint64_t dest = (val & 0xFF);
// Compiler will perform loop unrolling
for (unsigned i = 1; i <= 7; ++i) {
val >>= 1;
dest |= (val & (0x7FUL << 7 * i));
}
memcpy(bin, &dest, 7);
bin += 7;
ascii += 8;
}
// epilog - we do not pack since we have less than 8 bytes.
while (ascii < end) {
*bin++ = *ascii++;
}
}
now, I would like to speed it up with SIMD. I came with SSE2 algo below.
My question:
is it possible to optimize the internal loop that is sequential?
will it improve the throughput when running on large strings?
// The algo - do in parallel what ascii_pack does on two uint64_t integers
void ascii_pack_simd(const char* ascii, size_t len, uint8_t* bin) {
__m128i val;
__m128i mask = _mm_set1_epi64x(0x7FU); // two uint64_t masks
// I leave out 16 bytes in addition to 16 that we load in the loop
// because we store into "bin" full 16 bytes instead of 14. To prevent out of bound
// writes we finish one iteration earlier.
const char* end = ascii + len - 32;
while (ascii <= end) {
val = _mm_loadu_si128(reinterpret_cast<const __m128i*>(ascii));
__m128i dest = _mm_and_si128(val, mask);
// Compiler unrolls it
for (unsigned i = 1; i <= 7; ++i) {
val = _mm_srli_epi64(val, 1); // shift right both integers
__m128i shmask = _mm_slli_epi64(mask, 7 * i); // mask both
dest = _mm_or_si128(dest, _mm_and_si128(val, shmask)); // add another 7bit part.
}
// dest contains two 7 byte blobs. Lets copy them to bin.
_mm_storeu_si128(reinterpret_cast<__m128i*>(bin), dest);
memmove(bin + 7, bin + 8, 7);
bin += 14;
ascii += 16;
}
end += 32; // Bring back end.
DCHECK(ascii < end);
ascii_pack(ascii, end - ascii, bin);
}

The scalar trick (without requiring PEXT) which I referred to in the comments could be implemented like this:
uint64_t compress8x7bit(uint64_t x)
{
x = ((x & 0x7F007F007F007F00) >> 1) | (x & 0x007F007F007F007F);
x = ((x & 0x3FFF00003FFF0000) >> 2) | (x & 0x00003FFF00003FFF);
x = ((x & 0x0FFFFFFF00000000) >> 4) | (x & 0x000000000FFFFFFF);
return x;
}
The idea here is to concatenate together adjacent pairs, first concatenate 7-bit elements into 14-bit elements, then concatenate them into 28-bit elements, and finally concatenate them into one 56-bit chunk (which is the result).
With SSSE3, you could use pshufb to concatenate two of those 56-bit parts (before storing them) too.
SSE2 (and AVX2) can do the same thing as that scalar code with 64-bit elements, but this approach does not take advantage of any techniques that may be possible with special operations (which SSE2+ has plenty of, more with every version), there are probably better things to do than just implementing the scalar trick in SIMD.
For example just to throw something wild out there, gf2p8affineqb(0x8040201008040201, x) would put all the "discarded" bits in one place (namely the top byte of the result) and makes a solid 56-bit chunk out of the bits that we want to keep. But the bits do end up in a strange order (the first byte would contain bits 56, 48, 40, 32, 24, 16, 8, 0, in that order, listing the least significant bit first).
That order, strange as it is, can be easily unpacked using pshufb to reverse the bytes (you can also use this to insert the two zeroes) and then gf2p8affineqb(0x0102040810204080, reversedBytes) shuffles the bits back into the original order.
Here's a sketch of how that could work with actual AVX2+GFNI intrinsics. I'm not bothering to handle the extra parts at the end here, just the "main" loop, so the input text had better be a multiple of 32 bytes. Works on my PC ✔️
void compress8x7bit(const char* ascii, size_t len, uint8_t* bin)
{
const char* end = ascii + len;
while (ascii + 31 < end) {
__m256i text = _mm256_loadu_si256((__m256i*)ascii);
__m256i transposed = _mm256_gf2p8affine_epi64_epi8(_mm256_set1_epi64x(0x8040201008040201), text, 0);
__m256i compressed = _mm256_shuffle_epi8(transposed,
_mm256_set_epi8(-1, -1, 14, 13, 12, 11, 10, 9, 8, 6, 5, 4, 3, 2, 1, 0,
-1, -1, 14, 13, 12, 11, 10, 9, 8, 6, 5, 4, 3, 2, 1, 0));
_mm_storeu_si128((__m128i*)bin, _mm256_castsi256_si128(compressed));
_mm_storeu_si128((__m128i*)(bin + 14), _mm256_extracti128_si256(compressed, 1));
bin += 28;
ascii += 32;
}
}
void uncompress8x7bit(char* ascii, size_t len, const uint8_t* bin)
{
const char* end = ascii + len;
while (ascii + 31 < end) {
__m256i raw = _mm256_inserti128_si256(_mm256_castsi128_si256(_mm_loadu_si128((__m128i*)bin)), _mm_loadu_si128((__m128i*)(bin + 14)), 1);
__m256i rev_with_zeroes = _mm256_shuffle_epi8(raw,
_mm256_set_epi8(7, 8, 9, 10, 11, 12, 13, -1, 0, 1, 2, 3, 4, 5, 6, -1,
7, 8, 9, 10, 11, 12, 13, -1, 0, 1, 2, 3, 4, 5, 6, -1));
__m256i decompressed = _mm256_gf2p8affine_epi64_epi8(_mm256_set1_epi64x(0x0102040810204080), rev_with_zeroes, 0);
_mm256_storeu_si256((__m256i*)ascii, decompressed);
bin += 28;
ascii += 32;
}
}
Perhaps there is a nicer solution than using two 128-bit stores in the compressor and two 128-bit loads in the uncompressor. With AVX512 that would be easy since it has full-register byte-granular permutes, but AVX2 has vpshufb, which is not able to move bytes between the two 128-bit halves that make up a 256-bit vector. The uncompressor could do a funny load that starts 2 bytes before the start of the data it wants, like this: _mm256_loadu_si256((__m256i*)(bin - 2)) (and a slightly different shuffle vector), at the cost of having to avoid a potential out-of-bounds error with either padding or a special first iteration, but the compressor cannot (not cheaply) use a trick like that with a store that start 2 bytes earlier (that would destroy two bytes of the result).
By the way I have some test code here that you can use to verify that your bit-compression functions do the right thing (well sort of - as long as the function is a bit-permutation where some of the bits may be zeroed this works as a check, but this would not detect every possible bug in general):
uint64_t bitindex[7];
bitindex[6] = compress8x7bit(0xFFFFFFFFFFFFFFFF);
bitindex[5] = compress8x7bit(0xFFFFFFFF00000000);
bitindex[4] = compress8x7bit(0xFFFF0000FFFF0000);
bitindex[3] = compress8x7bit(0xFF00FF00FF00FF00);
bitindex[2] = compress8x7bit(0xF0F0F0F0F0F0F0F0);
bitindex[1] = compress8x7bit(0xCCCCCCCCCCCCCCCC);
bitindex[0] = compress8x7bit(0xAAAAAAAAAAAAAAAA);
for (size_t i = 0; i < 64; i++)
{
if (i != 0)
std::cout << ", ";
if (bitindex[6] & (1uLL << i))
{
int index = 0;
for (size_t j = 0; j < 6; j++)
{
if (bitindex[j] & (1uLL << i))
index |= 1 << j;
}
std::cout << index;
}
else
std::cout << "_";
}
std::cout << "\n";

You can improve the solution by #harold, if you replace the first two mask and shift steps by a vpmaddubsw and vpmaddwd (each using 1 instead of 4 uops) and the next step can be replaced by shifting every other 32bit element 4 to the left and afterwords shifting all 64bit elements 4 to the right. Of course, by using AVX2 instead of SSE, you can again double the throughput.
The final step of joining the lower and upper lane is likely most efficiently done by two separate stores which extract each lane directly to memory.
void ascii_pack32(char const* ascii, char* bin)
{
const __m256i control = _mm256_set_epi8(-1, -1, 14, 13, 12, 11, 10, 9, 8, 6, 5, 4, 3, 2, 1, 0,
-1, -1, 14, 13, 12, 11, 10, 9, 8, 6, 5, 4, 3, 2, 1, 0);
__m256i input = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ascii));
// only necessary if high bits of input might be set:
input = _mm256_and_si256(input, _mm256_set1_epi8(0x7f));
__m256i t1 = _mm256_maddubs_epi16(_mm256_set1_epi16(0x8001), input);
__m256i t2 = _mm256_madd_epi16(_mm256_set1_epi32(0x40000001), t1);
__m256i t3 = _mm256_srli_epi64(_mm256_sllv_epi32(t2, _mm256_set1_epi64x(4)), 4);
__m256i val = _mm256_shuffle_epi8(t3, control);
_mm_storeu_si128(reinterpret_cast<__m128i*>(bin), _mm256_castsi256_si128(val));
_mm_storeu_si128(reinterpret_cast<__m128i*>(bin+14), _mm256_extracti128_si256(val, 1));
}
Godbolt link with short testcode:
https://godbolt.org/z/hs7477h5W

SIMD unpack can benefit from blend instructions instead of and/andn/or because we can blend at dword / word / byte boundaries. We only need to AND once at the end to clear the high bit of each byte.
#include <immintrin.h>
static inline
__m128i ascii_unpack7x8_sse4(__m128i v)
{
__m128i separate_7B_halves = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, -1,
7, 8, 9,10,11,12,13, -1);
v = _mm_shuffle_epi8(v, separate_7B_halves);
// separate each u64 qword into 2 u32 halves, with the good bits at the bottom
__m128i shifted = _mm_slli_epi64(v, 4);
#ifdef __AVX2__
v = _mm_blend_epi32(v, shifted, 0b1010); // vpblendd is very efficient, 1 uop any port
#else
v = _mm_castps_si128(_mm_blend_ps( // blendps has extra bypass latency between integer insns, but is single-uop
_mm_castsi128_ps(v), _mm_castsi128_ps(shifted), 0b1010) );
#endif
// Separate each u32 into u16
shifted = _mm_slli_epi32(v, 2);
v = _mm_blend_epi16(v, shifted, 0b10101010); // high halves of pairs from shifted
// Separate each u16 into bytes, with one of two strategies
#if 0 // this strategy is simpler but worse
// shifted = _mm_add_epi16(v, v); // v<<1
// v = _mm_blendv_epi8(v, shifted, _mm_set1_epi16(0xff00));
// v = _mm_and_si128(v, _mm_set1_epi8(0x7f)); // clear garbage from high bits
#else
__m128i hi = _mm_and_si128(v, _mm_set1_epi16(0x3f80)); // isolate hi half
v = _mm_and_si128(v, _mm_set1_epi16(0x007f)); // clear high garbage
v = _mm_add_epi16(v, hi); // high halves left 1 (x+=x), low halves stay (x+=0)
// both ways need two vector constants and 3 instructions, but pblendvb can be slower and has an awkward requirement of having the control vector in XMM0
#endif
return v;
}
With AVX2 available, clang compiles it to this nice asm. Godbolt
# clang -O3 -march=x86-64-v3 (implies AVX2+BMI2, basically Haswell with generic tuning)
ascii_unpack7x8_sse4(long long __vector(2)):
vpshufb xmm0, xmm0, xmmword ptr [rip + .LCPI0_0] # xmm0 = xmm0[0,1,2,3,4,5,6],zero,xmm0[7,8,9,10,11,12,13],zero
vpsllq xmm1, xmm0, 4
vpblendd xmm0, xmm0, xmm1, 10 # xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
vpslld xmm1, xmm0, 2
vpblendw xmm0, xmm0, xmm1, 170 # xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
vpand xmm1, xmm0, xmmword ptr [rip + .LCPI0_1] # in a loop, these constants would be in registers
vpand xmm0, xmm0, xmmword ptr [rip + .LCPI0_2]
vpaddw xmm0, xmm0, xmm1
ret
With just SSE4.1, compilers need several movdqa instructions, as in GCC's output. And out-of-order exec will have an extra 1 or 2 cycles of latency to hide because of bypass-forwarding delays for integer shifts forwarding to an FP blendps, on Intel CPUs. (https://agner.org/optimize/). But that's fine, we're doing this in a loop over an array, modern CPUs have deep enough out-of-order exec.
# gcc -O3 -march=x86-64-v2 # SSE4.2, Nehalem. Actually only using SSE4.1
ascii_unpack7x8_sse4(long long __vector(2)):
movdqa xmm1, xmm0 # silly compiler wastes a MOV
pshufb xmm1, XMMWORD PTR .LC0[rip]
movdqa xmm0, xmm1 # save unshifted v
psllq xmm0, 4
blendps xmm1, xmm0, 10 # 0b1010 = 0xA
movdqa xmm0, xmm1
pslld xmm0, 2
pblendw xmm1, xmm0, 170 # 0b10101010 = 0xAA
movdqa xmm0, XMMWORD PTR .LC1[rip] # after inlining, probably a reg-copy
pand xmm0, xmm1 # and two PAND xmm,xmm
pand xmm1, XMMWORD PTR .LC2[rip]
paddw xmm0, xmm1
ret
If AVX2 is available, an __m256i version of this is straightforward and wouldn't need the blendps fallback. That may be better than scalar pdep (BMI2). AVX2 vpsrlvd or q (per-element shift counts) seem like they should help, but we find ourselves needing to move bits across dword boundaries, and it can only be left or right, not alternating directions. (AVX512 has variable-count rotates (32 and 64-bit), and 16-bit variable-count shifts. Rotates let you go right or left with the same instruction.)
The shift element size could be 64 each time; our blends drop bits that would get shifted into the low element of a pair. For the final step, paddw is 1 byte smaller than psllw/d/q because it has no immediate operand. And can run on more ports on most CPUs. Especially Haswell, where shifts can only run on port 0, but paddw can run on port 1 or 5. (This code has no instruction-level parallelism within one iteration, so we rely on out-of-order exec to overlap execution of multiple iterations.)
Skylake through Alder Lake run SIMD shifts on p01, SIMD inter adds on p015, blendps on p015, pblendw on p5 (p15 for Alder Lake), pblendvb as 1 uop for p015. (Only the non-AVX encoding; vpblendvb is 2 uops for p015). Zen 3 for example has plenty of throughput for all of these.
The final step avoiding _mm_blendv_epi8 has several advantages:
Both ways need two vector constants and 3 instructions. (And no difference in the minimum number of movdqa register-copies a compiler has to invent without non-destructive AVX instructions.)
The AND/AND/ADD version has better ILP; two ANDs in parallel.
SSE4.1 pblendvb can be slower (e.g. Haswell runs it as 2 uops for port 5) and has an awkward requirement of having the control vector in XMM0. Some compilers may waste instructions with hard-reg constraints. (Maybe even when inlining into a loop, unlike when we look at how this helper function would compile on its own.)
vpblendvb (the AVX encoding of it) is 2 uops (for any port) on newer Intel, or 3 on Alder Lake, presumably as the price for having 4 operands (3 inputs and a separate output). Also the AVX version is slow on Alder Lake E-cores (4 uops, 3.23 cycle throughput) https://uops.info/.
AMD CPUs don't have this problem; for example Zen 3 runs vpblendvb as 1 uop for either of two ports.
The only possible upside to the blend version is that the constants are easier to construct on the fly. GCC12 has started preferring to construct some constants on the fly when AVX is available, but does a rather bad job of it, using 10-byte mov r64, imm64 / vmovq / vpunpcklqdq instead of 5-byte mov reg, imm32 / ... / vpbroadcastd or pshufd v,v,0. Or instead of starting with an all-ones vector and shifting.
Actually, the constants for the non-blend way can be generated from an all-ones vector with psrlw xmm, 9 to get 0x007f, and then left shifting that 7-bit mask left by 7. So with AVX, 3 total instructions for both masks, without memory access. Unfortunately compilers don't know how to do this optimization so it's a moot point.
AVX-512F / BW, without AVX-512VBMI / AVX-512GFNI
If you have Ice Lake / Zen4 features, you want #Harold's answer; as I commented there, it's slightly better than AVX-512 vpmultishiftqb (parallel bitfield-extract within a qword).
But if not, with Skylake-X / Cascade Lake features (AVX-512BW and F) you have have masking and variable-count rotates. This saves 2 instructions vs. the SSE4 version (built with AVX2); it feels like there should be room to save more, especially at the final step within 16-bit elements. But masking has byte granularity, and there is no vprolvw, and still no byte shift, unlike AArch64 which can shift elements in 2 directions at byte granularity.
Splitting things apart and doing different things, then merging with a merge-masking vmovdqa could work, but I don't think would help.
#ifdef __AVX512BW__
// pre-Ice Lake, without AVX-512VBMI or AVX512-GFNI
__m128i ascii_unpack7x8_avx512bw(__m128i v)
{
// for YMM or ZMM, use VPERMW, or VPERMB if we have AVX512VBMI since unfortunately VPERMW isn't single-uop on Intel CPUs that support both.
__m128i separate_7B_halves = _mm_setr_epi8(0, 1, 2, 3, 4, 5, 6, -1,
7, 8, 9,10,11,12,13, -1);
v = _mm_shuffle_epi8(v, separate_7B_halves);
v = _mm_slli_epi64(v, 4); // 00000HGFE | DCBA0000 // dword halves
v = _mm_rolv_epi32(v, _mm_set_epi32(2, 32-2, 2, 32-2));
// 00HG|FE00 | 00DC|BA00 // u16 chunks of a u64
v = _mm_mask_srli_epi16(v, (__mmask8)0b0101'0101, v, 2); // 00HG | 00FE | 00DC | 00BA
// Separate each u16 into bytes
__m128i hi = _mm_and_si128(v, _mm_set1_epi16(0x3f80)); // isolate hi half
v = _mm_add_epi16(v, hi); // high halves left 1 (x+=x), low halves stay (x+=0)
// 0H0G | 0F0E | 0D0C | 0B0A in each qword.
return v;
}
#endif
Clang (Godbolt) optimizes the masked right-shift to a variable-count right shift, which is a good idea for a stand-alone function not in a loop especially when we're loading other constants.
This uses more non-immediate constants, but fewer uops. A wider version of this using vpermw to unpack 14-byte chunks to 16-byte lanes might have to do something to introduce zero bits where they're needed, perhaps using zero-masking on the shuffle. But I think we'd still need vpshufb within lanes, so it can zero those high bits.
Having those known zeros that we move around with shifts and rotates is what lets us only use one and and add at the end, unlike the blending version where elements end up with high garbage so we need to mask both ways.
# clang -O3 -march=x86-64-v4
ascii_unpack7x8_avx512bw(long long __vector(2)):
vpshufb xmm0, xmm0, xmmword ptr [rip + .LCPI1_0] # xmm0 = xmm0[0,1,2,3,4,5,6],zero,xmm0[7,8,9,10,11,12,13],zero
vpsllq xmm0, xmm0, 4
vprolvd xmm0, xmm0, xmmword ptr [rip + .LCPI1_1]
vpsrlvw xmm0, xmm0, xmmword ptr [rip + .LCPI1_2]
vpand xmm1, xmm0, xmmword ptr [rip + .LCPI1_3]
vpaddw xmm0, xmm1, xmm0
ret
These constants would of course be loaded into registers.
Just 6 uops; shifts run on port 0 or 1, shuffles on port 5, on Skylake, with VPAND and VPADD able to run on any of the 3 vector ALU ports. So it's a good balance, not running into back-end throughput bottlenecks on a specific port. (vs. 8 uops with clang's AVX build of the SSE4 version)
GCC using masking as requested, again the constant init will get hoisted out of loops, including k1.
# gcc -O3 -march=x86-64-v4
ascii_unpack7x8_avx512bw(long long __vector(2)):
vpshufb xmm0, xmm0, XMMWORD PTR .LC0[rip]
mov eax, 85 # 0x55
vpsllq xmm0, xmm0, 4
kmovb k1, eax
movabs rax, 4575727041462157184 # 0x3F803F803F803F80 silly to use a 64-bit immediate
vprolvd xmm0, xmm0, XMMWORD PTR .LC3[rip]
vpbroadcastq xmm1, rax
vpsrlw xmm0{k1}, xmm0, 2
vpand xmm1, xmm0, xmm1
vpaddw xmm0, xmm0, xmm1
ret
Same instructions doing the work, just setting up constants differently. (Except for vpsrlw xmm0{k1}, xmm0, 2 to shift some elements but not others.)

Backporting my arm64 answer to SSE2, we can simulate variadic shifts by mullo_epu16 and mulhi_epu16; first pack adjacent 7+7-bit values as consecutive:
// 0b'0aaaaaaa'0bbbbbbb + 0bbbbbbb = 0b'0aaaaaaa'bbbbbbb0
a0 = _mm_add_epi16(a, _mm_and_epi16(a, _mm_set1_epi16(0x7f)));
a0 = 0aaaaaaabbbbbbb0'0cccccccddddddd0'0eeeeeeefffffff0'0ggggggghhhhhhh0
a1 = 00000000aaaaaaab'000000cccccccddd'0000eeeeeeefffff'00ggggggghhhhhhh
a2 = bbbbbb0000000000'dddd000000000000'ff00000000000000'0000000000000000
a3 = 0000000000000000'bbbbbb0000000000'dddd000000000000'ff00000000000000
a1 = _mm_mulhi_epu16(a0, kShift); // 1 << {9,11,13,15}
a2 = _mm_mullo_epu16(a0, kShift); // 1 << {9,11,13,15}
a3 = _mm_bsrli_si128(a2, 2);
return _mm_or_si128(a1,a3);

Accumulating a running-total (prefix sum) horizontally across an __m256i vector

I'm trying to translate some scalar code (calc_offsets below) into the AVX2 equivalent. It takes a series of 'counts' and generates a table of offset positions, starting from some provided base value.
My attempt at converting this to AVX2 (avx2_calc_offsets), which I think is correct, seems to be about half the speed of the simple array approach. This is part of an effort to translate a larger hot section of (bottlenecked) code into AVX2 instructions and I'm looking to process the offsets further as vectors. I'd like to avoid jumping between AVX2 and scalar code for operations like this.
There's some example and simple benchmarking code provided. I'm getting around 2.15 seconds runtime for the array version and 4.41 seconds for the AVX2 version (on Ryzen Zen v1).
Is there a better approach using AVX2 to make this operation faster? I need to consider older AVX2 CPUs such as Haswell and original Ryzen series.
#include <immintrin.h>
#include <inttypes.h>
#include <stdio.h>
typedef uint32_t u32;
typedef uint64_t u64;
void calc_offsets (const u32 base, const u32 *counts, u32 *offsets)
{
offsets[0] = base;
offsets[1] = offsets[0] + counts[0];
offsets[2] = offsets[1] + counts[1];
offsets[3] = offsets[2] + counts[2];
offsets[4] = offsets[3] + counts[3];
offsets[5] = offsets[4] + counts[4];
offsets[6] = offsets[5] + counts[5];
offsets[7] = offsets[6] + counts[6];
}
__m256i avx2_calc_offsets (const u32 base, const __m256i counts)
{
const __m256i shuff = _mm256_set_epi32 (6, 5, 4, 3, 2, 1, 0, 7);
__m256i v, t;
// shift whole vector `v` 4 bytes left and insert `base`
v = _mm256_permutevar8x32_epi32 (counts, shuff);
v = _mm256_insert_epi32 (v, base, 0);
// accumulate running total within 128-bit sub-lanes
v = _mm256_add_epi32 (v, _mm256_slli_si256 (v, 4));
v = _mm256_add_epi32 (v, _mm256_slli_si256 (v, 8));
// add highest value in right-hand lane to each value in left
t = _mm256_set1_epi32 (_mm256_extract_epi32 (v, 3));
v = _mm256_blend_epi32 (_mm256_add_epi32 (v, t), v, 0x0F);
return v;
}
void main()
{
u32 base = 900000000;
u32 counts[8] = { 5, 50, 500, 5000, 50000, 500000, 5000000, 50000000 };
u32 offsets[8];
calc_offsets (base, &counts[0], &offsets[0]);
printf ("calc_offsets: ");
for (int i = 0; i < 8; i++) printf (" %u", offsets[i]);
printf ("\n-----\n");
__m256i v, t;
v = _mm256_loadu_si256 ((__m256i *) &counts[0]);
t = avx2_calc_offsets (base, v);
_mm256_storeu_si256 ((__m256i *) &offsets[0], t);
printf ("avx2_calc_offsets: ");
for (int i = 0; i < 8; i++) printf (" %u", offsets[i]);
printf ("\n-----\n");
// --- benchmarking ---
#define ITERS 1000000000
// uncomment to benchmark AVX2 version
// #define AVX2_BENCH
#ifdef AVX2_BENCH
// benchmark AVX2 version
for (u64 i = 0; i < ITERS; i++) {
v = avx2_calc_offsets (base, v);
}
_mm256_storeu_si256 ((__m256i *) &offsets[0], v);
#else
// benchmark array version
u32 *c = &counts[0];
u32 *o = &offsets[0];
for (u64 i = 0; i < ITERS; i++) {
calc_offsets (base, c, o);
// feedback results to prevent optimizer 'cleverness'
u32 *tmp = c;
c = o;
o = tmp;
}
#endif
printf ("offsets after benchmark: ");
for (int i = 0; i < 8; i++) printf (" %u", offsets[i]);
printf ("\n-----\n");
}
I'm using gcc -O2 -mavx2 ... to build. Godbolt link.

Eliminating the upfront _mm256_permutevar8x32_epi32 (vpermd) seems to make a massive difference here. This is likely because of its large latency (8 cycles on Ryzen?) and the immediate dependency upon it of all subsequent instructions.
Instead of feeding in the base value upfront I'm combining it with during the addition that carries the prefix-sum between 128-bit lanes.
__m256i avx2_calc_offsets_2 (const u32 base, const __m256i counts)
{
__m256i b, t, v;
v = counts;
// accumulate running totals within 128-bit sub-lanes
v = _mm256_add_epi32 (v, _mm256_slli_si256 (v, 4));
v = _mm256_add_epi32 (v, _mm256_slli_si256 (v, 8));
// extract highest value in right-hand lane and combine with base offset
t = _mm256_set1_epi32 (_mm256_extract_epi32 (v, 3));
b = _mm256_set1_epi32 (base);
t = _mm256_blend_epi32 (_mm256_add_epi32 (b, t), b, 0x0F);
// combine with shifted running totals
v = _mm256_add_epi32 (_mm256_slli_si256 (v, 4), t);
return v;
}
Godbolt link
Assembly comparison between the two versions:
avx2_calc_offsets:
vmovdqa ymm1, YMMWORD PTR .LC0[rip]
vpermd ymm0, ymm1, ymm0
vpinsrd xmm1, xmm0, edi, 0
vinserti128 ymm0, ymm0, xmm1, 0x0
vpslldq ymm1, ymm0, 4
vpaddd ymm0, ymm0, ymm1
vpslldq ymm1, ymm0, 8
vpaddd ymm0, ymm0, ymm1
vpsrldq xmm1, xmm0, 12
vpbroadcastd ymm1, xmm1
vpaddd ymm1, ymm1, ymm0
vpblendd ymm0, ymm1, ymm0, 15
ret
avx2_calc_offsets_2:
vpslldq ymm1, ymm0, 4
vmovd xmm2, edi
vpaddd ymm1, ymm1, ymm0
vpbroadcastd ymm2, xmm2
vpslldq ymm0, ymm1, 8
vpaddd ymm1, ymm1, ymm0
vpsrldq xmm0, xmm1, 12
vpslldq ymm1, ymm1, 4
vpbroadcastd ymm0, xmm0
vpaddd ymm0, ymm2, ymm0
vpblendd ymm0, ymm0, ymm2, 15
vpaddd ymm0, ymm0, ymm1
ret
Overall the same number of instructions, just less expensive in uops/latency I suppose.
The benchmark using avx2_calc_offsets_2 now runs in 2.7 seconds, which is around 63% faster than the previous version.
Update 1: GCC's inlining of avx2_calc_offsets_2 into the benchmark loop further explains the increased performance. As Peter predicts, the vmovd/ vpbroadcastd instructions corresponding to _mm256_set1_epi32 (base) are indeed hoisted out into a single load outside of the loop.
Loop assembly:
...
// loop setup
vmovdqa ymm2, YMMWORD PTR .LC5[rip] // hoisted load of broadcasted base
vmovdqa ymm0, YMMWORD PTR [rbp-176]
vmovdqa ymm1, YMMWORD PTR [rbp-144]
mov eax, 1000000000
jmp .L10
.L17: // loop body
vpslldq ymm1, ymm0, 4
vpaddd ymm0, ymm0, ymm1
vpslldq ymm1, ymm0, 8
vpaddd ymm0, ymm0, ymm1
vpsrldq xmm1, xmm0, 12
vpslldq ymm0, ymm0, 4
vpbroadcastd ymm1, xmm1
vpaddd ymm1, ymm1, ymm2
vpblendd ymm1, ymm1, ymm2, 15
.L10: // loop entry
vpaddd ymm0, ymm1, ymm0
sub rax, 1
jne .L17
...
.LC5: // broadcasted `base`
.long 900000000
.long 900000000
.long 900000000
.long 900000000
.long 900000000
.long 900000000
.long 900000000
.long 900000000
Update 2:
Focusing on the inlining case and replacing the _mm256_blend_epi32 / vpblendd with an __m128i insertion into the high lane of a zeroed __m256i, then adding to the final vector yields further performance and code-size improvements (thanks Peter).
__m256i avx2_calc_offsets_3 (const u32 base, const __m256i counts)
{
const __m256i z = _mm256_setzero_si256 ();
const __m256i b = _mm256_set1_epi32 (base);
__m256i v, t;
__m128i lo;
v = counts;
// accumulate running totals within 128-bit sub-lanes
v = _mm256_add_epi32 (v, _mm256_slli_si256 (v, 4));
v = _mm256_add_epi32 (v, _mm256_slli_si256 (v, 8));
// capture the max total in low-lane and broadcast into high-lane
lo = _mm_shuffle_epi32 (_mm256_castsi256_si128 (v), _MM_SHUFFLE (3, 3, 3, 3));
t = _mm256_inserti128_si256 (z, lo, 1);
// shift totals, add base and low-lane max
v = _mm256_slli_si256 (v, 4);
v = _mm256_add_epi32 (v, b);
v = _mm256_add_epi32 (v, t);
return v;
}
Godbolt link
The assembly for the inlined version in the loop now looks like:
// compiled with GCC version 10.3: gcc -O2 -mavx2 ...
// loop setup
vmovdqa ymm2, YMMWORD PTR .LC5[rip] // load broadcasted base
vmovdqa ymm0, YMMWORD PTR [rbp-176]
vmovdqa ymm1, YMMWORD PTR [rbp-144]
mov eax, 1000000000
vpxor xmm3, xmm3, xmm3
jmp .L12
.L20: // loop body
vpslldq ymm1, ymm0, 4
vpaddd ymm0, ymm0, ymm1
vpslldq ymm1, ymm0, 8
vpaddd ymm0, ymm0, ymm1
vpshufd xmm1, xmm0, 255
vpslldq ymm0, ymm0, 4
vinserti128 ymm1, ymm3, xmm1, 0x1
.L12: // loop entry
vpaddd ymm0, ymm0, ymm1
vpaddd ymm0, ymm0, ymm2
sub rax, 1
jne .L20
The loop body is down to only 9 vector instructions :).
There's an optimization bug in GCC when using -O3 where an extraneous vmovdqa ymm0, ymm1 is inserted at the end of the loop body, reducing the benchmark performance by a couple of percent. (At least for GCC versions 11.x, 10.x, and 9.x).
Update 3: Another slight performance gain. If we add in the low-lane's max total using a SSE/128-bit instruction before the 128-bit insertion, we shorten the critical path for v allowing better use of the shuffle port.
__m256i avx2_calc_offsets_4 (const u32 base, const __m256i counts)
{
const __m256i b = _mm256_set1_epi32 (base);
__m256i v, t;
__m128i lo;
v = counts;
// accumulate running totals within 128-bit sub-lanes
v = _mm256_add_epi32 (v, _mm256_slli_si256 (v, 4));
v = _mm256_add_epi32 (v, _mm256_slli_si256 (v, 8));
// capture the max total in low-lane, broadcast into high-lane and add to base
lo = _mm_shuffle_epi32 (_mm256_castsi256_si128 (v), _MM_SHUFFLE (3, 3, 3, 3));
lo = _mm_add_epi32 (_mm256_castsi256_si128 (b), lo);
t = _mm256_inserti128_si256 (b, lo, 1);
// shift totals, add base and low-lane max
v = _mm256_slli_si256 (v, 4);
v = _mm256_add_epi32 (v, t);
return v;
}
Godbolt link
.L23: // loop body
vpslldq ymm1, ymm0, 4
vpaddd ymm0, ymm0, ymm1
vpslldq ymm1, ymm0, 8
vpaddd ymm0, ymm0, ymm1
vpshufd xmm2, xmm0, 255
vpslldq ymm1, ymm0, 4
.L14: // loop entry
vpaddd xmm0, xmm2, xmm3
vinserti128 ymm0, ymm4, xmm0, 0x1
vpaddd ymm0, ymm1, ymm0
sub rax, 1
jne .L23
This looks fairly optimal to my (non-expert) eye, at least for early AVX2 chips. Benchmark time is brought down to ~2.17 seconds.
Strangely, if I reduce the size of the source code by deleting one of the previous function definitions, GCC 10 and 11 go a bit haywire and insert 3 (!) additional of vmovdqa instructions into the loop (Godbolt). The result is a slowdown of ~18% in my benchmark. GCC 9.x seems unaffected. I'm not sure what's going on here but it seems like a pretty nasty bug in GCC's optimizer. I'll try to reduce it and file a bug.
The benchmark using avx2_calc_offsets_3 now runs at effectively the same speed as the scalar version, which is a win in my case since it removes the need to jump to scalar code for performance reasons.

What is the reason for this weird behavior of sad instruction?

I have implemented a program using SSE2 to compare the vpsadbw instruction and psadbw of AVX2 and SSE2 respectively. The following code is the SSE2 program:
#define MAX1 4096
#define MAX2 MAX1
#define MAX3 MAX1
#define NUM_LOOP 1000000000
double pTime = 0, mTime = 5;
//global data for sequentila matrix operations
unsigned char a_char[MAX1][MAX2] __attribute__(( aligned(16)));
unsigned char b_char[MAX2][MAX3] __attribute__(( aligned(16)));
unsigned char c_char[MAX1][MAX3] __attribute__(( aligned(16)));
unsigned short int temp[8];
int main()
{
int i, j, w=0, sad=0;
struct timespec tStart, tEnd;
double tTotal , tBest=10000;
__m128i vec1, vec2, vecT, sad_total;
sad_total= _mm_setzero_si128();
do{
clock_gettime(CLOCK_MONOTONIC,&tStart);
for(i=0; i<MAX1; i++){
for(j=0; j<MAX2; j+=16){
vec1 = _mm_load_si128((__m128i *)&a_char[i][j]);
vec2 = _mm_load_si128((__m128i *)&b_char[i][j]);
vecT = _mm_sad_epu8( vec1 , vec2);
sad_total = _mm_add_epi64(vecT, sad_total);
}
}
_mm_store_si128((__m128i *)&temp[0], sad_total);
sad=temp[0]+temp[2]+temp[4]+temp[6];
clock_gettime(CLOCK_MONOTONIC,&tEnd);
tTotal = (tEnd.tv_sec - tStart.tv_sec);
tTotal += (tEnd.tv_nsec - tStart.tv_nsec) / 1000000000.0;
if(tTotal<tBest)
tBest=tTotal;
pTime += tTotal;
} while(w++ < NUM_LOOP && pTime < mTime);
printf(" The best time: %lf sec in %d repetition for %dX result is %d matrix\n",tBest,w, MAX1, sad);
return 0;
}
I use gcc, skylake, Linux mint
When I generate the assembly code the inner loop contain some unwanted move operation as follows for SSE2:
.L26:
vmovdqa xmm1, XMMWORD PTR a_char[rcx+rax]
vpsadbw xmm1, xmm1, XMMWORD PTR b_char[rcx+rax]
add rax, 16
vpaddq xmm3, xmm1, XMMWORD PTR [rsp]
cmp rax, 4096
vmovaps XMMWORD PTR [rsp], xmm3
jne .L26
Since AVX2 generates this assembly code:
.L26:
vmovdqa ymm1, YMMWORD PTR a_char[rcx+rax]
vpsadbw ymm1, ymm1, YMMWORD PTR b_char[rcx+rax]
add rax, 32
vpaddq ymm2, ymm2, ymm1
cmp rax, 4096
jne .L26
I don't know the reason of those 2 move instruction which violated the performance significantly.

The reason is this:
_mm_store_si128((__m128i *)&temp[0], sad_total);
Clang doesn't mind and makes nice code regardless, but GCC didn't like it (failed heuristics perhaps?)
With that replaced to something that doesn't trigger the "this should be on the stack all the time"-heuristic, GCC makes nicer code, for example: (not tested)
__m128i sad_total = _mm_setzero_si128();
for(i = 0; i < MAX1; i++) {
for(j = 0; j < MAX2; j += 16) {
__m128i vec1 = _mm_load_si128((__m128i *)&a_char[i][j]);
__m128i vec2 = _mm_load_si128((__m128i *)&b_char[i][j]);
__m128i vecT = _mm_sad_epu8( vec1 , vec2);
sad_total = _mm_add_epi64(sad_total, vecT);
}
}
__m128i hsum = _mm_add_epi64(sad_total, _mm_bsrli_si128(sad_total, 8));
sad = _mm_cvtsi128_si32(hsum);
The inner loop now looks like
.L2:
vmovdqa xmm1, XMMWORD PTR a_char[rdx+rax]
vpsadbw xmm1, xmm1, XMMWORD PTR b_char[rdx+rax]
add rax, 16
vpaddq xmm2, xmm1, xmm2
cmp rax, 4096
jne .L2

You're directly bypassing the compiler and telling it to use movdqa via _mm_load_si128. It's doing exactly what you're telling it to do. What is the problem here? I also noticed that you're aligning along 16byte boundary, feel free to correct me if I'm wrong (I'm not sure of how attribute is implemented on your compiler) but you may get padding as result so that each element will be aligned on a 16byte boundary; if so this will affect the impact of your unrolling. If not then feel free to correct me.

Extract non-zero values from _m128i register with SSE

I have to extract non-zero values of an __m128i register.
For example I have a vector with eight unsigned shorts.
__m128i vector {40, 0, 22, 0, 0, 0, 0, 8}
I want to extract the 40, 22 and 8 with a minimal amount of SSE instructions.
The non-zero values will then be stored in an array of non zero values.
{40, 22, 8, more values from different vectors ... }
Is it possible to shuffle them or is there a good intrinsic to extract and store?

If you look at this paper, the authors describe how to use _mm_cmpestrm instruction to do basically want you want. The core of their algorithm is this (which I've modified slightly to do what you want, instead of what they want):
__m128i res_v = _mm_cmpestrm(
vector,
8,
mm_setzero_si128(),
8,
_SIDD_UWORD_OPS|_SIDD_CMP_EQUAL_ANY|_SIDD_BIT_MASK|_SIDD_NEGATIVE_POLARITY);
int r = _mm_extract_epi32(res_v, 0);
__m128i p = _mm_shuffle_epi8(vector, sh_mask[r]);
If you build the look-up table sh_mask as described in the paper, then p should have the non-zero elements (without any re-ordering) followed by the zero elements. The number of bits set in r will tell you the number of non-zero elements.
_mm_cmpestrm is in SSE4, unfortunately.

Based on anjruu's answer, here's an SSSE3 version that has not been tested in any way:
; xmm0 = input
pxor xmm1, xmm1
pcmpeqb xmm1, xmm0
pmovmskb eax, xmm1
shl eax, 4
pshufb xmm0, [table + eax]
The table is different of course, but not that hard to work out, just keep in mind that the index is "inverted" - eg index 0 corresponds to having no zeros and 0xFFFF corresponds to all zeros.

Fast 24-bit array -> 32-bit array conversion?

Quick Summary:
I have an array of 24-bit values. Any suggestion on how to quickly expand the individual 24-bit array elements into 32-bit elements?
Details:
I'm processing incoming video frames in realtime using Pixel Shaders in DirectX 10. A stumbling block is that my frames are coming in from the capture hardware with 24-bit pixels (either as YUV or RGB images), but DX10 takes 32-bit pixel textures. So, I have to expand the 24-bit values to 32-bits before I can load them into the GPU.
I really don't care what I set the remaining 8 bits to, or where the incoming 24-bits are in that 32-bit value - I can fix all that in a pixel shader. But I need to do the conversion from 24-bit to 32-bit really quickly.
I'm not terribly familiar with SIMD SSE operations, but from my cursory glance it doesn't look like I can do the expansion using them, given my reads and writes aren't the same size. Any suggestions? Or am I stuck sequentially massaging this data set?
This feels so very silly - I'm using the pixel shaders for parallelism, but I have to do a sequential per-pixel operation before that. I must be missing something obvious...

The code below should be pretty fast. It copies 4 pixels in each iteration, using only 32-bit read/write instructions. The source and destination pointers should be aligned to 32 bits.
uint32_t *src = ...;
uint32_t *dst = ...;
for (int i=0; i<num_pixels; i+=4) {
uint32_t sa = src[0];
uint32_t sb = src[1];
uint32_t sc = src[2];
dst[i+0] = sa;
dst[i+1] = (sa>>24) | (sb<<8);
dst[i+2] = (sb>>16) | (sc<<16);
dst[i+3] = sc>>8;
src += 3;
}
Edit:
Here is a way to do this using the SSSE3 instructions PSHUFB and PALIGNR. The code is written using compiler intrinsics, but it shouldn't be hard to translate to assembly if needed. It copies 16 pixels in each iteration. The source and destination pointers Must be aligned to 16 bytes, or it will fault. If they aren't aligned, you can make it work by replacing _mm_load_si128 with _mm_loadu_si128 and _mm_store_si128 with _mm_storeu_si128, but this will be slower.
#include <emmintrin.h>
#include <tmmintrin.h>
__m128i *src = ...;
__m128i *dst = ...;
__m128i mask = _mm_setr_epi8(0,1,2,-1, 3,4,5,-1, 6,7,8,-1, 9,10,11,-1);
for (int i=0; i<num_pixels; i+=16) {
__m128i sa = _mm_load_si128(src);
__m128i sb = _mm_load_si128(src+1);
__m128i sc = _mm_load_si128(src+2);
__m128i val = _mm_shuffle_epi8(sa, mask);
_mm_store_si128(dst, val);
val = _mm_shuffle_epi8(_mm_alignr_epi8(sb, sa, 12), mask);
_mm_store_si128(dst+1, val);
val = _mm_shuffle_epi8(_mm_alignr_epi8(sc, sb, 8), mask);
_mm_store_si128(dst+2, val);
val = _mm_shuffle_epi8(_mm_alignr_epi8(sc, sc, 4), mask);
_mm_store_si128(dst+3, val);
src += 3;
dst += 4;
}
SSSE3 (not to be confused with SSE3) will require a relatively new processor: Core 2 or newer, and I believe AMD doesn't support it yet. Performing this with SSE2 instructions only will take a lot more operations, and may not be worth it.

SSE3 is awesome, but for those who can't use it for whatever reason, here's the conversion in x86 assembler, hand-optimized by yours truly. For completeness, I give the conversion in both directions: RGB32->RGB24 and RGB24->RGB32.
Note that interjay's C code leaves trash in the MSB (the alpha channel) of the destination pixels. This might not matter in some applications, but it matters in mine, hence my RGB24->RGB32 code forces the MSB to zero. Similarly, my RGB32->RGB24 code ignores the MSB; this avoids garbage output if the source data has a non-zero alpha channel. These features cost almost nothing in terms of performance, as verified by benchmarks.
For RGB32->RGB24 I was able to beat the VC++ optimizer by about 20%. For RGB24->RGB32 the gain was insignificant. Benchmarking was done on an i5 2500K. I omit the benchmarking code here, but if anyone wants it I'll provide it. The most important optimization was bumping the source pointer as soon as possible (see the ASAP comment). My best guess is that this increases parallelism by allowing the instruction pipeline to prefetch sooner. Other than that I just reordered some instructions to reduce dependencies and overlap memory accesses with bit-bashing.
void ConvRGB32ToRGB24(const UINT *Src, UINT *Dst, UINT Pixels)
{
#if !USE_ASM
for (UINT i = 0; i < Pixels; i += 4) {
UINT sa = Src[i + 0] & 0xffffff;
UINT sb = Src[i + 1] & 0xffffff;
UINT sc = Src[i + 2] & 0xffffff;
UINT sd = Src[i + 3];
Dst[0] = sa | (sb << 24);
Dst[1] = (sb >> 8) | (sc << 16);
Dst[2] = (sc >> 16) | (sd << 8);
Dst += 3;
}
#else
__asm {
mov ecx, Pixels
shr ecx, 2 // 4 pixels at once
jz ConvRGB32ToRGB24_$2
mov esi, Src
mov edi, Dst
ConvRGB32ToRGB24_$1:
mov ebx, [esi + 4] // sb
and ebx, 0ffffffh // sb & 0xffffff
mov eax, [esi + 0] // sa
and eax, 0ffffffh // sa & 0xffffff
mov edx, ebx // copy sb
shl ebx, 24 // sb << 24
or eax, ebx // sa | (sb << 24)
mov [edi + 0], eax // Dst[0]
shr edx, 8 // sb >> 8
mov eax, [esi + 8] // sc
and eax, 0ffffffh // sc & 0xffffff
mov ebx, eax // copy sc
shl eax, 16 // sc << 16
or eax, edx // (sb >> 8) | (sc << 16)
mov [edi + 4], eax // Dst[1]
shr ebx, 16 // sc >> 16
mov eax, [esi + 12] // sd
add esi, 16 // Src += 4 (ASAP)
shl eax, 8 // sd << 8
or eax, ebx // (sc >> 16) | (sd << 8)
mov [edi + 8], eax // Dst[2]
add edi, 12 // Dst += 3
dec ecx
jnz SHORT ConvRGB32ToRGB24_$1
ConvRGB32ToRGB24_$2:
}
#endif
}
void ConvRGB24ToRGB32(const UINT *Src, UINT *Dst, UINT Pixels)
{
#if !USE_ASM
for (UINT i = 0; i < Pixels; i += 4) {
UINT sa = Src[0];
UINT sb = Src[1];
UINT sc = Src[2];
Dst[i + 0] = sa & 0xffffff;
Dst[i + 1] = ((sa >> 24) | (sb << 8)) & 0xffffff;
Dst[i + 2] = ((sb >> 16) | (sc << 16)) & 0xffffff;
Dst[i + 3] = sc >> 8;
Src += 3;
}
#else
__asm {
mov ecx, Pixels
shr ecx, 2 // 4 pixels at once
jz SHORT ConvRGB24ToRGB32_$2
mov esi, Src
mov edi, Dst
push ebp
ConvRGB24ToRGB32_$1:
mov ebx, [esi + 4] // sb
mov edx, ebx // copy sb
mov eax, [esi + 0] // sa
mov ebp, eax // copy sa
and ebx, 0ffffh // sb & 0xffff
shl ebx, 8 // (sb & 0xffff) << 8
and eax, 0ffffffh // sa & 0xffffff
mov [edi + 0], eax // Dst[0]
shr ebp, 24 // sa >> 24
or ebx, ebp // (sa >> 24) | ((sb & 0xffff) << 8)
mov [edi + 4], ebx // Dst[1]
shr edx, 16 // sb >> 16
mov eax, [esi + 8] // sc
add esi, 12 // Src += 12 (ASAP)
mov ebx, eax // copy sc
and eax, 0ffh // sc & 0xff
shl eax, 16 // (sc & 0xff) << 16
or eax, edx // (sb >> 16) | ((sc & 0xff) << 16)
mov [edi + 8], eax // Dst[2]
shr ebx, 8 // sc >> 8
mov [edi + 12], ebx // Dst[3]
add edi, 16 // Dst += 16
dec ecx
jnz SHORT ConvRGB24ToRGB32_$1
pop ebp
ConvRGB24ToRGB32_$2:
}
#endif
}
And while we're at it, here are the same conversions in actual SSE3 assembly. This only works if you have an assembler (FASM is free) and have a CPU that supports SSE3 (likely but it's better to check). Note that the intrinsics don't necessarily output something this efficient, it totally depends on the tools you use and what platform you're compiling for. Here, it's straightforward: what you see is what you get. This code generates the same output as the x86 code above, and it's about 1.5x faster (on an i5 2500K).
format MS COFF
section '.text' code readable executable
public _ConvRGB32ToRGB24SSE3
; ebp + 8 Src (*RGB32, 16-byte aligned)
; ebp + 12 Dst (*RGB24, 16-byte aligned)
; ebp + 16 Pixels
_ConvRGB32ToRGB24SSE3:
push ebp
mov ebp, esp
mov eax, [ebp + 8]
mov edx, [ebp + 12]
mov ecx, [ebp + 16]
shr ecx, 4
jz done1
movupd xmm7, [mask1]
top1:
movupd xmm0, [eax + 0] ; sa = Src[0]
pshufb xmm0, xmm7 ; sa = _mm_shuffle_epi8(sa, mask)
movupd xmm1, [eax + 16] ; sb = Src[1]
pshufb xmm1, xmm7 ; sb = _mm_shuffle_epi8(sb, mask)
movupd xmm2, xmm1 ; sb1 = sb
pslldq xmm1, 12 ; sb = _mm_slli_si128(sb, 12)
por xmm0, xmm1 ; sa = _mm_or_si128(sa, sb)
movupd [edx + 0], xmm0 ; Dst[0] = sa
psrldq xmm2, 4 ; sb1 = _mm_srli_si128(sb1, 4)
movupd xmm0, [eax + 32] ; sc = Src[2]
pshufb xmm0, xmm7 ; sc = _mm_shuffle_epi8(sc, mask)
movupd xmm1, xmm0 ; sc1 = sc
pslldq xmm0, 8 ; sc = _mm_slli_si128(sc, 8)
por xmm0, xmm2 ; sc = _mm_or_si128(sb1, sc)
movupd [edx + 16], xmm0 ; Dst[1] = sc
psrldq xmm1, 8 ; sc1 = _mm_srli_si128(sc1, 8)
movupd xmm0, [eax + 48] ; sd = Src[3]
pshufb xmm0, xmm7 ; sd = _mm_shuffle_epi8(sd, mask)
pslldq xmm0, 4 ; sd = _mm_slli_si128(sd, 4)
por xmm0, xmm1 ; sd = _mm_or_si128(sc1, sd)
movupd [edx + 32], xmm0 ; Dst[2] = sd
add eax, 64
add edx, 48
dec ecx
jnz top1
done1:
pop ebp
ret
public _ConvRGB24ToRGB32SSE3
; ebp + 8 Src (*RGB24, 16-byte aligned)
; ebp + 12 Dst (*RGB32, 16-byte aligned)
; ebp + 16 Pixels
_ConvRGB24ToRGB32SSE3:
push ebp
mov ebp, esp
mov eax, [ebp + 8]
mov edx, [ebp + 12]
mov ecx, [ebp + 16]
shr ecx, 4
jz done2
movupd xmm7, [mask2]
top2:
movupd xmm0, [eax + 0] ; sa = Src[0]
movupd xmm1, [eax + 16] ; sb = Src[1]
movupd xmm2, [eax + 32] ; sc = Src[2]
movupd xmm3, xmm0 ; sa1 = sa
pshufb xmm0, xmm7 ; sa = _mm_shuffle_epi8(sa, mask)
movupd [edx], xmm0 ; Dst[0] = sa
movupd xmm4, xmm1 ; sb1 = sb
palignr xmm1, xmm3, 12 ; sb = _mm_alignr_epi8(sb, sa1, 12)
pshufb xmm1, xmm7 ; sb = _mm_shuffle_epi8(sb, mask);
movupd [edx + 16], xmm1 ; Dst[1] = sb
movupd xmm3, xmm2 ; sc1 = sc
palignr xmm2, xmm4, 8 ; sc = _mm_alignr_epi8(sc, sb1, 8)
pshufb xmm2, xmm7 ; sc = _mm_shuffle_epi8(sc, mask)
movupd [edx + 32], xmm2 ; Dst[2] = sc
palignr xmm3, xmm3, 4 ; sc1 = _mm_alignr_epi8(sc1, sc1, 4)
pshufb xmm3, xmm7 ; sc1 = _mm_shuffle_epi8(sc1, mask)
movupd [edx + 48], xmm3 ; Dst[3] = sc1
add eax, 48
add edx, 64
dec ecx
jnz top2
done2:
pop ebp
ret
section '.data' data readable writeable align 16
label mask1 dqword
db 0,1,2,4, 5,6,8,9, 10,12,13,14, -1,-1,-1,-1
label mask2 dqword
db 0,1,2,-1, 3,4,5,-1, 6,7,8,-1, 9,10,11,-1

The different input/output sizes are not a barrier to using simd, just a speed bump. You would need to chunk the data so that you read and write in full simd words (16 bytes).
In this case, you would read 3 SIMD words (48 bytes == 16 rgb pixels), do the expansion, then write 4 SIMD words.
I'm just saying you can use SIMD, I'm not saying you should. The middle bit, the expansion, is still tricky since you have non-uniform shift sizes in different parts of the word.

SSE 4.1 .ASM:
PINSRD XMM0, DWORD PTR[ESI], 0
PINSRD XMM0, DWORD PTR[ESI+3], 1
PINSRD XMM0, DWORD PTR[ESI+6], 2
PINSRD XMM0, DWORD PTR[ESI+9], 3
PSLLD XMM0, 8
PSRLD XMM0, 8
MOVNTDQ [EDI], XMM1
add ESI, 12
add EDI, 16

Develop Reference

c reactjs sql-server angularjs arrays wpf database batch-file google-app-engine silverlight