I need to compare two buffers chunk-wise for equality. I don't need information about the relation of the two buffers, just if each two chunks are equal or not. My intel machine supports up to SSE4.2
The naive approach is:
const size_t CHUNK_SIZE = 16; //128bit for SSE2 integer registers
const int ARRAY_SIZE = 200000000;
char* array_1 = (char*)_aligned_malloc(ARRAY_SIZE, 16);
char* array_2 = (char*)_aligned_malloc(ARRAY_SIZE, 16);
for (size_t i = 0; i < ARRAY_SIZE; )
{
volatile bool result = memcmp(array_1+i, array_2+i, CHUNK_SIZE);
i += CHUNK_SIZE;
}
Compared to my first try using SSE ever:
union U
{
__m128i m;
volatile int i[4];
} res;
for (size_t i = 0; i < ARRAY_SIZE; )
{
__m128i* pa1 = (__m128i*)(array_1+i);
__m128i* pa2 = (__m128i*)(array_2+i);
res.m = _mm_cmpeq_epi32(*pa1, *pa2);
volatile bool result = ( (res.i[0]==0) || (res.i[1]==0) || (res.i[2]==0) || (res.i[3]==0) );
i += CHUNK_SIZE;
}
The gain in speed is about 33%. Could I do any better?
You really shouldn't be using scalar code and unions to test all the individual vector elements - do something like this instead:
for (size_t i = 0; i < ARRAY_SIZE; i += CHUNK_SIZE)
{
const __m128i a1 = _mm_load_si128(array_1 + i);
const __m128i a2 = _mm_load_si128(array_2 + i);
const __m128i vcmp = _mm_cmpeq_epi32(a1, a2);
const int vmask = _mm_movemask_epi8(vcmp);
const bool result = (vmask == 0xffff);
// you probably want to break here if you get a mismatch ???
}
Since you can use SSE 4.1, there is another alternative that might be faster:
for (size_t i = 0; i < ARRAY_SIZE; i += CHUNK_SIZE;)
{
__m128i* pa1 = (__m128i*)(array_1+i);
__m128i* pa2 = (__m128i*)(array_2+i);
__m128i temp = _mm_xor_si128(*pa1, *pa2);
bool result = (bool)_mm_testz_si128(temp, temp);
}
_mm_testz_si128(a, b) returns 0 if a & b != 0 and it returns 1 if a & b == 0. The advantage is that you can use this version with the new AVX instructions as well, where the chunk size is 32 bytes.
Related
I would like to DSP-optimize a simple multiply-accumulate for-loop for the QC Hexagon. From the manual, it's not perfectly clear to me how to do that, both for the vector version and the non-vector version.
Assume my loop has a length which is a multiple of 4 (e.g., 64), i.e., I want to unroll the loop with a factor of 4. How would I do that? I can use either C-intrinsics or asm-code, but I don't understand how to do the 4x-memory load in first place.
Here is how my loop could look like in C:
Word32 sum = 0;
Word16 *pointer1; Word16 *pointer2;
for (i=0; i<64; i++)
{
sum += pointer1[I]*pointer2[i];
}
Any suggestions?
Here is a FIR filter implementation that demonstrates how to use Q6_P_vrmpyhacc_PP, the multiply halfword/accumulate. This instruction is described as 'big mac' in the PRM 😉
This instruction is in the scalar core so it does not require the HVX vector coprocessor.
void FIR08(short_8B_align Input[],
short_8B_align Coeff[],
short_8B_align Output[],
int unused, int ntaps,
int nsamples)
{
Word64 * vInput = (Word64*)Input;
Word64 * vCoeff = (Word64*)Coeff;
Word64 *__restrict vOutput = (Word64*)Output;
int i, j;
Word64 sum0, sum1, sum2, sum3;
for (i = 0; i < nsamples/4; i++)
{
sum0 = sum1 = sum2 = sum3 = 0;
for (j = 0; j < ntaps/4; j++)
{
Word64 vIn1 = vInput[i+j];
Word64 vIn2 = vInput[i+j+1];
Word64 curCoeff = vCoeff[j];
Word64 curIn;
curIn = vIn1;
sum0 = Q6_P_vrmpyhacc_PP(sum0, curIn, curCoeff);
curIn = Q6_P_valignb_PPI(vIn2, vIn1, 2);
sum1 = Q6_P_vrmpyhacc_PP(sum1, curIn, curCoeff);
curIn = Q6_P_valignb_PPI(vIn2, vIn1, 4);
sum2 = Q6_P_vrmpyhacc_PP(sum2, curIn, curCoeff);
curIn = Q6_P_valignb_PPI(vIn2, vIn1, 6);
sum3 = Q6_P_vrmpyhacc_PP(sum3, curIn, curCoeff);
}
Word64 curOut = Q6_P_combine_RR(Q6_R_combine_RhRh(sum3, sum2), Q6_R_combine_RhRh(sum1, sum0));
vOutput[i + 1] = Q6_P_vasrh_PI(curOut, 2);
}
}
I am currently playing with ARM Neon and have the written the following functions, one in C, one with NEON Intrinsics to compare the speeds. The functions compare two arrays. The parameter cb is the number of bytes divided by 8:
inline uint32_t is_not_zero(uint32x4_t v)
{
uint32x2_t tmp = vorr_u32(vget_low_u32(v), vget_high_u32(v));
return vget_lane_u32(vpmax_u32(tmp, tmp), 0);
}
uint32_t sum_neon(const uint8_t *s1, const uint8_t *s2, uint32_t cb)
{
const uint32_t *s1_cmp = (uint32_t *)s1;
const uint32_t *s2_cmp = (uint32_t *)s2;
cb *= 2;
while (cb--)
{
uint32x4x2_t cmp1 = vld2q_u32(s1_cmp);
uint32x4x2_t cmp2 = vld2q_u32(s2_cmp);
uint32x4_t res1 = vceqq_u32(cmp1.val[0], cmp2.val[0]);
uint32x4_t res2 = vceqq_u32(cmp1.val[1], cmp2.val[1]);
if (!is_not_zero(res1)) return 1;
if (!is_not_zero(res2)) return 1;
s1_cmp += 8;
s2_cmp += 8;
}
return 0;
}
uint32_t sum_c(const uint8_t *s1, const uint8_t *s2, uint32_t cb)
{
const uint64_t *p1 = (uint64_t *)s1;
const uint64_t *p2 = (uint64_t *)s2;
uint32_t n = 0;
while (cb--) {
if ((p1[n ] != p2[n ]) ||
(p1[n+1] != p2[n+1]) ||
(p1[n+2] != p2[n+2]) ||
(p1[n+3] != p2[n+3])) return 1;
++n;
}
return 0;
}
I dont understand why the C implementation is WAY faster than the NEON variant. The code is compiled on a raspberry pi using
-O3 -mcpu=cortex-a7 -mfpu=neon-vfpv4 -mfloat-abi=hard as CFlags.
#define Size 50000
void main()
{
unsigned char *arry1 = (unsigned char*)malloc(sizeof(unsigned char)* Size);
unsigned char *arry2 = (unsigned char*)malloc(sizeof(unsigned char)* Size);
unsigned int *result = (unsigned int*)malloc(sizeof(unsigned int)* Size);
for (int i = 0; i < 16; i++)
{
arry1[i] = i;
arry2[i] = i;
}
__m128i Z = _mm_setzero_si128();
__m128i vsum = _mm_setzero_si128();
//__m128i dummy = _mm_setzero_si128();
for (int j = 0; j < 16; j += 16)
{
//printf("%d\n\n", j);
__m128i test1 = _mm_setzero_si128();
test1 = _mm_loadu_si128((__m128i*)&arry1[j]);
__m128i test2 = _mm_setzero_si128();
test2 = _mm_loadu_si128((__m128i*)&arry2[j]);
__m128i s16L = _mm_unpacklo_epi8(test1, Z);
__m128i s16H = _mm_unpackhi_epi8(test1, Z);
__m128i s32LL = _mm_unpacklo_epi16(s16L, Z);
__m128i s32LH = _mm_unpackhi_epi16(s16L, Z);
__m128i s32HL = _mm_unpacklo_epi16(s16H, Z);
__m128i s32HH = _mm_unpackhi_epi16(s16H, Z);
__m128i t16L = _mm_unpacklo_epi8(test2, Z);
__m128i t16H = _mm_unpackhi_epi8(test2, Z);
__m128i t32LL = _mm_unpacklo_epi16(t16L, Z);
__m128i t32LH = _mm_unpackhi_epi16(t16L, Z);
__m128i t32HL = _mm_unpacklo_epi16(t16H, Z);
__m128i t32HH = _mm_unpackhi_epi16(t16H, Z);
__m128 s1 = _mm_cvtepi32_ps(s32LL);
__m128 s2 = _mm_cvtepi32_ps(s32LH);
__m128 s3 = _mm_cvtepi32_ps(s32HL);
__m128 s4 = _mm_cvtepi32_ps(s32HH);
__m128 t1 = _mm_cvtepi32_ps(t32LL);
__m128 t2 = _mm_cvtepi32_ps(t32LH);
__m128 t3 = _mm_cvtepi32_ps(t32HL);
__m128 t4 = _mm_cvtepi32_ps(t32HH);
s1 = _mm_mul_ps(s1, t1);
s2 = _mm_mul_ps(s2, t2);
s3 = _mm_mul_ps(s3, t3);
s4 = _mm_mul_ps(s4, t4);
s1 = _mm_hadd_ps(s1, s2);//41,13
s3 = _mm_hadd_ps(s3, s4); //313,221
vsum = _mm_cvtps_epi32(s3);
for (int k = 0; k < 16; k++)
{
printf("%u\n", (unsigned char)vsum.m128i_i8[k]);
}
s1 = _mm_hadd_ps(s1, s3); //734, 14
s1 = _mm_hadd_ps(s1, s1); //1100,140
s1 = _mm_hadd_ps(s1, s1); //1240
}
}
I progressing dot production using sse. I'm using _mm_mul_ps and _mm_hadd_ps instruction not _mm_dp_ps.
If values that after _mm_hadd_ps function is over 255, it is displayed wrong value.
For example, correct value of s3 is {0,0,0,421,0,0,0,313,0,0,0,221,0,0,0,145}.
But {0,0,1,165,0,0,1,57,0,0,0,221,0,0,0,145} is printed. Is this the result I declared arry1, arry2 as unsigned char? I know 255 is max value by 8-bit.
I can see here some problems:
1) If you want to calculate dot product of 50000 uint8_t values, it is OK. But dot product of 70000 values can cause of overflow of uint32_t type. Therefore using of uint64_t is better solution.
2) To calculate dot product of integer vectors it is not necessary to use float point numbers. It is more effective to use for calculation only integers.
There is an example of SSE2 function which calculates dot product for two uint8_t vectors:
#include <algorithm>
#include <emmintrin.h>
const __m128i Z = _mm_setzero_si128();
const size_t A = sizeof(__m128i);
const size_t B = 0x10000;
inline __m128i DotProduct32(const uint8_t * a, const uint8_t * b)
{
__m128i _a = _mm_loadu_si128((__m128i*)a);
__m128i _b = _mm_loadu_si128((__m128i*)b);
__m128i lo = _mm_madd_epi16(_mm_unpacklo_epi8(_a, Z), _mm_unpacklo_epi8(_b, Z));
__m128i hi = _mm_madd_epi16(_mm_unpackhi_epi8(_a, Z), _mm_unpackhi_epi8(_b, Z));
return _mm_add_epi32(lo, hi);
}
inline __m128i HorizontalSum32(__m128i a)
{
return _mm_add_epi64(_mm_unpacklo_epi32(a, Z), _mm_unpackhi_epi32(a, Z));
}
inline uint64_t ExtractSum64(__m128i a)
{
uint64_t _a[2];
_mm_storeu_si128((__m128i*)_a, a);
return _a[0] + _a[1];
}
void DotProduct(const uint8_t * a, const uint8_t * b, size_t size, uint64_t * sum)
{
size_t blockNumber = (size + B - 1)/B;
size_t alignedSize = size/A*A;
size_t i = 0;
__m128i sum64 = Z;
for (size_t block = 0; block < blockNumber; ++i)
{
__m128i sum32 = Z;
for (size_t blockEnd = std::min(alignedSize, i + B); i < blockEnd; i += A)
sum32 = _mm_add_epi32(sum32, DotProduct32(a + i, b + i));
sum64 = _mm_add_epi64(sum64, HorizontalSum32(sum32));
}
*sum = ExtractSum64(sum64);
for (; i < size; ++i)
*sum += a[i] * b[i];
}
I tried to reduce the execution time of this function, and I got the execution time down to
Sys:0.001s
Is there any way to reduce the execution time of this function further?
int function(uint32_t *r, const uint32_t *a, const uint32_t *b, int n)
{
int i;
uint32_t ri, c=0;
for (i = 0; i < n; i ++)
{
ri = a[i] + b[i] + c;
c = ((ri < a[i]) || ((ri == a[i]) && c));
r[i] = ri;
}
return ((int) c);
}
I guess, you loose most of the time in your conditional expression: most modern CPU hate branches if they can't predict them correctly most of the time. Consequently, the branches introduce by most loops are fine, because they are only mispredicted once for the entire loop. Branching on a carry condition, however, will likely result in 50% of the branches being mispredicted, and each misprediction is worth 10 to 20 cycles. Even worse, the && and || operators are sequence points, which are a hindrance to the optimizer.
So, I would try to eliminate these conditionals:
int function(uint32_t *r, const uint32_t *a, const uint32_t *b, int n) {
int i;
uint64_t ri, c=0;
for (i = 0; i < n; i ++) {
ri = (uint64_t)a[i] + (uint64_t)b[i] + c;
c = ri >> 32;
r[i] = (uint32_t)ri;
}
return ((int) c);
}
Here, I have used 64-bit arithmetic, since modern CPUs do 64-bit arithmetic just as fast as 32-bit arithmetic. However, if 64-bit arithmetic is slow on your hardware, you can fall back to 32-bit arithmetic:
int function(uint32_t *r, const uint32_t *a, const uint32_t *b, int n) {
int i;
uint32_t ri, c=0;
for (i = 0; i < n; i ++) {
uint32_t curA = a[i], curB = b[i];
uint32_t lowA = curA & 0xffffu, highA = curA >> 16;
uint32_t lowB = curB & 0xffffu, highB = curB >> 16;
uint32_t lowR = lowA + lowB + c;
uint32_t highR = highA + highB + (lowR >> 16);
c = highR >> 16;
r[i] = (highR << 16) + lowR;
}
return ((int) c);
}
Even though this looks like a monster, it's only 12 simple operations which should execute with a latency of one cycle on all hardware, i. e. the calculation of the entire loop body should take less than 12 cycles, consequently, the bottleneck should be the memory bus (and you can't avoid that).
you can get rid of the subscript notation and use pointer arithmetic instead which is said to be faster , however i don't know how much CPU time would that actually save .
int function(uint32_t *r, const uint32_t *a, const uint32_t *b, int n)
{
int i;
uint32_t ri, c=0;
for (i = 0; i < n; i ++)
{
ri = *(a + i) + *(b + i) + c;
c = ((ri < *(a + i)) || ((ri == *(a +i)) && c));
*(r + i) = ri;
}
return ((int) c);
}
for reasons see: Accessing array values via pointer arithmetic vs. subscripting in C
c = (ri < a[i]) + ((ri-a[i])*c) might be faster than your code which also test if c==0
I have n (8 bit) character strings all of them of the same length (say m), and another string s of the same length. I need to compute Hamming distances from s to each of the others strings. In plain C, something like:
unsigned char strings[n][m];
unsigned char s[m];
int distances[n];
for(i=0; i<n; i++) {
int distances[i] = 0;
for(j=0; j<m; j++) {
if(strings[i][j] != s[j])
distances[i]++;
}
}
I would like to use SIMD instructions with gcc to perform such computations more efficiently. I have read that PcmpIstrI in SSE 4.2 can be useful and my target computer supports that instruction set, so I would prefer a solution using SSE 4.2.
EDIT:
I wrote following function to compute Hamming distance between two strings:
static inline int popcnt128(__m128i n) {
const __m128i n_hi = _mm_unpackhi_epi64(n, n);
return _mm_popcnt_u64(_mm_cvtsi128_si64(n)) + _mm_popcnt_u64(_mm_cvtsi128_si64(n_hi));
}
int HammingDist(const unsigned char *p1, unsigned const char *p2, const int len) {
#define MODE (_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_EACH | _SIDD_BIT_MASK | _SIDD_NEGATIVE_POLARITY)
__m128i smm1 = _mm_loadu_si128 ((__m128i*) p1);
__m128i smm2 = _mm_loadu_si128 ((__m128i*) p2);
__m128i ResultMask;
int iters = len / 16;
int diffs = 0;
int i;
for(i=0; i<iters; i++) {
ResultMask = _mm_cmpestrm (smm1,16,smm2,16,MODE);
diffs += popcnt128(ResultMask);
p1 = p1+16;
p2 = p2+16;
smm1 = _mm_loadu_si128 ((__m128i*)p1);
smm2 =_mm_loadu_si128 ((__m128i*)p2);
}
int mod = len % 16;
if(mod>0) {
ResultMask = _mm_cmpestrm (smm1,mod,smm2,mod,MODE);
diffs += popcnt128(ResultMask);
}
return diffs;
}
So I can solve my problem by means of:
for(i=0; i<n; i++) {
int distances[i] = HammingDist(s, strings[i], m);
}
Is this the best I can do or can I use the fact that one of the strings compared is always the same? In addition, should I do some alignment on my arrays to improve performance?
ANOTHER ATTEMPT
Following Harold's recomendation, I have written following code:
void _SSE_hammingDistances(const ByteP str, const ByteP strings, int *ds, const int n, const int m) {
int iters = m / 16;
__m128i *smm1, *smm2, diffs;
for(int j=0; j<n; j++) {
smm1 = (__m128i*) str;
smm2 = (__m128i*) &strings[j*(m+1)]; // m+1, as strings are '\0' terminated
diffs = _mm_setzero_si128();
for (int i = 0; i < iters; i++) {
diffs = _mm_add_epi8(diffs, _mm_cmpeq_epi8(*smm1, *smm2));
smm1 += 1;
smm2 += 1;
}
int s = m;
signed char *ptr = (signed char *) &diffs;
for(int p=0; p<16; p++) {
s += *ptr;
ptr++;
}
*ds = s;
ds++;
}
}
but I am not able to do the final addition of bytes in __m128i by using psadbw. Can anyone please help me with that?
Here's an improved version of your latest routine, which uses PSADBW (_mm_sad_epu8) to eliminate the scalar code:
void hammingDistances_SSE(const uint8_t * str, const uint8_t * strings, int * const ds, const int n, const int m)
{
const int iters = m / 16;
const __m128i smm1 = _mm_loadu_si128((__m128i*)str);
assert((m & 15) == 0); // m must be a multiple of 16
for (int j = 0; j < n; j++)
{
__m128i smm2 = _mm_loadu_si128((__m128i*)&strings[j*(m+1)]); // m+1, as strings are '\0' terminated
__m128i diffs = _mm_setzero_si128();
for (int i = 0; i < iters; i++)
{
diffs = _mm_sub_epi8(diffs, _mm_cmpeq_epi8(smm1, smm2));
}
diffs = _mm_sad_epu8(diffs, _mm_setzero_si128());
ds[j] = m - (_mm_extract_epi16(diffs, 0) + _mm_extract_epi16(diffs, 4));
}
}