NEON: Optimize code

NEON: Optimize code - c

I am currently playing with ARM Neon and have the written the following functions, one in C, one with NEON Intrinsics to compare the speeds. The functions compare two arrays. The parameter cb is the number of bytes divided by 8:
inline uint32_t is_not_zero(uint32x4_t v)
{
uint32x2_t tmp = vorr_u32(vget_low_u32(v), vget_high_u32(v));
return vget_lane_u32(vpmax_u32(tmp, tmp), 0);
}
uint32_t sum_neon(const uint8_t *s1, const uint8_t *s2, uint32_t cb)
{
const uint32_t *s1_cmp = (uint32_t *)s1;
const uint32_t *s2_cmp = (uint32_t *)s2;
cb *= 2;
while (cb--)
{
uint32x4x2_t cmp1 = vld2q_u32(s1_cmp);
uint32x4x2_t cmp2 = vld2q_u32(s2_cmp);
uint32x4_t res1 = vceqq_u32(cmp1.val[0], cmp2.val[0]);
uint32x4_t res2 = vceqq_u32(cmp1.val[1], cmp2.val[1]);
if (!is_not_zero(res1)) return 1;
if (!is_not_zero(res2)) return 1;
s1_cmp += 8;
s2_cmp += 8;
}
return 0;
}
uint32_t sum_c(const uint8_t *s1, const uint8_t *s2, uint32_t cb)
{
const uint64_t *p1 = (uint64_t *)s1;
const uint64_t *p2 = (uint64_t *)s2;
uint32_t n = 0;
while (cb--) {
if ((p1[n ] != p2[n ]) ||
(p1[n+1] != p2[n+1]) ||
(p1[n+2] != p2[n+2]) ||
(p1[n+3] != p2[n+3])) return 1;
++n;
}
return 0;
}
I dont understand why the C implementation is WAY faster than the NEON variant. The code is compiled on a raspberry pi using
-O3 -mcpu=cortex-a7 -mfpu=neon-vfpv4 -mfloat-abi=hard as CFlags.

Related

Why is volatile used in this libsodium function?

I'm trying to understand this function sodium_is_zero from the cryptography library libsodium (https://github.com/jedisct1/libsodium):
https://github.com/jedisct1/libsodium/blob/master/src/libsodium/sodium/utils.c#L256-L266
int
sodium_is_zero(const unsigned char *n, const size_t nlen)
{
size_t i;
volatile unsigned char d = 0U;
for (i = 0U; i < nlen; i++) {
d |= n[i];
}
return 1 & ((d - 1) >> 8);
}
Could anyone explain to me why d is volatile and what purpose volatile is serving here?

Is there a memset() function for 16 and/or 32 bit values?

memset() is very fast, as it benefits from stosb op code internally.
Is there a function for 16 and 32 bit values which similar efficiently benefits from stosb, stosw and/or stosd?
wmemset() is not portable and does not help on 16 bit values.

There is no such function in standard C, but depending on what you're trying to do, there may be CPU-specific SIMD "intrinsic functions" that you can use to build one. Your mention of stosb makes me think you're using an x86, so review the documentation for the various *mmintrin.h headers and the functions they provide.

Yes it is in many variants.
for example
void *memset16(void *m, uint16_t val, size_t count)
{
uint16_t *buf = m;
while(count--) *buf++ = val;
return m;
}
void *memset32(void *m, uint32_t val, size_t count)
{
uint32_t *buf = m;
while(count--) *buf++ = val;
return m;
}
void *memsetXX(void *m, void *val, size_t size, size_t count)
{
char *buf = m;
while(count--)
{
memcpy(buf, val, size);
buf += size;
}
return m;
}
Safer version:
void *memset16safe(void *m, uint16_t val, size_t count)
{
char *buf = m;
union
{
uint8_t d8[2];
uint16_t d16;
}u16 = {.d16 = val};
while(count--)
{
*buf++ = u16.d8[0];
*buf++ = u16.d8[1];
}
return m;
}
void *memset32(void *m, uint32_t val, size_t count)
{
char *buf = m;
union
{
uint8_t d8[4];
uint32_t d32;
}u32 = {.d32 = val};
while(count--)
{
*buf++ = u32.d8[0];
*buf++ = u32.d8[1];
*buf++ = u32.d8[2];
*buf++ = u32.d8[3];
}
return m;
}

systick example on cortex m3 not working

I am trying to measure the time elapsed during a function but I am not sure if I am doing it correctly. When I stop the systick its value returned is 24 which is not possible.
#define SYSTICKS 0xFFFFFFu
static volatile uint32_t Count = 0;
void SysTick_Handler(void)
{
Count++;
}
int doSomething()
{
int a = 0;
int b = 8;
return a + b;
}
int main(int argc, char *argv[])
SysTick_Config(SYSTICKS);
volatile uint64_t a = (uint64_t)SYSTICKS - SysTick->VAL;
a += ((uint64_t)SYSTICKS * Count);
/* some code here */
doSomething();
volatile uint64_t b = (uint64_t)SYSTICKS - SysTick->VAL;
b += ((uint64_t)SYSTICKS * Count);
volatile uint64_t timeElapsed = a - b;
printf("a = %lld\r\n", a);
printf("b = %lld\r\n", b);
printf("timeElapsed = %lld\n", timeElapsed);
And here is the output
a = 16777215
b = 24
timeElapsed = 16777191

Computing Hamming distances to several strings with SSE

I have n (8 bit) character strings all of them of the same length (say m), and another string s of the same length. I need to compute Hamming distances from s to each of the others strings. In plain C, something like:
unsigned char strings[n][m];
unsigned char s[m];
int distances[n];
for(i=0; i<n; i++) {
int distances[i] = 0;
for(j=0; j<m; j++) {
if(strings[i][j] != s[j])
distances[i]++;
}
}
I would like to use SIMD instructions with gcc to perform such computations more efficiently. I have read that PcmpIstrI in SSE 4.2 can be useful and my target computer supports that instruction set, so I would prefer a solution using SSE 4.2.
EDIT:
I wrote following function to compute Hamming distance between two strings:
static inline int popcnt128(__m128i n) {
const __m128i n_hi = _mm_unpackhi_epi64(n, n);
return _mm_popcnt_u64(_mm_cvtsi128_si64(n)) + _mm_popcnt_u64(_mm_cvtsi128_si64(n_hi));
}
int HammingDist(const unsigned char *p1, unsigned const char *p2, const int len) {
#define MODE (_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_EACH | _SIDD_BIT_MASK | _SIDD_NEGATIVE_POLARITY)
__m128i smm1 = _mm_loadu_si128 ((__m128i*) p1);
__m128i smm2 = _mm_loadu_si128 ((__m128i*) p2);
__m128i ResultMask;
int iters = len / 16;
int diffs = 0;
int i;
for(i=0; i<iters; i++) {
ResultMask = _mm_cmpestrm (smm1,16,smm2,16,MODE);
diffs += popcnt128(ResultMask);
p1 = p1+16;
p2 = p2+16;
smm1 = _mm_loadu_si128 ((__m128i*)p1);
smm2 =_mm_loadu_si128 ((__m128i*)p2);
}
int mod = len % 16;
if(mod>0) {
ResultMask = _mm_cmpestrm (smm1,mod,smm2,mod,MODE);
diffs += popcnt128(ResultMask);
}
return diffs;
}
So I can solve my problem by means of:
for(i=0; i<n; i++) {
int distances[i] = HammingDist(s, strings[i], m);
}
Is this the best I can do or can I use the fact that one of the strings compared is always the same? In addition, should I do some alignment on my arrays to improve performance?
ANOTHER ATTEMPT
Following Harold's recomendation, I have written following code:
void _SSE_hammingDistances(const ByteP str, const ByteP strings, int *ds, const int n, const int m) {
int iters = m / 16;
__m128i *smm1, *smm2, diffs;
for(int j=0; j<n; j++) {
smm1 = (__m128i*) str;
smm2 = (__m128i*) &strings[j*(m+1)]; // m+1, as strings are '\0' terminated
diffs = _mm_setzero_si128();
for (int i = 0; i < iters; i++) {
diffs = _mm_add_epi8(diffs, _mm_cmpeq_epi8(*smm1, *smm2));
smm1 += 1;
smm2 += 1;
}
int s = m;
signed char *ptr = (signed char *) &diffs;
for(int p=0; p<16; p++) {
s += *ptr;
ptr++;
}
*ds = s;
ds++;
}
}
but I am not able to do the final addition of bytes in __m128i by using psadbw. Can anyone please help me with that?

Here's an improved version of your latest routine, which uses PSADBW (_mm_sad_epu8) to eliminate the scalar code:
void hammingDistances_SSE(const uint8_t * str, const uint8_t * strings, int * const ds, const int n, const int m)
{
const int iters = m / 16;
const __m128i smm1 = _mm_loadu_si128((__m128i*)str);
assert((m & 15) == 0); // m must be a multiple of 16
for (int j = 0; j < n; j++)
{
__m128i smm2 = _mm_loadu_si128((__m128i*)&strings[j*(m+1)]); // m+1, as strings are '\0' terminated
__m128i diffs = _mm_setzero_si128();
for (int i = 0; i < iters; i++)
{
diffs = _mm_sub_epi8(diffs, _mm_cmpeq_epi8(smm1, smm2));
}
diffs = _mm_sad_epu8(diffs, _mm_setzero_si128());
ds[j] = m - (_mm_extract_epi16(diffs, 0) + _mm_extract_epi16(diffs, 4));
}
}

compare buffers as fast as possible

I need to compare two buffers chunk-wise for equality. I don't need information about the relation of the two buffers, just if each two chunks are equal or not. My intel machine supports up to SSE4.2
The naive approach is:
const size_t CHUNK_SIZE = 16; //128bit for SSE2 integer registers
const int ARRAY_SIZE = 200000000;
char* array_1 = (char*)_aligned_malloc(ARRAY_SIZE, 16);
char* array_2 = (char*)_aligned_malloc(ARRAY_SIZE, 16);
for (size_t i = 0; i < ARRAY_SIZE; )
{
volatile bool result = memcmp(array_1+i, array_2+i, CHUNK_SIZE);
i += CHUNK_SIZE;
}
Compared to my first try using SSE ever:
union U
{
__m128i m;
volatile int i[4];
} res;
for (size_t i = 0; i < ARRAY_SIZE; )
{
__m128i* pa1 = (__m128i*)(array_1+i);
__m128i* pa2 = (__m128i*)(array_2+i);
res.m = _mm_cmpeq_epi32(*pa1, *pa2);
volatile bool result = ( (res.i[0]==0) || (res.i[1]==0) || (res.i[2]==0) || (res.i[3]==0) );
i += CHUNK_SIZE;
}
The gain in speed is about 33%. Could I do any better?

You really shouldn't be using scalar code and unions to test all the individual vector elements - do something like this instead:
for (size_t i = 0; i < ARRAY_SIZE; i += CHUNK_SIZE)
{
const __m128i a1 = _mm_load_si128(array_1 + i);
const __m128i a2 = _mm_load_si128(array_2 + i);
const __m128i vcmp = _mm_cmpeq_epi32(a1, a2);
const int vmask = _mm_movemask_epi8(vcmp);
const bool result = (vmask == 0xffff);
// you probably want to break here if you get a mismatch ???
}

Since you can use SSE 4.1, there is another alternative that might be faster:
for (size_t i = 0; i < ARRAY_SIZE; i += CHUNK_SIZE;)
{
__m128i* pa1 = (__m128i*)(array_1+i);
__m128i* pa2 = (__m128i*)(array_2+i);
__m128i temp = _mm_xor_si128(*pa1, *pa2);
bool result = (bool)_mm_testz_si128(temp, temp);
}
_mm_testz_si128(a, b) returns 0 if a & b != 0 and it returns 1 if a & b == 0. The advantage is that you can use this version with the new AVX instructions as well, where the chunk size is 32 bytes.

Develop Reference

c reactjs sql-server angularjs arrays wpf database batch-file google-app-engine silverlight

NEON: Optimize code - c

Related

Why is volatile used in this libsodium function?

Is there a memset() function for 16 and/or 32 bit values?

systick example on cortex m3 not working

Computing Hamming distances to several strings with SSE

compare buffers as fast as possible

Categories

Resources