Dot production using sse - c

#define Size 50000
void main()
{
unsigned char *arry1 = (unsigned char*)malloc(sizeof(unsigned char)* Size);
unsigned char *arry2 = (unsigned char*)malloc(sizeof(unsigned char)* Size);
unsigned int *result = (unsigned int*)malloc(sizeof(unsigned int)* Size);
for (int i = 0; i < 16; i++)
{
arry1[i] = i;
arry2[i] = i;
}
__m128i Z = _mm_setzero_si128();
__m128i vsum = _mm_setzero_si128();
//__m128i dummy = _mm_setzero_si128();
for (int j = 0; j < 16; j += 16)
{
//printf("%d\n\n", j);
__m128i test1 = _mm_setzero_si128();
test1 = _mm_loadu_si128((__m128i*)&arry1[j]);
__m128i test2 = _mm_setzero_si128();
test2 = _mm_loadu_si128((__m128i*)&arry2[j]);
__m128i s16L = _mm_unpacklo_epi8(test1, Z);
__m128i s16H = _mm_unpackhi_epi8(test1, Z);
__m128i s32LL = _mm_unpacklo_epi16(s16L, Z);
__m128i s32LH = _mm_unpackhi_epi16(s16L, Z);
__m128i s32HL = _mm_unpacklo_epi16(s16H, Z);
__m128i s32HH = _mm_unpackhi_epi16(s16H, Z);
__m128i t16L = _mm_unpacklo_epi8(test2, Z);
__m128i t16H = _mm_unpackhi_epi8(test2, Z);
__m128i t32LL = _mm_unpacklo_epi16(t16L, Z);
__m128i t32LH = _mm_unpackhi_epi16(t16L, Z);
__m128i t32HL = _mm_unpacklo_epi16(t16H, Z);
__m128i t32HH = _mm_unpackhi_epi16(t16H, Z);
__m128 s1 = _mm_cvtepi32_ps(s32LL);
__m128 s2 = _mm_cvtepi32_ps(s32LH);
__m128 s3 = _mm_cvtepi32_ps(s32HL);
__m128 s4 = _mm_cvtepi32_ps(s32HH);
__m128 t1 = _mm_cvtepi32_ps(t32LL);
__m128 t2 = _mm_cvtepi32_ps(t32LH);
__m128 t3 = _mm_cvtepi32_ps(t32HL);
__m128 t4 = _mm_cvtepi32_ps(t32HH);
s1 = _mm_mul_ps(s1, t1);
s2 = _mm_mul_ps(s2, t2);
s3 = _mm_mul_ps(s3, t3);
s4 = _mm_mul_ps(s4, t4);
s1 = _mm_hadd_ps(s1, s2);//41,13
s3 = _mm_hadd_ps(s3, s4); //313,221
vsum = _mm_cvtps_epi32(s3);
for (int k = 0; k < 16; k++)
{
printf("%u\n", (unsigned char)vsum.m128i_i8[k]);
}
s1 = _mm_hadd_ps(s1, s3); //734, 14
s1 = _mm_hadd_ps(s1, s1); //1100,140
s1 = _mm_hadd_ps(s1, s1); //1240
}
}
I progressing dot production using sse. I'm using _mm_mul_ps and _mm_hadd_ps instruction not _mm_dp_ps.
If values that after _mm_hadd_ps function is over 255, it is displayed wrong value.
For example, correct value of s3 is {0,0,0,421,0,0,0,313,0,0,0,221,0,0,0,145}.
But {0,0,1,165,0,0,1,57,0,0,0,221,0,0,0,145} is printed. Is this the result I declared arry1, arry2 as unsigned char? I know 255 is max value by 8-bit.

I can see here some problems:
1) If you want to calculate dot product of 50000 uint8_t values, it is OK. But dot product of 70000 values can cause of overflow of uint32_t type. Therefore using of uint64_t is better solution.
2) To calculate dot product of integer vectors it is not necessary to use float point numbers. It is more effective to use for calculation only integers.
There is an example of SSE2 function which calculates dot product for two uint8_t vectors:
#include <algorithm>
#include <emmintrin.h>
const __m128i Z = _mm_setzero_si128();
const size_t A = sizeof(__m128i);
const size_t B = 0x10000;
inline __m128i DotProduct32(const uint8_t * a, const uint8_t * b)
{
__m128i _a = _mm_loadu_si128((__m128i*)a);
__m128i _b = _mm_loadu_si128((__m128i*)b);
__m128i lo = _mm_madd_epi16(_mm_unpacklo_epi8(_a, Z), _mm_unpacklo_epi8(_b, Z));
__m128i hi = _mm_madd_epi16(_mm_unpackhi_epi8(_a, Z), _mm_unpackhi_epi8(_b, Z));
return _mm_add_epi32(lo, hi);
}
inline __m128i HorizontalSum32(__m128i a)
{
return _mm_add_epi64(_mm_unpacklo_epi32(a, Z), _mm_unpackhi_epi32(a, Z));
}
inline uint64_t ExtractSum64(__m128i a)
{
uint64_t _a[2];
_mm_storeu_si128((__m128i*)_a, a);
return _a[0] + _a[1];
}
void DotProduct(const uint8_t * a, const uint8_t * b, size_t size, uint64_t * sum)
{
size_t blockNumber = (size + B - 1)/B;
size_t alignedSize = size/A*A;
size_t i = 0;
__m128i sum64 = Z;
for (size_t block = 0; block < blockNumber; ++i)
{
__m128i sum32 = Z;
for (size_t blockEnd = std::min(alignedSize, i + B); i < blockEnd; i += A)
sum32 = _mm_add_epi32(sum32, DotProduct32(a + i, b + i));
sum64 = _mm_add_epi64(sum64, HorizontalSum32(sum32));
}
*sum = ExtractSum64(sum64);
for (; i < size; ++i)
*sum += a[i] * b[i];
}

Related

Hash value from C function

I have a problem and hope you can help me.
I have a function written in C that returns hash a value. My
headache is when I execute the program from another tool it takes a lot of time to run, probably because inside my function I run a command that hashes my value in SHA256, so I would like to know if there is another way to do it, maybe a function or something like that.
Here is what I have:
const char *EncryptSHA256 (char *Arg1) {
char command[128];
char result[512];
//I want to replace from here
snprintf(command, sizeof command, "echo -n %s | sha256sum | cut -c1-64",Arg1);
FILE *fpipe;
if (0 == (fpipe = (FILE*)popen(command, "r"))) {
perror("popen() failed.");
exit(1);
}
fread(result, 1, 512, fpipe);
pclose(fpipe);
const char *sha256 = &result[0];
//to here
return sha256;
}
Your code has undefined behavior because you return a pointer to result, a local array with automatic storage. Reading from this array by the caller has undefined behavior.
You should at least make result static so its contents remain readable after EncryptSHA256 returns to its caller.
Regarding the inefficiency of the method, here is a public domain implementation of SHA256 that you can use directly inside your program:
/* public domain sha256 implementation based on fips180-3 */
#include <stddef.h>
#include <stdint.h>
#include <string.h>
/* Public API */
struct sha256 {
uint64_t len; /* processed message length */
uint32_t h[8]; /* hash state */
uint8_t buf[64]; /* message block buffer */
};
/* reset state */
void sha256_init(struct sha256 *s);
/* process message */
void sha256_update(struct sha256 *s, const void *m, size_t len);
/* get message digest */
/* state is ruined after sum, keep a copy if multiple sum is needed */
/* part of the message might be left in s, zero it if secrecy is needed */
void sha256_sum(struct sha256 *s, uint8_t md[32]);
/* Implementation */
static uint32_t ror(uint32_t n, int k) {
return (n >> k) | (n << (32 - k));
}
#define Ch(x,y,z) (z ^ (x & (y ^ z)))
#define Maj(x,y,z) ((x & y) | (z & (x | y)))
#define S0(x) (ror(x,2) ^ ror(x,13) ^ ror(x,22))
#define S1(x) (ror(x,6) ^ ror(x,11) ^ ror(x,25))
#define R0(x) (ror(x,7) ^ ror(x,18) ^ (x>>3))
#define R1(x) (ror(x,17) ^ ror(x,19) ^ (x>>10))
static const uint32_t K[64] = {
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
};
static void processblock(struct sha256 *s, const uint8_t *buf) {
uint32_t W[64], t1, t2, a, b, c, d, e, f, g, h;
int i;
for (i = 0; i < 16; i++) {
W[i] = (uint32_t)buf[4 * i + 0] << 24;
W[i] |= (uint32_t)buf[4 * i + 1] << 16;
W[i] |= (uint32_t)buf[4 * i + 2] << 8;
W[i] |= buf[4 * i + 3];
}
for (; i < 64; i++)
W[i] = R1(W[i-2]) + W[i-7] + R0(W[i-15]) + W[i-16];
a = s->h[0];
b = s->h[1];
c = s->h[2];
d = s->h[3];
e = s->h[4];
f = s->h[5];
g = s->h[6];
h = s->h[7];
#define ROUND(a,b,c,d,e,f,g,h,i) \
t1 = h + S1(e) + Ch(e,f,g) + K[i] + W[i]; \
t2 = S0(a) + Maj(a,b,c); \
d += t1; \
h = t1 + t2;
for (i = 0; i < 64; ) {
ROUND(a, b, c, d, e, f, g, h, i); i++;
ROUND(h, a, b, c, d, e, f, g, i); i++;
ROUND(g, h, a, b, c, d, e, f, i); i++;
ROUND(f, g, h, a, b, c, d, e, i); i++;
ROUND(e, f, g, h, a, b, c, d, i); i++;
ROUND(d, e, f, g, h, a, b, c, i); i++;
ROUND(c, d, e, f, g, h, a, b, i); i++;
ROUND(b, c, d, e, f, g, h, a, i); i++;
}
#undef ROUND
s->h[0] += a;
s->h[1] += b;
s->h[2] += c;
s->h[3] += d;
s->h[4] += e;
s->h[5] += f;
s->h[6] += g;
s->h[7] += h;
}
static void pad(struct sha256 *s) {
unsigned r = s->len % 64;
s->buf[r++] = 0x80;
if (r > 56) {
memset(s->buf + r, 0, 64 - r);
r = 0;
processblock(s, s->buf);
}
memset(s->buf + r, 0, 56 - r);
s->len *= 8;
s->buf[56] = s->len >> 56;
s->buf[57] = s->len >> 48;
s->buf[58] = s->len >> 40;
s->buf[59] = s->len >> 32;
s->buf[60] = s->len >> 24;
s->buf[61] = s->len >> 16;
s->buf[62] = s->len >> 8;
s->buf[63] = s->len;
processblock(s, s->buf);
}
void sha256_init(struct sha256 *s) {
s->len = 0;
s->h[0] = 0x6a09e667;
s->h[1] = 0xbb67ae85;
s->h[2] = 0x3c6ef372;
s->h[3] = 0xa54ff53a;
s->h[4] = 0x510e527f;
s->h[5] = 0x9b05688c;
s->h[6] = 0x1f83d9ab;
s->h[7] = 0x5be0cd19;
}
void sha256_sum(struct sha256 *s, uint8_t md[20]) {
int i;
pad(s);
for (i = 0; i < 8; i++) {
md[4 * i + 0] = s->h[i] >> 24;
md[4 * i + 1] = s->h[i] >> 16;
md[4 * i + 2] = s->h[i] >> 8;
md[4 * i + 3] = s->h[i];
}
}
void sha256_update(struct sha256 *s, const void *m, unsigned long len) {
const uint8_t *p = m;
unsigned r = s->len % 64;
s->len += len;
if (r) {
if (len < 64 - r) {
memcpy(s->buf + r, p, len);
return;
}
memcpy(s->buf + r, p, 64 - r);
len -= 64 - r;
p += 64 - r;
processblock(s, s->buf);
}
for (; len >= 64; len -= 64, p += 64)
processblock(s, p);
memcpy(s->buf, p, len);
}
You would change your function to this:
const char *EncryptSHA256(char *Arg1) {
struct sha256 s;
unsigned char md[32];
static char result[65];
sha256_init(&s);
sha256_update(&s, Arg1, strlen(Arg1));
sha256_sum(&s, md);
for (int i = 0; i < 32; i++) {
sprintf(result + i * 2, "%02x", md[i]);
}
return result;
}
You could also change the API to pass an array of 32 unsigned characters to get the binary form if it is more convenient.

A int64_t value overflows in a structure in C

I want to create a polynomial with only one term: 6/5 and display it.
It should print this: 6/5X^0
The structure of polynomial is :
typedef struct __poly_struct_t *poly_t;
struct __poly_struct_t{
unsigned int deg;
ratio_t *coeffs;
};
Where ratio_t is an array of rational numbers, it's structure is:
typedef struct __ratio_struct_t{
int64_t num;
int64_t den;
}ratio_t;
I used two functions to construct this polynomial. polyFromRatioArray works: it prints 6/5X^0
poly_t polyFromRatioArray(ratio_t *c, unsigned int degree){
poly_t p = (struct __poly_struct_t*)malloc(sizeof(struct __poly_struct_t));
p->deg = degree;
p->coeffs = c;
return p;
}
The other one made the denominator overflowed: polyFromRatio prints 6/140218959144480X^0
poly_t polyFromRatio(ratio_t c){
return polyFromRatioArray(&c, 0);
}
Main function:
int main(){
ratio_t ra = createRatio((int64_t)6,(int64_t)5);
poly_t p1 = polyFromRatioArray(&ra, 0); // one that works
polyPrint(p1);
poly_t p2 = polyFromRatio(ra); // this doesn't
polyPrint(p2);
free(p1);
free(p2);
return 0;
}
Other fonctions involved:
ratio_t createRatio(int64_t a, int64_t b){
if(b == 0){
printf("Error : a divise by 0 \n");
exit(1);
}
ratio_t r;
int64_t pgcd = gcd(a, b); // gcd(int64_t a, int64_t b) is a function that finds pgcd using Euclid.
r.num = a/pgcd;
r.den = b/pgcd;
return r;
}
int64_t gcd(int64_t a, int64_t b){
int64_t u, v, g;
ext_eucl_div(&u, &v, &g, llabs(a), llabs(b));
return g;
}
void ext_eucl_div(int64_t *u, int64_t *v, int64_t *g, int64_t a, int64_t b){ // this function stocks pgcd of a and b in g
int64_t u1, u2, u3 , v1, v2, v3, q, t1, t2, t3;
int tour = 0;
do{
if(tour == 0){
u1 = 1; u2 = 0; u3 = a; v1 = 0; v2 = 1; v3 = b;
}
else{
u1 = v1; u2 = v2; u3 = v3; v1 = t1; v2 = t2; v3 = t3;
}
q = u3/v3;
t1 = u1 - q*v1;
t2 = u2 - q*v2;
t3 = u3%v3;
tour++;
} while(t3>=1);
*u = v1;
*v = v2;
*g = v3;
}
void polyPrint(poly_t p){
unsigned int i;
for(i=0; i<= p->deg; i++){
if(p->coeffs[i].num != 0){
printRatio(p->coeffs[i]);
if(i != p->deg) printf("X^%u + ", i);
else printf("X^%u\n", i);
}else printf("0\n");
}
}
void printRatio(ratio_t a){
printf("%" PRId64, a.num);
printf("/%" PRId64, a.den);
}
This is very strange, polyFromRatioArray and polyFromRatio seem like doing the same thing but nope.

How the following following SSE2 code read data

I have found following SSE2 code written to multiply 2x2 matrix. Can anybody explain me how this code is executing. When I go through the code I feel it just add values into two positions of C(2x2) matrix (C[0],C[3]). lda is the size of the large matrix and A,B and C are 2x2 matrix.
static void simd_2x2(int lda, double* A, double* B, double* C)
{
__m128d a,b1,b2,c1,c2;
c1 = _mm_loadu_pd( C+0*lda ); //load unaligned block in C
c2 = _mm_loadu_pd( C+1*lda );
for( int i = 0; i < 2; ++i )
{
a = _mm_load_pd( A+i*lda );//load aligned i-th column of A
b1 = _mm_load1_pd( B+i+0*lda ); //load i-th row of B
b2 = _mm_load1_pd( B+i+1*lda );
c1=_mm_add_pd( c1, _mm_mul_pd( a, b2 ) ); //rank-1 update
c2=_mm_add_pd( c2, _mm_mul_pd( a, b2 ) );
}
_mm_storeu_pd( C+0*lda, c1 ); //store unaligned block in C
_mm_storeu_pd( C+1*lda, c2 );
}
I'm guessing the source of your confusion is that the double precision intrinsics (_mm_load_pd et al) each process a vector of two double precision values. lda appears to be the stride. So for example:
c1 = _mm_loadu_pd( C+0*lda );
c2 = _mm_loadu_pd( C+1*lda );
loads a 2x2 block of doubles from C, C+1, C+lda, C+lda+1.
you could check the input of the function to make sure if the matrices is initialized correct, I use similar code and got the right output:
#include <stdlib.h>
#include <stdio.h>
#include <emmintrin.h>
#include <xmmintrin.h>
int main(void)
{
double *a, *b, *c;
int a_r = 2, a_c = 2, b_c = 2, b_r = 2;
int i, j, k;
/* allocate memory for matrix one */
a = (double *)malloc(sizeof(double) * a_r * a_r);
for (i = 0; i < a_c * a_c; i++)
{
*(a + i) = 2;
}
/* allocate memory for matrix two */
b = (double *)malloc(sizeof(double *) * b_r * b_r);
for (i = 0; i < b_c * b_c; i++)
{
*(b + i) = 2;
}
/* allocate memory for sum matrix */
c = (double *)malloc(sizeof(double *) * a_r * a_r);
for (i = 0; i < b_c * b_c; i++)
{
*(c + i) = 0;
}
printf("Initializing matrices...\n");
int lda = 2;
__m128d veca, vecb1, vecb2, c1, c2;
c1 = _mm_loadu_pd(c + 0 * lda);
c2 = _mm_loadu_pd(c + 1 * lda);
for (i = 0; i < 2; i++)
{
veca = _mm_load_pd(a);
vecb1 = _mm_load1_pd(b + i + 0 * lda); //load i-th row of B
vecb2 = _mm_load1_pd(b + i + 1 * lda);
//printf("vb10 %f vb11 %f vb20 %f vb21 %f\n", vecb1[0], vecb1[1], vecb2[0], vecb2[1]);
c1 = _mm_add_pd(c1, _mm_mul_pd(veca, vecb1)); //rank-1 update
c2 = _mm_add_pd(c2, _mm_mul_pd(veca, vecb2));
//printf("c10 %f c11 %f c20 %f c21 %f\n", c1[0], c1[1], c2[0], c2[1]);
}
_mm_storeu_pd(c + 0 * lda, c1); //store unaligned block in C
_mm_storeu_pd(c + 1 * lda, c2);
for (i = 0; i < 4; i++)
{
printf("c%d :(%f)\n", i, *(c + i));
}
}

Computing Hamming distances to several strings with SSE

I have n (8 bit) character strings all of them of the same length (say m), and another string s of the same length. I need to compute Hamming distances from s to each of the others strings. In plain C, something like:
unsigned char strings[n][m];
unsigned char s[m];
int distances[n];
for(i=0; i<n; i++) {
int distances[i] = 0;
for(j=0; j<m; j++) {
if(strings[i][j] != s[j])
distances[i]++;
}
}
I would like to use SIMD instructions with gcc to perform such computations more efficiently. I have read that PcmpIstrI in SSE 4.2 can be useful and my target computer supports that instruction set, so I would prefer a solution using SSE 4.2.
EDIT:
I wrote following function to compute Hamming distance between two strings:
static inline int popcnt128(__m128i n) {
const __m128i n_hi = _mm_unpackhi_epi64(n, n);
return _mm_popcnt_u64(_mm_cvtsi128_si64(n)) + _mm_popcnt_u64(_mm_cvtsi128_si64(n_hi));
}
int HammingDist(const unsigned char *p1, unsigned const char *p2, const int len) {
#define MODE (_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_EACH | _SIDD_BIT_MASK | _SIDD_NEGATIVE_POLARITY)
__m128i smm1 = _mm_loadu_si128 ((__m128i*) p1);
__m128i smm2 = _mm_loadu_si128 ((__m128i*) p2);
__m128i ResultMask;
int iters = len / 16;
int diffs = 0;
int i;
for(i=0; i<iters; i++) {
ResultMask = _mm_cmpestrm (smm1,16,smm2,16,MODE);
diffs += popcnt128(ResultMask);
p1 = p1+16;
p2 = p2+16;
smm1 = _mm_loadu_si128 ((__m128i*)p1);
smm2 =_mm_loadu_si128 ((__m128i*)p2);
}
int mod = len % 16;
if(mod>0) {
ResultMask = _mm_cmpestrm (smm1,mod,smm2,mod,MODE);
diffs += popcnt128(ResultMask);
}
return diffs;
}
So I can solve my problem by means of:
for(i=0; i<n; i++) {
int distances[i] = HammingDist(s, strings[i], m);
}
Is this the best I can do or can I use the fact that one of the strings compared is always the same? In addition, should I do some alignment on my arrays to improve performance?
ANOTHER ATTEMPT
Following Harold's recomendation, I have written following code:
void _SSE_hammingDistances(const ByteP str, const ByteP strings, int *ds, const int n, const int m) {
int iters = m / 16;
__m128i *smm1, *smm2, diffs;
for(int j=0; j<n; j++) {
smm1 = (__m128i*) str;
smm2 = (__m128i*) &strings[j*(m+1)]; // m+1, as strings are '\0' terminated
diffs = _mm_setzero_si128();
for (int i = 0; i < iters; i++) {
diffs = _mm_add_epi8(diffs, _mm_cmpeq_epi8(*smm1, *smm2));
smm1 += 1;
smm2 += 1;
}
int s = m;
signed char *ptr = (signed char *) &diffs;
for(int p=0; p<16; p++) {
s += *ptr;
ptr++;
}
*ds = s;
ds++;
}
}
but I am not able to do the final addition of bytes in __m128i by using psadbw. Can anyone please help me with that?
Here's an improved version of your latest routine, which uses PSADBW (_mm_sad_epu8) to eliminate the scalar code:
void hammingDistances_SSE(const uint8_t * str, const uint8_t * strings, int * const ds, const int n, const int m)
{
const int iters = m / 16;
const __m128i smm1 = _mm_loadu_si128((__m128i*)str);
assert((m & 15) == 0); // m must be a multiple of 16
for (int j = 0; j < n; j++)
{
__m128i smm2 = _mm_loadu_si128((__m128i*)&strings[j*(m+1)]); // m+1, as strings are '\0' terminated
__m128i diffs = _mm_setzero_si128();
for (int i = 0; i < iters; i++)
{
diffs = _mm_sub_epi8(diffs, _mm_cmpeq_epi8(smm1, smm2));
}
diffs = _mm_sad_epu8(diffs, _mm_setzero_si128());
ds[j] = m - (_mm_extract_epi16(diffs, 0) + _mm_extract_epi16(diffs, 4));
}
}

compare buffers as fast as possible

I need to compare two buffers chunk-wise for equality. I don't need information about the relation of the two buffers, just if each two chunks are equal or not. My intel machine supports up to SSE4.2
The naive approach is:
const size_t CHUNK_SIZE = 16; //128bit for SSE2 integer registers
const int ARRAY_SIZE = 200000000;
char* array_1 = (char*)_aligned_malloc(ARRAY_SIZE, 16);
char* array_2 = (char*)_aligned_malloc(ARRAY_SIZE, 16);
for (size_t i = 0; i < ARRAY_SIZE; )
{
volatile bool result = memcmp(array_1+i, array_2+i, CHUNK_SIZE);
i += CHUNK_SIZE;
}
Compared to my first try using SSE ever:
union U
{
__m128i m;
volatile int i[4];
} res;
for (size_t i = 0; i < ARRAY_SIZE; )
{
__m128i* pa1 = (__m128i*)(array_1+i);
__m128i* pa2 = (__m128i*)(array_2+i);
res.m = _mm_cmpeq_epi32(*pa1, *pa2);
volatile bool result = ( (res.i[0]==0) || (res.i[1]==0) || (res.i[2]==0) || (res.i[3]==0) );
i += CHUNK_SIZE;
}
The gain in speed is about 33%. Could I do any better?
You really shouldn't be using scalar code and unions to test all the individual vector elements - do something like this instead:
for (size_t i = 0; i < ARRAY_SIZE; i += CHUNK_SIZE)
{
const __m128i a1 = _mm_load_si128(array_1 + i);
const __m128i a2 = _mm_load_si128(array_2 + i);
const __m128i vcmp = _mm_cmpeq_epi32(a1, a2);
const int vmask = _mm_movemask_epi8(vcmp);
const bool result = (vmask == 0xffff);
// you probably want to break here if you get a mismatch ???
}
Since you can use SSE 4.1, there is another alternative that might be faster:
for (size_t i = 0; i < ARRAY_SIZE; i += CHUNK_SIZE;)
{
__m128i* pa1 = (__m128i*)(array_1+i);
__m128i* pa2 = (__m128i*)(array_2+i);
__m128i temp = _mm_xor_si128(*pa1, *pa2);
bool result = (bool)_mm_testz_si128(temp, temp);
}
_mm_testz_si128(a, b) returns 0 if a & b != 0 and it returns 1 if a & b == 0. The advantage is that you can use this version with the new AVX instructions as well, where the chunk size is 32 bytes.

Resources