Computing Hamming distances to several strings with SSE - c

I have n (8 bit) character strings all of them of the same length (say m), and another string s of the same length. I need to compute Hamming distances from s to each of the others strings. In plain C, something like:
unsigned char strings[n][m];
unsigned char s[m];
int distances[n];
for(i=0; i<n; i++) {
int distances[i] = 0;
for(j=0; j<m; j++) {
if(strings[i][j] != s[j])
distances[i]++;
}
}
I would like to use SIMD instructions with gcc to perform such computations more efficiently. I have read that PcmpIstrI in SSE 4.2 can be useful and my target computer supports that instruction set, so I would prefer a solution using SSE 4.2.
EDIT:
I wrote following function to compute Hamming distance between two strings:
static inline int popcnt128(__m128i n) {
const __m128i n_hi = _mm_unpackhi_epi64(n, n);
return _mm_popcnt_u64(_mm_cvtsi128_si64(n)) + _mm_popcnt_u64(_mm_cvtsi128_si64(n_hi));
}
int HammingDist(const unsigned char *p1, unsigned const char *p2, const int len) {
#define MODE (_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_EACH | _SIDD_BIT_MASK | _SIDD_NEGATIVE_POLARITY)
__m128i smm1 = _mm_loadu_si128 ((__m128i*) p1);
__m128i smm2 = _mm_loadu_si128 ((__m128i*) p2);
__m128i ResultMask;
int iters = len / 16;
int diffs = 0;
int i;
for(i=0; i<iters; i++) {
ResultMask = _mm_cmpestrm (smm1,16,smm2,16,MODE);
diffs += popcnt128(ResultMask);
p1 = p1+16;
p2 = p2+16;
smm1 = _mm_loadu_si128 ((__m128i*)p1);
smm2 =_mm_loadu_si128 ((__m128i*)p2);
}
int mod = len % 16;
if(mod>0) {
ResultMask = _mm_cmpestrm (smm1,mod,smm2,mod,MODE);
diffs += popcnt128(ResultMask);
}
return diffs;
}
So I can solve my problem by means of:
for(i=0; i<n; i++) {
int distances[i] = HammingDist(s, strings[i], m);
}
Is this the best I can do or can I use the fact that one of the strings compared is always the same? In addition, should I do some alignment on my arrays to improve performance?
ANOTHER ATTEMPT
Following Harold's recomendation, I have written following code:
void _SSE_hammingDistances(const ByteP str, const ByteP strings, int *ds, const int n, const int m) {
int iters = m / 16;
__m128i *smm1, *smm2, diffs;
for(int j=0; j<n; j++) {
smm1 = (__m128i*) str;
smm2 = (__m128i*) &strings[j*(m+1)]; // m+1, as strings are '\0' terminated
diffs = _mm_setzero_si128();
for (int i = 0; i < iters; i++) {
diffs = _mm_add_epi8(diffs, _mm_cmpeq_epi8(*smm1, *smm2));
smm1 += 1;
smm2 += 1;
}
int s = m;
signed char *ptr = (signed char *) &diffs;
for(int p=0; p<16; p++) {
s += *ptr;
ptr++;
}
*ds = s;
ds++;
}
}
but I am not able to do the final addition of bytes in __m128i by using psadbw. Can anyone please help me with that?

Here's an improved version of your latest routine, which uses PSADBW (_mm_sad_epu8) to eliminate the scalar code:
void hammingDistances_SSE(const uint8_t * str, const uint8_t * strings, int * const ds, const int n, const int m)
{
const int iters = m / 16;
const __m128i smm1 = _mm_loadu_si128((__m128i*)str);
assert((m & 15) == 0); // m must be a multiple of 16
for (int j = 0; j < n; j++)
{
__m128i smm2 = _mm_loadu_si128((__m128i*)&strings[j*(m+1)]); // m+1, as strings are '\0' terminated
__m128i diffs = _mm_setzero_si128();
for (int i = 0; i < iters; i++)
{
diffs = _mm_sub_epi8(diffs, _mm_cmpeq_epi8(smm1, smm2));
}
diffs = _mm_sad_epu8(diffs, _mm_setzero_si128());
ds[j] = m - (_mm_extract_epi16(diffs, 0) + _mm_extract_epi16(diffs, 4));
}
}

Related

Why am I getting a segfault in this genetic algorithm problem?

I'm trying to solve a CodeWars problem called "Training on Binary Genetic Algorithms." There is a fitness function that is preloaded. When the program is run, a test function creates a random 35-bit string and it uses my run function which is supposed to return the same 35-bit string. This string is supposed to be found using a genetic algorithm.
Here is my code:
#include <stddef.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <time.h>
typedef double fitness_t (const char *, ...);
extern fitness_t fitness;
void generate (size_t length, char * s)
{
for (size_t i = 0; i < length; i++)
s[i] = rand() % 2 + 48;
}
double sum(size_t n, double ar[n])
{
double sum = 0;
for (int i = 0; i < n; i++)
sum += ar[i];
return sum;
}
void select (int size, char* population[size], double fitnesses[size])
{
double probabilities[size]; // normalized to 1
double r; // random number
int s1, s2;
int i;
for (i = 0; i < size; i++)
probabilities[i] = fitnesses[i] / sum(size, fitnesses);
// select first chromosome
r = (double)(rand() % 1000000) / 1000000; // generates a random float between 0 and 1
for (i = 0; i < size && r > 0; i++)
r -= probabilities[i];
s1 = i;
// select second chromosome
s2 = s1;
while (s2 == s1) // ensures the two chromosomes aren't the same
{
r = (double)(rand() % 1000000) / 1000000; // generates a random float between 0 and 1
for (i = 0; i < size && r > 0; i++)
r -= probabilities[i];
s2 = i;
}
// places these two chromosomes on top
char * temp = population[0];
population[0] = population[s1];
population[s1] = temp;
temp = population[1];
population[1] = population[s2];
population[s2] = temp;
}
void crossover (size_t n, char* s1, char* s2)
{
int r = rand() % n; // select a random bit to cross over at
char temp;
for (size_t i = r; i < n; i++) // swap every bit from bit r to bit n
{
temp = s1[i];
s1[i] = s2[i];
s2[i] = temp;
}
}
void mutate (size_t n, char* s, double p)
{
double r;
for (size_t i = 0; i < n; i++) // for each bit
{
r = (double)(rand() % 1000000) / 1000000; // random float between 0 and 1
if (r <= p) // if random number is less than probability
{
if (s[i] == '1') s[i] = '0'; // swap 0s and 1s
else s[i] = '1';
}
}
}
void bubbleSortPop(int size, char * population[size], double fitnesses[size])
{
int i, j;
char * temp_chrome;
double temp_fitness;
for (i = 0; i < size - 1; i++)
// Last i elements are already in place
for (j = 0; j < size - i - 1; j++)
if (fitnesses[j] < fitnesses[j + 1])
{
temp_chrome = population[j];
population[j] = population[j+1];
population[j+1] = temp_chrome;
temp_fitness = fitnesses[j];
fitnesses[j] = fitnesses[j+1];
fitnesses[j+1] = temp_fitness;
}
}
// this function changes the population.
// select, crossover, mutate
void evolve(fitness_t f, size_t size, int length, char * population[size],
double fitnesses[size], double p_c, double p_m)
{
char * s1, * s2;
double f1, f2;
char * temp_pop[size+2];
double temp_fit[size+2];
int i;
double r;
// moves two selected parents to the top
select(size, population, fitnesses);
// begin reproduction process; duplicate the chromosomes
s1 = population[0];
s2 = population[1];
// crossover
r = (double)(rand() % 1000000) / 1000000; // random float between 0 and 1
if (r < p_c) // probability of crossing over
crossover(length, s1, s2); // commences with crossover
// mutate
mutate(length, s1, p_m);
mutate(length, s2, p_m);
// calculate fitnesses
f1 = f(s1);
f2 = f(s2);
// merge fitneses
// copy original fitnesses into temp_fit
for (i = 0; i < size; i++)
temp_fit[i] = fitnesses[i];
// add new fitnesses
temp_fit[size] = f1;
temp_fit[size+1] = f2;
// merge children into population
// copy original population into temp_pop
for (i = 0; i < size; i++)
temp_pop[i] = population[i];
// add two children to temp_pop
temp_pop[size] = s1;
temp_pop[size+1] = s2;
// sort fitnesses and population
bubbleSortPop(size+2, temp_pop, temp_fit);
// add first 100 elements of temp_pop and fit to population and fitnesses
for (i = 0; i < size; i++)
{
population[i] = temp_pop[i];
fitnesses[i] = temp_fit[i];
}
}
char* runN (fitness_t f, int length, double p_c, double p_m, size_t iterations) {
}
char* run (fitness_t f, int length, double p_c, double p_m)
{
size_t size = 100;
char * population[size];
double fitnesses[size];
size_t i;
int r;
srand(time(0));
// initialize population array
for (i = 0; i < size; i++)
population[i] = malloc((length+1) * sizeof(char));
// generate original population
for (i = 0; i < size; i++)
{
generate(length, population[i]);
fitnesses[i] = f(population[i]);
printf("[%2d] %s %lf\n", i, population[i], fitnesses[i]);
}
// evolve the population
for (i = 0; i < 10; i++)
evolve(f, size, length, population, fitnesses, p_c, p_m);
// print result
printf("\nAFTER EVOLUTION\n");
for (i = 0; i < size; i++) // generates original population
printf("[%2d] %s %lf\n", i, population[i], fitnesses[i]);
// store best chromosome and free memory
char ret[length+1];
strcpy(ret, population[0]);
for (i = 0; i < size; i++)
free(population[i]);
return ret;
}
The issue is when I run my code, it nearly always comes out with a segfault at some point while printing the contents of population and fitness.
At least these problems:
Attempting to print a non-string with "%s"
Code uses "%s" and passes population[i] as if it points to a string. population[i] does not point to a string as it does not certainly have a null character. Result undefined behavior (UB). Perhaps attempting to access beyond allocated memory.
// Undefined behavior: population[i] not a string
printf("[%2d] %s %lf\n", i, population[i], fitnesses[i]);
Set the null character.
generate(length, population[i]);
population[i][length] = '\0'; // Add this here or equivalent in `generate()`.
Many compiler warnings (20+)
Enable all compiler warnings and fix those.
I found the solution. It was all the places where I tried to copy a string by making a string pointer and assigning it the same address as the pointer I wanted to copy. For example, in 'select', when I tried to move the two strings to the top, I did
char * temp = population[0];
population[0] = population[s1];
population[s1] = temp;
temp = population[1];
population[1] = population[s2];
population[s2] = temp;
I changed this to using strcpy(). I made the same mistake in 'evolve' where I tried to duplicate the chromosomes by just copying their address into variables, rather than the strings themselves:
char * s1, * s2;
// begin reproduction process; duplicate the chromosomes
s1 = population[0];
s2 = population[1];
I changed it to this:
char s1[length+1], s2[length+1];
strcpy(s1, population[0]);
strcpy(s2, population[1]);
After I made this change the segfault went away. Thanks for all your answers.

Intel Intrinsics code optimization

So i'm trying to multiply a constant with short int a[101] with intel intrinsics. I have done it with addition but i can't seem to figure why it wont work with multiplication. Also before we used ints of 32 bits and now we use 16 bit short so we can have double as many values in the intrinsics to fill the 128 bit as far as i understand?
naive example of what im trying to do:
int main(int argc, char **argv){
short int a[101];
int len = sizeof(a)/sizeof(short);
/*Populating array a with values 1 to 101*/
mult(len, a);
return 0;
}
int mult(int len, short int *a){
int result = 0;
for(int i=0; i<len; i++){
result += a[i]*20;
}
return result;
}
And my code trying to do the same in intrinsics
/*Same main as before with a short int a[101] containing values 1 to 101*/
int SIMD(int len, short int *a){
int res;
int val[4];
/*Setting constant value to mulitply with*/
__m128i sum = _mm_set1_epi16(20);
__m128i s = _mm_setzero_si128( );
for(int i=0; i<len/4*4; i += 4){
__m128i vec = _mm_loadu_si128((__m128i *)(a+i));
s += _mm_mul_epu32(vec,sum);
}
_mm_storeu_si128((__m128i*) val, s);
res += val[0] + val[1] + val[2] + val[3];
/*Haldeling tail*/
for(int i=len/4*4; i<len; i++){
res += a[i];
}
return res;
}
So i do get a number out as result, but the number does not match the naive method, i have tried other intrinsics and changing numbers to see if it makes any noticable difference but nothing comes close to the output i expect. The computation time is almost the same as the naive at the moment aswell.
There are 8 short in one __m128i. So:
for(int i=0; i<len/4*4; i += 4)
should be
for(int i=0; i<len/8*8; i += 8)`
and:
res += val[0] + val[1] + val[2] + val[3];
should be:
res += val[0] + val[1] + val[2] + val[3] + val[4] + val[5] + val[6] + val[7];
and:
for(int i=len/4*4; i<len; i++)
should be:
for(int i=len/8*8; i<len; i++)
In:
s += _mm_mul_epu32(vec,sum);
_mm_mul_epu32 operates on 32-bit elements. It should be:
s += _mm_mullo_epi16(vec, sum);
The object res is not initialized; it should be:
int res = 0;
Here is working code:
#include <stdio.h>
#include <stdlib.h>
#include <immintrin.h>
// Number of elements in an array.
#define NumberOf(x) (sizeof (x) / sizeof *(x))
// Compute the result with scalar arithmetic.
static int mult(int len, short int *a)
{
int result = 0;
for (size_t i=0; i<len; i++)
{
result += a[i]*20;
}
return result;
}
// Compute the result with SIMD arithmetic.
static int SIMD(int len, short int *a)
{
// Initialize the multiplier and the sum.
__m128i multiplier = _mm_set1_epi16(20);
__m128i s = _mm_setzero_si128( );
// Process blocks of 8 short.
for (int i=0; i<len/8*8; i += 8)
{
__m128i vec = _mm_loadu_si128((__m128i *)(a+i));
// Multtiply by multiplier and add to sum.
s = _mm_add_epi16(s, _mm_mullo_epi16(vec, multiplier));
}
// Store the sum so far so its individual elements can be manipulated.
short val[8];
_mm_storeu_si128((__m128i*) val, s);
// Add the individual elements.
int res = 0;
for (size_t i = 0; i < 8; ++i)
res += val[i];
// Add the elements in the tail.
for (size_t i = len/8*8; i < len; ++i)
{
res += a[i];
}
return res;
}
int main(int argc, char **argv)
{
short int a[96];
int len = NumberOf(a);
// Initiailize a.
for (size_t i = 0; i < len; ++i)
a[i] = i+1;
printf("sum by scalar arithmetic is %d.\n", mult(len, a));
printf("sum by SIMD arithmetic is %d.\n", SIMD(len, a));
return 0;
}

Algorithm for multiplying arbitrary length polynomials?

Let's say we represent polynomials as an array of floats, where the degree of each item in the polynomial corresponds to the index in the array (eg. 4.2x^5+x^2-1.4 would be represented as {-1.4, 0, 1, 0, 0, 4.2}.
My assignment question is to write a method in C that multiples two arbitrary length polynomials and prints out the result (rather than returning it).
Normally, when I ask questions on SO I include what I've attempted so far, but I'm genuinely completely clueless with this one. This is all I have:
void multpoly(float *a, int len_a, float *b, int len_b)
{
for(i = 0; i < len_result; i++)
{
printf(" %.5f, ", product[i]);
}
}
Any help would be greatly appreciated!
I believe this does what you want:
// constraints: 'result' must have space for at least len1 + len2 - 1 elements.
void multpoly(const float *poly1, int len1, const float *poly2, int len2, float *result)
{
int i, p1i, p2i;
int len_result = len1 + len2 - 1;
for (i = 0; i < len_result; i++) result[i] = 0.0;
for (p1i = 0; p1i < len1; ++p1i)
for (p2i = 0; p2i < len2; ++p2i)
result[p1i + p2i] += poly1[p1i] * poly2[p2i];
}
Ideone example of this function
#include <stdio.h>
#include <string.h>
main()
{
float a[5]={1, 2, 3};
float b[5]={2, 0, 1};
printPol(a, b, 3, 3);
}
void printPol(float*a, float*b, int len1, int len2)
{
int i, j;
// order of resulting poly is o1+o2
// o1 = len1 -1
// o2 = len2 -1
// length is order + 1 (+1 is the constant number)
int len = (len1-1)+(len2-1)+1;
float res[len];
//initialize
for(i=0;i<len; i++) res[i] = 0;
for(i=0; i<len1; i++)
for(j=0; j<len2; j++)
{
// mutually multiply all elements
res[i+j] += a[i]*b[j];
}
printf("%f ", res[0]);
for(i=1;i<len; i++) printf("+%f*x^%d ", res[i], i);
}

Search an ordered array in a CUDA kernel

I'm writing a CUDA kernel and each thread has to complete the following task: suppose I have an ordered array a of n unsigned integers (the first one is always 0) stored in shared memory, each thread has to find the array index i such that a[i] ≤ threadIdx.x and a[i + 1] > threadIdx.x.
A naive solution could be:
for (i = 0; i < n - 1; i++)
if (a[i + 1] > threadIdx.x) break;
but I suppose this is not the optimal way to do it... can anyone suggest anything better?
Like Robert, I was thinking that a binary search has got to be faster that a naïve loop -- the upper bound of operation count for a binary search is O(log(n)), compared to O(N) for the loop.
My extremely simple implementation:
#include <iostream>
#include <climits>
#include <assert.h>
__device__ __host__
int midpoint(int a, int b)
{
return a + (b-a)/2;
}
__device__ __host__
int eval(int A[], int i, int val, int imin, int imax)
{
int low = (A[i] <= val);
int high = (A[i+1] > val);
if (low && high) {
return 0;
} else if (low) {
return -1;
} else {
return 1;
}
}
__device__ __host__
int binary_search(int A[], int val, int imin, int imax)
{
while (imax >= imin) {
int imid = midpoint(imin, imax);
int e = eval(A, imid, val, imin, imax);
if(e == 0) {
return imid;
} else if (e < 0) {
imin = imid;
} else {
imax = imid;
}
}
return -1;
}
__device__ __host__
int linear_search(int A[], int val, int imin, int imax)
{
int res = -1;
for(int i=imin; i<(imax-1); i++) {
if (A[i+1] > val) {
res = i;
break;
}
}
return res;
}
template<int version>
__global__
void search(int * source, int * result, int Nin, int Nout)
{
extern __shared__ int buff[];
int tid = threadIdx.x + blockIdx.x*blockDim.x;
int val = INT_MAX;
if (tid < Nin) val = source[threadIdx.x];
buff[threadIdx.x] = val;
__syncthreads();
int res;
switch(version) {
case 0:
res = binary_search(buff, threadIdx.x, 0, blockDim.x);
break;
case 1:
res = linear_search(buff, threadIdx.x, 0, blockDim.x);
break;
}
if (tid < Nout) result[tid] = res;
}
int main(void)
{
const int inputLength = 128000;
const int isize = inputLength * sizeof(int);
const int outputLength = 256;
const int osize = outputLength * sizeof(int);
int * hostInput = new int[inputLength];
int * hostOutput = new int[outputLength];
int * deviceInput;
int * deviceOutput;
for(int i=0; i<inputLength; i++) {
hostInput[i] = -200 + 5*i;
}
cudaMalloc((void**)&deviceInput, isize);
cudaMalloc((void**)&deviceOutput, osize);
cudaMemcpy(deviceInput, hostInput, isize, cudaMemcpyHostToDevice);
dim3 DimBlock(256, 1, 1);
dim3 DimGrid(1, 1, 1);
DimGrid.x = (outputLength / DimBlock.x) +
((outputLength % DimBlock.x > 0) ? 1 : 0);
size_t shmsz = DimBlock.x * sizeof(int);
for(int i=0; i<5; i++) {
search<1><<<DimGrid, DimBlock, shmsz>>>(deviceInput, deviceOutput,
inputLength, outputLength);
}
for(int i=0; i<5; i++) {
search<0><<<DimGrid, DimBlock, shmsz>>>(deviceInput, deviceOutput,
inputLength, outputLength);
}
cudaMemcpy(hostOutput, deviceOutput, osize, cudaMemcpyDeviceToHost);
for(int i=0; i<outputLength; i++) {
int idx = hostOutput[i];
int tidx = i % DimBlock.x;
assert( (hostInput[idx] <= tidx) && (tidx < hostInput[idx+1]) );
}
cudaDeviceReset();
return 0;
}
gave about a five times speed up compared to the loop:
>nvprof a.exe
======== NVPROF is profiling a.exe...
======== Command: a.exe
======== Profiling result:
Time(%) Time Calls Avg Min Max Name
60.11 157.85us 1 157.85us 157.85us 157.85us [CUDA memcpy HtoD]
32.58 85.55us 5 17.11us 16.63us 19.04us void search<int=1>(int*, int*, int, int)
6.52 17.13us 5 3.42us 3.35us 3.73us void search<int=0>(int*, int*, int, int)
0.79 2.08us 1 2.08us 2.08us 2.08us [CUDA memcpy DtoH]
I'm sure that someoneclever could do a lot better than that. But perhaps this gives you at least a few ideas.
can anyone suggest anything better?
A brute force approach would be to have each thread do a binary search (on threadIdx.x + 1).
// sets idx to the index of the first element in a that is
// equal to or larger than key
__device__ void bsearch_range(const int *a, const int key, const unsigned len_a, unsigned *idx){
unsigned lower = 0;
unsigned upper = len_a;
unsigned midpt;
while (lower < upper){
midpt = (lower + upper)>>1;
if (a[midpt] < key) lower = midpt +1;
else upper = midpt;
}
*idx = lower;
return;
}
__global__ void find_my_idx(const int *a, const unsigned len_a, int *my_idx){
unsigned idx = (blockDim.x * blockIdx.x) + threadIdx.x;
unsigned sp_a;
int val = idx+1;
bsearch_range(a, val, len_a, &sp_a);
my_idx[idx] = ((val-1) < a[sp_a]) ? sp_a:-1;
}
This is coded in browser, not tested. It's hacked from a piece of working code, however. If you have trouble making it work, I can revisit it. I don't recommend this approach on a device without caches (cc 1.x device).
This is actually searching on the full unique 1D thread index (blockDim.x * blockIdx.x + threadIdx.x + 1) You can change val to be anything you like.
You could also add an appropriate thread check, if the number of threads you intend to launch is greater than the length of your my_idx result vector.
I imagine there is a more clever approach that may use something akin to prefix sums.
This is the best algorithm so far. It's called: LPW Indexed Search
__global__ void find_position_lpw(int *a, int n)
{
int idx = threadIdx.x;
__shared__ int aux[ MAX_THREADS_PER_BLOCK /*1024*/ ];
aux[idx] = 0;
if (idx < n)
atomicAdd( &aux[a[idx]], 1); // atomics in case there are duplicates
__syncthreads();
int tmp;
for (int j = 1; j <= MAX_THREADS_PER_BLOCK / 2; j <<= 1)
{
if( idx >= j ) tmp = aux[idx - j];
__syncthreads();
if( idx >= j ) aux[idx] += tmp;
__syncthreads();
}
// result in "i"
int i = aux[idx] - 1;
// use "i" here...
// ...
}

compare buffers as fast as possible

I need to compare two buffers chunk-wise for equality. I don't need information about the relation of the two buffers, just if each two chunks are equal or not. My intel machine supports up to SSE4.2
The naive approach is:
const size_t CHUNK_SIZE = 16; //128bit for SSE2 integer registers
const int ARRAY_SIZE = 200000000;
char* array_1 = (char*)_aligned_malloc(ARRAY_SIZE, 16);
char* array_2 = (char*)_aligned_malloc(ARRAY_SIZE, 16);
for (size_t i = 0; i < ARRAY_SIZE; )
{
volatile bool result = memcmp(array_1+i, array_2+i, CHUNK_SIZE);
i += CHUNK_SIZE;
}
Compared to my first try using SSE ever:
union U
{
__m128i m;
volatile int i[4];
} res;
for (size_t i = 0; i < ARRAY_SIZE; )
{
__m128i* pa1 = (__m128i*)(array_1+i);
__m128i* pa2 = (__m128i*)(array_2+i);
res.m = _mm_cmpeq_epi32(*pa1, *pa2);
volatile bool result = ( (res.i[0]==0) || (res.i[1]==0) || (res.i[2]==0) || (res.i[3]==0) );
i += CHUNK_SIZE;
}
The gain in speed is about 33%. Could I do any better?
You really shouldn't be using scalar code and unions to test all the individual vector elements - do something like this instead:
for (size_t i = 0; i < ARRAY_SIZE; i += CHUNK_SIZE)
{
const __m128i a1 = _mm_load_si128(array_1 + i);
const __m128i a2 = _mm_load_si128(array_2 + i);
const __m128i vcmp = _mm_cmpeq_epi32(a1, a2);
const int vmask = _mm_movemask_epi8(vcmp);
const bool result = (vmask == 0xffff);
// you probably want to break here if you get a mismatch ???
}
Since you can use SSE 4.1, there is another alternative that might be faster:
for (size_t i = 0; i < ARRAY_SIZE; i += CHUNK_SIZE;)
{
__m128i* pa1 = (__m128i*)(array_1+i);
__m128i* pa2 = (__m128i*)(array_2+i);
__m128i temp = _mm_xor_si128(*pa1, *pa2);
bool result = (bool)_mm_testz_si128(temp, temp);
}
_mm_testz_si128(a, b) returns 0 if a & b != 0 and it returns 1 if a & b == 0. The advantage is that you can use this version with the new AVX instructions as well, where the chunk size is 32 bytes.

Resources