I have a problem and hope you can help me.
I have a function written in C that returns hash a value. My
headache is when I execute the program from another tool it takes a lot of time to run, probably because inside my function I run a command that hashes my value in SHA256, so I would like to know if there is another way to do it, maybe a function or something like that.
Here is what I have:
const char *EncryptSHA256 (char *Arg1) {
char command[128];
char result[512];
//I want to replace from here
snprintf(command, sizeof command, "echo -n %s | sha256sum | cut -c1-64",Arg1);
FILE *fpipe;
if (0 == (fpipe = (FILE*)popen(command, "r"))) {
perror("popen() failed.");
exit(1);
}
fread(result, 1, 512, fpipe);
pclose(fpipe);
const char *sha256 = &result[0];
//to here
return sha256;
}
Your code has undefined behavior because you return a pointer to result, a local array with automatic storage. Reading from this array by the caller has undefined behavior.
You should at least make result static so its contents remain readable after EncryptSHA256 returns to its caller.
Regarding the inefficiency of the method, here is a public domain implementation of SHA256 that you can use directly inside your program:
/* public domain sha256 implementation based on fips180-3 */
#include <stddef.h>
#include <stdint.h>
#include <string.h>
/* Public API */
struct sha256 {
uint64_t len; /* processed message length */
uint32_t h[8]; /* hash state */
uint8_t buf[64]; /* message block buffer */
};
/* reset state */
void sha256_init(struct sha256 *s);
/* process message */
void sha256_update(struct sha256 *s, const void *m, size_t len);
/* get message digest */
/* state is ruined after sum, keep a copy if multiple sum is needed */
/* part of the message might be left in s, zero it if secrecy is needed */
void sha256_sum(struct sha256 *s, uint8_t md[32]);
/* Implementation */
static uint32_t ror(uint32_t n, int k) {
return (n >> k) | (n << (32 - k));
}
#define Ch(x,y,z) (z ^ (x & (y ^ z)))
#define Maj(x,y,z) ((x & y) | (z & (x | y)))
#define S0(x) (ror(x,2) ^ ror(x,13) ^ ror(x,22))
#define S1(x) (ror(x,6) ^ ror(x,11) ^ ror(x,25))
#define R0(x) (ror(x,7) ^ ror(x,18) ^ (x>>3))
#define R1(x) (ror(x,17) ^ ror(x,19) ^ (x>>10))
static const uint32_t K[64] = {
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
};
static void processblock(struct sha256 *s, const uint8_t *buf) {
uint32_t W[64], t1, t2, a, b, c, d, e, f, g, h;
int i;
for (i = 0; i < 16; i++) {
W[i] = (uint32_t)buf[4 * i + 0] << 24;
W[i] |= (uint32_t)buf[4 * i + 1] << 16;
W[i] |= (uint32_t)buf[4 * i + 2] << 8;
W[i] |= buf[4 * i + 3];
}
for (; i < 64; i++)
W[i] = R1(W[i-2]) + W[i-7] + R0(W[i-15]) + W[i-16];
a = s->h[0];
b = s->h[1];
c = s->h[2];
d = s->h[3];
e = s->h[4];
f = s->h[5];
g = s->h[6];
h = s->h[7];
#define ROUND(a,b,c,d,e,f,g,h,i) \
t1 = h + S1(e) + Ch(e,f,g) + K[i] + W[i]; \
t2 = S0(a) + Maj(a,b,c); \
d += t1; \
h = t1 + t2;
for (i = 0; i < 64; ) {
ROUND(a, b, c, d, e, f, g, h, i); i++;
ROUND(h, a, b, c, d, e, f, g, i); i++;
ROUND(g, h, a, b, c, d, e, f, i); i++;
ROUND(f, g, h, a, b, c, d, e, i); i++;
ROUND(e, f, g, h, a, b, c, d, i); i++;
ROUND(d, e, f, g, h, a, b, c, i); i++;
ROUND(c, d, e, f, g, h, a, b, i); i++;
ROUND(b, c, d, e, f, g, h, a, i); i++;
}
#undef ROUND
s->h[0] += a;
s->h[1] += b;
s->h[2] += c;
s->h[3] += d;
s->h[4] += e;
s->h[5] += f;
s->h[6] += g;
s->h[7] += h;
}
static void pad(struct sha256 *s) {
unsigned r = s->len % 64;
s->buf[r++] = 0x80;
if (r > 56) {
memset(s->buf + r, 0, 64 - r);
r = 0;
processblock(s, s->buf);
}
memset(s->buf + r, 0, 56 - r);
s->len *= 8;
s->buf[56] = s->len >> 56;
s->buf[57] = s->len >> 48;
s->buf[58] = s->len >> 40;
s->buf[59] = s->len >> 32;
s->buf[60] = s->len >> 24;
s->buf[61] = s->len >> 16;
s->buf[62] = s->len >> 8;
s->buf[63] = s->len;
processblock(s, s->buf);
}
void sha256_init(struct sha256 *s) {
s->len = 0;
s->h[0] = 0x6a09e667;
s->h[1] = 0xbb67ae85;
s->h[2] = 0x3c6ef372;
s->h[3] = 0xa54ff53a;
s->h[4] = 0x510e527f;
s->h[5] = 0x9b05688c;
s->h[6] = 0x1f83d9ab;
s->h[7] = 0x5be0cd19;
}
void sha256_sum(struct sha256 *s, uint8_t md[20]) {
int i;
pad(s);
for (i = 0; i < 8; i++) {
md[4 * i + 0] = s->h[i] >> 24;
md[4 * i + 1] = s->h[i] >> 16;
md[4 * i + 2] = s->h[i] >> 8;
md[4 * i + 3] = s->h[i];
}
}
void sha256_update(struct sha256 *s, const void *m, unsigned long len) {
const uint8_t *p = m;
unsigned r = s->len % 64;
s->len += len;
if (r) {
if (len < 64 - r) {
memcpy(s->buf + r, p, len);
return;
}
memcpy(s->buf + r, p, 64 - r);
len -= 64 - r;
p += 64 - r;
processblock(s, s->buf);
}
for (; len >= 64; len -= 64, p += 64)
processblock(s, p);
memcpy(s->buf, p, len);
}
You would change your function to this:
const char *EncryptSHA256(char *Arg1) {
struct sha256 s;
unsigned char md[32];
static char result[65];
sha256_init(&s);
sha256_update(&s, Arg1, strlen(Arg1));
sha256_sum(&s, md);
for (int i = 0; i < 32; i++) {
sprintf(result + i * 2, "%02x", md[i]);
}
return result;
}
You could also change the API to pass an array of 32 unsigned characters to get the binary form if it is more convenient.
I want to create a polynomial with only one term: 6/5 and display it.
It should print this: 6/5X^0
The structure of polynomial is :
typedef struct __poly_struct_t *poly_t;
struct __poly_struct_t{
unsigned int deg;
ratio_t *coeffs;
};
Where ratio_t is an array of rational numbers, it's structure is:
typedef struct __ratio_struct_t{
int64_t num;
int64_t den;
}ratio_t;
I used two functions to construct this polynomial. polyFromRatioArray works: it prints 6/5X^0
poly_t polyFromRatioArray(ratio_t *c, unsigned int degree){
poly_t p = (struct __poly_struct_t*)malloc(sizeof(struct __poly_struct_t));
p->deg = degree;
p->coeffs = c;
return p;
}
The other one made the denominator overflowed: polyFromRatio prints 6/140218959144480X^0
poly_t polyFromRatio(ratio_t c){
return polyFromRatioArray(&c, 0);
}
Main function:
int main(){
ratio_t ra = createRatio((int64_t)6,(int64_t)5);
poly_t p1 = polyFromRatioArray(&ra, 0); // one that works
polyPrint(p1);
poly_t p2 = polyFromRatio(ra); // this doesn't
polyPrint(p2);
free(p1);
free(p2);
return 0;
}
Other fonctions involved:
ratio_t createRatio(int64_t a, int64_t b){
if(b == 0){
printf("Error : a divise by 0 \n");
exit(1);
}
ratio_t r;
int64_t pgcd = gcd(a, b); // gcd(int64_t a, int64_t b) is a function that finds pgcd using Euclid.
r.num = a/pgcd;
r.den = b/pgcd;
return r;
}
int64_t gcd(int64_t a, int64_t b){
int64_t u, v, g;
ext_eucl_div(&u, &v, &g, llabs(a), llabs(b));
return g;
}
void ext_eucl_div(int64_t *u, int64_t *v, int64_t *g, int64_t a, int64_t b){ // this function stocks pgcd of a and b in g
int64_t u1, u2, u3 , v1, v2, v3, q, t1, t2, t3;
int tour = 0;
do{
if(tour == 0){
u1 = 1; u2 = 0; u3 = a; v1 = 0; v2 = 1; v3 = b;
}
else{
u1 = v1; u2 = v2; u3 = v3; v1 = t1; v2 = t2; v3 = t3;
}
q = u3/v3;
t1 = u1 - q*v1;
t2 = u2 - q*v2;
t3 = u3%v3;
tour++;
} while(t3>=1);
*u = v1;
*v = v2;
*g = v3;
}
void polyPrint(poly_t p){
unsigned int i;
for(i=0; i<= p->deg; i++){
if(p->coeffs[i].num != 0){
printRatio(p->coeffs[i]);
if(i != p->deg) printf("X^%u + ", i);
else printf("X^%u\n", i);
}else printf("0\n");
}
}
void printRatio(ratio_t a){
printf("%" PRId64, a.num);
printf("/%" PRId64, a.den);
}
This is very strange, polyFromRatioArray and polyFromRatio seem like doing the same thing but nope.
I have found following SSE2 code written to multiply 2x2 matrix. Can anybody explain me how this code is executing. When I go through the code I feel it just add values into two positions of C(2x2) matrix (C[0],C[3]). lda is the size of the large matrix and A,B and C are 2x2 matrix.
static void simd_2x2(int lda, double* A, double* B, double* C)
{
__m128d a,b1,b2,c1,c2;
c1 = _mm_loadu_pd( C+0*lda ); //load unaligned block in C
c2 = _mm_loadu_pd( C+1*lda );
for( int i = 0; i < 2; ++i )
{
a = _mm_load_pd( A+i*lda );//load aligned i-th column of A
b1 = _mm_load1_pd( B+i+0*lda ); //load i-th row of B
b2 = _mm_load1_pd( B+i+1*lda );
c1=_mm_add_pd( c1, _mm_mul_pd( a, b2 ) ); //rank-1 update
c2=_mm_add_pd( c2, _mm_mul_pd( a, b2 ) );
}
_mm_storeu_pd( C+0*lda, c1 ); //store unaligned block in C
_mm_storeu_pd( C+1*lda, c2 );
}
I'm guessing the source of your confusion is that the double precision intrinsics (_mm_load_pd et al) each process a vector of two double precision values. lda appears to be the stride. So for example:
c1 = _mm_loadu_pd( C+0*lda );
c2 = _mm_loadu_pd( C+1*lda );
loads a 2x2 block of doubles from C, C+1, C+lda, C+lda+1.
you could check the input of the function to make sure if the matrices is initialized correct, I use similar code and got the right output:
#include <stdlib.h>
#include <stdio.h>
#include <emmintrin.h>
#include <xmmintrin.h>
int main(void)
{
double *a, *b, *c;
int a_r = 2, a_c = 2, b_c = 2, b_r = 2;
int i, j, k;
/* allocate memory for matrix one */
a = (double *)malloc(sizeof(double) * a_r * a_r);
for (i = 0; i < a_c * a_c; i++)
{
*(a + i) = 2;
}
/* allocate memory for matrix two */
b = (double *)malloc(sizeof(double *) * b_r * b_r);
for (i = 0; i < b_c * b_c; i++)
{
*(b + i) = 2;
}
/* allocate memory for sum matrix */
c = (double *)malloc(sizeof(double *) * a_r * a_r);
for (i = 0; i < b_c * b_c; i++)
{
*(c + i) = 0;
}
printf("Initializing matrices...\n");
int lda = 2;
__m128d veca, vecb1, vecb2, c1, c2;
c1 = _mm_loadu_pd(c + 0 * lda);
c2 = _mm_loadu_pd(c + 1 * lda);
for (i = 0; i < 2; i++)
{
veca = _mm_load_pd(a);
vecb1 = _mm_load1_pd(b + i + 0 * lda); //load i-th row of B
vecb2 = _mm_load1_pd(b + i + 1 * lda);
//printf("vb10 %f vb11 %f vb20 %f vb21 %f\n", vecb1[0], vecb1[1], vecb2[0], vecb2[1]);
c1 = _mm_add_pd(c1, _mm_mul_pd(veca, vecb1)); //rank-1 update
c2 = _mm_add_pd(c2, _mm_mul_pd(veca, vecb2));
//printf("c10 %f c11 %f c20 %f c21 %f\n", c1[0], c1[1], c2[0], c2[1]);
}
_mm_storeu_pd(c + 0 * lda, c1); //store unaligned block in C
_mm_storeu_pd(c + 1 * lda, c2);
for (i = 0; i < 4; i++)
{
printf("c%d :(%f)\n", i, *(c + i));
}
}
I have n (8 bit) character strings all of them of the same length (say m), and another string s of the same length. I need to compute Hamming distances from s to each of the others strings. In plain C, something like:
unsigned char strings[n][m];
unsigned char s[m];
int distances[n];
for(i=0; i<n; i++) {
int distances[i] = 0;
for(j=0; j<m; j++) {
if(strings[i][j] != s[j])
distances[i]++;
}
}
I would like to use SIMD instructions with gcc to perform such computations more efficiently. I have read that PcmpIstrI in SSE 4.2 can be useful and my target computer supports that instruction set, so I would prefer a solution using SSE 4.2.
EDIT:
I wrote following function to compute Hamming distance between two strings:
static inline int popcnt128(__m128i n) {
const __m128i n_hi = _mm_unpackhi_epi64(n, n);
return _mm_popcnt_u64(_mm_cvtsi128_si64(n)) + _mm_popcnt_u64(_mm_cvtsi128_si64(n_hi));
}
int HammingDist(const unsigned char *p1, unsigned const char *p2, const int len) {
#define MODE (_SIDD_UBYTE_OPS | _SIDD_CMP_EQUAL_EACH | _SIDD_BIT_MASK | _SIDD_NEGATIVE_POLARITY)
__m128i smm1 = _mm_loadu_si128 ((__m128i*) p1);
__m128i smm2 = _mm_loadu_si128 ((__m128i*) p2);
__m128i ResultMask;
int iters = len / 16;
int diffs = 0;
int i;
for(i=0; i<iters; i++) {
ResultMask = _mm_cmpestrm (smm1,16,smm2,16,MODE);
diffs += popcnt128(ResultMask);
p1 = p1+16;
p2 = p2+16;
smm1 = _mm_loadu_si128 ((__m128i*)p1);
smm2 =_mm_loadu_si128 ((__m128i*)p2);
}
int mod = len % 16;
if(mod>0) {
ResultMask = _mm_cmpestrm (smm1,mod,smm2,mod,MODE);
diffs += popcnt128(ResultMask);
}
return diffs;
}
So I can solve my problem by means of:
for(i=0; i<n; i++) {
int distances[i] = HammingDist(s, strings[i], m);
}
Is this the best I can do or can I use the fact that one of the strings compared is always the same? In addition, should I do some alignment on my arrays to improve performance?
ANOTHER ATTEMPT
Following Harold's recomendation, I have written following code:
void _SSE_hammingDistances(const ByteP str, const ByteP strings, int *ds, const int n, const int m) {
int iters = m / 16;
__m128i *smm1, *smm2, diffs;
for(int j=0; j<n; j++) {
smm1 = (__m128i*) str;
smm2 = (__m128i*) &strings[j*(m+1)]; // m+1, as strings are '\0' terminated
diffs = _mm_setzero_si128();
for (int i = 0; i < iters; i++) {
diffs = _mm_add_epi8(diffs, _mm_cmpeq_epi8(*smm1, *smm2));
smm1 += 1;
smm2 += 1;
}
int s = m;
signed char *ptr = (signed char *) &diffs;
for(int p=0; p<16; p++) {
s += *ptr;
ptr++;
}
*ds = s;
ds++;
}
}
but I am not able to do the final addition of bytes in __m128i by using psadbw. Can anyone please help me with that?
Here's an improved version of your latest routine, which uses PSADBW (_mm_sad_epu8) to eliminate the scalar code:
void hammingDistances_SSE(const uint8_t * str, const uint8_t * strings, int * const ds, const int n, const int m)
{
const int iters = m / 16;
const __m128i smm1 = _mm_loadu_si128((__m128i*)str);
assert((m & 15) == 0); // m must be a multiple of 16
for (int j = 0; j < n; j++)
{
__m128i smm2 = _mm_loadu_si128((__m128i*)&strings[j*(m+1)]); // m+1, as strings are '\0' terminated
__m128i diffs = _mm_setzero_si128();
for (int i = 0; i < iters; i++)
{
diffs = _mm_sub_epi8(diffs, _mm_cmpeq_epi8(smm1, smm2));
}
diffs = _mm_sad_epu8(diffs, _mm_setzero_si128());
ds[j] = m - (_mm_extract_epi16(diffs, 0) + _mm_extract_epi16(diffs, 4));
}
}
I need to compare two buffers chunk-wise for equality. I don't need information about the relation of the two buffers, just if each two chunks are equal or not. My intel machine supports up to SSE4.2
The naive approach is:
const size_t CHUNK_SIZE = 16; //128bit for SSE2 integer registers
const int ARRAY_SIZE = 200000000;
char* array_1 = (char*)_aligned_malloc(ARRAY_SIZE, 16);
char* array_2 = (char*)_aligned_malloc(ARRAY_SIZE, 16);
for (size_t i = 0; i < ARRAY_SIZE; )
{
volatile bool result = memcmp(array_1+i, array_2+i, CHUNK_SIZE);
i += CHUNK_SIZE;
}
Compared to my first try using SSE ever:
union U
{
__m128i m;
volatile int i[4];
} res;
for (size_t i = 0; i < ARRAY_SIZE; )
{
__m128i* pa1 = (__m128i*)(array_1+i);
__m128i* pa2 = (__m128i*)(array_2+i);
res.m = _mm_cmpeq_epi32(*pa1, *pa2);
volatile bool result = ( (res.i[0]==0) || (res.i[1]==0) || (res.i[2]==0) || (res.i[3]==0) );
i += CHUNK_SIZE;
}
The gain in speed is about 33%. Could I do any better?
You really shouldn't be using scalar code and unions to test all the individual vector elements - do something like this instead:
for (size_t i = 0; i < ARRAY_SIZE; i += CHUNK_SIZE)
{
const __m128i a1 = _mm_load_si128(array_1 + i);
const __m128i a2 = _mm_load_si128(array_2 + i);
const __m128i vcmp = _mm_cmpeq_epi32(a1, a2);
const int vmask = _mm_movemask_epi8(vcmp);
const bool result = (vmask == 0xffff);
// you probably want to break here if you get a mismatch ???
}
Since you can use SSE 4.1, there is another alternative that might be faster:
for (size_t i = 0; i < ARRAY_SIZE; i += CHUNK_SIZE;)
{
__m128i* pa1 = (__m128i*)(array_1+i);
__m128i* pa2 = (__m128i*)(array_2+i);
__m128i temp = _mm_xor_si128(*pa1, *pa2);
bool result = (bool)_mm_testz_si128(temp, temp);
}
_mm_testz_si128(a, b) returns 0 if a & b != 0 and it returns 1 if a & b == 0. The advantage is that you can use this version with the new AVX instructions as well, where the chunk size is 32 bytes.