I have a problem and hope you can help me.
I have a function written in C that returns hash a value. My
headache is when I execute the program from another tool it takes a lot of time to run, probably because inside my function I run a command that hashes my value in SHA256, so I would like to know if there is another way to do it, maybe a function or something like that.
Here is what I have:
const char *EncryptSHA256 (char *Arg1) {
char command[128];
char result[512];
//I want to replace from here
snprintf(command, sizeof command, "echo -n %s | sha256sum | cut -c1-64",Arg1);
FILE *fpipe;
if (0 == (fpipe = (FILE*)popen(command, "r"))) {
perror("popen() failed.");
exit(1);
}
fread(result, 1, 512, fpipe);
pclose(fpipe);
const char *sha256 = &result[0];
//to here
return sha256;
}
Your code has undefined behavior because you return a pointer to result, a local array with automatic storage. Reading from this array by the caller has undefined behavior.
You should at least make result static so its contents remain readable after EncryptSHA256 returns to its caller.
Regarding the inefficiency of the method, here is a public domain implementation of SHA256 that you can use directly inside your program:
/* public domain sha256 implementation based on fips180-3 */
#include <stddef.h>
#include <stdint.h>
#include <string.h>
/* Public API */
struct sha256 {
uint64_t len; /* processed message length */
uint32_t h[8]; /* hash state */
uint8_t buf[64]; /* message block buffer */
};
/* reset state */
void sha256_init(struct sha256 *s);
/* process message */
void sha256_update(struct sha256 *s, const void *m, size_t len);
/* get message digest */
/* state is ruined after sum, keep a copy if multiple sum is needed */
/* part of the message might be left in s, zero it if secrecy is needed */
void sha256_sum(struct sha256 *s, uint8_t md[32]);
/* Implementation */
static uint32_t ror(uint32_t n, int k) {
return (n >> k) | (n << (32 - k));
}
#define Ch(x,y,z) (z ^ (x & (y ^ z)))
#define Maj(x,y,z) ((x & y) | (z & (x | y)))
#define S0(x) (ror(x,2) ^ ror(x,13) ^ ror(x,22))
#define S1(x) (ror(x,6) ^ ror(x,11) ^ ror(x,25))
#define R0(x) (ror(x,7) ^ ror(x,18) ^ (x>>3))
#define R1(x) (ror(x,17) ^ ror(x,19) ^ (x>>10))
static const uint32_t K[64] = {
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
};
static void processblock(struct sha256 *s, const uint8_t *buf) {
uint32_t W[64], t1, t2, a, b, c, d, e, f, g, h;
int i;
for (i = 0; i < 16; i++) {
W[i] = (uint32_t)buf[4 * i + 0] << 24;
W[i] |= (uint32_t)buf[4 * i + 1] << 16;
W[i] |= (uint32_t)buf[4 * i + 2] << 8;
W[i] |= buf[4 * i + 3];
}
for (; i < 64; i++)
W[i] = R1(W[i-2]) + W[i-7] + R0(W[i-15]) + W[i-16];
a = s->h[0];
b = s->h[1];
c = s->h[2];
d = s->h[3];
e = s->h[4];
f = s->h[5];
g = s->h[6];
h = s->h[7];
#define ROUND(a,b,c,d,e,f,g,h,i) \
t1 = h + S1(e) + Ch(e,f,g) + K[i] + W[i]; \
t2 = S0(a) + Maj(a,b,c); \
d += t1; \
h = t1 + t2;
for (i = 0; i < 64; ) {
ROUND(a, b, c, d, e, f, g, h, i); i++;
ROUND(h, a, b, c, d, e, f, g, i); i++;
ROUND(g, h, a, b, c, d, e, f, i); i++;
ROUND(f, g, h, a, b, c, d, e, i); i++;
ROUND(e, f, g, h, a, b, c, d, i); i++;
ROUND(d, e, f, g, h, a, b, c, i); i++;
ROUND(c, d, e, f, g, h, a, b, i); i++;
ROUND(b, c, d, e, f, g, h, a, i); i++;
}
#undef ROUND
s->h[0] += a;
s->h[1] += b;
s->h[2] += c;
s->h[3] += d;
s->h[4] += e;
s->h[5] += f;
s->h[6] += g;
s->h[7] += h;
}
static void pad(struct sha256 *s) {
unsigned r = s->len % 64;
s->buf[r++] = 0x80;
if (r > 56) {
memset(s->buf + r, 0, 64 - r);
r = 0;
processblock(s, s->buf);
}
memset(s->buf + r, 0, 56 - r);
s->len *= 8;
s->buf[56] = s->len >> 56;
s->buf[57] = s->len >> 48;
s->buf[58] = s->len >> 40;
s->buf[59] = s->len >> 32;
s->buf[60] = s->len >> 24;
s->buf[61] = s->len >> 16;
s->buf[62] = s->len >> 8;
s->buf[63] = s->len;
processblock(s, s->buf);
}
void sha256_init(struct sha256 *s) {
s->len = 0;
s->h[0] = 0x6a09e667;
s->h[1] = 0xbb67ae85;
s->h[2] = 0x3c6ef372;
s->h[3] = 0xa54ff53a;
s->h[4] = 0x510e527f;
s->h[5] = 0x9b05688c;
s->h[6] = 0x1f83d9ab;
s->h[7] = 0x5be0cd19;
}
void sha256_sum(struct sha256 *s, uint8_t md[20]) {
int i;
pad(s);
for (i = 0; i < 8; i++) {
md[4 * i + 0] = s->h[i] >> 24;
md[4 * i + 1] = s->h[i] >> 16;
md[4 * i + 2] = s->h[i] >> 8;
md[4 * i + 3] = s->h[i];
}
}
void sha256_update(struct sha256 *s, const void *m, unsigned long len) {
const uint8_t *p = m;
unsigned r = s->len % 64;
s->len += len;
if (r) {
if (len < 64 - r) {
memcpy(s->buf + r, p, len);
return;
}
memcpy(s->buf + r, p, 64 - r);
len -= 64 - r;
p += 64 - r;
processblock(s, s->buf);
}
for (; len >= 64; len -= 64, p += 64)
processblock(s, p);
memcpy(s->buf, p, len);
}
You would change your function to this:
const char *EncryptSHA256(char *Arg1) {
struct sha256 s;
unsigned char md[32];
static char result[65];
sha256_init(&s);
sha256_update(&s, Arg1, strlen(Arg1));
sha256_sum(&s, md);
for (int i = 0; i < 32; i++) {
sprintf(result + i * 2, "%02x", md[i]);
}
return result;
}
You could also change the API to pass an array of 32 unsigned characters to get the binary form if it is more convenient.
I am trying to Solve elliptic curve discrete logarithm using Pollard rho (find k where G=kp), So i searched for implementation in c and i found one after adding problem specific data in the main function i got segmentation fault (core dumped)
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <gmp.h>
#include <limits.h>
#include <sys/time.h>
#include <openssl/ec.h>
#include <openssl/bn.h>
#include <openssl/obj_mac.h> // for NID_secp256k1
#define POLLARD_SET_COUNT 16
#if defined(WIN32) || defined(_WIN32)
#define EXPORT __declspec(dllexport)
#else
#define EXPORT
#endif
#define MAX_RESTART 100
int ec_point_partition(const EC_GROUP *ecgrp, const EC_POINT *x) {
size_t len = EC_POINT_point2oct( ecgrp, x, POINT_CONVERSION_UNCOMPRESSED, NULL, 0, NULL );
unsigned char ret[len];
EC_POINT_point2oct( ecgrp, x, POINT_CONVERSION_UNCOMPRESSED, ret, len, NULL );
int id = ( ret[len - 1] & 0xFF ) % POLLARD_SET_COUNT;
return id;
}
// P generator
// Q result*P
// order of the curve
// result
//Reference: J. Sattler and C. P. Schnorr, "Generating random walks in groups"
int elliptic_pollard_rho_dlog(const EC_GROUP *group, const EC_POINT *P, const EC_POINT *Q, const BIGNUM *order, BIGNUM *res) {
printf("Pollard rho discrete log algorithm... \n");
BN_CTX* ctx;
ctx = BN_CTX_new();
int i, j;
int iterations = 0;
if ( !EC_POINT_is_on_curve(group, P, ctx ) || !EC_POINT_is_on_curve(group, Q, ctx ) ) return 1;
EC_POINT *X1 = EC_POINT_new(group);
EC_POINT *X2 = EC_POINT_new(group);
BIGNUM *c1 = BN_new();
BIGNUM *d1 = BN_new();
BIGNUM *c2 = BN_new();
BIGNUM *d2 = BN_new();
BIGNUM* a[POLLARD_SET_COUNT];
BIGNUM* b[POLLARD_SET_COUNT];
EC_POINT* R[POLLARD_SET_COUNT];
BN_zero(c1); BN_zero(d1);
BN_zero(c2); BN_zero(d2);
for (i = 0; i < POLLARD_SET_COUNT; i++) {
a[i] = BN_new();
b[i] = BN_new();
R[i] = EC_POINT_new(group);
BN_rand_range(a[i], order);
BN_rand_range(b[i], order);
// R = aP + bQ
EC_POINT_mul(group, R[i], a[i], Q, b[i], ctx);
//ep_norm(R[i], R[i]);
}
BN_rand_range(c1, order);
BN_rand_range(d1, order);
// X1 = c1*P + d1*Q
EC_POINT_mul(group, X1, c1, Q, d1, ctx);
//ep_norm(X1, X1);
BN_copy(c2, c1);
BN_copy(d2, d1);
EC_POINT_copy(X2, X1);
double work_time = (double) clock();
do {
j = ec_point_partition(group, X1);
EC_POINT_add(group, X1, X1, R[j], ctx);
BN_mod_add(c1, c1, a[j], order, ctx);
BN_mod_add(d1, d1, b[j], order, ctx);
for (i = 0; i < 2; i++) {
j = ec_point_partition(group, X2);
EC_POINT_add(group, X2, X2, R[j], ctx);
BN_mod_add(c2, c2, a[j], order, ctx);
BN_mod_add(d2, d2, b[j], order, ctx);
}
iterations++;
printf("Iteration %d \r",iterations );
} while ( EC_POINT_cmp(group, X1, X2, ctx) != 0 ) ;
printf("\n ");
work_time = ( (double) clock() - work_time ) / (double)CLOCKS_PER_SEC;
printf("Number of iterations %d %f\n",iterations, work_time );
BN_mod_sub(c1, c1, c2, order, ctx);
BN_mod_sub(d2, d2, d1, order, ctx);
if (BN_is_zero(d2) == 1) return 1;
//d1 = d2^-1 mod order
BN_mod_inverse(d1, d2, order, ctx);
BN_mod_mul(res, c1, d1, order, ctx);
for (int k = 0; k < POLLARD_SET_COUNT; ++k) {
BN_free(a[k]);
BN_free(b[k]);
EC_POINT_free(R[k]);
}
BN_free(c1); BN_free(d1);
BN_free(c2); BN_free(d2);
EC_POINT_free(X1); EC_POINT_free(X2);
BN_CTX_free(ctx);
return 0;
}
int main(int argc, char *argv[])
{
unsigned char *p_str="134747661567386867366256408824228742802669457";
unsigned char *a_str="-1";
unsigned char *b_str="0";
BIGNUM *p = BN_bin2bn(p_str, sizeof(p_str), NULL);
BIGNUM *a = BN_bin2bn(a_str, sizeof(a_str), NULL);
BIGNUM *b = BN_bin2bn(b_str, sizeof(b_str), NULL);
BN_CTX* ctx;
ctx = BN_CTX_new();
EC_GROUP* g = EC_GROUP_new(EC_GFp_simple_method());
EC_GROUP_set_curve_GFp(g,p,a,b,ctx);
unsigned char *XP_str="18185174461194872234733581786593019886770620";
unsigned char *YP_str="74952280828346465277451545812645059041440154";
BN_CTX* ctx1;
ctx1 = BN_CTX_new();
BIGNUM *XP = BN_bin2bn(XP_str, sizeof(XP_str), NULL);
BIGNUM *YP = BN_bin2bn(YP_str, sizeof(YP_str), NULL);
EC_POINT* P = EC_POINT_new(g);
EC_POINT_set_affine_coordinates_GFp(g,P,XP,YP,ctx1);
unsigned char *XQ_str="76468233972358960368422190121977870066985660";
unsigned char *YQ_str="33884872380845276447083435959215308764231090";
BIGNUM* XQ = BN_bin2bn(XQ_str, sizeof(XQ_str), NULL);
BIGNUM* YQ = BN_bin2bn(YQ_str, sizeof(YQ_str), NULL);
EC_POINT *Q = EC_POINT_new(g);
BN_CTX* ctx2;
ctx2 = BN_CTX_new();
EC_POINT_set_affine_coordinates_GFp(g,Q,XQ,YQ,ctx2);
char * str;
unsigned char *N_str="2902021510595963727029";
BIGNUM *N = BN_bin2bn(N_str, sizeof(N_str), NULL);
BIGNUM *res;
elliptic_pollard_rho_dlog (g,P,Q,N,res);
BN_bn2mpi(res,str);
printf("%s\n", str);
return 0;
}
This is the statement that cause segmentation fault
BN_bn2mpi(res,str);
Part 1. Python version.
Update: See new Part 2 of my answer, there I present C++ version of same algorithm as this Python version.
Your task is very interesting!
Maybe you wanted your code to be fixed but instead I decided to implement from scratch pure Python (Part 1 of answer) and pure C++ (Part 2) solutions without using any external non-standard modules. This kind of solutions from scratch without dependencies I think are very useful for educational purposes.
Algorithm like this is quite complex, and Python is easy enough to make implementation of such algorithm possible in short time.
In code below I used help of Wikipedia to implement Pollard's Rho Discrete Logarithm and Elliptic Curve Point Multiplication.
Code doesn't depend on any external modules, it uses just few built-in Python modules. There is a possibility to use gmpy2 module if you install it through python -m pip install gmpy2 and uncomment line #import gmpy2 in code.
You may see that I generate random base point myself and compute its order. I don't use any external curve like BitCoin's secp256k1, or other standard curves.
In the beginning of main() function you can see that I set up bits = 24, this is number of bits for prime modulus of curve, order of curve (number of distinct points) will have about the same bit size. You may set it to bits = 32 to try solving task for bigger curve.
As known, algorithm's complexity is O(Sqrt(Curve_Order)), it takes this many elliptic curve points additions. Points additions are not primitive operations and also take some time. So algorithm run for curve order bit size of bits = 32 takes about 10-15 seconds. While bits = 64 will take a way too long time for Python, but C++ version (that I'm going to implement later) will be fast enough to crack 64 bits within an hour or so.
Sometimes you may notice when running code that it shows that Pollard Rho failed few times, this happens if algorithm tries to find modular inverse of non-invertible number (non-coprime to modulus) both at last step of Pollard Rho and also when computing Infinite Point as a result of elliptic curve point addition. Same kind of failure also happens from time to time in regular Pollard Rho Integer Factorization when GCD is equal to N.
Try it online!
import random
#random.seed(10)
class ECPoint:
gmpy2 = None
#import gmpy2
import random
class InvError(Exception):
def __init__(self, *args):
self.value = args
#classmethod
def Int(cls, x):
return int(x) if cls.gmpy2 is None else cls.gmpy2.mpz(x)
#classmethod
def fermat_prp(cls, n, trials = 32):
# https://en.wikipedia.org/wiki/Fermat_primality_test
if n <= 16:
return n in (2, 3, 5, 7, 11, 13)
for i in range(trials):
if pow(cls.random.randint(2, n - 2), n - 1, n) != 1:
return False
return True
#classmethod
def rand_prime(cls, bits):
while True:
p = cls.random.randrange(1 << (bits - 1), 1 << bits) | 1
if cls.fermat_prp(p):
return p
#classmethod
def base_gen(cls, bits = 128, *, min_order_pfactor = 0):
while True:
while True:
N = cls.rand_prime(bits)
if N % 4 != 3:
continue
x0, y0, A = [cls.random.randrange(1, N) for i in range(3)]
B = (y0 ** 2 - x0 ** 3 - A * x0) % N
y0_calc = pow(x0 ** 3 + A * x0 + B, (N + 1) // 4, N)
if y0 == y0_calc:
break
bp = ECPoint(A, B, N, x0, y0, calc_q = True)
if bp.q is not None and min(bp.q_ps) >= min_order_pfactor:
break
assert bp.q > 1 and (bp.q + 1) * bp == bp
return bp
def __init__(self, A, B, N, x, y, *, q = 0, prepare = True, calc_q = False):
if prepare:
N = self.Int(N)
assert (x is None) == (y is None), (x, y)
A, B, x, y, q = [(self.Int(e) % N if e is not None else None) for e in [A, B, x, y, q]]
assert (4 * A ** 3 + 27 * B ** 2) % N != 0
assert N % 4 == 3
if x is not None:
assert (y ** 2 - x ** 3 - A * x - B) % N == 0, (hex(N), hex((y ** 2 - x ** 3 - A * x) % N))
assert y == pow(x ** 3 + A * x + B, (N + 1) // 4, N)
self.A, self.B, self.N, self.x, self.y, self.q = A, B, N, x, y, q
if calc_q:
self.q, self.q_ps = self.find_order()
def copy(self):
return ECPoint(self.A, self.B, self.N, self.x, self.y, q = self.q, prepare = False)
def inf(self):
return ECPoint(self.A, self.B, self.N, None, None, q = self.q, prepare = False)
def find_order(self, *, _m = 1, _ps = []):
if 1:
try:
r = _m * self
except self.InvError:
return _m, _ps
B = 2 * self.N
for p in self.gen_primes():
if p * p > B * 2:
return None, []
assert _m % p != 0, (_m, p)
assert p <= B, (p, B)
hi = 1
try:
for cnt in range(1, 1 << 60):
hi *= p
if hi > B:
cnt -= 1
break
r = p * r
except self.InvError:
return self.find_order(_m = hi * _m, _ps = [p] * cnt + _ps)
else:
# Alternative slower way
r = self
for i in range(1 << 60):
try:
r = r + self
except self.InvError:
return i + 2, []
#classmethod
def gen_primes(cls, *, ps = [2, 3]):
yield from ps
for p in range(ps[-1] + 2, 1 << 60, 2):
is_prime = True
for e in ps:
if e * e > p:
break
if p % e == 0:
is_prime = False
break
if is_prime:
ps.append(p)
yield ps[-1]
def __add__(self, other):
if self.x is None:
return other.copy()
if other.x is None:
return self.copy()
A, B, N, q = self.A, self.B, self.N, self.q
Px, Py, Qx, Qy = self.x, self.y, other.x, other.y
if Px == Qx and Py == Qy:
s = ((Px * Px * 3 + A) * self.inv(Py * 2, N)) % N
else:
s = ((Py - Qy) * self.inv(Px - Qx, N)) % N
x = (s * s - Px - Qx) % N
y = (s * (Px - x) - Py) % N
return ECPoint(A, B, N, x, y, q = q, prepare = False)
def __rmul__(self, other):
assert other > 0, other
if other == 1:
return self.copy()
other = self.Int(other - 1)
r = self
while True:
if other & 1:
r = r + self
if other == 1:
return r
other >>= 1
self = self + self
#classmethod
def inv(cls, a, n):
a %= n
if cls.gmpy2 is None:
try:
return pow(a, -1, n)
except ValueError:
import math
raise cls.InvError(math.gcd(a, n), a, n)
else:
g, s, t = cls.gmpy2.gcdext(a, n)
if g != 1:
raise cls.InvError(g, a, n)
return s % n
def __repr__(self):
return str(dict(x = self.x, y = self.y, A = self.A, B = self.B, N = self.N, q = self.q))
def __eq__(self, other):
for i, (a, b) in enumerate([(self.x, other.x), (self.y, other.y), (self.A, other.A), (self.B, other.B), (self.N, other.N), (self.q, other.q)]):
if a != b:
return False
return True
def pollard_rho_ec_log(a, b, bp):
# https://en.wikipedia.org/wiki/Pollard%27s_rho_algorithm_for_logarithms#Algorithm
import math
for itry in range(1 << 60):
try:
i = -1
part_p = bp.rand_prime(max(3, int(math.log2(bp.N) / 2)))
def f(x):
mod3 = ((x.x or 0) % part_p) % 3
if mod3 == 0:
return b + x
elif mod3 == 1:
return x + x
elif mod3 == 2:
return a + x
else:
assert False
def g(x, n):
mod3 = ((x.x or 0) % part_p) % 3
if mod3 == 0:
return n
elif mod3 == 1:
return (2 * n) % bp.q
elif mod3 == 2:
return (n + 1) % bp.q
else:
assert False
def h(x, n):
mod3 = ((x.x or 0) % part_p) % 3
if mod3 == 0:
return (n + 1) % bp.q
elif mod3 == 1:
return (2 * n) % bp.q
elif mod3 == 2:
return n
else:
assert False
a0, b0, x0 = 0, 0, bp.inf()
aim1, bim1, xim1 = a0, b0, x0
a2im2, b2im2, x2im2 = a0, b0, x0
for i in range(1, 1 << 60):
xi = f(xim1)
ai = g(xim1, aim1)
bi = h(xim1, bim1)
x2i = f(f(x2im2))
a2i = g(f(x2im2), g(x2im2, a2im2))
b2i = h(f(x2im2), h(x2im2, b2im2))
if xi == x2i:
return (bp.inv(bi - b2i, bp.q) * (a2i - ai)) % bp.q
xim1, aim1, bim1 = xi, ai, bi
x2im2, a2im2, b2im2 = x2i, a2i, b2i
except bp.InvError as ex:
print(f'Try {itry:>4}, Pollard-Rho failed, invert err at iter {i:>7},', ex.value)
def main():
import random, math
bits = 24
print('Generating base point, wait...')
bp = ECPoint.base_gen(bits, min_order_pfactor = 10)
print('order', bp.q, '=', ' * '.join([str(e) for e in bp.q_ps]))
k0, k1 = [random.randrange(1, bp.q) for i in range(2)]
a = k0 * bp
x = k1
b = x * a
x_calc = pollard_rho_ec_log(a, b, bp)
print('our x', x, 'found x', x_calc)
print('equal points:', x * a == x_calc * a)
if __name__ == '__main__':
main()
Output:
Generating base point, wait...
order 5805013 = 19 * 109 * 2803
Try 0, Pollard-Rho failed, invert err at iter 1120, (109, 1411441, 5805013)
Try 1, Pollard-Rho failed, invert err at iter 3992, (19, 5231802, 5805013)
our x 990731 found x 990731
equal points: True
Part 2. C++ version.
Almost identical code as above, but rewritten in C++.
This C++ version is much faster then Python, C++ code spends around 1 minute on 1 Ghz CPU to crack 48-bit curve. Same amount of time is spent by Python on 32-bit curve.
To remind, complexity is O(Sqrt(Curve_Order)) it means that if C++ spends same time for 48 bits (sqrt is 2^24) as Python for 32 bits (sqrt is 2^16) then C++ is around 2^24/2^16 = 2^8 = 256 times faster than Python's version.
Following version is compilable only in CLang, because it uses 128 and 192 bit integers. In GCC there also exists __int128 but no 192/256 ints. 192-bit int is only used in BarrettMod() function, so if you replace this function's body with return x % n; then you don't need 256-bit int and then you can compile in GCC.
I implemented Barrett Reduction algorithm, to replace operation of taking modulus (% N) that is based on slow division with special Barrett formula based on just multiply/shift/sub. This boosts modulus operations several times.
Try it online!
#include <cstdint>
#include <random>
#include <stdexcept>
#include <type_traits>
#include <iomanip>
#include <iostream>
#include <string>
#include <chrono>
#include <cmath>
using u64 = uint64_t;
using u128 = unsigned __int128;
using u192 = unsigned _ExtInt(192);
using Word = u64;
using DWord = u128;
using SWord = std::make_signed_t<Word>;
using TWord = u192;
#define ASSERT_MSG(cond, msg) { if (!(cond)) throw std::runtime_error("Assertion (" #cond ") failed at line " + std::to_string(__LINE__) + "! Msg '" + std::string(msg) + "'."); }
#define ASSERT(cond) ASSERT_MSG(cond, "")
#define LN { g_log << " LN " << __LINE__ << " " << std::flush; }
#define DUMP(x) { g_log << " " << (#x) << " = " << (x) << " " << std::flush; }
static auto & g_log = std::cout;
class ECPoint {
public:
class InvError : public std::runtime_error {
public:
InvError(Word const & gcd, Word const & x, Word const & mod)
: std::runtime_error("(gcd " + std::to_string(gcd) + ", x " + std::to_string(x) +
", mod " + std::to_string(mod) + ")") {}
};
static Word pow_mod(Word a, Word b, Word const & c) {
// https://en.wikipedia.org/wiki/Modular_exponentiation
Word r = 1;
while (b != 0) {
if (b & 1)
r = (DWord(r) * a) % c;
a = (DWord(a) * a) % c;
b >>= 1;
}
return r;
}
static Word rand_range(Word const & begin, Word const & end) {
u64 const seed = (u64(std::random_device{}()) << 32) + std::random_device{}();
thread_local std::mt19937_64 rng{seed};
ASSERT(begin < end);
return std::uniform_int_distribution<Word>(begin, end - 1)(rng);
}
static bool fermat_prp(Word const & n, size_t trials = 32) {
// https://en.wikipedia.org/wiki/Fermat_primality_test
if (n <= 16)
return n == 2 || n == 3 || n == 5 || n == 7 || n == 11 || n == 13;
for (size_t i = 0; i < trials; ++i)
if (pow_mod(rand_range(2, n - 2), n - 1, n) != 1)
return false;
return true;
}
static Word rand_prime_range(Word begin, Word end) {
while (true) {
Word const p = rand_range(begin, end) | 1;
if (fermat_prp(p))
return p;
}
}
static Word rand_prime(size_t bits) {
return rand_prime_range(Word(1) << (bits - 1), Word((DWord(1) << bits) - 1));
}
std::tuple<Word, size_t> BarrettRS(Word n) {
size_t constexpr extra = 3;
for (size_t k = 0; k < sizeof(DWord) * 8; ++k) {
if (2 * (k + extra) < sizeof(Word) * 8)
continue;
if ((DWord(1) << k) <= DWord(n))
continue;
k += extra;
ASSERT_MSG(2 * k < sizeof(DWord) * 8, "k " + std::to_string(k));
DWord r = (DWord(1) << (2 * k)) / n;
ASSERT_MSG(DWord(r) < (DWord(1) << (sizeof(Word) * 8)),
"k " + std::to_string(k) + " n " + std::to_string(n));
ASSERT(2 * k >= sizeof(Word) * 8);
return std::make_tuple(Word(r), size_t(2 * k - sizeof(Word) * 8));
}
ASSERT(false);
}
template <bool Adjust>
static Word BarrettMod(DWord const & x, Word const & n, Word const & r, size_t s) {
//return x % n;
DWord const q = DWord(((TWord(x) * r) >> (sizeof(Word) * 8)) >> s);
Word t = Word(DWord(x) - q * n);
if constexpr(Adjust) {
Word const mask = ~Word(SWord(t - n) >> (sizeof(Word) * 8 - 1));
t -= mask & n;
}
return t;
}
static Word Adjust(Word const & a, Word const & n) {
return a >= n ? a - n : a;
}
Word modNn(DWord const & a) const { return BarrettMod<false>(a, N_, N_br_, N_bs_); }
Word modNa(DWord const & a) const { return BarrettMod<true>(a, N_, N_br_, N_bs_); }
Word modQn(DWord const & a) const { return BarrettMod<false>(a, q_, Q_br_, Q_bs_); }
Word modQa(DWord const & a) const { return BarrettMod<true>(a, q_, Q_br_, Q_bs_); }
static Word mod(DWord const & a, Word const & n) { return a % n; }
static ECPoint base_gen(size_t bits = 128, Word min_order_pfactor = 0) {
while (true) {
Word const N = rand_prime(bits);
if (mod(N, 4) != 3)
continue;
Word const
x0 = rand_range(1, N), y0 = rand_range(1, N), A = rand_range(1, N),
B = mod(mod(DWord(y0) * y0, N) + N * 2 - mod(DWord(mod(DWord(x0) * x0, N)) * x0, N) - mod(DWord(A) * x0, N), N),
y0_calc = pow_mod(mod(DWord(y0) * y0, N), (N + 1) >> 2, N);
if (y0 != y0_calc)
continue;
auto const bp = ECPoint(A, B, N, x0, y0, 0, true, true);
auto BpCheckOrder = [&]{
for (auto e: bp.q_ps())
if (e < min_order_pfactor)
return false;
return true;
};
if (!(bp.q() != 0 && !bp.q_ps().empty() && BpCheckOrder()))
continue;
ASSERT(bp.q() > 1 && bp * (bp.q() + 1) == bp);
return bp;
}
ASSERT(false);
}
ECPoint(Word A, Word B, Word N, Word x, Word y, Word q = 0, bool prepare = true, bool calc_q = false) {
if (prepare) {
A = mod(A, N); B = mod(B, N); x = mod(x, N); y = mod(y, N); q = mod(q, N);
ASSERT(mod(4 * mod(DWord(mod(DWord(A) * A, N)) * A, N) + 27 * mod(DWord(B) * B, N), N) != 0);
ASSERT(mod(N, 4) == 3);
if (!(x == 0 && y == 0)) {
ASSERT(mod(mod(DWord(y) * y, N) + 3 * N - mod(DWord(mod(DWord(x) * x, N)) * x, N) - mod(DWord(A) * x, N) - B, N) == 0);
ASSERT(y == pow_mod(mod(DWord(mod(DWord(x) * x, N)) * x, N) + mod(DWord(A) * x, N) + B, (N + 1) >> 2, N));
}
std::tie(N_br_, N_bs_) = BarrettRS(N);
if (q != 0)
std::tie(Q_br_, Q_bs_) = BarrettRS(q);
}
std::tie(A_, B_, N_, x_, y_, q_) = std::tie(A, B, N, x, y, q);
if (calc_q) {
std::tie(q_, q_ps_) = find_order();
if (q_ != 0)
std::tie(Q_br_, Q_bs_) = BarrettRS(q_);
}
}
auto copy() const {
return ECPoint(A_, B_, N_, x_, y_, q_, false);
}
auto inf() const {
return ECPoint(A_, B_, N_, 0, 0, q_, false);
}
static auto const & gen_primes(Word const B) {
thread_local std::vector<Word> ps = {2, 3};
for (Word p = ps.back() + 2; p <= B; p += 2) {
bool is_prime = true;
for (auto const e: ps) {
if (e * e > p)
break;
if (p % e == 0) {
is_prime = false;
break;
}
}
if (is_prime)
ps.push_back(p);
}
return ps;
}
std::tuple<Word, std::vector<Word>> find_order(Word _m = 1, std::vector<Word> _ps = {}) const {
ASSERT(_m <= 2 * N_);
if constexpr(1) {
auto r = *this;
try {
r *= _m;
} catch (InvError const &) {
return std::make_tuple(_m, _ps);
}
Word const B = 2 * N_;
for (Word const p: gen_primes(std::llround(std::cbrt(B) + 1))) {
if (p * p * p > B)
break;
ASSERT(p <= B);
size_t cnt = 0;
Word hi = 1;
try {
for (cnt = 1;; ++cnt) {
if (hi * p > B) {
cnt -= 1;
break;
}
hi *= p;
r *= p;
}
} catch (InvError const & ex) {
_ps.insert(_ps.begin(), cnt, p);
return find_order(hi * _m, _ps);
}
}
} else {
// Alternative slower way
auto r = *this;
for (Word i = 0;; ++i)
try {
r += *this;
} catch (InvError const &) {
_ps.clear();
return std::make_tuple(i + 2, _ps);
}
}
_ps.clear();
return std::make_tuple(Word(0), _ps);
}
static std::tuple<Word, SWord, SWord> EGCD(Word const & a, Word const & b) {
Word ro = 0, r = 0, qu = 0, re = 0;
SWord so = 0, s = 0;
std::tie(ro, r, so, s) = std::make_tuple(a, b, 1, 0);
while (r != 0) {
std::tie(qu, re) = std::make_tuple(ro / r, ro % r);
std::tie(ro, r) = std::make_tuple(r, re);
std::tie(so, s) = std::make_tuple(s, so - s * SWord(qu));
}
SWord const to = (SWord(ro) - SWord(a) * so) / SWord(b);
return std::make_tuple(ro, so, to);
}
Word inv(Word a, Word const & n, size_t any_n_q = 0) const {
ASSERT(n > 0);
a = any_n_q == 0 ? mod(a, n) : any_n_q == 1 ? modNa(a) : any_n_q == 2 ? modQa(a) : 0;
auto [gcd, s, t] = EGCD(a, n);
if (gcd != 1)
throw InvError(gcd, a, n);
a = Word(SWord(n) + s);
a = any_n_q == 0 ? mod(a, n) : any_n_q == 1 ? modNa(a) : any_n_q == 2 ? modQa(a) : 0;
return a;
}
Word invN(Word a) const { return inv(a, N_, 1); }
Word invQ(Word a) const { return inv(a, q_, 2); }
ECPoint & operator += (ECPoint const & o) {
if (x_ == 0 && y_ == 0) {
*this = o;
return *this;
}
if (o.x_ == 0 && o.y_ == 0)
return *this;
Word const Px = x_, Py = y_, Qx = o.x_, Qy = o.y_;
Word s = 0;
if ((Adjust(Px, N_) == Adjust(Qx, o.N_)) && (Adjust(Py, N_) == Adjust(Qy, o.N_)))
s = modNn(DWord(modNn(DWord(Px) * Px * 3) + A_) * invN(Py * 2));
else
s = modNn(DWord(Py + 2 * N_ - Qy) * invN(Px + 2 * N_ - Qx));
x_ = modNn(DWord(s) * s + 4 * N_ - Px - Qx);
y_ = modNn(DWord(s) * (Px + 2 * N_ - x_) + 2 * N_ - Py);
return *this;
}
ECPoint operator + (ECPoint const & o) const {
ECPoint c = *this;
c += o;
return c;
}
ECPoint & operator *= (Word k) {
auto const ok = k;
ASSERT(k > 0);
if (k == 1)
return *this;
k -= 1;
auto r = *this, s = *this;
while (true) {
if (k & 1) {
r += s;
if (k == 1)
break;
}
k >>= 1;
s += s;
}
if constexpr(0) {
auto r2 = *this;
for (u64 i = 1; i < ok; ++i)
r2 += *this;
ASSERT(r == r2);
}
*this = r;
return *this;
}
ECPoint operator * (Word k) const {
ECPoint r = *this;
r *= k;
return r;
}
bool operator == (ECPoint const & o) const {
return A_ == o.A_ && B_ == o.B_ && N_ == o.N_ && q_ == o.q_ &&
Adjust(x_, N_) == Adjust(o.x_, o.N_) && Adjust(y_, N_) == Adjust(o.y_, o.N_);
}
Word const & q() const { return q_; }
std::vector<Word> const & q_ps() const { return q_ps_; }
Word const & x() const { return x_; }
private:
Word A_ = 0, B_ = 0, N_ = 0, q_ = 0, x_ = 0, y_ = 0, N_br_ = 0, Q_br_ = 0;
size_t N_bs_ = 0, Q_bs_ = 0;
std::vector<Word> q_ps_;
};
Word pollard_rho_ec_log(ECPoint const & a, ECPoint const & b, ECPoint const & bp) {
// https://en.wikipedia.org/wiki/Pollard%27s_rho_algorithm_for_logarithms#Algorithm
for (u64 itry = 0;; ++itry) {
u64 i = 0;
try {
Word const part_p = bp.rand_prime_range(8, bp.q() >> 4);
auto ModQ = [&](Word n) {
return n >= bp.q() ? n - bp.q() : n;
};
auto f = [&](auto const & x) -> ECPoint {
Word const mod3 = (x.x() % part_p) % 3;
if (mod3 == 0)
return b + x;
else if (mod3 == 1)
return x + x;
else if (mod3 == 2)
return a + x;
else
ASSERT(false);
};
auto const g = [&](auto const & x, Word n) -> Word {
Word const mod3 = (x.x() % part_p) % 3;
if (mod3 == 0)
return n;
else if (mod3 == 1)
return ModQ(2 * n);
else if (mod3 == 2)
return ModQ(n + 1);
else
ASSERT(false);
};
auto const h = [&](auto const & x, Word n) -> Word {
Word const mod3 = (x.x() % part_p) % 3;
if (mod3 == 0)
return ModQ(n + 1);
else if (mod3 == 1)
return ModQ(2 * n);
else if (mod3 == 2)
return n;
else
ASSERT(false);
};
Word aim1 = 0, bim1 = 0, a2im2 = 0, b2im2 = 0, ai = 0, bi = 0, a2i = 0, b2i = 0;
ECPoint xim1 = bp.inf(), x2im2 = bp.inf(), xi = bp.inf(), x2i = bp.inf();
for (i = 1;; ++i) {
xi = f(xim1);
ai = g(xim1, aim1);
bi = h(xim1, bim1);
x2i = f(f(x2im2));
a2i = g(f(x2im2), g(x2im2, a2im2));
b2i = h(f(x2im2), h(x2im2, b2im2));
if (xi == x2i)
return bp.modQa(DWord(bp.invQ(bp.q() + bi - b2i)) * (bp.q() + a2i - ai));
std::tie(xim1, aim1, bim1) = std::tie(xi, ai, bi);
std::tie(x2im2, a2im2, b2im2) = std::tie(x2i, a2i, b2i);
}
} catch (ECPoint::InvError const & ex) {
g_log << "Try " << std::setfill(' ') << std::setw(4) << itry << ", Pollard-Rho failed, invert err at iter "
<< std::setw(7) << i << ", " << ex.what() << std::endl;
}
}
}
void test() {
auto const gtb = std::chrono::high_resolution_clock::now();
auto Time = [&]() -> double {
return std::chrono::duration_cast<std::chrono::milliseconds>(
std::chrono::high_resolution_clock::now() - gtb).count() / 1'000.0;
};
double tb = 0;
size_t constexpr bits = 36;
g_log << "Generating base point, wait... " << std::flush;
tb = Time();
auto const bp = ECPoint::base_gen(bits, 50);
g_log << "Time " << Time() - tb << " sec" << std::endl;
g_log << "order " << bp.q() << " = ";
for (auto e: bp.q_ps())
g_log << e << " * " << std::flush;
g_log << std::endl;
Word const k0 = ECPoint::rand_range(1, bp.q()),
x = ECPoint::rand_range(1, bp.q());
auto a = bp * k0;
auto b = a * x;
g_log << "Searching discrete logarithm... " << std::endl;
tb = Time();
Word const x_calc = pollard_rho_ec_log(a, b, bp);
g_log << "Time " << Time() - tb << " sec" << std::endl;
g_log << "our x " << x << ", found x " << x_calc << std::endl;
g_log << "equal points: " << std::boolalpha << (a * x == a * x_calc) << std::endl;
}
int main() {
try {
test();
} catch (std::exception const & ex) {
g_log << "Exception: " << ex.what() << std::endl;
}
}
Output:
Generating base point, wait... Time 38.932 sec
order 195944962603297 = 401 * 4679 * 9433 * 11071 *
Searching discrete logarithm...
Time 69.791 sec
our x 15520105103514, found x 15520105103514
equal points: true
I've implemented sha2 256 based on the rfc spec found at: https://www.rfc-editor.org/rfc/rfc4634.
My implementation:
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <errno.h>
uint32_t K[] = {
0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5,
0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5,
0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,
0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174,
0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc,
0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da,
0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7,
0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967,
0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13,
0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85,
0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3,
0xd192e819,0xd6990624,0xf40e3585,0x106aa070,
0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5,
0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3,
0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208,
0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2,
};
uint32_t CH(uint32_t x, uint32_t y, uint32_t z) {
uint32_t answ = (x & y) ^ (~x & z);
return answ;
}
uint32_t MAJ(uint32_t x, uint32_t y, uint32_t z) {
uint32_t answ = (x & y) ^ (x & z) ^ (y & z);
return answ;
}
uint32_t ROTL(uint32_t x, short n) {
return (x << n) | (x>>(32 - n));
}
uint32_t ROTR(uint32_t x, short n) {
return (x >> n) | (x<<(32 - n));
}
uint32_t BIGS0(uint32_t x) {
return ROTR(x,2) ^ ROTR(x,13) ^ ROTR(x,22);
}
uint32_t BIGS1(uint32_t x) {
return ROTR(x,6) ^ ROTR(x,11) ^ ROTR(x,25);
}
uint32_t SSIG0(uint32_t x) {
return ROTR(x,7) ^ ROTR(x,18) ^ (x >> 3);
}
uint32_t SSIG1(uint32_t x) {
return ROTR(x,17) ^ ROTR(x,19) ^ (x >> 10);
}
uint32_t toInt(uint8_t *t) {
return (t[3] << 24) | (t[2] << 16) | (t[1] << 8) | t[0];
}
void process(uint32_t *block) {
uint32_t H[] = {
0x6a09e667,
0xbb67ae85,
0x3c6ef372,
0xa54ff53a,
0x510e527f,
0x9b05688c,
0x1f83d9ab,
0x5be0cd19
};
uint32_t *W = (uint32_t *) malloc (sizeof(uint32_t*) * 64);
for (int t = 0; t < 16; t++) {
W[t] = block[t];
}
for (int t = 16; t < 64; t++) {
W[t] = SSIG1(W[t-2]) + W[t-7] + SSIG0(t-15) + W[t-16];
}
uint32_t a = H[0];
uint32_t b = H[1];
uint32_t c = H[2];
uint32_t d = H[3];
uint32_t e = H[4];
uint32_t f = H[5];
uint32_t g = H[6];
uint32_t h = H[7];
uint32_t T1 = 0;
uint32_t T2 = 0;
for(int t =0; t < 64 ; t++) {
T1 = h + BIGS1(e) + CH(e,f,g) + K[t] + W[t];
T2 = BIGS0(a) + MAJ(a,b,c);
h = g;
g = f;
f = e;
e = d + T1;
d = c;
c = b;
b = a;
a = T1 + T2;
}
H[0] = a + H[0];
H[1] = b + H[1];
H[2] = c + H[2];
H[3] = d + H[3];
H[4] = e + H[4];
H[5] = f + H[5];
H[6] = g + H[6];
H[7] = h + H[7];
for (int j = 0; j < 8; j++ ) {
printf("%08x", H[j] );
}
free(W);
}
int main(void)
{
uint32_t block[] = {
0x61626380,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000018,
};//abc in 512 bit padded block
process(block);
return 0;
}
For now I'm only doing one 512 message block and my testing message is 'abc'. Doing all the preprocessing as required results in a padded message block as follows:
0x61626380 0x00000000 0x00000000 0x00000000
0x00000000 0x00000000 0x00000000 0x00000000
0x00000000 0x00000000 0x00000000 0x00000000
0x00000000 0x00000000 0x00000000 0x00000018
After feeding it through my implementation I get the hash as: 4b9cc43100a30340dbc8f2328e2c80a91fdbd7b8cd20962d1b64e31283c4b99d
Where as the correct hash is:
ba7816bf8f01cfea414140de5dae2223b00361a396177a9cb410ff61f20015ad
Tracing my output and comparing it to the example provided at http://csrc.nist.gov/publications/fips/fips180-2/fips180-2withchangenotice.pdf page 34. My implementation seems to go wrong at iteration 16 (thus t=16) when the variable e needs to be calculated.
Any help would be appreciated.
This line:
W[t] = SSIG1(W[t-2]) + W[t-7] + SSIG0(t-15) + W[t-16];
Should be
W[t] = SSIG1(W[t-2]) + W[t-7] + SSIG0(W[t-15]) + W[t-16];
The problem kicks in on iteration 16 because that's when you first refer to a value buffered using that assignment (earlier values were just copies of the initial block).