Complement Multiply With Carry (CMWC) RNG C winmain#16 compile issue

Complement Multiply With Carry (CMWC) RNG C winmain#16 compile issue - c

http://en.wikipedia.org/wiki/Multiply-with-carry#Complementary-multiply-with-carry_RNGs
has some code in C
I'm trying to compile it, but when I do I get this error
c:\program files (x86)\codeblocks\mingw\bin..\lib\gcc\mingw32\4.4.1......\libmingw32.a(main.o):main.c|| undefined reference to `WinMain#16'|
Here's the code
I hope to figure out how to implement it and use it instead of booster's mersenne twister.
As someone pointed out, I was missing my main function, but now with it, I'm still having an issue getting it to run.
#include <stdint.h>
#define PHI 0x9e3779b9
static uint32_t Q[4096], c = 362436;
int main
{
init_rand(6);
int c = rand_cmwc();
};
void init_rand(uint32_t x)
{
int i;
Q[0] = x;
Q[1] = x + PHI;
Q[2] = x + PHI + PHI;
for (i = 3; i < 4096; i++)
Q[i] = Q[i - 3] ^ Q[i - 2] ^ PHI ^ i;
}
uint32_t rand_cmwc(void)
{
uint64_t t, a = 18782LL;
static uint32_t i = 4095;
uint32_t x, r = 0xfffffffe;
i = (i + 1) & 4095;
t = a * Q[i] + c;
c = (t >> 32);
x = t + c;
if (x < c) {
x++;
c++;
}
return (Q[i] = r - x);
}

Okay, I figured it out.
I was missing my main function.
I then implemented the code as a class, in hopes to make it thread safe (allowing for multiple seeds)
#include <iostream>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#define PHI 0x9e3779b9
using namespace std;
static uint32_t Q[4096], c = 362436;
class cmwc
{
public:
void init_rand(uint32_t x)
{
int i;
Q[0] = x;
Q[1] = x + PHI;
Q[2] = x + PHI + PHI;
for (i = 3; i < 4096; i++)
Q[i] = Q[i - 3] ^ Q[i - 2] ^ PHI ^ i;
}
uint32_t rand_cmwc(void)
{
uint64_t t, a = 18782LL;
static uint32_t i = 4095;
uint32_t x, r = 0xfffffffe;
i = (i + 1) & 4095;
t = a * Q[i] + c;
c = (t >> 32);
x = t + c;
if (x < c) {
x++;
c++;
}
return (Q[i] = r - x);
}
uint32_t random_num;
};
int main()
{
cmwc rng1;
rng1.init_rand(time(NULL));
rng1.random_num = 1 + rng1.rand_cmwc()%6;
//int r = rng1.rand_cmwc();
cout << "die: " << rng1.random_num << endl;
return 0;
}

Related

Hash value from C function

I have a problem and hope you can help me.
I have a function written in C that returns hash a value. My
headache is when I execute the program from another tool it takes a lot of time to run, probably because inside my function I run a command that hashes my value in SHA256, so I would like to know if there is another way to do it, maybe a function or something like that.
Here is what I have:
const char *EncryptSHA256 (char *Arg1) {
char command[128];
char result[512];
//I want to replace from here
snprintf(command, sizeof command, "echo -n %s | sha256sum | cut -c1-64",Arg1);
FILE *fpipe;
if (0 == (fpipe = (FILE*)popen(command, "r"))) {
perror("popen() failed.");
exit(1);
}
fread(result, 1, 512, fpipe);
pclose(fpipe);
const char *sha256 = &result[0];
//to here
return sha256;
}

Your code has undefined behavior because you return a pointer to result, a local array with automatic storage. Reading from this array by the caller has undefined behavior.
You should at least make result static so its contents remain readable after EncryptSHA256 returns to its caller.
Regarding the inefficiency of the method, here is a public domain implementation of SHA256 that you can use directly inside your program:
/* public domain sha256 implementation based on fips180-3 */
#include <stddef.h>
#include <stdint.h>
#include <string.h>
/* Public API */
struct sha256 {
uint64_t len; /* processed message length */
uint32_t h[8]; /* hash state */
uint8_t buf[64]; /* message block buffer */
};
/* reset state */
void sha256_init(struct sha256 *s);
/* process message */
void sha256_update(struct sha256 *s, const void *m, size_t len);
/* get message digest */
/* state is ruined after sum, keep a copy if multiple sum is needed */
/* part of the message might be left in s, zero it if secrecy is needed */
void sha256_sum(struct sha256 *s, uint8_t md[32]);
/* Implementation */
static uint32_t ror(uint32_t n, int k) {
return (n >> k) | (n << (32 - k));
}
#define Ch(x,y,z) (z ^ (x & (y ^ z)))
#define Maj(x,y,z) ((x & y) | (z & (x | y)))
#define S0(x) (ror(x,2) ^ ror(x,13) ^ ror(x,22))
#define S1(x) (ror(x,6) ^ ror(x,11) ^ ror(x,25))
#define R0(x) (ror(x,7) ^ ror(x,18) ^ (x>>3))
#define R1(x) (ror(x,17) ^ ror(x,19) ^ (x>>10))
static const uint32_t K[64] = {
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
};
static void processblock(struct sha256 *s, const uint8_t *buf) {
uint32_t W[64], t1, t2, a, b, c, d, e, f, g, h;
int i;
for (i = 0; i < 16; i++) {
W[i] = (uint32_t)buf[4 * i + 0] << 24;
W[i] |= (uint32_t)buf[4 * i + 1] << 16;
W[i] |= (uint32_t)buf[4 * i + 2] << 8;
W[i] |= buf[4 * i + 3];
}
for (; i < 64; i++)
W[i] = R1(W[i-2]) + W[i-7] + R0(W[i-15]) + W[i-16];
a = s->h[0];
b = s->h[1];
c = s->h[2];
d = s->h[3];
e = s->h[4];
f = s->h[5];
g = s->h[6];
h = s->h[7];
#define ROUND(a,b,c,d,e,f,g,h,i) \
t1 = h + S1(e) + Ch(e,f,g) + K[i] + W[i]; \
t2 = S0(a) + Maj(a,b,c); \
d += t1; \
h = t1 + t2;
for (i = 0; i < 64; ) {
ROUND(a, b, c, d, e, f, g, h, i); i++;
ROUND(h, a, b, c, d, e, f, g, i); i++;
ROUND(g, h, a, b, c, d, e, f, i); i++;
ROUND(f, g, h, a, b, c, d, e, i); i++;
ROUND(e, f, g, h, a, b, c, d, i); i++;
ROUND(d, e, f, g, h, a, b, c, i); i++;
ROUND(c, d, e, f, g, h, a, b, i); i++;
ROUND(b, c, d, e, f, g, h, a, i); i++;
}
#undef ROUND
s->h[0] += a;
s->h[1] += b;
s->h[2] += c;
s->h[3] += d;
s->h[4] += e;
s->h[5] += f;
s->h[6] += g;
s->h[7] += h;
}
static void pad(struct sha256 *s) {
unsigned r = s->len % 64;
s->buf[r++] = 0x80;
if (r > 56) {
memset(s->buf + r, 0, 64 - r);
r = 0;
processblock(s, s->buf);
}
memset(s->buf + r, 0, 56 - r);
s->len *= 8;
s->buf[56] = s->len >> 56;
s->buf[57] = s->len >> 48;
s->buf[58] = s->len >> 40;
s->buf[59] = s->len >> 32;
s->buf[60] = s->len >> 24;
s->buf[61] = s->len >> 16;
s->buf[62] = s->len >> 8;
s->buf[63] = s->len;
processblock(s, s->buf);
}
void sha256_init(struct sha256 *s) {
s->len = 0;
s->h[0] = 0x6a09e667;
s->h[1] = 0xbb67ae85;
s->h[2] = 0x3c6ef372;
s->h[3] = 0xa54ff53a;
s->h[4] = 0x510e527f;
s->h[5] = 0x9b05688c;
s->h[6] = 0x1f83d9ab;
s->h[7] = 0x5be0cd19;
}
void sha256_sum(struct sha256 *s, uint8_t md[20]) {
int i;
pad(s);
for (i = 0; i < 8; i++) {
md[4 * i + 0] = s->h[i] >> 24;
md[4 * i + 1] = s->h[i] >> 16;
md[4 * i + 2] = s->h[i] >> 8;
md[4 * i + 3] = s->h[i];
}
}
void sha256_update(struct sha256 *s, const void *m, unsigned long len) {
const uint8_t *p = m;
unsigned r = s->len % 64;
s->len += len;
if (r) {
if (len < 64 - r) {
memcpy(s->buf + r, p, len);
return;
}
memcpy(s->buf + r, p, 64 - r);
len -= 64 - r;
p += 64 - r;
processblock(s, s->buf);
}
for (; len >= 64; len -= 64, p += 64)
processblock(s, p);
memcpy(s->buf, p, len);
}
You would change your function to this:
const char *EncryptSHA256(char *Arg1) {
struct sha256 s;
unsigned char md[32];
static char result[65];
sha256_init(&s);
sha256_update(&s, Arg1, strlen(Arg1));
sha256_sum(&s, md);
for (int i = 0; i < 32; i++) {
sprintf(result + i * 2, "%02x", md[i]);
}
return result;
}
You could also change the API to pass an array of 32 unsigned characters to get the binary form if it is more convenient.

SHA-1 in C on little-endian environment

Most implementations of SHA-1 (even on Wikipedia) I came across are coded for big-endian runtimes. So I'm trying to implement my own version for my machine (little-endian).
I've followed the pseudo-code form Wikipedia and have the following code. I found a function that converts the byte ordering but still not getting the correct output.
#include<stdio.h>
#include<string.h>
#include<math.h>
#include<stdlib.h>
#define rotateleft(x,n) ((x<<n) | (x>>(32-n)))
unsigned int endian_reverse(unsigned int n)
{
unsigned int m = 0;
m |= n << 24;
m |= ((n >> 8) << 24) >> 8;
m |= ((n << 8) >> 24) << 8;
m |= n >> 24;
return m;
}
void SHA1(unsigned char * str1)
{
unsigned long int h0,h1,h2,h3,h4,a,b,c,d,e,f,k,temp;
h0 = 0x67452301;
h1 = 0xEFCDAB89;
h2 = 0x98BADCFE;
h3 = 0x10325476;
h4 = 0xC3D2E1F0;
unsigned char * str;
str = (unsigned char *)malloc(strlen((const char *)str1)+100);
strcpy((char *)str,(const char *)str1);
int current_length = strlen((const char *)str);
int original_length = current_length;
str[current_length] = 0x80;
str[current_length + 1] = '\0';
char ic = str[current_length];
current_length++;
int ib = current_length % 64;
int i, j;
if(ib<56)
ib = 56-ib;
else
ib = 120 - ib;
for(i=0;i < ib;i++)
{
str[current_length]=0x00;
current_length++;
}
str[current_length + 1]='\0';
for(i=0;i<6;i++)
{
str[current_length]=0x0;
current_length++;
}
str[current_length] = (original_length * 8) / 0x100 ;
current_length++;
str[current_length] = (original_length * 8) % 0x100;
current_length++;
str[current_length+i]='\0';
int number_of_chunks = current_length/64;
unsigned long int word[80];
for(i=0;i<number_of_chunks;i++)
{
for(j=0;j<16;j++)
{
word[j] =
str[i*64 + j*4 + 0] * 0x1000000 + str[i*64 + j*4 + 1] * 0x10000 +
str[i*64 + j*4 + 2] * 0x100 + str[i*64 + j*4 + 3];
}
for(j=16;j<80;j++)
{
word[j] = rotateleft((word[j-3] ^ word[j-8] ^ word[j-14] ^ word[j-16]),1);
}
a = h0;
b = h1;
c = h2;
d = h3;
e = h4;
for(int m=0;m<80;m++)
{
if(m<=19)
{
f = (b & c) | ((~b) & d);
k = 0x5A827999;
}
else if(m<=39)
{
f = b ^ c ^ d;
k = 0x6ED9EBA1;
}
else if(m<=59)
{
f = (b & c) | (b & d) | (c & d);
k = 0x8F1BBCDC;
}
else
{
f = b ^ c ^ d;
k = 0xCA62C1D6;
}
temp = (rotateleft(a,5) + f + e + k + word[m]) & 0xFFFFFFFF;
e = d;
d = c;
c = rotateleft(b,30);
b = a;
a = temp;
}
h0 = h0 + a;
h1 = h1 + b;
h2 = h2 + c;
h3 = h3 + d;
h4 = h4 + e;
}
h0 = endian_reverse(h0);
h1 = endian_reverse(h1);
h2 = endian_reverse(h2);
h3 = endian_reverse(h3);
h4 = endian_reverse(h4);
printf("\n\n");
printf("Hash: %x %x %x %x %x",h0, h1, h2, h3, h4);
printf("\n\n");
}
int main()
{
SHA1((unsigned char *)"abc");
return 0;
}
SHA-1 ("abc"):
Resulting
Hash: f7370736 e388302f 15815610 1ccacd49 e7649bb6
Correct (actual)
Hash: a9993e36 4706816a ba3e2571 7850c26c 9cd0d89d
Am I doing my endianness conversion correctly or in the correct spot?

Am I doing my endianness conversion correctly or in the correct spot?
Endianness conversion endian_reverse(() is incorrect when unsigned is not 32-bit.
Endianness conversion not used in correct spot. Endian conversion not needed.
Other issues exists.
Code is assuming unsigned long int is 32-bit. When unsigned long int is 64-bit, I can get the same answer as OP.
Save yourself time: There is no reason to use loose types here. Use uint32_t, uint8_t. For array sizing and string lengths use size_t. Avoid signed types and constants.
By changing unsigned long --> uint32_t and dropping h0 = endian_reverse(h0); ... h7 = endian_reverse(h7); I came up with
Hash: a9993e36 4706816a ba3e2571 7850c26c 9cd0d89d
Other suggestions.
Multiple of 64
size in str = malloc(size) should be a multiple of 64
Stay within multiples of 64
str[current_length+i]='\0'; can write outside allocation.
Alternate size storing
Works for all size_t values up to 264-3 - 1.
size_t current_length = ...
// append length in bits
uint64_t current_length_bits = current_length;
current_length_bits *= 8;
for (i = 8; i > 0; ) {
i--;
str[current_length + i] = (unsigned char) current_length_bits;
current_length_bits >>= 8;
}
current_length += 8;

Call must have pointer - problem with CCS

I am newbie here but have a problem without answer. I'm working with a TM4C1294 from Texas and in code there are countless errors. But at first, I have this code. How you can see, x3 have the read from PIN ADC0, but the error give to me in the lane is (obs. ignore asterick before # and the code is not complete because the rest of the body in the CCS is too large). Thanks for the attention!!
%declaration of pointers to PIN read
#define F_SAMPLE 2000
#define x0 *ADC3_read
#define x1 *ADC2_read
#define x2 *ADC1_read
#define x3 *ADC0_read
#define x4 *ADC4_read
%declaration of variables
int PI = 3.14159;
int teste = 0;
float y_0, ya1, yb1, y_1;
float y0_aux, ya1_aux, yb1_aux;
int alfa, i, j;
%declaration of PIN read
uint32_t ADC_read[5];
uint32_t *ADC0_read=&ADC_read[0];
uint32_t *ADC1_read=&ADC_read[1];
uint32_t *ADC2_read=&ADC_read[2];
uint32_t *ADC3_read=&ADC_read[3];
uint32_t *ADC4_read=&ADC_read[4];
%control code
for( i=0; i <= 12000; i++)
{
alfa = alfa + (2*PI/200);
if (alfa >= 2*PI)
alfa = alfa - 2*PI;
j++;
y0_aux = y0_aux + x3[i]; %error=identifier "x3" is undefined
ya1_aux = ya1_aux + x3[i]*sin(alfa);
yb1_aux = yb1_aux + x3[i]*cos(alfa);
if(j==200){
y_0 = y0_aux/200;
ya1 = ya1_aux/200;
yb1 = yb1_aux/200;
y_1 = sqrt((ya1 * ya1) + (yb1 * yb1));
y0_aux = 0;
ya1_aux = 0;
yb1_aux = 0;
j = 0;
}
}
}

Caveat: This may be incomplete as I'm not sure about your final result, but we can get the compilation errors out. I had to take a slight liberty to add a main function so I could get something compilable.
Here is a reindented version of your latest program [which has errors]:
#include <stdint.h>
#include <math.h>
// declaration of pointers to PIN read
#define F_SAMPLE 2000
#define x0 *ADC3_read
#define x1 *ADC2_read
#define x2 *ADC1_read
#define x3 *ADC0_read
#define x4 *ADC4_read
int
main(void)
{
// declaration of variables
int PI = 3.14159;
int teste = 0;
float y_0,
ya1,
yb1,
y_1;
float y0_aux,
ya1_aux,
yb1_aux;
int alfa,
i,
j;
// declaration of PIN read
uint32_t ADC_read[5];
uint32_t *ADC0_read = &ADC_read[0];
uint32_t *ADC1_read = &ADC_read[1];
uint32_t *ADC2_read = &ADC_read[2];
uint32_t *ADC3_read = &ADC_read[3];
uint32_t *ADC4_read = &ADC_read[4];
// control code
for (i = 0; i <= 12000; i++) {
alfa = alfa + (2 * PI / 200);
if (alfa >= 2 * PI)
alfa = alfa - 2 * PI;
j++;
// error = identifier "x3" is undefined
y0_aux = y0_aux + x3[i];
ya1_aux = ya1_aux + x3[i] * sin(alfa);
yb1_aux = yb1_aux + x3[i] * cos(alfa);
if (j == 200) {
y_0 = y0_aux / 200;
ya1 = ya1_aux / 200;
yb1 = yb1_aux / 200;
y_1 = sqrt((ya1 * ya1) + (yb1 * yb1));
y0_aux = 0;
ya1_aux = 0;
yb1_aux = 0;
j = 0;
}
}
}
This is the gcc error output:
fix1.c: In function ‘main’:
fix1.c:9:17: error: invalid type argument of unary ‘*’ (have ‘uint32_t {aka unsigned int}’)
#define x3 *ADC0_read
^
fix1.c:46:21: note: in expansion of macro ‘x3’
y0_aux = y0_aux + x3[i];
^~
fix1.c:9:17: error: invalid type argument of unary ‘*’ (have ‘uint32_t {aka unsigned int}’)
#define x3 *ADC0_read
^
fix1.c:47:23: note: in expansion of macro ‘x3’
ya1_aux = ya1_aux + x3[i] * sin(alfa);
^~
fix1.c:9:17: error: invalid type argument of unary ‘*’ (have ‘uint32_t {aka unsigned int}’)
#define x3 *ADC0_read
^
fix1.c:48:23: note: in expansion of macro ‘x3’
yb1_aux = yb1_aux + x3[i] * cos(alfa);
^~
Here is one way to change things [this compiles cleanly]:
#include <stdint.h>
#include <math.h>
// declaration of pointers to PIN read
#define F_SAMPLE 2000
#define x0 ADC3_read
#define x1 ADC2_read
#define x2 ADC1_read
#define x3 ADC0_read
#define x4 ADC4_read
int
main(void)
{
// declaration of variables
int PI = 3.14159;
int teste = 0;
float y_0,
ya1,
yb1,
y_1;
float y0_aux,
ya1_aux,
yb1_aux;
int alfa,
i,
j;
// declaration of PIN read
uint32_t ADC_read[5];
uint32_t *ADC0_read = &ADC_read[0];
uint32_t *ADC1_read = &ADC_read[1];
uint32_t *ADC2_read = &ADC_read[2];
uint32_t *ADC3_read = &ADC_read[3];
uint32_t *ADC4_read = &ADC_read[4];
// control code
for (i = 0; i <= 12000; i++) {
alfa = alfa + (2 * PI / 200);
if (alfa >= 2 * PI)
alfa = alfa - 2 * PI;
j++;
// error = identifier "x3" is undefined
y0_aux = y0_aux + x3[i];
ya1_aux = ya1_aux + x3[i] * sin(alfa);
yb1_aux = yb1_aux + x3[i] * cos(alfa);
if (j == 200) {
y_0 = y0_aux / 200;
ya1 = ya1_aux / 200;
yb1 = yb1_aux / 200;
y_1 = sqrt((ya1 * ya1) + (yb1 * yb1));
y0_aux = 0;
ya1_aux = 0;
yb1_aux = 0;
j = 0;
}
}
}
Here is an alternate way to code it [this also compiles cleanly]:
#include <stdint.h>
#include <math.h>
// declaration of pointers to PIN read
#define F_SAMPLE 2000
#define x0(o) ADC3_read[o]
#define x1(o) ADC2_read[o]
#define x2(o) ADC1_read[o]
#define x3(o) ADC0_read[o]
#define x4(o) ADC4_read[o]
int
main(void)
{
// declaration of variables
int PI = 3.14159;
int teste = 0;
float y_0,
ya1,
yb1,
y_1;
float y0_aux,
ya1_aux,
yb1_aux;
int alfa,
i,
j;
// declaration of PIN read
uint32_t ADC_read[5];
uint32_t *ADC0_read = &ADC_read[0];
uint32_t *ADC1_read = &ADC_read[1];
uint32_t *ADC2_read = &ADC_read[2];
uint32_t *ADC3_read = &ADC_read[3];
uint32_t *ADC4_read = &ADC_read[4];
// control code
for (i = 0; i <= 12000; i++) {
alfa = alfa + (2 * PI / 200);
if (alfa >= 2 * PI)
alfa = alfa - 2 * PI;
j++;
// error = identifier "x3" is undefined
y0_aux = y0_aux + x3(i);
ya1_aux = ya1_aux + x3(i) * sin(alfa);
yb1_aux = yb1_aux + x3(i) * cos(alfa);
if (j == 200) {
y_0 = y0_aux / 200;
ya1 = ya1_aux / 200;
yb1 = yb1_aux / 200;
y_1 = sqrt((ya1 * ya1) + (yb1 * yb1));
y0_aux = 0;
ya1_aux = 0;
yb1_aux = 0;
j = 0;
}
}
}
But, indexing off from ADC0_read doesn't make sense to me. Your for loop goes to 12000, but ADC_read has only 5 elements, so either variant above will probably segfault because you're going way past the end of the array.
More likely, you may want something like:
uint32_t ADC_read[12000];
#define x3(o) ADC_read[(o) + 0]
#define x2(o) ADC_read[(o) + 1]
...
Honestly, I don't see how the individual pointers (ADC3_read, etc.) fit in [well]

elliptic curve discrete logarithm

I am trying to Solve elliptic curve discrete logarithm using Pollard rho (find k where G=kp), So i searched for implementation in c and i found one after adding problem specific data in the main function i got segmentation fault (core dumped)
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <gmp.h>
#include <limits.h>
#include <sys/time.h>
#include <openssl/ec.h>
#include <openssl/bn.h>
#include <openssl/obj_mac.h> // for NID_secp256k1
#define POLLARD_SET_COUNT 16
#if defined(WIN32) || defined(_WIN32)
#define EXPORT __declspec(dllexport)
#else
#define EXPORT
#endif
#define MAX_RESTART 100
int ec_point_partition(const EC_GROUP *ecgrp, const EC_POINT *x) {
size_t len = EC_POINT_point2oct( ecgrp, x, POINT_CONVERSION_UNCOMPRESSED, NULL, 0, NULL );
unsigned char ret[len];
EC_POINT_point2oct( ecgrp, x, POINT_CONVERSION_UNCOMPRESSED, ret, len, NULL );
int id = ( ret[len - 1] & 0xFF ) % POLLARD_SET_COUNT;
return id;
}
// P generator
// Q result*P
// order of the curve
// result
//Reference: J. Sattler and C. P. Schnorr, "Generating random walks in groups"
int elliptic_pollard_rho_dlog(const EC_GROUP *group, const EC_POINT *P, const EC_POINT *Q, const BIGNUM *order, BIGNUM *res) {
printf("Pollard rho discrete log algorithm... \n");
BN_CTX* ctx;
ctx = BN_CTX_new();
int i, j;
int iterations = 0;
if ( !EC_POINT_is_on_curve(group, P, ctx ) || !EC_POINT_is_on_curve(group, Q, ctx ) ) return 1;
EC_POINT *X1 = EC_POINT_new(group);
EC_POINT *X2 = EC_POINT_new(group);
BIGNUM *c1 = BN_new();
BIGNUM *d1 = BN_new();
BIGNUM *c2 = BN_new();
BIGNUM *d2 = BN_new();
BIGNUM* a[POLLARD_SET_COUNT];
BIGNUM* b[POLLARD_SET_COUNT];
EC_POINT* R[POLLARD_SET_COUNT];
BN_zero(c1); BN_zero(d1);
BN_zero(c2); BN_zero(d2);
for (i = 0; i < POLLARD_SET_COUNT; i++) {
a[i] = BN_new();
b[i] = BN_new();
R[i] = EC_POINT_new(group);
BN_rand_range(a[i], order);
BN_rand_range(b[i], order);
// R = aP + bQ
EC_POINT_mul(group, R[i], a[i], Q, b[i], ctx);
//ep_norm(R[i], R[i]);
}
BN_rand_range(c1, order);
BN_rand_range(d1, order);
// X1 = c1*P + d1*Q
EC_POINT_mul(group, X1, c1, Q, d1, ctx);
//ep_norm(X1, X1);
BN_copy(c2, c1);
BN_copy(d2, d1);
EC_POINT_copy(X2, X1);
double work_time = (double) clock();
do {
j = ec_point_partition(group, X1);
EC_POINT_add(group, X1, X1, R[j], ctx);
BN_mod_add(c1, c1, a[j], order, ctx);
BN_mod_add(d1, d1, b[j], order, ctx);
for (i = 0; i < 2; i++) {
j = ec_point_partition(group, X2);
EC_POINT_add(group, X2, X2, R[j], ctx);
BN_mod_add(c2, c2, a[j], order, ctx);
BN_mod_add(d2, d2, b[j], order, ctx);
}
iterations++;
printf("Iteration %d \r",iterations );
} while ( EC_POINT_cmp(group, X1, X2, ctx) != 0 ) ;
printf("\n ");
work_time = ( (double) clock() - work_time ) / (double)CLOCKS_PER_SEC;
printf("Number of iterations %d %f\n",iterations, work_time );
BN_mod_sub(c1, c1, c2, order, ctx);
BN_mod_sub(d2, d2, d1, order, ctx);
if (BN_is_zero(d2) == 1) return 1;
//d1 = d2^-1 mod order
BN_mod_inverse(d1, d2, order, ctx);
BN_mod_mul(res, c1, d1, order, ctx);
for (int k = 0; k < POLLARD_SET_COUNT; ++k) {
BN_free(a[k]);
BN_free(b[k]);
EC_POINT_free(R[k]);
}
BN_free(c1); BN_free(d1);
BN_free(c2); BN_free(d2);
EC_POINT_free(X1); EC_POINT_free(X2);
BN_CTX_free(ctx);
return 0;
}
int main(int argc, char *argv[])
{
unsigned char *p_str="134747661567386867366256408824228742802669457";
unsigned char *a_str="-1";
unsigned char *b_str="0";
BIGNUM *p = BN_bin2bn(p_str, sizeof(p_str), NULL);
BIGNUM *a = BN_bin2bn(a_str, sizeof(a_str), NULL);
BIGNUM *b = BN_bin2bn(b_str, sizeof(b_str), NULL);
BN_CTX* ctx;
ctx = BN_CTX_new();
EC_GROUP* g = EC_GROUP_new(EC_GFp_simple_method());
EC_GROUP_set_curve_GFp(g,p,a,b,ctx);
unsigned char *XP_str="18185174461194872234733581786593019886770620";
unsigned char *YP_str="74952280828346465277451545812645059041440154";
BN_CTX* ctx1;
ctx1 = BN_CTX_new();
BIGNUM *XP = BN_bin2bn(XP_str, sizeof(XP_str), NULL);
BIGNUM *YP = BN_bin2bn(YP_str, sizeof(YP_str), NULL);
EC_POINT* P = EC_POINT_new(g);
EC_POINT_set_affine_coordinates_GFp(g,P,XP,YP,ctx1);
unsigned char *XQ_str="76468233972358960368422190121977870066985660";
unsigned char *YQ_str="33884872380845276447083435959215308764231090";
BIGNUM* XQ = BN_bin2bn(XQ_str, sizeof(XQ_str), NULL);
BIGNUM* YQ = BN_bin2bn(YQ_str, sizeof(YQ_str), NULL);
EC_POINT *Q = EC_POINT_new(g);
BN_CTX* ctx2;
ctx2 = BN_CTX_new();
EC_POINT_set_affine_coordinates_GFp(g,Q,XQ,YQ,ctx2);
char * str;
unsigned char *N_str="2902021510595963727029";
BIGNUM *N = BN_bin2bn(N_str, sizeof(N_str), NULL);
BIGNUM *res;
elliptic_pollard_rho_dlog (g,P,Q,N,res);
BN_bn2mpi(res,str);
printf("%s\n", str);
return 0;
}
This is the statement that cause segmentation fault
BN_bn2mpi(res,str);

Part 1. Python version.
Update: See new Part 2 of my answer, there I present C++ version of same algorithm as this Python version.
Your task is very interesting!
Maybe you wanted your code to be fixed but instead I decided to implement from scratch pure Python (Part 1 of answer) and pure C++ (Part 2) solutions without using any external non-standard modules. This kind of solutions from scratch without dependencies I think are very useful for educational purposes.
Algorithm like this is quite complex, and Python is easy enough to make implementation of such algorithm possible in short time.
In code below I used help of Wikipedia to implement Pollard's Rho Discrete Logarithm and Elliptic Curve Point Multiplication.
Code doesn't depend on any external modules, it uses just few built-in Python modules. There is a possibility to use gmpy2 module if you install it through python -m pip install gmpy2 and uncomment line #import gmpy2 in code.
You may see that I generate random base point myself and compute its order. I don't use any external curve like BitCoin's secp256k1, or other standard curves.
In the beginning of main() function you can see that I set up bits = 24, this is number of bits for prime modulus of curve, order of curve (number of distinct points) will have about the same bit size. You may set it to bits = 32 to try solving task for bigger curve.
As known, algorithm's complexity is O(Sqrt(Curve_Order)), it takes this many elliptic curve points additions. Points additions are not primitive operations and also take some time. So algorithm run for curve order bit size of bits = 32 takes about 10-15 seconds. While bits = 64 will take a way too long time for Python, but C++ version (that I'm going to implement later) will be fast enough to crack 64 bits within an hour or so.
Sometimes you may notice when running code that it shows that Pollard Rho failed few times, this happens if algorithm tries to find modular inverse of non-invertible number (non-coprime to modulus) both at last step of Pollard Rho and also when computing Infinite Point as a result of elliptic curve point addition. Same kind of failure also happens from time to time in regular Pollard Rho Integer Factorization when GCD is equal to N.
Try it online!
import random
#random.seed(10)
class ECPoint:
gmpy2 = None
#import gmpy2
import random
class InvError(Exception):
def __init__(self, *args):
self.value = args
#classmethod
def Int(cls, x):
return int(x) if cls.gmpy2 is None else cls.gmpy2.mpz(x)
#classmethod
def fermat_prp(cls, n, trials = 32):
# https://en.wikipedia.org/wiki/Fermat_primality_test
if n <= 16:
return n in (2, 3, 5, 7, 11, 13)
for i in range(trials):
if pow(cls.random.randint(2, n - 2), n - 1, n) != 1:
return False
return True
#classmethod
def rand_prime(cls, bits):
while True:
p = cls.random.randrange(1 << (bits - 1), 1 << bits) | 1
if cls.fermat_prp(p):
return p
#classmethod
def base_gen(cls, bits = 128, *, min_order_pfactor = 0):
while True:
while True:
N = cls.rand_prime(bits)
if N % 4 != 3:
continue
x0, y0, A = [cls.random.randrange(1, N) for i in range(3)]
B = (y0 ** 2 - x0 ** 3 - A * x0) % N
y0_calc = pow(x0 ** 3 + A * x0 + B, (N + 1) // 4, N)
if y0 == y0_calc:
break
bp = ECPoint(A, B, N, x0, y0, calc_q = True)
if bp.q is not None and min(bp.q_ps) >= min_order_pfactor:
break
assert bp.q > 1 and (bp.q + 1) * bp == bp
return bp
def __init__(self, A, B, N, x, y, *, q = 0, prepare = True, calc_q = False):
if prepare:
N = self.Int(N)
assert (x is None) == (y is None), (x, y)
A, B, x, y, q = [(self.Int(e) % N if e is not None else None) for e in [A, B, x, y, q]]
assert (4 * A ** 3 + 27 * B ** 2) % N != 0
assert N % 4 == 3
if x is not None:
assert (y ** 2 - x ** 3 - A * x - B) % N == 0, (hex(N), hex((y ** 2 - x ** 3 - A * x) % N))
assert y == pow(x ** 3 + A * x + B, (N + 1) // 4, N)
self.A, self.B, self.N, self.x, self.y, self.q = A, B, N, x, y, q
if calc_q:
self.q, self.q_ps = self.find_order()
def copy(self):
return ECPoint(self.A, self.B, self.N, self.x, self.y, q = self.q, prepare = False)
def inf(self):
return ECPoint(self.A, self.B, self.N, None, None, q = self.q, prepare = False)
def find_order(self, *, _m = 1, _ps = []):
if 1:
try:
r = _m * self
except self.InvError:
return _m, _ps
B = 2 * self.N
for p in self.gen_primes():
if p * p > B * 2:
return None, []
assert _m % p != 0, (_m, p)
assert p <= B, (p, B)
hi = 1
try:
for cnt in range(1, 1 << 60):
hi *= p
if hi > B:
cnt -= 1
break
r = p * r
except self.InvError:
return self.find_order(_m = hi * _m, _ps = [p] * cnt + _ps)
else:
# Alternative slower way
r = self
for i in range(1 << 60):
try:
r = r + self
except self.InvError:
return i + 2, []
#classmethod
def gen_primes(cls, *, ps = [2, 3]):
yield from ps
for p in range(ps[-1] + 2, 1 << 60, 2):
is_prime = True
for e in ps:
if e * e > p:
break
if p % e == 0:
is_prime = False
break
if is_prime:
ps.append(p)
yield ps[-1]
def __add__(self, other):
if self.x is None:
return other.copy()
if other.x is None:
return self.copy()
A, B, N, q = self.A, self.B, self.N, self.q
Px, Py, Qx, Qy = self.x, self.y, other.x, other.y
if Px == Qx and Py == Qy:
s = ((Px * Px * 3 + A) * self.inv(Py * 2, N)) % N
else:
s = ((Py - Qy) * self.inv(Px - Qx, N)) % N
x = (s * s - Px - Qx) % N
y = (s * (Px - x) - Py) % N
return ECPoint(A, B, N, x, y, q = q, prepare = False)
def __rmul__(self, other):
assert other > 0, other
if other == 1:
return self.copy()
other = self.Int(other - 1)
r = self
while True:
if other & 1:
r = r + self
if other == 1:
return r
other >>= 1
self = self + self
#classmethod
def inv(cls, a, n):
a %= n
if cls.gmpy2 is None:
try:
return pow(a, -1, n)
except ValueError:
import math
raise cls.InvError(math.gcd(a, n), a, n)
else:
g, s, t = cls.gmpy2.gcdext(a, n)
if g != 1:
raise cls.InvError(g, a, n)
return s % n
def __repr__(self):
return str(dict(x = self.x, y = self.y, A = self.A, B = self.B, N = self.N, q = self.q))
def __eq__(self, other):
for i, (a, b) in enumerate([(self.x, other.x), (self.y, other.y), (self.A, other.A), (self.B, other.B), (self.N, other.N), (self.q, other.q)]):
if a != b:
return False
return True
def pollard_rho_ec_log(a, b, bp):
# https://en.wikipedia.org/wiki/Pollard%27s_rho_algorithm_for_logarithms#Algorithm
import math
for itry in range(1 << 60):
try:
i = -1
part_p = bp.rand_prime(max(3, int(math.log2(bp.N) / 2)))
def f(x):
mod3 = ((x.x or 0) % part_p) % 3
if mod3 == 0:
return b + x
elif mod3 == 1:
return x + x
elif mod3 == 2:
return a + x
else:
assert False
def g(x, n):
mod3 = ((x.x or 0) % part_p) % 3
if mod3 == 0:
return n
elif mod3 == 1:
return (2 * n) % bp.q
elif mod3 == 2:
return (n + 1) % bp.q
else:
assert False
def h(x, n):
mod3 = ((x.x or 0) % part_p) % 3
if mod3 == 0:
return (n + 1) % bp.q
elif mod3 == 1:
return (2 * n) % bp.q
elif mod3 == 2:
return n
else:
assert False
a0, b0, x0 = 0, 0, bp.inf()
aim1, bim1, xim1 = a0, b0, x0
a2im2, b2im2, x2im2 = a0, b0, x0
for i in range(1, 1 << 60):
xi = f(xim1)
ai = g(xim1, aim1)
bi = h(xim1, bim1)
x2i = f(f(x2im2))
a2i = g(f(x2im2), g(x2im2, a2im2))
b2i = h(f(x2im2), h(x2im2, b2im2))
if xi == x2i:
return (bp.inv(bi - b2i, bp.q) * (a2i - ai)) % bp.q
xim1, aim1, bim1 = xi, ai, bi
x2im2, a2im2, b2im2 = x2i, a2i, b2i
except bp.InvError as ex:
print(f'Try {itry:>4}, Pollard-Rho failed, invert err at iter {i:>7},', ex.value)
def main():
import random, math
bits = 24
print('Generating base point, wait...')
bp = ECPoint.base_gen(bits, min_order_pfactor = 10)
print('order', bp.q, '=', ' * '.join([str(e) for e in bp.q_ps]))
k0, k1 = [random.randrange(1, bp.q) for i in range(2)]
a = k0 * bp
x = k1
b = x * a
x_calc = pollard_rho_ec_log(a, b, bp)
print('our x', x, 'found x', x_calc)
print('equal points:', x * a == x_calc * a)
if __name__ == '__main__':
main()
Output:
Generating base point, wait...
order 5805013 = 19 * 109 * 2803
Try 0, Pollard-Rho failed, invert err at iter 1120, (109, 1411441, 5805013)
Try 1, Pollard-Rho failed, invert err at iter 3992, (19, 5231802, 5805013)
our x 990731 found x 990731
equal points: True
Part 2. C++ version.
Almost identical code as above, but rewritten in C++.
This C++ version is much faster then Python, C++ code spends around 1 minute on 1 Ghz CPU to crack 48-bit curve. Same amount of time is spent by Python on 32-bit curve.
To remind, complexity is O(Sqrt(Curve_Order)) it means that if C++ spends same time for 48 bits (sqrt is 2^24) as Python for 32 bits (sqrt is 2^16) then C++ is around 2^24/2^16 = 2^8 = 256 times faster than Python's version.
Following version is compilable only in CLang, because it uses 128 and 192 bit integers. In GCC there also exists __int128 but no 192/256 ints. 192-bit int is only used in BarrettMod() function, so if you replace this function's body with return x % n; then you don't need 256-bit int and then you can compile in GCC.
I implemented Barrett Reduction algorithm, to replace operation of taking modulus (% N) that is based on slow division with special Barrett formula based on just multiply/shift/sub. This boosts modulus operations several times.
Try it online!
#include <cstdint>
#include <random>
#include <stdexcept>
#include <type_traits>
#include <iomanip>
#include <iostream>
#include <string>
#include <chrono>
#include <cmath>
using u64 = uint64_t;
using u128 = unsigned __int128;
using u192 = unsigned _ExtInt(192);
using Word = u64;
using DWord = u128;
using SWord = std::make_signed_t<Word>;
using TWord = u192;
#define ASSERT_MSG(cond, msg) { if (!(cond)) throw std::runtime_error("Assertion (" #cond ") failed at line " + std::to_string(__LINE__) + "! Msg '" + std::string(msg) + "'."); }
#define ASSERT(cond) ASSERT_MSG(cond, "")
#define LN { g_log << " LN " << __LINE__ << " " << std::flush; }
#define DUMP(x) { g_log << " " << (#x) << " = " << (x) << " " << std::flush; }
static auto & g_log = std::cout;
class ECPoint {
public:
class InvError : public std::runtime_error {
public:
InvError(Word const & gcd, Word const & x, Word const & mod)
: std::runtime_error("(gcd " + std::to_string(gcd) + ", x " + std::to_string(x) +
", mod " + std::to_string(mod) + ")") {}
};
static Word pow_mod(Word a, Word b, Word const & c) {
// https://en.wikipedia.org/wiki/Modular_exponentiation
Word r = 1;
while (b != 0) {
if (b & 1)
r = (DWord(r) * a) % c;
a = (DWord(a) * a) % c;
b >>= 1;
}
return r;
}
static Word rand_range(Word const & begin, Word const & end) {
u64 const seed = (u64(std::random_device{}()) << 32) + std::random_device{}();
thread_local std::mt19937_64 rng{seed};
ASSERT(begin < end);
return std::uniform_int_distribution<Word>(begin, end - 1)(rng);
}
static bool fermat_prp(Word const & n, size_t trials = 32) {
// https://en.wikipedia.org/wiki/Fermat_primality_test
if (n <= 16)
return n == 2 || n == 3 || n == 5 || n == 7 || n == 11 || n == 13;
for (size_t i = 0; i < trials; ++i)
if (pow_mod(rand_range(2, n - 2), n - 1, n) != 1)
return false;
return true;
}
static Word rand_prime_range(Word begin, Word end) {
while (true) {
Word const p = rand_range(begin, end) | 1;
if (fermat_prp(p))
return p;
}
}
static Word rand_prime(size_t bits) {
return rand_prime_range(Word(1) << (bits - 1), Word((DWord(1) << bits) - 1));
}
std::tuple<Word, size_t> BarrettRS(Word n) {
size_t constexpr extra = 3;
for (size_t k = 0; k < sizeof(DWord) * 8; ++k) {
if (2 * (k + extra) < sizeof(Word) * 8)
continue;
if ((DWord(1) << k) <= DWord(n))
continue;
k += extra;
ASSERT_MSG(2 * k < sizeof(DWord) * 8, "k " + std::to_string(k));
DWord r = (DWord(1) << (2 * k)) / n;
ASSERT_MSG(DWord(r) < (DWord(1) << (sizeof(Word) * 8)),
"k " + std::to_string(k) + " n " + std::to_string(n));
ASSERT(2 * k >= sizeof(Word) * 8);
return std::make_tuple(Word(r), size_t(2 * k - sizeof(Word) * 8));
}
ASSERT(false);
}
template <bool Adjust>
static Word BarrettMod(DWord const & x, Word const & n, Word const & r, size_t s) {
//return x % n;
DWord const q = DWord(((TWord(x) * r) >> (sizeof(Word) * 8)) >> s);
Word t = Word(DWord(x) - q * n);
if constexpr(Adjust) {
Word const mask = ~Word(SWord(t - n) >> (sizeof(Word) * 8 - 1));
t -= mask & n;
}
return t;
}
static Word Adjust(Word const & a, Word const & n) {
return a >= n ? a - n : a;
}
Word modNn(DWord const & a) const { return BarrettMod<false>(a, N_, N_br_, N_bs_); }
Word modNa(DWord const & a) const { return BarrettMod<true>(a, N_, N_br_, N_bs_); }
Word modQn(DWord const & a) const { return BarrettMod<false>(a, q_, Q_br_, Q_bs_); }
Word modQa(DWord const & a) const { return BarrettMod<true>(a, q_, Q_br_, Q_bs_); }
static Word mod(DWord const & a, Word const & n) { return a % n; }
static ECPoint base_gen(size_t bits = 128, Word min_order_pfactor = 0) {
while (true) {
Word const N = rand_prime(bits);
if (mod(N, 4) != 3)
continue;
Word const
x0 = rand_range(1, N), y0 = rand_range(1, N), A = rand_range(1, N),
B = mod(mod(DWord(y0) * y0, N) + N * 2 - mod(DWord(mod(DWord(x0) * x0, N)) * x0, N) - mod(DWord(A) * x0, N), N),
y0_calc = pow_mod(mod(DWord(y0) * y0, N), (N + 1) >> 2, N);
if (y0 != y0_calc)
continue;
auto const bp = ECPoint(A, B, N, x0, y0, 0, true, true);
auto BpCheckOrder = [&]{
for (auto e: bp.q_ps())
if (e < min_order_pfactor)
return false;
return true;
};
if (!(bp.q() != 0 && !bp.q_ps().empty() && BpCheckOrder()))
continue;
ASSERT(bp.q() > 1 && bp * (bp.q() + 1) == bp);
return bp;
}
ASSERT(false);
}
ECPoint(Word A, Word B, Word N, Word x, Word y, Word q = 0, bool prepare = true, bool calc_q = false) {
if (prepare) {
A = mod(A, N); B = mod(B, N); x = mod(x, N); y = mod(y, N); q = mod(q, N);
ASSERT(mod(4 * mod(DWord(mod(DWord(A) * A, N)) * A, N) + 27 * mod(DWord(B) * B, N), N) != 0);
ASSERT(mod(N, 4) == 3);
if (!(x == 0 && y == 0)) {
ASSERT(mod(mod(DWord(y) * y, N) + 3 * N - mod(DWord(mod(DWord(x) * x, N)) * x, N) - mod(DWord(A) * x, N) - B, N) == 0);
ASSERT(y == pow_mod(mod(DWord(mod(DWord(x) * x, N)) * x, N) + mod(DWord(A) * x, N) + B, (N + 1) >> 2, N));
}
std::tie(N_br_, N_bs_) = BarrettRS(N);
if (q != 0)
std::tie(Q_br_, Q_bs_) = BarrettRS(q);
}
std::tie(A_, B_, N_, x_, y_, q_) = std::tie(A, B, N, x, y, q);
if (calc_q) {
std::tie(q_, q_ps_) = find_order();
if (q_ != 0)
std::tie(Q_br_, Q_bs_) = BarrettRS(q_);
}
}
auto copy() const {
return ECPoint(A_, B_, N_, x_, y_, q_, false);
}
auto inf() const {
return ECPoint(A_, B_, N_, 0, 0, q_, false);
}
static auto const & gen_primes(Word const B) {
thread_local std::vector<Word> ps = {2, 3};
for (Word p = ps.back() + 2; p <= B; p += 2) {
bool is_prime = true;
for (auto const e: ps) {
if (e * e > p)
break;
if (p % e == 0) {
is_prime = false;
break;
}
}
if (is_prime)
ps.push_back(p);
}
return ps;
}
std::tuple<Word, std::vector<Word>> find_order(Word _m = 1, std::vector<Word> _ps = {}) const {
ASSERT(_m <= 2 * N_);
if constexpr(1) {
auto r = *this;
try {
r *= _m;
} catch (InvError const &) {
return std::make_tuple(_m, _ps);
}
Word const B = 2 * N_;
for (Word const p: gen_primes(std::llround(std::cbrt(B) + 1))) {
if (p * p * p > B)
break;
ASSERT(p <= B);
size_t cnt = 0;
Word hi = 1;
try {
for (cnt = 1;; ++cnt) {
if (hi * p > B) {
cnt -= 1;
break;
}
hi *= p;
r *= p;
}
} catch (InvError const & ex) {
_ps.insert(_ps.begin(), cnt, p);
return find_order(hi * _m, _ps);
}
}
} else {
// Alternative slower way
auto r = *this;
for (Word i = 0;; ++i)
try {
r += *this;
} catch (InvError const &) {
_ps.clear();
return std::make_tuple(i + 2, _ps);
}
}
_ps.clear();
return std::make_tuple(Word(0), _ps);
}
static std::tuple<Word, SWord, SWord> EGCD(Word const & a, Word const & b) {
Word ro = 0, r = 0, qu = 0, re = 0;
SWord so = 0, s = 0;
std::tie(ro, r, so, s) = std::make_tuple(a, b, 1, 0);
while (r != 0) {
std::tie(qu, re) = std::make_tuple(ro / r, ro % r);
std::tie(ro, r) = std::make_tuple(r, re);
std::tie(so, s) = std::make_tuple(s, so - s * SWord(qu));
}
SWord const to = (SWord(ro) - SWord(a) * so) / SWord(b);
return std::make_tuple(ro, so, to);
}
Word inv(Word a, Word const & n, size_t any_n_q = 0) const {
ASSERT(n > 0);
a = any_n_q == 0 ? mod(a, n) : any_n_q == 1 ? modNa(a) : any_n_q == 2 ? modQa(a) : 0;
auto [gcd, s, t] = EGCD(a, n);
if (gcd != 1)
throw InvError(gcd, a, n);
a = Word(SWord(n) + s);
a = any_n_q == 0 ? mod(a, n) : any_n_q == 1 ? modNa(a) : any_n_q == 2 ? modQa(a) : 0;
return a;
}
Word invN(Word a) const { return inv(a, N_, 1); }
Word invQ(Word a) const { return inv(a, q_, 2); }
ECPoint & operator += (ECPoint const & o) {
if (x_ == 0 && y_ == 0) {
*this = o;
return *this;
}
if (o.x_ == 0 && o.y_ == 0)
return *this;
Word const Px = x_, Py = y_, Qx = o.x_, Qy = o.y_;
Word s = 0;
if ((Adjust(Px, N_) == Adjust(Qx, o.N_)) && (Adjust(Py, N_) == Adjust(Qy, o.N_)))
s = modNn(DWord(modNn(DWord(Px) * Px * 3) + A_) * invN(Py * 2));
else
s = modNn(DWord(Py + 2 * N_ - Qy) * invN(Px + 2 * N_ - Qx));
x_ = modNn(DWord(s) * s + 4 * N_ - Px - Qx);
y_ = modNn(DWord(s) * (Px + 2 * N_ - x_) + 2 * N_ - Py);
return *this;
}
ECPoint operator + (ECPoint const & o) const {
ECPoint c = *this;
c += o;
return c;
}
ECPoint & operator *= (Word k) {
auto const ok = k;
ASSERT(k > 0);
if (k == 1)
return *this;
k -= 1;
auto r = *this, s = *this;
while (true) {
if (k & 1) {
r += s;
if (k == 1)
break;
}
k >>= 1;
s += s;
}
if constexpr(0) {
auto r2 = *this;
for (u64 i = 1; i < ok; ++i)
r2 += *this;
ASSERT(r == r2);
}
*this = r;
return *this;
}
ECPoint operator * (Word k) const {
ECPoint r = *this;
r *= k;
return r;
}
bool operator == (ECPoint const & o) const {
return A_ == o.A_ && B_ == o.B_ && N_ == o.N_ && q_ == o.q_ &&
Adjust(x_, N_) == Adjust(o.x_, o.N_) && Adjust(y_, N_) == Adjust(o.y_, o.N_);
}
Word const & q() const { return q_; }
std::vector<Word> const & q_ps() const { return q_ps_; }
Word const & x() const { return x_; }
private:
Word A_ = 0, B_ = 0, N_ = 0, q_ = 0, x_ = 0, y_ = 0, N_br_ = 0, Q_br_ = 0;
size_t N_bs_ = 0, Q_bs_ = 0;
std::vector<Word> q_ps_;
};
Word pollard_rho_ec_log(ECPoint const & a, ECPoint const & b, ECPoint const & bp) {
// https://en.wikipedia.org/wiki/Pollard%27s_rho_algorithm_for_logarithms#Algorithm
for (u64 itry = 0;; ++itry) {
u64 i = 0;
try {
Word const part_p = bp.rand_prime_range(8, bp.q() >> 4);
auto ModQ = [&](Word n) {
return n >= bp.q() ? n - bp.q() : n;
};
auto f = [&](auto const & x) -> ECPoint {
Word const mod3 = (x.x() % part_p) % 3;
if (mod3 == 0)
return b + x;
else if (mod3 == 1)
return x + x;
else if (mod3 == 2)
return a + x;
else
ASSERT(false);
};
auto const g = [&](auto const & x, Word n) -> Word {
Word const mod3 = (x.x() % part_p) % 3;
if (mod3 == 0)
return n;
else if (mod3 == 1)
return ModQ(2 * n);
else if (mod3 == 2)
return ModQ(n + 1);
else
ASSERT(false);
};
auto const h = [&](auto const & x, Word n) -> Word {
Word const mod3 = (x.x() % part_p) % 3;
if (mod3 == 0)
return ModQ(n + 1);
else if (mod3 == 1)
return ModQ(2 * n);
else if (mod3 == 2)
return n;
else
ASSERT(false);
};
Word aim1 = 0, bim1 = 0, a2im2 = 0, b2im2 = 0, ai = 0, bi = 0, a2i = 0, b2i = 0;
ECPoint xim1 = bp.inf(), x2im2 = bp.inf(), xi = bp.inf(), x2i = bp.inf();
for (i = 1;; ++i) {
xi = f(xim1);
ai = g(xim1, aim1);
bi = h(xim1, bim1);
x2i = f(f(x2im2));
a2i = g(f(x2im2), g(x2im2, a2im2));
b2i = h(f(x2im2), h(x2im2, b2im2));
if (xi == x2i)
return bp.modQa(DWord(bp.invQ(bp.q() + bi - b2i)) * (bp.q() + a2i - ai));
std::tie(xim1, aim1, bim1) = std::tie(xi, ai, bi);
std::tie(x2im2, a2im2, b2im2) = std::tie(x2i, a2i, b2i);
}
} catch (ECPoint::InvError const & ex) {
g_log << "Try " << std::setfill(' ') << std::setw(4) << itry << ", Pollard-Rho failed, invert err at iter "
<< std::setw(7) << i << ", " << ex.what() << std::endl;
}
}
}
void test() {
auto const gtb = std::chrono::high_resolution_clock::now();
auto Time = [&]() -> double {
return std::chrono::duration_cast<std::chrono::milliseconds>(
std::chrono::high_resolution_clock::now() - gtb).count() / 1'000.0;
};
double tb = 0;
size_t constexpr bits = 36;
g_log << "Generating base point, wait... " << std::flush;
tb = Time();
auto const bp = ECPoint::base_gen(bits, 50);
g_log << "Time " << Time() - tb << " sec" << std::endl;
g_log << "order " << bp.q() << " = ";
for (auto e: bp.q_ps())
g_log << e << " * " << std::flush;
g_log << std::endl;
Word const k0 = ECPoint::rand_range(1, bp.q()),
x = ECPoint::rand_range(1, bp.q());
auto a = bp * k0;
auto b = a * x;
g_log << "Searching discrete logarithm... " << std::endl;
tb = Time();
Word const x_calc = pollard_rho_ec_log(a, b, bp);
g_log << "Time " << Time() - tb << " sec" << std::endl;
g_log << "our x " << x << ", found x " << x_calc << std::endl;
g_log << "equal points: " << std::boolalpha << (a * x == a * x_calc) << std::endl;
}
int main() {
try {
test();
} catch (std::exception const & ex) {
g_log << "Exception: " << ex.what() << std::endl;
}
}
Output:
Generating base point, wait... Time 38.932 sec
order 195944962603297 = 401 * 4679 * 9433 * 11071 *
Searching discrete logarithm...
Time 69.791 sec
our x 15520105103514, found x 15520105103514
equal points: true

Wrong hash output in C implementation of sha2 256

I've implemented sha2 256 based on the rfc spec found at: https://www.rfc-editor.org/rfc/rfc4634.
My implementation:
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <string.h>
#include <errno.h>
uint32_t K[] = {
0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5,
0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5,
0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,
0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174,
0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc,
0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da,
0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7,
0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967,
0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13,
0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85,
0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3,
0xd192e819,0xd6990624,0xf40e3585,0x106aa070,
0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5,
0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3,
0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208,
0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2,
};
uint32_t CH(uint32_t x, uint32_t y, uint32_t z) {
uint32_t answ = (x & y) ^ (~x & z);
return answ;
}
uint32_t MAJ(uint32_t x, uint32_t y, uint32_t z) {
uint32_t answ = (x & y) ^ (x & z) ^ (y & z);
return answ;
}
uint32_t ROTL(uint32_t x, short n) {
return (x << n) | (x>>(32 - n));
}
uint32_t ROTR(uint32_t x, short n) {
return (x >> n) | (x<<(32 - n));
}
uint32_t BIGS0(uint32_t x) {
return ROTR(x,2) ^ ROTR(x,13) ^ ROTR(x,22);
}
uint32_t BIGS1(uint32_t x) {
return ROTR(x,6) ^ ROTR(x,11) ^ ROTR(x,25);
}
uint32_t SSIG0(uint32_t x) {
return ROTR(x,7) ^ ROTR(x,18) ^ (x >> 3);
}
uint32_t SSIG1(uint32_t x) {
return ROTR(x,17) ^ ROTR(x,19) ^ (x >> 10);
}
uint32_t toInt(uint8_t *t) {
return (t[3] << 24) | (t[2] << 16) | (t[1] << 8) | t[0];
}
void process(uint32_t *block) {
uint32_t H[] = {
0x6a09e667,
0xbb67ae85,
0x3c6ef372,
0xa54ff53a,
0x510e527f,
0x9b05688c,
0x1f83d9ab,
0x5be0cd19
};
uint32_t *W = (uint32_t *) malloc (sizeof(uint32_t*) * 64);
for (int t = 0; t < 16; t++) {
W[t] = block[t];
}
for (int t = 16; t < 64; t++) {
W[t] = SSIG1(W[t-2]) + W[t-7] + SSIG0(t-15) + W[t-16];
}
uint32_t a = H[0];
uint32_t b = H[1];
uint32_t c = H[2];
uint32_t d = H[3];
uint32_t e = H[4];
uint32_t f = H[5];
uint32_t g = H[6];
uint32_t h = H[7];
uint32_t T1 = 0;
uint32_t T2 = 0;
for(int t =0; t < 64 ; t++) {
T1 = h + BIGS1(e) + CH(e,f,g) + K[t] + W[t];
T2 = BIGS0(a) + MAJ(a,b,c);
h = g;
g = f;
f = e;
e = d + T1;
d = c;
c = b;
b = a;
a = T1 + T2;
}
H[0] = a + H[0];
H[1] = b + H[1];
H[2] = c + H[2];
H[3] = d + H[3];
H[4] = e + H[4];
H[5] = f + H[5];
H[6] = g + H[6];
H[7] = h + H[7];
for (int j = 0; j < 8; j++ ) {
printf("%08x", H[j] );
}
free(W);
}
int main(void)
{
uint32_t block[] = {
0x61626380,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000000,
0x00000018,
};//abc in 512 bit padded block
process(block);
return 0;
}
For now I'm only doing one 512 message block and my testing message is 'abc'. Doing all the preprocessing as required results in a padded message block as follows:
0x61626380 0x00000000 0x00000000 0x00000000
0x00000000 0x00000000 0x00000000 0x00000000
0x00000000 0x00000000 0x00000000 0x00000000
0x00000000 0x00000000 0x00000000 0x00000018
After feeding it through my implementation I get the hash as: 4b9cc43100a30340dbc8f2328e2c80a91fdbd7b8cd20962d1b64e31283c4b99d
Where as the correct hash is:
ba7816bf8f01cfea414140de5dae2223b00361a396177a9cb410ff61f20015ad
Tracing my output and comparing it to the example provided at http://csrc.nist.gov/publications/fips/fips180-2/fips180-2withchangenotice.pdf page 34. My implementation seems to go wrong at iteration 16 (thus t=16) when the variable e needs to be calculated.
Any help would be appreciated.

This line:
W[t] = SSIG1(W[t-2]) + W[t-7] + SSIG0(t-15) + W[t-16];
Should be
W[t] = SSIG1(W[t-2]) + W[t-7] + SSIG0(W[t-15]) + W[t-16];
The problem kicks in on iteration 16 because that's when you first refer to a value buffered using that assignment (earlier values were just copies of the initial block).