my MD5 implementation is giving the wrong result - c

Ive been trying to solve this issue for hours and hours on end im not completely sure whats making the wrong output obviously its hard to find out because its a hashing algorithm i know my chunking code is incomplete its still a work in progress and ive been programming this for a week
/*
* MD5 Implementation in C
* Created by Caelan Ireland 2023
*/
//#include "MD5.h"
#include <stdint.h> // uint32_t
#include <stdlib.h>
#include <stddef.h>
#include <string.h>
#include <stdio.h>
#include <endian.h>
#include <stdint.h> // needed?
// make sure to use -lm tag when compiling with gcc
#include <math.h>
// To Do: Essential Items for Understanding MD5
// 5. Divide up
// 6. Finish ABCD funct
// 4. Padding (Incomplete) incl error checks
// 4 32 bit words called A, B, C and D
#define A 0x67452301
#define B 0xEFCDAB89
#define C 0x98BADCFE
#define D 0x10325476
// remember Let [abcd k s i] ------------------------------------------->>>>>>>
const uint32_t S[64] = {
7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22,
5, 9, 14, 20, 5, 9, 14, 20, 5, 9, 14, 20, 5, 9, 14, 20,
4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23,
6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21
};
//const uint32_t MD[4] = {0x67452301, 0xEFCDAB89, 0x98BADCFE, 0x10325476};
// unsigned int T[64];
// for(int i = 0; i < 64; i++) {
// T[i] = floor(pow(2, 32) * fabs(sin(i + 1))); // fabs fixed wrap or abs? floorf for floats
// printf("0x%08x\n", T[i]); // %u
// };
// precomputed hashes
const uint32_t K[64] = { // T
0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee, 0xf57c0faf, 0x4787c62a,
0xa8304613, 0xfd469501, 0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be,
0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821, 0xf61e2562, 0xc040b340,
0x265e5a51, 0xe9b6c7aa, 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8,
0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed, 0xa9e3e905, 0xfcefa3f8,
0x676f02d9, 0x8d2a4c8a, 0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c,
0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70, 0x289b7ec6, 0xeaa127fa,
0xd4ef3085, 0x04881d05, 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665,
0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039, 0x655b59c3, 0x8f0ccc92,
0xffeff47d, 0x85845dd1, 0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1,
0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391
};
#define F(x, y, z) (((x) & (y)) | ((~x) & (z)))
#define G(x, y, z) (((x) & (z)) | ((y) & (~z)))
#define H(x, y, z) ((x) ^ (y) ^ (z))
#define I(x, y, z) ((y) ^ ((x) | (~z)))
#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32 - (n))))
#define FF(a, b, c, d, k, s, i) \
{ (a) += F((b), (c), (d)) + (k) + (i); \
(a) = ROTATE_LEFT((a), (s)); \
(a) += (b); \
}
#define GG(a, b, c, d, k, s, i) \
{ (a) += G((b), (c), (d)) + (k) + (i); \
(a) = ROTATE_LEFT((a), (s)); \
(a) += (b); \
}
#define HH(a, b, c, d, k, s, i) \
{ (a) += H((b), (c), (d)) + (k) + (i); \
(a) = ROTATE_LEFT((a), (s)); \
(a) += (b); \
}
#define II(a, b, c, d, k, s, i) \
{ (a) += I((b), (c), (d)) + (k) + (i); \
(a) = ROTATE_LEFT((a), (s)); \
(a) += (b); \
}
// void ToLittleEndian(uint32_t* input) {
// uint32_t tmp = *input;
// *input = ((tmp & 0xff000000) >> 24) | ((tmp & 0x00ff0000) >> 8) |
// ((tmp & 0xff) << 24) | ((tmp & 0x0000ff00) << 8);
// }
uint8_t* ProcessChunk(uint8_t* input)
{
uint32_t AA = A;
uint32_t BB = B;
uint32_t CC = C;
uint32_t DD = D;
uint32_t M[16] = {0};
for(int i = 0; i < 16; i++) {
for(int j = 0; j < 4; j++) {
//word[i] |= (uint32_t)input[(i * 4) + j] << (24 - (j * 8));
M[i] |= (uint32_t)input[(i * 4) + j] << (32 - ((j + 1) * 8));
}
M[i] = htole32(M[i]); // is this necessary?
}
for(int i = 0; i < 16; i++) {
printf("\n word %d = 0x%08x\n", i, M[i]);
}
/* shifting rounds */
for(int i = 0; i < 64; i++) {
// error here
if (i < 16) {
FF(AA, BB, CC, DD, M[i], S[i], K[i]);
//printf("%d, %d, %u\n", M[i], S[i], K[i]);
} else if (i < 32) {
GG(AA, BB, CC, DD, M[(5 * i + 1) % 16], S[i], K[i]);
//printf("%d, %d, %u\n", M[(5 * i + 1) % 16], S[i], K[i]);
} else if (i < 48) {
HH(AA, BB, CC, DD, M[(3 * i + 5) % 16], S[i], K[i]);
//printf("%d, %d, %u\n", M[(3 * i + 5) % 16], S[i], K[i]);
} else {
II(AA, BB, CC, DD, M[(7 * i) % 16], S[i], K[i]);
//printf("%d, %d, %u\n", M[(7 * i) % 16], S[i], K[i]);
}
int Temp = DD;
DD = CC;
CC = BB;
BB = AA;
AA = Temp;
// printf("A = 0x%x\n", A);
// printf("B = 0x%x\n", B);
// printf("C = 0x%x\n", C);
// printf("D = 0x%x\n", D);
// printf("\n");
// printf("AA = 0x%x\n", AA);
// printf("BB = 0x%x\n", BB);
// printf("CC = 0x%x\n", CC);
// printf("DD = 0x%x\n", DD);
// printf("\n");
}
AA += A;
BB += B;
CC += C;
DD += D;
// uint64_t result = ((uint64_t)AA << 96) | ((uint64_t)BB << 64) |
// ((uint64_t)CC << 32) | (uint64_t)D;
uint8_t* digest = malloc(16);
for (int i = 0; i < 16; i++) {
if (i < 4) {
digest[i] = (AA >> (i*8)) & 0xff;
} else if (i < 8) {
digest[i] = (BB >> (i*8)) & 0xff;
} else if (i < 12) {
digest[i] = (CC >> (i*8)) & 0xff;
} else {
digest[i] = (DD >> (i*8)) & 0xff;
}
}
// for (int i = 0; i < 16; i++) {
// printf("%x ", result[i]);
// }
return (uint8_t*)digest;
// print out chunk
// printf("\n\n");
// for(int i = 0; i < 16; i++) {
// printf("0x%x ", M[i]);
// }
}
// abbrivation for print error
int printerr(char* errordesc)
{
printf("Error: %s\n", errordesc);
exit(1);
}
uint8_t* AddPadding(uint8_t* input, size_t input_len)
{
// compute padding length
size_t pad_len = (56 - (input_len) % 64) % 64;
// allocate required space to output buffer
uint8_t* padded = malloc(input_len + pad_len + 8);
if (padded == NULL) {
printerr("padding process cant allocate req memory");
}
// add cmd line string bytes into output
for(int i = 0; i < input_len; i++) { padded[i] = input[i]; }
// add padding bytes
for(int i = 0; i < (pad_len + 8); i++) {
if(i == 0) {
padded[input_len] = 0x80;
} else {
padded[input_len + 1 + i] = 0x00;
}
}
// changing size byte position to 0x80 when utilizing last byte
// add usigned 64 bit input length to end of the string
// if 2^64 only low order of bits are utilised
for(int i = 0; i < (64 / 8); i++) {
padded[input_len + pad_len + i] = (uint8_t)(input_len >> (64 - (i + 1) * 8) & 0xFF);
//padded[input_len + pad_len + i] = (uint8_t)(input_len >> (i * 8) & 0xFF);
}
return padded;
}
int main (int argc, char* argv[])
{
// initalise input var
uint8_t* inputstr = NULL;
if (argc !=3 || strcmp(argv[1], "--inputstr") != 0) {
printerr("usage - MD5 --inputstr (input_string)");
} else {
// point to argv[2] (3rd argument) for buffer
inputstr = (uint8_t*)argv[2];
}
// get str length
const size_t inputstr_len = strlen((char*)inputstr);
// improve later
if (inputstr_len == 0) {
printerr("inputstr < 1 char");
}
// add padding to last chunk
uint8_t* outputstr = AddPadding(inputstr, inputstr_len);
// print padding (probably will delete later)
for (int i = 0; i < inputstr_len + 1 + (64 - ((inputstr_len + 8 + 1) % 64 ) + 8); i++) {
printf("0x%02x ", outputstr[i]);
}
// break message into chunks + 1 compensation for padding
//size_t chunks = (inputstr_len / 64) + ((inputstr_len % 64 > 0) && (inputstr_len % 64 > 56));
uint8_t* chunk_result = NULL;
size_t chunks = (inputstr_len / 64) + 1 + (inputstr_len % 64 > 56);
for(int i = 0; i < chunks; i++) {
// only give req bytes
uint8_t* chunk_input = malloc(64 * sizeof(uint8_t));
for(int j = 0; j < 64; j++) {
chunk_input[j] = outputstr[i * 64 + j];
}
chunk_result = ProcessChunk(chunk_input);
// > 2 chunks??
}
// print chunk data
for (int i = 0; i < 16; i++) {
printf("%02x ", chunk_result[i]);
}
// make sure chunks cant be 0
printf("chunks: %ld\n", chunks);
// deallocate memory
free(outputstr);
free(chunk_result);
// executed sucessfully
exit(0);
}
so i was trying the MD5 hashing result for 'h' and it was
2510c39011c5be704182423e3a695e91
my result was
e0 64 f6 8c 9d de 68 df 1f 4b 11 d8 41 13 6b 77
i apologize i havent made a sorting method yet but by comparing the 0x25 you can tell its not equal
i have found a couple errors hopefully this helps on line 150 i defined them all as B in the macros #define statements i forgot the brackets around the letters

Related

256-bit integer to string [duplicate]

I'm trying to convert a 128-bit unsigned integer stored as an array of 4 unsigned ints to the decimal string representation in C:
unsigned int src[] = { 0x12345678, 0x90abcdef, 0xfedcba90, 0x8765421 };
printf("%s", some_func(src)); // gives "53072739890371098123344"
(The input and output examples above are completely fictional; I have no idea what that input would produce.)
If I was going to hex, binary or octal, this would be a simple matter of masks and bit shifts to peel of the least significant characters. However, it seems to me that I need to do base-10 division. Unfortunately, I can't remember how to do that across multiple ints, and the system I'm using doesn't support data types larger than 32-bits, so using a 128-bit type is not possible. Using a different language is also out, and I'd rather avoid a big number library just for this one operation.
Division is not necessary:
#include <string.h>
#include <stdio.h>
typedef unsigned long uint32;
/* N[0] - contains least significant bits, N[3] - most significant */
char* Bin128ToDec(const uint32 N[4])
{
// log10(x) = log2(x) / log2(10) ~= log2(x) / 3.322
static char s[128 / 3 + 1 + 1];
uint32 n[4];
char* p = s;
int i;
memset(s, '0', sizeof(s) - 1);
s[sizeof(s) - 1] = '\0';
memcpy(n, N, sizeof(n));
for (i = 0; i < 128; i++)
{
int j, carry;
carry = (n[3] >= 0x80000000);
// Shift n[] left, doubling it
n[3] = ((n[3] << 1) & 0xFFFFFFFF) + (n[2] >= 0x80000000);
n[2] = ((n[2] << 1) & 0xFFFFFFFF) + (n[1] >= 0x80000000);
n[1] = ((n[1] << 1) & 0xFFFFFFFF) + (n[0] >= 0x80000000);
n[0] = ((n[0] << 1) & 0xFFFFFFFF);
// Add s[] to itself in decimal, doubling it
for (j = sizeof(s) - 2; j >= 0; j--)
{
s[j] += s[j] - '0' + carry;
carry = (s[j] > '9');
if (carry)
{
s[j] -= 10;
}
}
}
while ((p[0] == '0') && (p < &s[sizeof(s) - 2]))
{
p++;
}
return p;
}
int main(void)
{
static const uint32 testData[][4] =
{
{ 0, 0, 0, 0 },
{ 1048576, 0, 0, 0 },
{ 0xFFFFFFFF, 0, 0, 0 },
{ 0, 1, 0, 0 },
{ 0x12345678, 0x90abcdef, 0xfedcba90, 0x8765421 }
};
printf("%s\n", Bin128ToDec(testData[0]));
printf("%s\n", Bin128ToDec(testData[1]));
printf("%s\n", Bin128ToDec(testData[2]));
printf("%s\n", Bin128ToDec(testData[3]));
printf("%s\n", Bin128ToDec(testData[4]));
return 0;
}
Output:
0
1048576
4294967295
4294967296
11248221411398543556294285637029484152
Straightforward division base 2^32, prints decimal digits in reverse order, uses 64-bit arithmetic, complexity O(n) where n is the number of decimal digits in the representation:
#include <stdio.h>
unsigned int a [] = { 0x12345678, 0x12345678, 0x12345678, 0x12345678 };
/* 24197857161011715162171839636988778104 */
int
main ()
{
unsigned long long d, r;
do
{
r = a [0];
d = r / 10;
r = ((r - d * 10) << 32) + a [1];
a [0] = d;
d = r / 10;
r = ((r - d * 10) << 32) + a [2];
a [1] = d;
d = r / 10;
r = ((r - d * 10) << 32) + a [3];
a [2] = d;
d = r / 10;
r = r - d * 10;
a [3] = d;
printf ("%d\n", (unsigned int) r);
}
while (a[0] || a[1] || a[2] || a[3]);
return 0;
}
EDIT: Corrected the loop so it displays a 0 if the array a contains only zeros.
Also, the array is read left to right, a[0] is most-significant, a[3] is least significant digits.
A slow but simple approach is to just printing digits from most significant to least significant using subtraction. Basically you need a function for checking if x >= y and another for computing x -= y when that is the case.
Then you can start counting how many times you can subtract 10^38 (and this will be most significant digit), then how many times you can subtract 10^37 ... down to how many times you can subtract 1.
The following is a full implementation of this approach:
#include <stdio.h>
typedef unsigned ui128[4];
int ge128(ui128 a, ui128 b)
{
int i = 3;
while (i >= 0 && a[i] == b[i])
--i;
return i < 0 ? 1 : a[i] >= b[i];
}
void sub128(ui128 a, ui128 b)
{
int i = 0;
int borrow = 0;
while (i < 4)
{
int next_borrow = (borrow && a[i] <= b[i]) || (!borrow && a[i] < b[i]);
a[i] -= b[i] + borrow;
borrow = next_borrow;
i += 1;
}
}
ui128 deci128[] = {{1u,0u,0u,0u},
{10u,0u,0u,0u},
{100u,0u,0u,0u},
{1000u,0u,0u,0u},
{10000u,0u,0u,0u},
{100000u,0u,0u,0u},
{1000000u,0u,0u,0u},
{10000000u,0u,0u,0u},
{100000000u,0u,0u,0u},
{1000000000u,0u,0u,0u},
{1410065408u,2u,0u,0u},
{1215752192u,23u,0u,0u},
{3567587328u,232u,0u,0u},
{1316134912u,2328u,0u,0u},
{276447232u,23283u,0u,0u},
{2764472320u,232830u,0u,0u},
{1874919424u,2328306u,0u,0u},
{1569325056u,23283064u,0u,0u},
{2808348672u,232830643u,0u,0u},
{2313682944u,2328306436u,0u,0u},
{1661992960u,1808227885u,5u,0u},
{3735027712u,902409669u,54u,0u},
{2990538752u,434162106u,542u,0u},
{4135583744u,46653770u,5421u,0u},
{2701131776u,466537709u,54210u,0u},
{1241513984u,370409800u,542101u,0u},
{3825205248u,3704098002u,5421010u,0u},
{3892314112u,2681241660u,54210108u,0u},
{268435456u,1042612833u,542101086u,0u},
{2684354560u,1836193738u,1126043566u,1u},
{1073741824u,1182068202u,2670501072u,12u},
{2147483648u,3230747430u,935206946u,126u},
{0u,2242703233u,762134875u,1262u},
{0u,952195850u,3326381459u,12621u},
{0u,932023908u,3199043520u,126217u},
{0u,730304488u,1925664130u,1262177u},
{0u,3008077584u,2076772117u,12621774u},
{0u,16004768u,3587851993u,126217744u},
{0u,160047680u,1518781562u,1262177448u}};
void print128(ui128 x)
{
int i = 38;
int z = 0;
while (i >= 0)
{
int c = 0;
while (ge128(x, deci128[i]))
{
c++; sub128(x, deci128[i]);
}
if (i==0 || z || c > 0)
{
z = 1; putchar('0' + c);
}
--i;
}
}
int main(int argc, const char *argv[])
{
ui128 test = { 0x12345678, 0x90abcdef, 0xfedcba90, 0x8765421 };
print128(test);
return 0;
}
That number in the problem text in decimal becomes
11248221411398543556294285637029484152
and Python agrees this is the correct value (this of course doesn't mean the code is correct!!! ;-) )
Same thing, but with 32-bit integer arithmetic:
#include <stdio.h>
unsigned short a [] = {
0x0876, 0x5421,
0xfedc, 0xba90,
0x90ab, 0xcdef,
0x1234, 0x5678
};
int
main ()
{
unsigned int d, r;
do
{
r = a [0];
d = r / 10;
r = ((r - d * 10) << 16) + a [1];
a [0] = d;
d = r / 10;
r = ((r - d * 10) << 16) + a [2];
a [1] = d;
d = r / 10;
r = ((r - d * 10) << 16) + a [3];
a [2] = d;
d = r / 10;
r = ((r - d * 10) << 16) + a [4];
a [3] = d;
d = r / 10;
r = ((r - d * 10) << 16) + a [5];
a [4] = d;
d = r / 10;
r = ((r - d * 10) << 16) + a [6];
a [5] = d;
d = r / 10;
r = ((r - d * 10) << 16) + a [7];
a [6] = d;
d = r / 10;
r = r - d * 10;
a [7] = d;
printf ("%d\n", r);
}
while (a[0] || a[1] || a[2] || a[3] || a [4] || a [5] || a[6] || a[7]);
return 0;
}
You actually don't need to implement long division. You need to implement multiplication by a power of two, and addition. You have four uint_32. First convert each of them to a string. Multiply them by (2^32)^3, (2^32)^2, (2^32)^1, and (2^32)^0 respectively, then add them together. You don't need to do the base conversion, you just need to handle putting the four pieces together. You'll obviously need to make sure the strings can handle a number up to UINT_32_MAX*(2^32)^3.
Supposing you have a fast 32-bit multiplication and division the result can be computed 4 digits at a time by implementing a bigint division/modulo 10000 and then using (s)printf for output of digit groups.
This approach is also trivial to extend to higher (or even variable) precision...
#include <stdio.h>
typedef unsigned long bigint[4];
void print_bigint(bigint src)
{
unsigned long int x[8]; // expanded version (16 bit per element)
int result[12]; // 4 digits per element
int done = 0; // did we finish?
int i = 0; // digit group counter
/* expand to 16-bit per element */
x[0] = src[0] & 65535;
x[1] = src[0] >> 16;
x[2] = src[1] & 65535;
x[3] = src[1] >> 16;
x[4] = src[2] & 65535;
x[5] = src[2] >> 16;
x[6] = src[3] & 65535;
x[7] = src[3] >> 16;
while (!done)
{
done = 1;
{
unsigned long carry = 0;
int j;
for (j=7; j>=0; j--)
{
unsigned long d = (carry << 16) + x[j];
x[j] = d / 10000;
carry = d - x[j] * 10000;
if (x[j]) done = 0;
}
result[i++] = carry;
}
}
printf ("%i", result[--i]);
while (i > 0)
{
printf("%04i", result[--i]);
}
}
int main(int argc, const char *argv[])
{
bigint tests[] = { { 0, 0, 0, 0 },
{ 0xFFFFFFFFUL, 0, 0, 0 },
{ 0, 1, 0, 0 },
{ 0x12345678UL, 0x90abcdefUL, 0xfedcba90UL, 0x8765421UL } };
{
int i;
for (i=0; i<4; i++)
{
print_bigint(tests[i]);
printf("\n");
}
}
return 0;
}
#Alexey Frunze's method is easy but it's very slow. You should use #chill's 32-bit integer method above. Another easy method without any multiplication or division is double dabble. This may work slower than chill's algorithm but much faster than Alexey's one. After running you'll have a packed BCD of the decimal number
On github is an open source project (c++) which provides a class for a datatype uint265_t and uint128_t.
https://github.com/calccrypto/uint256_t
No, I' not affiliated with that project, but I was using it for such a purpose, but I guess it could be usefull for others as well.

How to debug a Fatal Signal 6 crash on Android Studio for Native C Code?

I am working to find a crash that happens in Native C in our Android Studio.
We know where that the error originates from a library in Native C. Most probably from a Garbage handler.
 
We have tried to Release the variables in different ways with no success yet. The strange part the code works fine for Android 5.0 and up. 
I also googled for how to debug NDK in Android studio by adding
Enable app debugging in your AndroidManifest.xml file by including an <application> element that sets the android:debuggable attribute to true.
and adding a log:
__android_log_print(ANDROID_LOG_VERBOSE, APPNAME, "The value R %f G %f B %f , H %f S %f V %f ", rgbData[0], rgbData[1], rgbData[2], rgbData[3], rgbData[4], rgbData[5]);
However, the monitor still doesn't print anything. Below the piece of code and the crash log.
Any help would be highly appreciated.
#include <jni.h>
//#include <android/log.h>
#define APPNAME "handroid"
#define MAX(a,b) \
({ __typeof__ (a) _a = (a); \
__typeof__ (b) _b = (b); \
_a > _b ? _a : _b; })
#define MIN(a,b) \
({ __typeof__ (a) _a = (a); \
__typeof__ (b) _b = (b); \
_a < _b ? _a : _b; })
double* rgbData;
int rgbDataSize = 0;
JNIEXPORT
void
JNICALL
Java_handroid_classes_Camera_YUVtoRBGHSV(JNIEnv * env, jobject obj, jdoubleArray rgb_hsv, jbyteArray yuv420sp, jint width, jint height)
{
int sz;
int i;
int j;
int Y;
int Cr = 0;
int Cb = 0;
int pixPtr = 0;
int jDiv2 = 0;
int R = 0;
int G = 0;
int B = 0;
double tR = 0;
double tG = 0;
double tB = 0;
int cOff;
int w = width;
int h = height;
sz = w * h;
int pixel;
int uvp;
int y1192;
int y;
int v;
int u;
int yp;
//for hsv
double min, max, delta, hsv_h, hsv_s, hsv_v;
jboolean isCopy;
jbyte* yuv = (*env)->GetByteArrayElements(env, yuv420sp, &isCopy);
if(rgbDataSize < sz) {
double tmp[6];
rgbData = &tmp[0];
rgbDataSize = sz;
}
//Calculate pixel colors
for (j = 0, yp = 0; j < h; j++) {
uvp = sz + (j >> 1) * w, u = 0, v = 0;
for (i = 0; i < w; i++, yp++) {
y = (0xff & yuv[yp]) - 16;
if (y < 0) y = 0;
if ((i & 1) == 0) {
v = (0xff & yuv[uvp++]) - 128;
u = (0xff & yuv[uvp++]) - 128;
}
y1192 = 1192 * y;
R = (y1192 + 1634 * v);
G = (y1192 - 833 * v - 400 * u);
B = (y1192 + 2066 * u);
if (R < 0) R = 0;
else if (R > 262143) R = 262143;
if (G < 0) G = 0;
else if (G > 262143) G = 262143;
if (B < 0) B = 0;
else if (B > 262143) B = 262143;
pixel = 0xff000000 | ((R << 6) & 0xff0000) | ((G >> 2) & 0xff00) | ((B >> 10) & 0xff);
tR += (pixel >> 16) & 0xff;
tG += (pixel >> 8) & 0xff;
tB += (pixel >> 0) & 0xff;
}
}
//Create RGB sum (average pixel)
rgbData[0] = (double)(tR/255/sz);
rgbData[1] = (double)(tG/255/sz);
rgbData[2] = (double)(tB/255/sz);
//Calculate HSV
min = MIN(rgbData[0], MIN(rgbData[1], rgbData[2]));
max = MAX(rgbData[0], MAX(rgbData[1], rgbData[2]));
hsv_v = max;
delta = max - min;
if( max != 0 ){
hsv_s = delta / max;
if( rgbData[0] == max )
hsv_h = ( rgbData[1] - rgbData[2] ) / delta;
else if( rgbData[1] == max )
hsv_h=2+(rgbData[2]-rgbData[0])/delta;
else
hsv_h=4+(rgbData[0]-rgbData[1])/delta;
hsv_h *= 60;
if( hsv_h < 0 )
hsv_h += 360;
rgbData[3] = hsv_h;
rgbData[4] = hsv_s;
rgbData[5] = hsv_v;
}else {
// r = g = b = 0
hsv_s = 0;
hsv_h = -1;
rgbData[3] = hsv_h;
rgbData[4] = hsv_s;
rgbData[5] = hsv_v;
}
//Log the data in Android
//__android_log_print(ANDROID_LOG_VERBOSE, APPNAME, "The value R %f G %f B %f , H %f S %f V %f ", rgbData[0], rgbData[1], rgbData[2], rgbData[3], rgbData[4], rgbData[5]);
//Set RGB
(*env)->SetDoubleArrayRegion(env, rgb_hsv, 0, 1, ( jdouble * ) &rgbData[0] );
(*env)->SetDoubleArrayRegion(env, rgb_hsv, 1, 1, ( jdouble * ) &rgbData[1] );
(*env)->SetDoubleArrayRegion(env, rgb_hsv, 2, 1, ( jdouble * ) &rgbData[2] );
//Set HSV
(*env)->SetDoubleArrayRegion(env, rgb_hsv, 3, 1, ( jdouble * ) &rgbData[3] );
(*env)->SetDoubleArrayRegion(env, rgb_hsv, 4, 1, ( jdouble * ) &rgbData[4] );
(*env)->SetDoubleArrayRegion(env, rgb_hsv, 5, 1, ( jdouble * ) &rgbData[5] );
//Release the array data
//(*env)->ReleaseByteArrayElements(env, yuv420sp, yuv, JNI_ABORT);
(*env)->ReleaseByteArrayElements(env, yuv420sp, yuv, 0);
//(*env)->ReleaseDoubleArrayElements(env, yuv420sp, yuv,rgbData,min,max,0);
}
Crash log
04-03 15:30:27.687 5547-5547/com.hlib E/dalvikvm: VM aborting
04-03 15:30:27.687 5547-5547/com.hlib A/libc: Fatal signal 6 (SIGABRT) at 0x000015ab (code=-6), thread 5547 (it.hlib)
I am not familiar with Native C under android, but I will try :)
The macros MIN & MAX look very peculiar, not sure about the syntax "({ instr })", would it be valuated to the last instruction ? why not use a more conventional definition "MAX(a, b) ((a) > (b) ? (a) : (b))" ? It would incur 3 evaluations instead of 2 ; but in any case, the MIN & MAX are used in the code in a nested manner, in this case it might be more efficient to use plain functions.
rgbData is initialized with a stack variable tmp[6] that will be destroyed after the end of the if block,
if(rgbDataSize < sz) {
double tmp[6];
rgbData = &tmp[0];
rgbDataSize = sz;
}
the result of
jbyte* yuv = (*env)->GetByteArrayElements(env, yuv420sp, &isCopy);
should be checked wether it is null, does it return an array of at least (w*h + w * (h/2) + w) bytes ? accessed through:
v = (0xff & yuv[uvp++]) - 128;
u = (0xff & yuv[uvp++]) - 128;
Are the "width" and "height" parameters correct with regards to the size of the bitmap "yuv" ?
the block
if( max != 0 ){
will divide by "delta", which might be 0.
does the next block need "{ }" to include both instructions ?
else
hsv_h=4+(rgbData[0]-rgbData[1])/delta;
hsv_h *= 60;
it seems that the 6 function calls could be replaced with only 1 call
<= (*env)->SetDoubleArrayRegion(env, rgb_hsv, 0, 1, ( jdouble * ) &rgbData[0] );
=> (*env)->SetDoubleArrayRegion(env, rgb_hsv, 0, 6, ( jdouble * ) &rgbData[0] );
That's what I could find out with a brief look.

How can I encode four 16 bit uints into a 64 bit uint, and decode them again?

I wrote this function with the help of this page on bit twiddling:
uint16_t *decode(uint64_t instr) {
// decode instr (this is new to me lol)
uint16_t icode = (instr >> 48) & ((1 << 16) - 1);
uint16_t p1 = (instr >> 32) & ((1 << 16) - 1);
uint16_t p2 = (instr >> 16) & ((1 << 16) - 1);
uint16_t p3 = (instr >> 00) & ((1 << 16) - 1);
return (uint16_t[]){icode, p1, p2, p3};
}
I have this to test it:
uint16_t *arr = decode(number);
for(int i = 0; i < 4; i++) {
printf("%d\n", arr[i]);
}
However, this prints 0 four times whatever number is. I also haven't solved the first part of the question, how to encode the four uint16_t's in the first place.
how to encode the four uint16_t's in the first place
This isn't hard. All you have to do is to load each uint16_t to a uint64_t one-by-one, and then return that uint64_t:
uint64_t encode(uint16_t uints[]) {
uint64_t master = 0;
for (uint8_t index = 0; index <= 3; ++index) {
master <<= 16; // Shift master left by 16 bits to create space for the next uint16
master |= uints[index]; // Load uints[index] to the lower 16 bits of master
} // Do this four times
return master;
}
To load the uint16_ts in reverse order, simply replace uint8_t index = 0; index <= 3; ++index with uint8_t index = 3; index >= 0; --index.
Your best bet is actually to use memcpy. Most modern compilers will optimize this into the necessary bit shifts and such for you.
uint64_t pack(const uint16_t arr[static 4]) {
uint64_t res;
memcpy(&res, arr, 8);
return res;
}
void unpack(uint64_t v, uint16_t arr[static 4]) {
memcpy(arr, &v, 8);
}
Note that the result is endian-dependent, appropriate for packing and unpacking on the same machine. Note too that I'm using the static array specifier to check that the caller passes at least 4 elements (when such checking is possible); if that gives your compiler grief, just remove the static specifier.
First, you can't pass an array back from a function the way you currently have it (which is why you're getting 0's), you'll need to pass it via pointer or static reference.
However, since you're dealing with 2 known bit-widths, you can use a mask and shift off that:
out[0] = val & 0x000000000000FFFF; // 1st word
out[1] = (val & 0x00000000FFFF0000) >> 16; // 2nd word
out[2] = (val & 0x0000FFFF00000000) >> 32; // 3rd word
out[3] = (val & 0xFFFF000000000000) >> 48; // 4th word
You could put this in a function or macro:
#define MACRO_DECODE(val, arr) arr[0]= val & 0x000000000000FFFF; \
arr[1] = (val & 0x00000000FFFF0000) >> 16; \
arr[2] = (val & 0x0000FFFF00000000) >> 32; \
arr[3] = (val & 0xFFFF000000000000) >> 48;
void decode(uint64_t val, uint16_t *out)
{
out[0] = val & 0x000000000000FFFF;
out[1] = (val & 0x00000000FFFF0000) >> 16;
out[2] = (val & 0x0000FFFF00000000) >> 32;
out[3] = (val & 0xFFFF000000000000) >> 48;
}
int main(int argc, char** argv)
{
int i;
uint16_t arr[] = { 0, 0, 0, 0} ;
for (i = 0; i < 4; ++i) {
printf("%#06x = %d\n", arr[i], arr[i]);
}
// as a function
decode(0xAAAABBBBCCCCDDDD, arr);
for (i = 0; i < 4; ++i) {
printf("%#06x = %d\n", arr[i], arr[i]);
}
// as a macro
MACRO_DECODE(0xDDDDCCCCBBBBAAAA, arr);
for (i = 0; i < 4; ++i) {
printf("%#06x = %d\n", arr[i], arr[i]);
}
return 0;
}
Additionally, you could use memcpy:
int main(int argc, char** argv)
{
int i;
uint16_t arr[] = { 0, 0, 0, 0} ;
uint64_t src = 0xAAAABBBBCCCCDDDD;
for (i = 0; i < 4; ++i) {
printf("%#06x = %d\n", arr[i], arr[i]);
}
// memcpy
memcpy(arr, &src, sizeof(arr));
for (i = 0; i < 4; ++i) {
printf("%#06x = %d\n", arr[i], arr[i]);
}
return 0;
}
Hope that can help.

parallelize prime sieve segmentation error

I changed the first version of the Code linked here to make the loop on line 360 (in the Code I shared below) run in parallel.
For this I replaced the fields\variables where the results are saved by fields of field\variables so that every thread can save them without deleting the results of other threads.
Additionally I replaced the calculation of values used in every loop pass in a way that they aren't depending on the value they had in the last loop pass (So that I can calculate them just depending on the value of the loop variable).
I am going to post the entire Code here be course the Minimal example would just be 10 lines shorter but miss any possibility to Check if the result is wrong or not. No special compiler features are used, just -fopenmp (under g++) as argument is necessary.
//
// prime_sieve.c
//
// Copyright (C) July 2002, Tomás Oliveira e Silva
//
// e-mail: tos#ua.pt
// www: http://www.ieeta.pt/~tos
//
// Comparison of two simple (but serious) implementations of the segmented sieve of
// Eratosthenes. The second implementation can generate primes reasonably fast near
// 1e18 (using around 400Mbytes of memory).
//
// _implementation_=0 gives a classical segmented sieve
// _implementation_=1 gives a cache-friendly segmented sieve
//
// See timing results for the two implementations at the end.
//
// Main idea: use one linked list for each interval of the segmented sieve, putting in it
// the primes than have an odd multiple in that interval (but not in a previous interval);
// this allows a better utilization of the processor data caches, giving significant time
// savings (up to a factor of 6) when working near 1e18. The amount of memory used is
// approximately 8*pi(sqrt(N)), where N is the last number of the interval, and pi(x) is
// the usual prime counting function.
//
// Assumptions: pointers have 4 bytes, gcc compiler
//
//
// Released under the GNU general public license (version 2 or any later version); see the
// gpl.txt file (or the page http://www.gnu.org/licenses/gpl.html) for details.
//
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
#include <math.h>
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
//
// configuration specification
//
// default parameters optimized for integers near 1e18; see tables at the end
//
#define nthreads 2
#ifndef _sieve_bits_log2_
# define _sieve_bits_log2_ 19
#endif
#ifndef _bucket_size_log2_
# define _bucket_size_log2_ 10
#endif
#if _bucket_size_log2_ > 16
# error "_bucket_size_log2_ is too large"
#endif
//
// basic type definitions
//
typedef unsigned char u08;
typedef unsigned int u32;
typedef unsigned long long u64;
//
// memory allocation
//
static void *get_memory(u32 size)
{
size_t m;
m = (size_t)malloc(size + 255); // this assumes that sizeof(void *) = sizeof(size_t)
if((void *)m == NULL)
exit(1);
m = (m + (size_t)255) & ~(size_t)255;
return (void *)m; // pointer aligned on a 256 byte boundary
}
//
// count the number of zeros
//
static u32 count_zero_bits(u08 *addr,u32 size)
{
static u32 data[256];
u32 i,j;
if(data[1] == 0)
for(i = 0;i < 256;i++)
for(j = i ^ 255;j;j >>= 1)
if(j & 1)
data[i]++;
j = 0;
for(i = 0;i < size;i++)
j += data[(u32)addr[i] & 255];
return j;
}
//
// generation of the (small) primes used by the main sieve
//
#define number_of_small_primes 6541// number of primes below 2 ^ 19
static u32 small_primes[number_of_small_primes];
static u32 small_sieve[nthreads][1024];//65 536 bits
static u32 small_base[nthreads];
static void update_small_sieve(u32 th_id)
{
u32 i,j;
for(j = 0;j < 1024;j++)
small_sieve[th_id][j] = 0;
for(i = 0;i < number_of_small_primes;i++)
{
j = small_primes[i] * small_primes[i];
if(j >= small_base[th_id] + 65536)
break;
if(j < small_base[th_id])
{
j = small_base[th_id] / small_primes[i];
j *= small_primes[i];
if(j < small_base[th_id])
j += small_primes[i];
if((j & 1) == 0)
j += small_primes[i];
}
for(j = (j - small_base[th_id]) >> 1;j < 32768;j += small_primes[i])
small_sieve[th_id][j >> 5] |= 1 << (j & 31);
}
}
//
// main sieve
//
// the following structure is used to record the
// information required to sieve an interval
//
// the value of _bucket_size_log2_ should
// be small (and a multiple of the L1 or L2 data cache line size)
//
#define primes_per_bucket ((1 << (_bucket_size_log2_ - 3)) - 1)
typedef struct bucket
{
struct bucket *next; // pointer to next bucket
u32 count; // count of the number of primes in this bucket
struct
{
u32 p; // prime
u32 o; // the bit number of the first odd multiple (>= main_base) of the prime
}
data[primes_per_bucket];
}
bucket;
static u32 main_sieve[nthreads][1 << (_sieve_bits_log2_ - 5)];
static u64 main_limit; // wird nicht parallel geaendert
static bucket **main_lists[nthreads],*available_buckets[nthreads];
static u32 list_size_log2;
void more_buckets(int th_id) { u32 i,j; i = 1 << (20 - _bucket_size_log2_);
available_buckets[th_id] = (bucket *)get_memory(i * sizeof(bucket)); for(j = 0;j < i;j++)
available_buckets[th_id][j].next = (j < i - 1) ? &available_buckets[th_id][j + 1] : NULL; }
void new_bucket(u64 k,int th_id) { bucket *b; if(available_buckets[th_id] == NULL) more_buckets(th_id);
b = available_buckets[th_id]; available_buckets[th_id] = available_buckets[th_id]->next;
b->next = main_lists[th_id][k]; main_lists[th_id][k] = b; b->count = 0; }
static void init_main_sieve(const u64 main_base, const u32 th_id, u32 next_prime, const u32 current_list)
{
u64 t,end;
u32 i,j;
u32 k;
end = main_base + (u64)(2 << _sieve_bits_log2_);
if ( small_base[th_id] != (next_prime/65536) * 65536) {
small_base[th_id] = (next_prime/65536) * 65536;
update_small_sieve(th_id);
}
while((t = (u64)next_prime * (u64)next_prime) < end)
{
if(next_prime >= small_base[th_id] + 65536)
{
small_base[th_id] += 65536;
update_small_sieve(th_id);
}
// primes are (beside two) always odd so they have at least a distance of 2.
// you dont have to save information about even numbers, so divide distance by two.
i = (next_prime - small_base[th_id]) >> 1;
if((small_sieve[th_id][i >> 5] & (1 << (i & 31))) == 0)// is nextprime a prime?
{
if(t < main_base) // setze t auf das erste vielfache der Primzahl > main_base
{
t = main_base / (u64)next_prime;
t *= (u64)next_prime;
if(t < main_base)
t += (u64)next_prime;
if(((u32)t & 1) == 0)
t += (u64)next_prime;
}
i = (u32)((t - main_base) >> 1); // bit number
k = (current_list + (i >> _sieve_bits_log2_)) & ((1 << list_size_log2) - 1);
if(main_lists[th_id][k]->count == primes_per_bucket){
//#pragma omp critical
new_bucket(k, th_id);
}
j = main_lists[th_id][k]->count++;
main_lists[th_id][k]->data[j].p = next_prime;
main_lists[th_id][k]->data[j].o = i & ((1 << _sieve_bits_log2_) - 1);
}
// atomic add
next_prime += 2;
}
}
static void do_main_sieve(const u64 main_base, const u32 th_id, u32 next_prime, const u32 current_list)
{
bucket *b;
bucket *c;
u32 j,k;
u32 i,p,o;
init_main_sieve(main_base, th_id, next_prime, current_list);
for(i = 0;i < (1 << (_sieve_bits_log2_ - 5));i++)
main_sieve[th_id][i] = 0;
b = main_lists[th_id][current_list];
while(b != NULL)
{
for(i = 0;i < b->count;i++)
{
p = b->data[i].p;
for(o = b->data[i].o;o < (1 << _sieve_bits_log2_);o += p)
//finde das entsprechende u32 feld mit allen bits von o auser den 5 letzten
// und finde mit den letzten 5 bits von 0 die Stelle in dem u32 wert die du auf 1
// dh. vielfaches einer Zahl, setzt
main_sieve[th_id][o >> 5] |= 1 << (o & 31);
k = (current_list + (o >> _sieve_bits_log2_)) & ((1 << list_size_log2) - 1);
if(main_lists[th_id][k]->count == primes_per_bucket) {
//#pragma omp critical
new_bucket(k, th_id);
}
j = main_lists[th_id][k]->count++;
main_lists[th_id][k]->data[j].p = p;
main_lists[th_id][k]->data[j].o = o & ((1 << _sieve_bits_log2_) - 1);
}
c = b;
b = b->next;
c->next = available_buckets[th_id];
available_buckets[th_id] = c;
}
main_lists[th_id][current_list] = NULL;
#pragma omp critical
new_bucket(current_list, th_id);
//current_list = (current_list + 1) & ((1 << list_size_log2) - 1);
}
void set_small_primes(void)
{
u32 i,j;
if(small_primes[0] == 0)
{ // initialize the small_primes array
for(j = 0;j < 1024;j++)
small_sieve[0][j] = 0;
for(i = 3;i < 256;i += 2)// 256 ^2 = 65 536
if((small_sieve[0][i >> 6] & (1 << ((i >> 1) & 31))) == 0)
for(j = (i * i) >> 1;j < 32768;j += i)
small_sieve[0][j >> 5] |= 1 << (j & 31);
j = 0;
for(i = 3;i < 65536;i += 2)
if((small_sieve[0][i >> 6] & (1 << ((i >> 1) & 31))) == 0)
small_primes[j++] = i;
if(j != number_of_small_primes)
exit(2); // this should never happen
}
}
//
// main program
//
int main(int argc,char **argv)
{
double t;
u32 i,j;
u64 pi, counter=0;
u64 main_base;
int ntasks = 1;
u32 next_prime = 3;
u32 current_list = 0;
omp_set_num_threads(nthreads);
if(argc == 1)
i = 15;
else
i = atoi(argv[1]);
if(i < 6)
i = 6;
if(i > 18)
i = 18;
printf("%2u %2u",_sieve_bits_log2_,_bucket_size_log2_);
main_base = 1ull;
for(j = 0;j < i;j++)
main_base *= 10ull;
main_limit = main_base + 2000000000ull;
// set list_size_log2
u32 l;
l = 1 + (u32)ceil(sqrt((double)main_limit));
l = 2 + (l >> _sieve_bits_log2_);
for(list_size_log2 = 2;(1 << list_size_log2) < l;list_size_log2++)
;
//set main_lists
for (int i = 0; i < nthreads;i++) {
available_buckets[i] = NULL;
main_lists[i] = (bucket **)get_memory((1 << list_size_log2) * sizeof(bucket *));
for(u32 k = 0;k < (1 << list_size_log2);k++)
{
main_lists[i][k] = NULL;
new_bucket(k, i);
}
}
//set_small_primes
t = (double)clock();
for (int i = 0; i < nthreads;i++) small_base[i] = 0;
set_small_primes();
printf(" %2d",i);
// init main sieve
init_main_sieve(main_base,0, next_prime, current_list);
t = ((double)clock() - t) / (double)CLOCKS_PER_SEC;
printf(" %6.2f",t);
j = 1 << (_sieve_bits_log2_ - 3);
pi = 0ull;
main_limit = main_base + 1000000000ull;
if(((u32)main_base | (u32)main_limit) & 63)
{
fprintf(stderr,"Warning: prime number counts may be incorrect\n");
fprintf(stderr," main_base and main_limit should be multiples of 64\n");
}
// calculate iteration count fast
t = (double)clock();
u64 main_base_tmp = main_base;
const u64 main_base_const = main_base_tmp;
for(;;)
{
i = (u32)(main_limit - main_base_tmp) >> 4;
if(i <= j)
break;
main_base_tmp += (u64)j << 4;
counter++;
}
{
//prepare values
int th_id = omp_get_thread_num();
u64 main_base_private = main_base_const;
u64 end = main_base_private + (u64)(2 << _sieve_bits_log2_);
u32 next_prime_private = next_prime;
while ((u64) next_prime_private * (u64) next_prime_private < end) next_prime_private += 2;
next_prime = next_prime_private;
// call function
do_main_sieve(main_base_private, th_id, next_prime_private, current_list);
// calculate results
pi += (u64)count_zero_bits((u08 *)main_sieve[th_id],j);
}
while (1) printf("B");
#pragma omp parallel for //private (main_base)
for(u64 c=1;c<counter;c++)
{
//prepare values
u32 current_list_private = current_list;
for (u64 count = 0; count < c; count++)
current_list_private = (current_list_private + 1) & ((1 << list_size_log2) - 1);
int th_id = omp_get_thread_num();
u64 main_base_private = main_base_const+((u64)j << 4)*(c);
u64 end = main_base_const+((u64)j << 4)*(c-1) + (u64)(2 << _sieve_bits_log2_);
u32 next_prime_private = next_prime;
while ((u64) next_prime_private * (u64) next_prime_private < end) next_prime_private += 2;
// call function
do_main_sieve(main_base_private, th_id, next_prime_private, current_list_private);
// calculate results
#pragma omp atomic
pi += (u64)count_zero_bits((u08 *)main_sieve[th_id],j);
printf(" %llu",c);
}
main_base = main_base_const+((u64)j << 4)*(counter);
u64 end = main_base + (u64)(2 << _sieve_bits_log2_);
while ((u64) next_prime * (u64) next_prime < end) next_prime += 2;
for (u64 count = 0; count < counter; count++)
current_list = (current_list + 1) & ((1 << list_size_log2) - 1);
do_main_sieve(main_base, 0, next_prime, current_list);
i = (u32)(main_limit - main_base) >> 4;
pi += (u64)count_zero_bits((u08 *)main_sieve[0],i);
t = ((double)clock() - t) / (double)CLOCKS_PER_SEC;
printf(" %7.2f %8llu\n",t,pi);
return 0;
}
I checked all variables used in this Code, they should be not depending on anything else then the loop variable (and other variables calculated only depending on the loop variable). To be specific
next_prime, main_base, small_base, small_sieve, available_buckets_buckets, main_sieve, current_list should not cause any trouble.
If would really appreciate it if anybody could have a look at it and tell me why I always get the same wrong result if I choose a threadnum > 1.
May some IDE's could tell more about that too but I use Codelite rarely and don't know how to get this information.

speeding up md5 program

This is a example of md5 in C, but the program is very slow it takes a little over a second to encode a simple string, What is slowing the program down?
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
// Constants are the integer part of the sines of integers (in radians) * 2^32.
const uint32_t k[64] = {
0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee ,
0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501 ,
0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be ,
0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821 ,
0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa ,
0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8 ,
0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed ,
0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a ,
0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c ,
0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70 ,
0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05 ,
0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665 ,
0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039 ,
0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1 ,
0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1 ,
0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391 };
// r specifies the per-round shift amounts
const uint32_t r[] = {7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22, 7, 12, 17, 22,
5, 9, 14, 20, 5, 9, 14, 20, 5, 9, 14, 20, 5, 9, 14, 20,
4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23, 4, 11, 16, 23,
6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21, 6, 10, 15, 21};
// leftrotate function definition
#define LEFTROTATE(x, c) (((x) << (c)) | ((x) >> (32 - (c))))
void to_bytes(uint32_t val, uint8_t *bytes)
{
bytes[0] = (uint8_t) val;
bytes[1] = (uint8_t) (val >> 8);
bytes[2] = (uint8_t) (val >> 16);
bytes[3] = (uint8_t) (val >> 24);
}
uint32_t to_int32(const uint8_t *bytes)
{
return (uint32_t) bytes[0]
| ((uint32_t) bytes[1] << 8)
| ((uint32_t) bytes[2] << 16)
| ((uint32_t) bytes[3] << 24);
}
void md5(const uint8_t *initial_msg, size_t initial_len, uint8_t *digest) {
// These vars will contain the hash
uint32_t h0, h1, h2, h3;
// Message (to prepare)
uint8_t *msg = NULL;
size_t new_len, offset;
uint32_t w[16];
uint32_t a, b, c, d, i, f, g, temp;
// Initialize variables - simple count in nibbles:
h0 = 0x67452301;
h1 = 0xefcdab89;
h2 = 0x98badcfe;
h3 = 0x10325476;
//Pre-processing:
//append "1" bit to message
//append "0" bits until message length in bits ≡ 448 (mod 512)
//append length mod (2^64) to message
for (new_len = initial_len + 1; new_len % (512/8) != 448/8; new_len++)
;
msg = (uint8_t*)malloc(new_len + 8);
memcpy(msg, initial_msg, initial_len);
msg[initial_len] = 0x80; // append the "1" bit; most significant bit is "first"
for (offset = initial_len + 1; offset < new_len; offset++)
msg[offset] = 0; // append "0" bits
// append the len in bits at the end of the buffer.
to_bytes(initial_len*8, msg + new_len);
// initial_len>>29 == initial_len*8>>32, but avoids overflow.
to_bytes(initial_len>>29, msg + new_len + 4);
// Process the message in successive 512-bit chunks:
//for each 512-bit chunk of message:
for(offset=0; offset<new_len; offset += (512/8)) {
// break chunk into sixteen 32-bit words w[j], 0 ≤ j ≤ 15
for (i = 0; i < 16; i++)
w[i] = to_int32(msg + offset + i*4);
// Initialize hash value for this chunk:
a = h0;
b = h1;
c = h2;
d = h3;
// Main loop:
for(i = 0; i<64; i++) {
if (i < 16) {
f = (b & c) | ((~b) & d);
g = i;
} else if (i < 32) {
f = (d & b) | ((~d) & c);
g = (5*i + 1) % 16;
} else if (i < 48) {
f = b ^ c ^ d;
g = (3*i + 5) % 16;
} else {
f = c ^ (b | (~d));
g = (7*i) % 16;
}
temp = d;
d = c;
c = b;
b = b + LEFTROTATE((a + f + k[i] + w[g]), r[i]);
a = temp;
}
// Add this chunk's hash to result so far:
h0 += a;
h1 += b;
h2 += c;
h3 += d;
}
// cleanup
free(msg);
//var char digest[16] := h0 append h1 append h2 append h3 //(Output is in little-endian)
to_bytes(h0, digest);
to_bytes(h1, digest + 4);
to_bytes(h2, digest + 8);
to_bytes(h3, digest + 12);
}
int main(int argc, char **argv) {
char *msg = argv[1];
size_t len;
int i;
uint8_t result[16];
if (argc < 2) {
printf("usage: %s 'string'\n", argv[0]);
return 1;
}
len = strlen(msg);
// benchmark
for (i = 0; i < 1000000; i++) {
md5((uint8_t*)msg, len, result);
}
// display result
for (i = 0; i < 16; i++)
printf("%2.2x", result[i]);
puts("");
return 0;
}
I might be missing something, but:
// benchmark
for (i = 0; i < 1000000; i++) {
md5((uint8_t*)msg, len, result);
}
performs the same code one million times? A good idea to get an idea of how efficient it is, but not to actually do any work, efficiently.

Resources