MD5 Padding and Little Endians - md5

I have been trying to recreate the MD5 algorithm on my own.
I just can't seem to get the algorithm right. It seems that I
have a problem on padding and endianness.
#include <stdio.h>
#include <stdint.h>
/* F, G, H and I are basic MD5 functions.
*/
#define F(x, y, z) (((x) & (y)) | ((~x) & (z)))
#define G(x, y, z) (((x) & (z)) | ((y) & (~z)))
#define H(x, y, z) ((x) ^ (y) ^ (z))
#define I(x, y, z) ((y) ^ ((x) | (~z)))
/* ROTATE_LEFT rotates x left n bits.
*/
#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n))))
/* FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4.
Rotation is separate from addition to prevent recomputation.
*/
#define FF(a, b, c, d, x, s, ac) { \
(a) += F ((b), (c), (d)) + (x) + (uint32_t)(ac); \
(a) = ROTATE_LEFT ((a), (s)); \
(a) += (b); \
}
#define GG(a, b, c, d, x, s, ac) { \
(a) += G ((b), (c), (d)) + (x) + (uint32_t)(ac); \
(a) = ROTATE_LEFT ((a), (s)); \
(a) += (b); \
}
#define HH(a, b, c, d, x, s, ac) { \
(a) += H ((b), (c), (d)) + (x) + (uint32_t)(ac); \
(a) = ROTATE_LEFT ((a), (s)); \
(a) += (b); \
}
#define II(a, b, c, d, x, s, ac) { \
(a) += I ((b), (c), (d)) + (x) + (uint32_t)(ac); \
(a) = ROTATE_LEFT ((a), (s)); \
(a) += (b); \
}
#define S11 7
#define S12 12
#define S13 17
#define S14 22
#define S21 5
#define S22 9
#define S23 14
#define S24 20
#define S31 4
#define S32 11
#define S33 16
#define S34 23
#define S41 6
#define S42 10
#define S43 15
#define S44 21
void MD5_hash(uint32_t *message, uint32_t *digest) {
const uint32_t d0 = 0x67452301;
const uint32_t d1 = 0xEFCDAB89;
const uint32_t d2 = 0x98BADCFE;
const uint32_t d3 = 0x10325476;
uint32_t a, b, c, d, *x;
a = d0;
b = d1;
c = d2;
d = d3;
x = message;
/* Round 1 */
FF (a, b, c, d, x[ 0], S11, 0xd76aa478); /* 1 */
FF (d, a, b, c, x[ 1], S12, 0xe8c7b756); /* 2 */
FF (c, d, a, b, x[ 2], S13, 0x242070db); /* 3 */
FF (b, c, d, a, x[ 3], S14, 0xc1bdceee); /* 4 */
FF (a, b, c, d, x[ 4], S11, 0xf57c0faf); /* 5 */
FF (d, a, b, c, x[ 5], S12, 0x4787c62a); /* 6 */
FF (c, d, a, b, x[ 6], S13, 0xa8304613); /* 7 */
FF (b, c, d, a, x[ 7], S14, 0xfd469501); /* 8 */
FF (a, b, c, d, x[ 8], S11, 0x698098d8); /* 9 */
FF (d, a, b, c, x[ 9], S12, 0x8b44f7af); /* 10 */
FF (c, d, a, b, x[10], S13, 0xffff5bb1); /* 11 */
FF (b, c, d, a, x[11], S14, 0x895cd7be); /* 12 */
FF (a, b, c, d, x[12], S11, 0x6b901122); /* 13 */
FF (d, a, b, c, x[13], S12, 0xfd987193); /* 14 */
FF (c, d, a, b, x[14], S13, 0xa679438e); /* 15 */
FF (b, c, d, a, x[15], S14, 0x49b40821); /* 16 */
/* Round 2 */
GG (a, b, c, d, x[ 1], S21, 0xf61e2562); /* 17 */
GG (d, a, b, c, x[ 6], S22, 0xc040b340); /* 18 */
GG (c, d, a, b, x[11], S23, 0x265e5a51); /* 19 */
GG (b, c, d, a, x[ 0], S24, 0xe9b6c7aa); /* 20 */
GG (a, b, c, d, x[ 5], S21, 0xd62f105d); /* 21 */
GG (d, a, b, c, x[10], S22, 0x02441453); /* 22 */
GG (c, d, a, b, x[15], S23, 0xd8a1e681); /* 23 */
GG (b, c, d, a, x[ 4], S24, 0xe7d3fbc8); /* 24 */
GG (a, b, c, d, x[ 9], S21, 0x21e1cde6); /* 25 */
GG (d, a, b, c, x[14], S22, 0xc33707d6); /* 26 */
GG (c, d, a, b, x[ 3], S23, 0xf4d50d87); /* 27 */
GG (b, c, d, a, x[ 8], S24, 0x455a14ed); /* 28 */
GG (a, b, c, d, x[13], S21, 0xa9e3e905); /* 29 */
GG (d, a, b, c, x[ 2], S22, 0xfcefa3f8); /* 30 */
GG (c, d, a, b, x[ 7], S23, 0x676f02d9); /* 31 */
GG (b, c, d, a, x[12], S24, 0x8d2a4c8a); /* 32 */
/* Round 3 */
HH (a, b, c, d, x[ 5], S31, 0xfffa3942); /* 33 */
HH (d, a, b, c, x[ 8], S32, 0x8771f681); /* 34 */
HH (c, d, a, b, x[11], S33, 0x6d9d6122); /* 35 */
HH (b, c, d, a, x[14], S34, 0xfde5380c); /* 36 */
HH (a, b, c, d, x[ 1], S31, 0xa4beea44); /* 37 */
HH (d, a, b, c, x[ 4], S32, 0x4bdecfa9); /* 38 */
HH (c, d, a, b, x[ 7], S33, 0xf6bb4b60); /* 39 */
HH (b, c, d, a, x[10], S34, 0xbebfbc70); /* 40 */
HH (a, b, c, d, x[13], S31, 0x289b7ec6); /* 41 */
HH (d, a, b, c, x[ 0], S32, 0xeaa127fa); /* 42 */
HH (c, d, a, b, x[ 3], S33, 0xd4ef3085); /* 43 */
HH (b, c, d, a, x[ 6], S34, 0x04881d05); /* 44 */
HH (a, b, c, d, x[ 9], S31, 0xd9d4d039); /* 45 */
HH (d, a, b, c, x[12], S32, 0xe6db99e5); /* 46 */
HH (c, d, a, b, x[15], S33, 0x1fa27cf8); /* 47 */
HH (b, c, d, a, x[ 2], S34, 0xc4ac5665); /* 48 */
/* Round 4 */
II (a, b, c, d, x[ 0], S41, 0xf4292244); /* 49 */
II (d, a, b, c, x[ 7], S42, 0x432aff97); /* 50 */
II (c, d, a, b, x[14], S43, 0xab9423a7); /* 51 */
II (b, c, d, a, x[ 5], S44, 0xfc93a039); /* 52 */
II (a, b, c, d, x[12], S41, 0x655b59c3); /* 53 */
II (d, a, b, c, x[ 3], S42, 0x8f0ccc92); /* 54 */
II (c, d, a, b, x[10], S43, 0xffeff47d); /* 55 */
II (b, c, d, a, x[ 1], S44, 0x85845dd1); /* 56 */
II (a, b, c, d, x[ 8], S41, 0x6fa87e4f); /* 57 */
II (d, a, b, c, x[15], S42, 0xfe2ce6e0); /* 58 */
II (c, d, a, b, x[ 6], S43, 0xa3014314); /* 59 */
II (b, c, d, a, x[13], S44, 0x4e0811a1); /* 60 */
II (a, b, c, d, x[ 4], S41, 0xf7537e82); /* 61 */
II (d, a, b, c, x[11], S42, 0xbd3af235); /* 62 */
II (c, d, a, b, x[ 2], S43, 0x2ad7d2bb); /* 63 */
II (b, c, d, a, x[ 9], S44, 0xeb86d391); /* 64 */
a += d0;
b += d1;
c += d2;
d += d3;
digest[0] = a;
digest[1] = b;
digest[2] = c;
digest[3] = d;
}
int main(void) {
uint32_t message[16], digest[4];
message[0] = 0x61800000;
message[1] = 0x00000000;
message[2] = 0x00000000;
message[3] = 0x00000000;
message[4] = 0x00000000;
message[5] = 0x00000000;
message[6] = 0x00000000;
message[7] = 0x00000000;
message[8] = 0x00000000;
message[9] = 0x00000000;
message[10] = 0x00000000;
message[11] = 0x00000000;
message[12] = 0x00000000;
message[13] = 0x00000000;
message[14] = 0x08000000;
message[15] = 0x00000000;
digest[0] = 0x00000000;
digest[1] = 0x00000000;
digest[2] = 0x00000000;
digest[3] = 0x00000000;
MD5_hash(message, digest);
printf("%08X %08X %08X %08X\n", digest[0], digest[1], digest[2], digest[3]);
}
The above code is my implementation. Now, my question is, how do i pad a certain message? for example, if my message is 'a', then the message is : 0x61800000...08000000000000. is this correct? (message[0] = 0x61800000 ... message[14] = 0x08000000, message[15] = 0x000000000). I think I might be wrong in my endianness or my interpretation of the padding instructions. can anyone please enlighten me?
(The output of the above code is: 5058CD0E 2476E559 CF86AEA4 8A173599);

Well I did my own implementation and I hit the same wall/obstacle during implementation.
A + Padding is 0x61800000 it's correct, but the 64 bits used for length indication is in big-endianness format. So out of those 64 bits the last 4 bytes are zero and the 4 bytes preceding those contains the length of the message bytes*8 bits, the least significant bits preceding the most significant bits.
Another problem with the code you copied from the RFC standard of the MD5 hash. The MD5Transform function copies the data from char array format (left-to-right) and requires them to be packed in a 32bit integer in (little-endian) format. After performing the computational cycles you need to translate the results from (little-endian) 32bit integer into the char array format requiring you to reverse the encoding step that was done during the initial call of MD5Transform.
Hope this helps.

I've recently had similar trouble with implementing MD5. If you're on Windows or another OS that uses little-endian, try this:
message[0] = 0x00008061;
message[1] = 0x00000000;
message[2] = 0x00000000;
message[3] = 0x00000000;
message[4] = 0x00000000;
message[5] = 0x00000000;
message[6] = 0x00000000;
message[7] = 0x00000000;
message[8] = 0x00000000;
message[9] = 0x00000000;
message[10] = 0x00000000;
message[11] = 0x00000000;
message[12] = 0x00000000;
message[13] = 0x00000000;
message[14] = 0x00000008;
message[15] = 0x00000000;
This might not be your only problem (if it is indeed the case). I have tried using your big-endian paddings on my little-endian OS with a correct MD5 algorithm (that has been confirmed to give correct output with little-endian padding), and it gives me the following output: 0ecd585059e57624a4ae86cf9935178a This probably means that you just have to look through your algorithm again.

I realize this is quite an old question but I came across it while attempting to do my own implementation of RIPEMD-160 and trying to figure out the message block and message schedule construction process.
What solved it for me was appending the padding bits/bytes to the message bytes up to the remaining 64bits designated for the message length.
I then added the message length as a uint64 in Little Endian. For example, for the input string "a", this is what the message block looks like in 32bit groups in hex (from left to right):
61 80 00 00 00 00 00 00 00 00 00 00 00 00 00 00
00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
00 00 00 00 00 00 00 00 08 00 00 00 00 00 00 00
Following that, I converted all the 32bit groups into uint32 words in Little Endian and ran these values through the compression function.
The final "gotcha" step I ran into was converting the results of the compression function to a 160bit/20byte output value of the 5 registers ALSO IN LITTLE ENDIAN:
final_value := 20byte array
append(final_value, LittleEndian(h0))
append(final_value, LittleEndian(h1))
append(final_value, LittleEndian(h2))
append(final_value, LittleEndian(h3))
append(final_value, LittleEndian(h4))
This final value could then be output as a hex string and it matched the reference values given in the Dobbertin et. al. paper
Cheers

Related

About the adress arithmetic in C

It's from a elementary question from the very beginning of MIT6.s081, Why does the fifth
printf's result is:
5: a[0] = 200, a[1] = 128144, a[2] = 256, a[3] = 302
I really can't imagine how 128144 comes!
Here is the code:
#include <stdio.h>
#include <stdlib.h>
void f(void)
{
int a[4];
int *b = malloc(16);
int *c;
int i;
printf("1: a = %p, b = %p, c = %p\n", a, b, c);
c = a;
for (i = 0; i < 4; i++)
a[i] = 100 + i;
c[0] = 200;
printf("2: a[0] = %d, a[1] = %d, a[2] = %d, a[3] = %d\n",
a[0], a[1], a[2], a[3]);
c[1] = 300;
*(c + 2) = 301;
c[3] = 302;
printf("3: a[0] = %d, a[1] = %d, a[2] = %d, a[3] = %d\n",
a[0], a[1], a[2], a[3]);
c = c + 1;
*c = 400;
printf("4: a[0] = %d, a[1] = %d, a[2] = %d, a[3] = %d\n",
a[0], a[1], a[2], a[3]);
c = (int *)((char *)c + 1);
*c = 500;
printf("5: a[0] = %d, a[1] = %d, a[2] = %d, a[3] = %d\n",
a[0], a[1], a[2], a[3]);
b = (int *)a + 1;
c = (int *)((char *)a + 1);
printf("6: a = %p, b = %p, c = %p\n", a, b, c);
}
int main(int ac, char **av)
{
f();
return 0;
}
A standard integer is 4 bytes in width, meaning that, in memory, your 500 will look like this (assuming Big endian): 0x00 0x00 0x01 0xF4.
If you do c + 1, with c being an int * you are actually moving the pointer by these four bytes.
By casting c to a char *, you change the step width of the pointer to 1, meaning that it will now point to this location: 0x00 0x00 0x01 0xF4
By writing your 500 to that value, the same four bytes will now look like this: 0x00 0x00 0x00 0x01 (the remaining 0xF4 will be written to the next integer).
I am now assuming that your machine is little endian, meaning the bytes are 'reversed'.
0x90 0xF4 0x01 0x00
The 0x90 is leftover from the assignment of 400 in the previous iteration and by putting them all together, you get 0x01f490 or 128144.
After 4: a[0] = 200, a[1] = 400, a[2] = 301, a[3] = 302
The binary value of the array pointed by a and c is as following:
a
c
C8 00 00 00 | 90 01 00 00 | 2D 01 00 00 | 2E 01 00 00
After c = (int *) ((char *) c + 1);
a
| c
| |
C8 00 00 00 | 90 01 00 00 | 2D 01 00 00 | 2E 01 00 00
After *c = 500; // 500 = 0x000001F4
a
| c
| |
C8 00 00 00 | 90 F4 01 00 | 00 01 00 00 | 2E 01 00 00
So a[1] = 0x0001F490; a[2] = 0x00000100;
And 0x0001F490 = 128144, 0x00000100 = 256.

Understanding Pointers (in C): converting int pointer to char and changing values

I have the following code, which deals with pointers in C
void
f(void)
{
int a[4];
int *b = malloc(16);
int *c;
int i;
printf("1: a = %p, b = %p, c = %p\n", a, b, c);
c = a;
for (i = 0; i < 4; i++)
a[i] = 100 + i;
c[0] = 200;
printf("2: a[0] = %d, a[1] = %d, a[2] = %d, a[3] = %d\n",
a[0], a[1], a[2], a[3]);
c[1] = 300;
*(c + 2) = 301;
3[c] = 302;
printf("3: a[0] = %d, a[1] = %d, a[2] = %d, a[3] = %d\n",
a[0], a[1], a[2], a[3]);
c = c + 1;
*c = 400;
printf("4: a[0] = %d, a[1] = %d, a[2] = %d, a[3] = %d\n",
a[0], a[1], a[2], a[3]);
c = (int *) ((char *) c + 1);
*c = 500;
printf("5: a[0] = %d, a[1] = %d, a[2] = %d, a[3] = %d\n",
a[0], a[1], a[2], a[3]);
b = (int *) a + 1;
c = (int *) ((char *) a + 1);
printf("6: a = %p, b = %p, c = %p\n", a, b, c);
}
int
main(int ac, char **av)
{
f();
return 0;
}
Output:
1: a = 0x7fff5fbff710, b = 0x1003002e0, c = 0x100000000
2: a[0] = 200, a[1] = 101, a[2] = 102, a[3] = 103
3: a[0] = 200, a[1] = 300, a[2] = 301, a[3] = 302
4: a[0] = 200, a[1] = 400, a[2] = 301, a[3] = 302
5: a[0] = 200, a[1] = 128144, a[2] = 256, a[3] = 302
6: a = 0x7fff5fbff710, b = 0x7fff5fbff714, c = 0x7fff5fbff711
Program ended with exit code: 0
I understand why a[1] is 128144 (we're moving a byte forward when we cast the int pointer to char +1 and over-write 500), but I don't understand why a[2] is 256 [the first bit will get over-written, but that doesn't give 256]. I'll highly appreciate help! Thanks in advance :)
Short answer: You modified c so it no longer has 4-byte integer alignment, and setting it to 500 modified two of the array elements. Also, that unaligned access is generally a no-no and will cause alignment faults on some systems.
Long answer: At step 4, a[1] contains 0x00000190 which is 400, and a[2] contains 0x0000012D which is 301.
Let's assume a[0] starts at memory address 0. Here is how the bytes are arranged in a little-endian system (the endianness is very important here), left-to-right from 0 to 15:
ADDR: 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F
DATA: C8 00 00 00 90 01 00 00 2D 01 00 00 2E 01 00 00
At step 5 you've unaligned c by one byte via your cast to a char *, so it points to address 05 instead of 04. You assign the value 500 (0x000001F4) to it, which overwrites the bytes from 05-08, resulting in this memory:
ADDR: 00 01 02 03 04 05 06 07 08 09 0A 0B 0C 0D 0E 0F
DATA: C8 00 00 00 90 F4 01 00 00 01 00 00 2E 01 00 00
When you read a[1] you get 0x0001F490 which is 128144.
When you read a[2] you get 0x00000100 which is 256.
the thing to realize is that this
c = (int *) ((char *) c + 1);
makes c point to some of a[1] and some of a[2]. You move c over by one byte and that make that a int pointer. This makes *c address the last 3 bytes of a[1] and the first byte of a[2]. Draw out all the bits and see what happens

How can I read and obtain separated data from a file using 'fread' in C?

I've written in a file (using 'fwrite()') the following:
TUS�ABQ���������������(A����������(A��B������(A��B���A��(A��B���A������B���A������0����A������0�ABQ�������0�ABQ�����LAS����������������A�����������A��&B�������A��&B��B���A��&B��B������&B��
B����153���B����153�LAS�����153�LAS�����LAX���������������:A����������:AUUB������:AUUB��B��:
AUUB��B����UUB��B����������B��������LAX���������LAX�����MDW���������������A����������A��(�������A��(����A��A��(����A������(����A����A�89���A����A�89MDW�����A�89MDW�����OAK���������
����������������������#�����������#�����������#�����������#�������������������������OAK���������OAK�����SAN���������������LA����������LA��P#������LA��P#��#A��LA��P#��#A������P#��#A����������#A��������SAN���������SAN�����TPA�ABQ����������������B�����������B��#�����...(continues)
which is translated to this:
TUSLWD2.103.47.775.1904.06.40.03AMBRFD4.63.228.935.0043.09.113.0ASDGHU5.226.47.78.3.26...(The same structure)
and the hexdump of that would be:
00000000 54 55 53 00 41 42 51 00 00 00 00 00 00 00 00 00 |TUS.ABQ.........|
00000010 00 00 00 00 00 00 28 41 00 00 0e 42 00 00 f8 41 |......(A...B...A|
00000020 00 00 00 00 4c 41 53 00 00 00 00 00 00 00 00 00 |....LAS.........|
00000030 00 00 00 00 00 00 88 41 00 00 26 42 9a 99 11 42 |.......A..&B...B|
(Continues...)
the structure is, always 2 words of 3 characters each one (i.e. TUS and LWD) followed by 7 floats, and then it repeats again on a on until end of file.
The key thing is: I just want to read every field separated like 'TUS', 'LWD', '2.10', '3.4', '7.77'...
And I can only use 'fread()' to achieve that! For now, I'm trying this:
aux2 = 0;
fseek(fp, SEEK_SET, 0);
fileSize = 0;
while (!feof(fp) && aux<=2) {
fread(buffer, sizeof(char)*4, 1, fp);
printf("%s", buffer);
fread(buffer, sizeof(char)*4, 1, fp);
printf("%s", buffer);
for(i=0; i<7; i++){
fread(&delay, sizeof(float), 1, fp);
printf("%f", delay);
}
printf("\n");
aux++;
fseek(fp,sizeof(char)*7+sizeof(float)*7,SEEK_SET);
aux2+=36;
}
And I get this result:
TUSABQ0.0000000.0000000.00000010.5000000.0000000.00000010.500000
AB0.0000000.000000-10384675421112248092159136000638976.0000000.0000000.000000-10384675421112248092159136000638976.0000000.000000
AB0.0000000.000000-10384675421112248092159136000638976.0000000.0000000.000000-10384675421112248092159136000638976.0000000.000000
But it does not works correctly...
*Note: forget the arguments of the last 'fseek()', cos I've been trying too many meaningless things!
To write the words (i.e. TUS) into the file, I use this:
fwrite(x->data->key, 4, sizeof(char), fp);
and to write the floats, this:
for (i = 0; i < 7; i++) {
fwrite(&current->data->retrasos[i], sizeof(float), sizeof(float), fp);
}
I'd recommend using a structure to hold each data unit:
typedef struct {
float value[7];
char word1[5]; /* 4 + '\0' */
char word2[5]; /* 4 + '\0' */
} unit;
To make the file format portable, you need a function that packs and unpacks the above structure to/from a 36-byte array. On Intel and AMD architectures, float corresponds to IEEE-754-2008 binary32 format in little-endian byte order. For example,
#define STORAGE_UNIT (4+4+7*4)
#if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64)
size_t unit_pack(char *target, const size_t target_len, const unit *source)
{
size_t i;
if (!target || target_len < STORAGE_UNIT || !source) {
errno = EINVAL;
return 0;
}
memcpy(target + 0, source->word1, 4);
memcpy(target + 4, source->word2, 4);
for (i = 0; i < 7; i++)
memcpy(target + 8 + 4*i, &(source->value[i]), 4);
return STORAGE_UNIT;
}
size_t unit_unpack(unit *target, const char *source, const size_t source_len)
{
size_t i;
if (!target || !source || source_len < STORAGE_UNIT) {
errno = EINVAL;
return 0;
}
memcpy(target->word1, source, 4);
target->word1[4] = '\0';
memcpy(target->word2, source + 4, 4);
target->word2[4] = '\0';
for (i = 0; i < 7; i++)
memcpy(&(target->value[i]), source + 8 + i*4, 4);
return STORAGE_UNIT;
}
#else
#error Unsupported architecture!
#endif
The above only works on Intel and AMD machines, but it is certainly easy to extend to other architectures if necessary. (Almost all machines currently use IEEE 754-2008 binary32 for float, only the byte order varies. Those that do not, typically have C extensions that do the conversion to/from their internal formats.)
Using the above, you can -- should! must! -- document your file format, for example as follows:
Words are 4 bytes encoded in UTF-8
Floats are IEEE 754-2008 binary32 values in little-endian byte order
A file contains one or more units. Each unit comprises of
Name Description
word1 First word
word2 Second word
value0 First float
value1 Second float
value2 Third float
value3 Fourth float
value4 Fifth float
value5 Sixth float
value6 Second float
There is no padding.
To write an unit, use a char array of size STORAGE_UNIT as a cache, and write that. So, if you have unit *one, you can write it to FILE *out using
char buffer[STORAGE_UNIT];
if (unit_pack(buffer, sizeof buffer, one)) {
/* Error! Abort program! */
}
if (fwrite(buffer, STORAGE_UNIT, 1, out) != 1) {
/* Write error! Abort program! */
}
Correspondingly, reading from FILE *in would be
char buffer[STORAGE_UNIT];
if (fread(buffer, STORAGE_UNIT, 1, in) != 1) {
/* End of file, or read error.
Check feof(in) or/and ferror(in). */
}
if (unit_unpack(one, buffer, STORAGE_UNIT)) {
/* Error! Abort program! */
}
If one is an array of units, and you are writing or reading one[k], use &(one[k]) (or equivalently one + k) instead of one.

Only storing 2 first floats of a __m128 variable in C

I have an array with room for two float numbers, and I have a __m128 variable. I want to only store the two first floats of the __m128 variable.
What I'm doing now is
_mm_storeu_ps((float*)a, m0); //a is the array, m0 is the __m128 variable
this puts the first two floats of m0 into a, but it also continues to store its last two floats beyond the memory of a.
You can use the _mm_storel_pi intrinsic. This intrinsic generates a single movlps instruction. Here is an example. Functions sample1-sample4 demonstrate suggestions so far. Sample5 demonstrates the _mm_storel_pi method.
#include <stdio.h>
#include <intrin.h>
//-----------------------------------------
void sample1 (float *a, __m128 m0)
{
_mm_storeu_ps(a, m0); //a is the array, m0 is the __m128 variable
}
//-----------------------------------------
void sample2 (float *a, __m128 m0)
{
float *p = (float *)&m0;
a[0] = p[0];
a[1] = p[1];
}
//-----------------------------------------
void sample3 (float *a, __m128 m0)
{
_mm_store_ss(&a[0], m0);
_mm_store_ss(&a[1], _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(1,1,1,1)));
}
//-----------------------------------------
void sample4 (float *a, __m128 m0)
{
union { __m128 i; float f[4]; } u;
u.i = m0;
a [0] = u.f[0];
a [1] = u.f[1];
}
//-----------------------------------------
void sample5 (float *a, __m128 m0)
{
_mm_storel_pi ((__m64 *)a, m0);
}
//-----------------------------------------
void printa (float *a)
{
printf ("%g %g %g %g\n", a [0], a [1], a [2], a [3]);
}
//-----------------------------------------
int main (void)
{
__m128 m0 = _mm_set_ps (1.0, 2.0, 3.0, 4.0);
float a [4];
memset (a, 0, sizeof a);
sample1 (a, m0);
printa (a);
memset (a, 0, sizeof a);
sample2 (a, m0);
printa (a);
memset (a, 0, sizeof a);
sample3 (a, m0);
printa (a);
memset (a, 0, sizeof a);
sample4 (a, m0);
printa (a);
memset (a, 0, sizeof a);
sample5 (a, m0);
printa (a);
return 0;
}
//-------------------------------------
output:
4 3 2 1
4 3 0 0
4 3 0 0
4 3 0 0
4 3 0 0
Here is gcc 4.8.1 x64 code generation for the functions:
0000000000401510 <sample1>:
401510: 0f 28 02 movaps xmm0,XMMWORD PTR [rdx]
401513: 0f 11 01 movups XMMWORD PTR [rcx],xmm0
401516: c3 ret
0000000000401520 <sample2>:
401520: 0f 28 02 movaps xmm0,XMMWORD PTR [rdx]
401523: f3 0f 11 01 movss DWORD PTR [rcx],xmm0
401527: 0f c6 c0 55 shufps xmm0,xmm0,0x55
40152b: f3 0f 11 41 04 movss DWORD PTR [rcx+0x4],xmm0
401530: c3 ret
0000000000401540 <sample3>:
401540: 0f 28 02 movaps xmm0,XMMWORD PTR [rdx]
401543: f3 0f 11 01 movss DWORD PTR [rcx],xmm0
401547: 0f c6 c0 55 shufps xmm0,xmm0,0x55
40154b: f3 0f 11 41 04 movss DWORD PTR [rcx+0x4],xmm0
401550: c3 ret
0000000000401560 <sample4>:
401560: 48 8b 02 mov rax,QWORD PTR [rdx]
401563: 89 01 mov DWORD PTR [rcx],eax
401565: 48 c1 e8 20 shr rax,0x20
401569: 89 41 04 mov DWORD PTR [rcx+0x4],eax
40156c: c3 ret
0000000000401570 <sample5>:
401570: 0f 28 02 movaps xmm0,XMMWORD PTR [rdx]
401573: 0f 13 01 movlps QWORD PTR [rcx],xmm0
401576: c3 ret
You have a couple of options:
Option 1
You can cast a pointer to the __m128 to a float* and index it accordingly:
float *p = (float *)&m0;
a[0] = p[0];
a[1] = p[1];
Some people prefer to create a union of an array of 4 floats and a __m128, which performance wise would be very similar.
Option 2
If you only want to use the SSE intrinsics, you can use _mm_store_ss and _mm_shuffle_ps:
_mm_store_ss(&a[0], m0);
_mm_store_ss(&a[1], _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(1,1,1,1)));
The shuffle instructions in SSE are extremely useful, read more about them here.

How do I handle byte order differences when reading/writing floating-point types in C?

I'm devising a file format for my application, and I'd obviously like for it to work on both big-endian and little-endian systems. I've already found working solutions for managing integral types using htonl and ntohl, but I'm a bit stuck when trying to do the same with float and double values.
Given the nature of how floating-point representations work, I would assume that the standard byte-order functions won't work on these values. Likewise, I'm not even entirely sure if endianness in the traditional sense is what governs the byte order of these types.
All I need is consistency. A way to write a double out, and ensure I get that same value when I read it back in. How can I do this in C?
Another option could be to use double frexp(double value, int *exp); from <math.h> (C99) to break down the floating-point value into a normalized fraction (in the range [0.5, 1)) and an integral power of 2. You can then multiply the fraction by FLT_RADIXDBL_MANT_DIG to get an integer in the range [FLT_RADIXDBL_MANT_DIG/2, FLT_RADIXDBL_MANT_DIG). Then you save both integers big- or little-endian, whichever you choose in your format.
When you load a saved number, you do the reverse operation and use double ldexp(double x, int exp); to multiply the reconstructed fraction by the power of 2.
This will work best when FLT_RADIX=2 (virtually all systems, I suppose?) and DBL_MANT_DIG<=64.
Care must be taken to avoid overflows.
Sample code for doubles:
#include <limits.h>
#include <float.h>
#include <math.h>
#include <string.h>
#include <stdio.h>
#if CHAR_BIT != 8
#error currently supported only CHAR_BIT = 8
#endif
#if FLT_RADIX != 2
#error currently supported only FLT_RADIX = 2
#endif
#ifndef M_PI
#define M_PI 3.14159265358979324
#endif
typedef unsigned char uint8;
/*
10-byte little-endian serialized format for double:
- normalized mantissa stored as 64-bit (8-byte) signed integer:
negative range: (-2^53, -2^52]
zero: 0
positive range: [+2^52, +2^53)
- 16-bit (2-byte) signed exponent:
range: [-0x7FFE, +0x7FFE]
Represented value = mantissa * 2^(exponent - 53)
Special cases:
- +infinity: mantissa = 0x7FFFFFFFFFFFFFFF, exp = 0x7FFF
- -infinity: mantissa = 0x8000000000000000, exp = 0x7FFF
- NaN: mantissa = 0x0000000000000000, exp = 0x7FFF
- +/-0: only one zero supported
*/
void Double2Bytes(uint8 buf[10], double x)
{
double m;
long long im; // at least 64 bits
int ie;
int i;
if (isnan(x))
{
// NaN
memcpy(buf, "\x00\x00\x00\x00\x00\x00\x00\x00" "\xFF\x7F", 10);
return;
}
else if (isinf(x))
{
if (signbit(x))
// -inf
memcpy(buf, "\x00\x00\x00\x00\x00\x00\x00\x80" "\xFF\x7F", 10);
else
// +inf
memcpy(buf, "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\x7F" "\xFF\x7F", 10);
return;
}
// Split double into normalized mantissa (range: (-1, -0.5], 0, [+0.5, +1))
// and base-2 exponent
m = frexp(x, &ie); // x = m * 2^ie exactly for FLT_RADIX=2
// frexp() can't fail
// Extract most significant 53 bits of mantissa as integer
m = ldexp(m, 53); // can't overflow because
// DBL_MAX_10_EXP >= 37 equivalent to DBL_MAX_2_EXP >= 122
im = trunc(m); // exact unless DBL_MANT_DIG > 53
// If the exponent is too small or too big, reduce the number to 0 or
// +/- infinity
if (ie > 0x7FFE)
{
if (im < 0)
// -inf
memcpy(buf, "\x00\x00\x00\x00\x00\x00\x00\x80" "\xFF\x7F", 10);
else
// +inf
memcpy(buf, "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\x7F" "\xFF\x7F", 10);
return;
}
else if (ie < -0x7FFE)
{
// 0
memcpy(buf, "\x00\x00\x00\x00\x00\x00\x00\x00" "\x00\x00", 10);
return;
}
// Store im as signed 64-bit little-endian integer
for (i = 0; i < 8; i++, im >>= 8)
buf[i] = (uint8)im;
// Store ie as signed 16-bit little-endian integer
for (i = 8; i < 10; i++, ie >>= 8)
buf[i] = (uint8)ie;
}
void Bytes2Double(double* x, const uint8 buf[10])
{
unsigned long long uim; // at least 64 bits
long long im; // ditto
unsigned uie;
int ie;
double m;
int i;
int negative = 0;
int maxe;
if (!memcmp(buf, "\x00\x00\x00\x00\x00\x00\x00\x00" "\xFF\x7F", 10))
{
#ifdef NAN
*x = NAN;
#else
*x = 0; // NaN is not supported, use 0 instead (we could return an error)
#endif
return;
}
if (!memcmp(buf, "\x00\x00\x00\x00\x00\x00\x00\x80" "\xFF\x7F", 10))
{
*x = -INFINITY;
return;
}
else if (!memcmp(buf, "\xFF\xFF\xFF\xFF\xFF\xFF\xFF\x7F" "\xFF\x7F", 10))
{
*x = INFINITY;
return;
}
// Load im as signed 64-bit little-endian integer
uim = 0;
for (i = 0; i < 8; i++)
{
uim >>= 8;
uim |= (unsigned long long)buf[i] << (64 - 8);
}
if (uim <= 0x7FFFFFFFFFFFFFFFLL)
im = uim;
else
im = (long long)(uim - 0x7FFFFFFFFFFFFFFFLL - 1) - 0x7FFFFFFFFFFFFFFFLL - 1;
// Obtain the absolute value of the mantissa, make sure it's
// normalized and fits into 53 bits, else the input is invalid
if (im > 0)
{
if (im < (1LL << 52) || im >= (1LL << 53))
{
#ifdef NAN
*x = NAN;
#else
*x = 0; // NaN is not supported, use 0 instead (we could return an error)
#endif
return;
}
}
else if (im < 0)
{
if (im > -(1LL << 52) || im <= -(1LL << 53))
{
#ifdef NAN
*x = NAN;
#else
*x = 0; // NaN is not supported, use 0 instead (we could return an error)
#endif
return;
}
negative = 1;
im = -im;
}
// Load ie as signed 16-bit little-endian integer
uie = 0;
for (i = 8; i < 10; i++)
{
uie >>= 8;
uie |= (unsigned)buf[i] << (16 - 8);
}
if (uie <= 0x7FFF)
ie = uie;
else
ie = (int)(uie - 0x7FFF - 1) - 0x7FFF - 1;
// If DBL_MANT_DIG < 53, truncate the mantissa
im >>= (53 > DBL_MANT_DIG) ? (53 - DBL_MANT_DIG) : 0;
m = im;
m = ldexp(m, (53 > DBL_MANT_DIG) ? -DBL_MANT_DIG : -53); // can't overflow
// because DBL_MAX_10_EXP >= 37 equivalent to DBL_MAX_2_EXP >= 122
// Find out the maximum base-2 exponent and
// if ours is greater, return +/- infinity
frexp(DBL_MAX, &maxe);
if (ie > maxe)
m = INFINITY;
else
m = ldexp(m, ie); // underflow may cause a floating-point exception
*x = negative ? -m : m;
}
int test(double x, const char* name)
{
uint8 buf[10], buf2[10];
double x2;
int error1, error2;
Double2Bytes(buf, x);
Bytes2Double(&x2, buf);
Double2Bytes(buf2, x2);
printf("%+.15E '%s' -> %02X %02X %02X %02X %02X %02X %02X %02X %02X %02X\n",
x,
name,
buf[0],buf[1],buf[2],buf[3],buf[4],buf[5],buf[6],buf[7],buf[8],buf[9]);
if ((error1 = memcmp(&x, &x2, sizeof(x))) != 0)
puts("Bytes2Double(Double2Bytes(x)) != x");
if ((error2 = memcmp(buf, buf2, sizeof(buf))) != 0)
puts("Double2Bytes(Bytes2Double(Double2Bytes(x))) != Double2Bytes(x)");
puts("");
return error1 || error2;
}
int testInf(void)
{
uint8 buf[10];
double x, x2;
int error;
x = DBL_MAX;
Double2Bytes(buf, x);
if (!++buf[8])
++buf[9]; // increment the exponent beyond the maximum
Bytes2Double(&x2, buf);
printf("%02X %02X %02X %02X %02X %02X %02X %02X %02X %02X -> %+.15E\n",
buf[0],buf[1],buf[2],buf[3],buf[4],buf[5],buf[6],buf[7],buf[8],buf[9],
x2);
if ((error = !isinf(x2)) != 0)
puts("Bytes2Double(Double2Bytes(DBL_MAX) * 2) != INF");
puts("");
return error;
}
#define VALUE_AND_NAME(V) { V, #V }
const struct
{
double value;
const char* name;
} testData[] =
{
#ifdef NAN
VALUE_AND_NAME(NAN),
#endif
VALUE_AND_NAME(0.0),
VALUE_AND_NAME(+DBL_MIN),
VALUE_AND_NAME(-DBL_MIN),
VALUE_AND_NAME(+1.0),
VALUE_AND_NAME(-1.0),
VALUE_AND_NAME(+M_PI),
VALUE_AND_NAME(-M_PI),
VALUE_AND_NAME(+DBL_MAX),
VALUE_AND_NAME(-DBL_MAX),
VALUE_AND_NAME(+INFINITY),
VALUE_AND_NAME(-INFINITY),
};
int main(void)
{
unsigned i;
int errors = 0;
for (i = 0; i < sizeof(testData) / sizeof(testData[0]); i++)
errors += test(testData[i].value, testData[i].name);
errors += testInf();
// Test subnormal values. A floating-point exception may be raised.
errors += test(+DBL_MIN / 2, "+DBL_MIN / 2");
errors += test(-DBL_MIN / 2, "-DBL_MIN / 2");
printf("%d error(s)\n", errors);
return 0;
}
Output (ideone):
+NAN 'NAN' -> 00 00 00 00 00 00 00 00 FF 7F
+0.000000000000000E+00 '0.0' -> 00 00 00 00 00 00 00 00 00 00
+2.225073858507201E-308 '+DBL_MIN' -> 00 00 00 00 00 00 10 00 03 FC
-2.225073858507201E-308 '-DBL_MIN' -> 00 00 00 00 00 00 F0 FF 03 FC
+1.000000000000000E+00 '+1.0' -> 00 00 00 00 00 00 10 00 01 00
-1.000000000000000E+00 '-1.0' -> 00 00 00 00 00 00 F0 FF 01 00
+3.141592653589793E+00 '+M_PI' -> 18 2D 44 54 FB 21 19 00 02 00
-3.141592653589793E+00 '-M_PI' -> E8 D2 BB AB 04 DE E6 FF 02 00
+1.797693134862316E+308 '+DBL_MAX' -> FF FF FF FF FF FF 1F 00 00 04
-1.797693134862316E+308 '-DBL_MAX' -> 01 00 00 00 00 00 E0 FF 00 04
+INF '+INFINITY' -> FF FF FF FF FF FF FF 7F FF 7F
-INF '-INFINITY' -> 00 00 00 00 00 00 00 80 FF 7F
FF FF FF FF FF FF 1F 00 01 04 -> +INF
+1.112536929253601E-308 '+DBL_MIN / 2' -> 00 00 00 00 00 00 10 00 02 FC
-1.112536929253601E-308 '-DBL_MIN / 2' -> 00 00 00 00 00 00 F0 FF 02 FC
0 error(s)
Depending on the application it could be a good idea to use a plain text data format (a possibility being XML). If you don't want to waste disk space you can compress it.
XML is probably the most portable way to do it.
However, it appears that you already have most of the parser built, but are stuck on the float/double issue. I would suggest writing it out as a string (to whatever precision you desire) and then reading that back in.
Unless all your target platforms use IEEE-754 floats (and doubles), no byte-swapping tricks will work for you.
If you guarantee that your implementations always treat serialized floating point representations in a specified format, then you will be fine (IEEE 754 is common).
Yes, architectures may order floating point numbers differently (e.g. in big or little endian). Therefore, you will want to somehow specify the endianness. This could be in the format's specification or variable and recorded in the file's data.
The last major pitfall is that alignment for builtins may vary. How your hardware/processor handles malaligned data is implementation defined. So you may need to swap the data/bytes, then move it to the destination float/double.
A library like HDF5 or even NetCDF is probably a bit heavyweight for this as High Performance Mark said, unless you also need the other features available in those libraries.
A lighter-weight alternative that only deals with the serialization would be e.g. XDR (see also wikipedia description). Many OS'es supply XDR routines out of the box, if this is not enough free-standing XDR libraries exist as well.

Resources