How to improve Mersenne Twister vor AVX/SSE?

How to improve Mersenne Twister vor AVX/SSE? - c

Today i have started a project having the goal to optimize the generation of random numbers.
I want to wipe several hard drives, using the Mersenne Twister PRNG, but unfortunately i'm only able to produce around 200MB/s of random data, on 8 hard drives, so 25MB/s each.
Is there a way to optimize this code, using AVX, or SSE (for legacy reasons) to improve this code?
Is it something what can be done without having studied computer science?
Unfortunately i'm a simple C programmer, only having the experience of a few years, but not having studied computer science yet.
Could someone provide some information, how to bring this forward?
Which of these processes could be improved? Can someone give some examples, how an enhanced version of it could look like? Can someone provide some good books to retrieve a better knowledge about this matter?
/*
A C-program for MT19937, with initialization improved 2002/1/26.
Coded by Takuji Nishimura and Makoto Matsumoto.
Before using, initialize the state by using init_genrand(seed)
or init_by_array(init_key, key_length).
Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura,
All rights reserved.
Copyright (C) 2005, Mutsuo Saito,
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
3. The names of its contributors may not be used to endorse or promote
products derived from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Any feedback is very welcome.
http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/emt.html
email: m-mat # math.sci.hiroshima-u.ac.jp (remove space)
*/
#include <stdio.h>
#include "mt19937ar.h"
/* Period parameters */
#define N 624
#define M 397
#define MATRIX_A 0x9908b0dfUL /* constant vector a */
#define UPPER_MASK 0x80000000UL /* most significant w-r bits */
#define LOWER_MASK 0x7fffffffUL /* least significant r bits */
static unsigned long mt[N]; /* the array for the state vector */
static int mti=N+1; /* mti==N+1 means mt[N] is not initialized */
/* initializes mt[N] with a seed */
void init_genrand(unsigned long s)
{
mt[0]= s & 0xffffffffUL;
for (mti=1; mti<N; mti++) {
mt[mti] =
(1812433253UL * (mt[mti-1] ^ (mt[mti-1] >> 30)) + mti);
/* See Knuth TAOCP Vol2. 3rd Ed. P.106 for multiplier. */
/* In the previous versions, MSBs of the seed affect */
/* only MSBs of the array mt[]. */
/* 2002/01/09 modified by Makoto Matsumoto */
mt[mti] &= 0xffffffffUL;
/* for >32 bit machines */
}
}
/* initialize by an array with array-length */
/* init_key is the array for initializing keys */
/* key_length is its length */
/* slight change for C++, 2004/2/26 */
void init_by_array(unsigned long init_key[], int key_length)
{
int i, j, k;
init_genrand(19650218UL);
i=1; j=0;
k = (N>key_length ? N : key_length);
for (; k; k--) {
mt[i] = (mt[i] ^ ((mt[i-1] ^ (mt[i-1] >> 30)) * 1664525UL))
+ init_key[j] + j; /* non linear */
mt[i] &= 0xffffffffUL; /* for WORDSIZE > 32 machines */
i++; j++;
if (i>=N) { mt[0] = mt[N-1]; i=1; }
if (j>=key_length) j=0;
}
for (k=N-1; k; k--) {
mt[i] = (mt[i] ^ ((mt[i-1] ^ (mt[i-1] >> 30)) * 1566083941UL))
- i; /* non linear */
mt[i] &= 0xffffffffUL; /* for WORDSIZE > 32 machines */
i++;
if (i>=N) { mt[0] = mt[N-1]; i=1; }
}
mt[0] = 0x80000000UL; /* MSB is 1; assuring non-zero initial array */
}
/* generates a random number on [0,0xffffffff]-interval */
unsigned long genrand_int32(void)
{
unsigned long y;
static unsigned long mag01[2]={0x0UL, MATRIX_A};
/* mag01[x] = x * MATRIX_A for x=0,1 */
if (mti >= N) { /* generate N words at one time */
int kk;
if (mti == N+1) /* if init_genrand() has not been called, */
init_genrand(5489UL); /* a default initial seed is used */
for (kk=0;kk<N-M;kk++) {
y = (mt[kk]&UPPER_MASK)|(mt[kk+1]&LOWER_MASK);
mt[kk] = mt[kk+M] ^ (y >> 1) ^ mag01[y & 0x1UL];
}
for (;kk<N-1;kk++) {
y = (mt[kk]&UPPER_MASK)|(mt[kk+1]&LOWER_MASK);
mt[kk] = mt[kk+(M-N)] ^ (y >> 1) ^ mag01[y & 0x1UL];
}
y = (mt[N-1]&UPPER_MASK)|(mt[0]&LOWER_MASK);
mt[N-1] = mt[M-1] ^ (y >> 1) ^ mag01[y & 0x1UL];
mti = 0;
}
y = mt[mti++];
/* Tempering */
y ^= (y >> 11);
y ^= (y << 7) & 0x9d2c5680UL;
y ^= (y << 15) & 0xefc60000UL;
y ^= (y >> 18);
return y;
}

I think, Mersenne Twister PRNG is not suitable for your purpose, because of it is not cryptographically secure. If cryptographer can guess or recover piece of MT sequence (for example, from initially 0-filled disk part), he can recover your generator state, and recover entire your sequence, by re-run PRNG from that state.
I suggest you to use modification of RC4 PRNG, but works with array of "uint16_t [1 << 16]", rather than original byte-oriented RC4 with array "uint8_t [1 << 8]". This modification will give you 2 benefits:
For each computation step, you will extract 2 bytes, not one, as in the original. By other words, you will get ~2x performance improvement, this is important for you.
Period of this generator would be extremely long, I estimate it as 2^(2^14).
To preserve attack to your KSA (major RC4 vulnerability), I suggest you to "empty intial run" 2^18 times before usage. By other words, you initially, just read and drop initial 2^18 uint16_t values from PRNG.
Also, you can modify the KSA, to make it more hack-resistant.

Related

Frama-C does not recognize valid memory access from bitwise-ANDed index

I am right-shifting an unsigned integer then &ing it with 0b111, so the resulting value must be in the range [0, 7].
When I use that value as an index into an array of length 8, Frama-C is not able to verify the associated rte: mem_access assertion.
#include <assert.h>
#include <stdbool.h>
#include <stdint.h>
/*#
requires \valid_read(bitRuleBuf + (0 .. 7));
assigns \nothing;
ensures \forall uint8_t c; (0 <= c < 8) ==> (
((\result >> c) & 0b1) <==> bitRuleBuf[(state >> c) & 0b111]
);
*/
uint8_t GetNewOctet(
const uint16_t state,
const bool bitRuleBuf[const static 8])
{
uint8_t result = 0;
/*
loop invariant 0 <= b <= 8;
loop invariant \forall uint8_t c; (0 <= c < b) ==> (
((result >> c) & 0b1) <==> bitRuleBuf[(state >> c) & 0b111]
);
loop assigns b, result;
loop variant 8 - b;
*/
for (uint8_t b = 0; b < 8; b += 1) {
result |= ((uint8_t)bitRuleBuf[(state >> b) & 0b111]) << b;
// Still seeing an issue if break apart the steps:
/*
const uint16_t shifted = state >> b;
const uint16_t anded = shifted & 0b111;
// "assert rte: mem_access" is not successful here.
const uint8_t value = bitRuleBuf[anded];
const uint8_t shifted2 = value << b;
result |= shifted2;
*/
}
return result;
}
/*#
assigns \nothing;
*/
int main(void) {
// Empty cells with both neighbors empty become alive.
// All other cells become empty.
const bool bitRuleBuf[] = {
1, // 0b000
0, // 0b001
0, // 0b010
0, // 0b011
0, // 0b100
0, // 0b101
0, // 0b110
0 // 0b111
};
const uint8_t newOctet = GetNewOctet(0b0010000100, bitRuleBuf);
//assert(newOctet == 0b00011000); // Can be uncommented to verify.
//# assert newOctet == 0b00011000;
return 0;
}
The failed assertion happens for line 27: result |= ((uint8_t)bitRuleBuf[(state >> b) & 0b111]) << b;.
Changing the & 0b111 to % 8 does not resolve the issue.
I have tried many variations of the code and ACSL involved, but have not been successful.
I'm guessing integer promotion might be involved in the issue.
How can the code/ACSL be modified so that verification is successful?
$ frama-c --version
24.0 (Chromium)
I am running frama-c and frama-c-gui with arguments -wp and -wp-rte.
For background of the included block of code, the least significant 10 bits of the state argument are the state of 10 cells of 1-dimensional cellular automata. The function returns the next state of the middle 8 cells of those 10.
Edit: Alt-Ergo 2.4.1 is used:
$ why3 config detect
Found prover Alt-Ergo version 2.4.1, OK.
1 prover(s) added
Save config to /home/user/.why3.conf

First, a small note: if you're using WP, it is also important to state which solvers you have configured: each of them has strengths and weaknesses, so that it might be easier to complete a proof with the appropriate solver. In particular, my answer is based on the use of Z3 (4.8.14), known as z3-ce by
why3 config detect (note that you have to run this command each time you change the set of solvers you use).
EDIT As mentioned in comments below, the Mod-Mask tactic is not available in Frama-C 24.0, but only in the development version (https://git.frama-c.com/pub/frama-c). As far as I can tell, for a Frama-C 24.0 solution, you need to resort to a Coq script as mentioned at the end of this answer.
Z3 alone is not sufficient to complete the proof, but you can use a WP tactic (see section 2.2 of the WP manual about the interactive proof editor in Frama-C's GUI). Namely, if you select land(7, to_sint32(x)) in the proof editor, the tactics panel will show you a Mod-Mask tactic, which converts bitmasks to modulo operations and vice-versa (see image below). If you apply it, Z3 will complete the two resulting proof obligations, completing the proof of the assertion.
After that, you can save the script in order to be able to replay it later: use e.g. -wp-prover script,z3-ce,alt-ergo to let WP take advantage of existing scripts in addition to automated solvers. The scripts are searched for (and saved to) the script subdirectory of the WP session directory, which defaults to ./.frama-c/wp and can be set with -wp-session.
Another possibility is to use a Coq script to complete the proof. This supposes you have Coq, CoqIDE and the Why3 Coq libraries installed (they are available as opam packages coq, coqide and why3-coq respectively). Once this is done and you have configured Why3 to use Coq (why3 config detect should tell you that it has found Coq), you can use it through the GUI of Frama-C to complete proofs that the built-in interactive prover can't take care of.
For that, you may need to configure Frama-C to display a Coq column in the WP Goals panel: click on the Provers button in this panel, and make sure that Coq is ON, as shown below:
Once this is done, you can double-click in the cell of this column that corresponds to the proof obligation you want to discharge with Coq. This will open a CoqIDE session where you have to complete the proof script at then end of the opened file. Once this is done, save the file and quit CoqIDE. The Coq script will then be saved as part of the WP session, and can be played again if coq is among the arguments given to -wp-prover. For what it's worth, the following script seems to do the trick (Coq 8.13.2, Why3 1.4.1, and of course Frama-C 24.0):
intros alloc mem_int b buf state0 state1 x a1 hpos hval hbmax hbmax1 htyp hlink huint1 huint2 huint3 hvalid htyp1 htyp2 hdef.
assert (0<=land 7 (to_sint32 x) <= 7)%Z.
apply uint_land_range; auto with zarith.
unfold valid_rd.
unfold valid_rd in hvalid.
unfold a1.
unfold shift; simpl.
unfold shift in hvalid; simpl in hvalid.
assert (to_sint32 (land 7 (to_sint32 x)) = land 7 (to_sint32 x))%Z.
- apply id_sint32; unfold is_sint32; auto with zarith.
- intros _; repeat split; auto with zarith.

Fisher Yates algorithm gives back same order of numbers in parallel started programs when seeded over the system time

I start several C / C++ programs in parallel, which rely on random numbers. Fairly new to this topic, I heard that the seed should be done over the time.
Furthermore, I use the Fisher Yates Algorithm to get a list with unique random shuffled values. However, starting the program twice in parallel gives back the same results for both lists.
How can I fix this? Can I use a different, but still relient seed?
My simple test code for this looks like this:
#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <time.h>
static int rand_int(int n) {
int limit = RAND_MAX - RAND_MAX % n;
int rnd;
do {
rnd = rand();
}
while (rnd >= limit);
return rnd % n;
}
void shuffle(int *array, int n) {
int i, j, tmp;
for (i = n - 1; i > 0; i--) {
j = rand_int(i + 1);
tmp = array[j];
array[j] = array[i];
array[i] = tmp;
}
}
int main(int argc,char* argv[]){
srand(time(NULL));
int x = 100;
int randvals[100];
for(int i =0; i < x;i++)
randvals[i] = i;
shuffle(randvals,x);
for(int i=0;i < x;i++)
printf("%d %d \n",i,randvals[i]);
}
I used the implementation for the fisher yates algorithm from here:
http://www.sanfoundry.com/c-program-implement-fisher-yates-algorithm-array-shuffling/
I started the programs in parallel like this:
./randomprogram >> a.txt & ./randomprogram >> b.txt
and then compared both text files, which had the same content.
The end application is for data augmentation in the deep learning field. The machine runs Ubuntu 16.04 with C++11.

You're getting the same results due to how you're seeding the RNG:
srand(time(NULL));
The time function returns the time in seconds since the epoch. If two instances of the program start during the same second (which is likely if start them in quick succession) then both will use the same seed and get the same set of random values.
You need to add more entropy to your seed. A simple way of doing this is to bitwise-XOR the process ID with the time:
srand(time(NULL) ^ getpid());

As I mentioned in a comment, I like to use a Xorshift* pseudo-random number generator, seeded from /dev/urandom if present, otherwise using POSIX.1 clock_gettime() and getpid() to seed the generator.
It is good enough for most statistical work, but obviously not for any kind of security or cryptographic purposes.
Consider the following xorshift64.h inline implementation:
#ifndef XORSHIFT64_H
#define XORSHIFT64_H
#include <stdlib.h>
#include <unistd.h>
#include <stdint.h>
#include <time.h>
#ifndef SEED_SOURCE
#define SEED_SOURCE "/dev/urandom"
#endif
typedef struct {
uint64_t state[1];
} prng_state;
/* Mixes state by generating 'rounds' pseudorandom numbers,
but does not store them anywhere. This is often done
to ensure a well-mixed state after seeding the generator.
*/
static inline void prng_skip(prng_state *prng, size_t rounds)
{
uint64_t state = prng->state[0];
while (rounds-->0) {
state ^= state >> 12;
state ^= state << 25;
state ^= state >> 27;
}
prng->state[0] = state;
}
/* Returns an uniform pseudorandom number between 0 and 2**64-1, inclusive.
*/
static inline uint64_t prng_u64(prng_state *prng)
{
uint64_t state = prng->state[0];
state ^= state >> 12;
state ^= state << 25;
state ^= state >> 27;
prng->state[0] = state;
return state * UINT64_C(2685821657736338717);
}
/* Returns an uniform pseudorandom number [0, 1), excluding 1.
This carefully avoids the (2**64-1)/2**64 bias on 0,
but assumes that the double type has at most 63 bits of
precision in the mantissa.
*/
static inline double prng_one(prng_state *prng)
{
uint64_t u;
double d;
do {
do {
u = prng_u64(prng);
} while (!u);
d = (double)(u - 1u) / 18446744073709551616.0;
} while (d == 1.0);
return d;
}
/* Returns an uniform pseudorandom number (-1, 1), excluding -1 and +1.
This carefully avoids the (2**64-1)/2**64 bias on 0,
but assumes that the double type has at most 63 bits of
precision in the mantissa.
*/
static inline double prng_delta(prng_state *prng)
{
uint64_t u;
double d;
do {
do {
u = prng_u64(prng);
} while (!u);
d = ((double)(u - 1u) - 9223372036854775808.0) / 9223372036854775808.0;
} while (d == -1.0 || d == 1.0);
return d;
}
/* Returns an uniform pseudorandom integer between min and max, inclusive.
Uses the exclusion method to ensure uniform distribution.
*/
static inline uint64_t prng_range(prng_state *prng, const uint64_t min, const uint64_t max)
{
if (min != max) {
const uint64_t basis = (min < max) ? min : max;
const uint64_t range = (min < max) ? max-min : min-max;
uint64_t mask = range;
uint64_t u;
/* In range, all bits up to the higest bit set in range, must be set. */
mask |= mask >> 1;
mask |= mask >> 2;
mask |= mask >> 4;
mask |= mask >> 8;
mask |= mask >> 16;
mask |= mask >> 32;
/* In all cases, range <= mask < 2*range, so at worst case,
(mask = 2*range-1), this excludes at most 50% of generated values,
on average. */
do {
u = prng_u64(prng) & mask;
} while (u > range);
return u + basis;
} else
return min;
}
static inline void prng_seed(prng_state *prng)
{
#if _POSIX_TIMERS-0 > 0
struct timespec now;
#endif
FILE *src;
/* Try /dev/urandom. */
src = fopen(SEED_SOURCE, "r");
if (src) {
int tries = 16;
while (tries-->0) {
if (fread(prng->state, sizeof prng->state, 1, src) != 1)
break;
if (prng->state[0]) {
fclose(src);
return;
}
}
fclose(src);
}
#if _POSIX_TIMERS-0 > 0
#if _POSIX_MONOTONIC_CLOCK-0 > 0
if (clock_gettime(CLOCK_MONOTONIC, &now) == 0) {
prng->state[0] = (uint64_t)((uint64_t)now.tv_sec * UINT64_C(60834327289))
^ (uint64_t)((uint64_t)now.tv_nsec * UINT64_C(34958268769))
^ (uint64_t)((uint64_t)getpid() * UINT64_C(2772668794075091))
^ (uint64_t)((uint64_t)getppid() * UINT64_C(19455108437));
if (prng->state[0])
return;
} else
#endif
if (clock_gettime(CLOCK_REALTIME, &now) == 0) {
prng->state[0] = (uint64_t)((uint64_t)now.tv_sec * UINT64_C(60834327289))
^ (uint64_t)((uint64_t)now.tv_nsec * UINT64_C(34958268769))
^ (uint64_t)((uint64_t)getpid() * UINT64_C(2772668794075091))
^ (uint64_t)((uint64_t)getppid() * UINT64_C(19455108437));
if (prng->state[0])
return;
}
#endif
prng->state[0] = (uint64_t)((uint64_t)time(NULL) * UINT64_C(60834327289))
^ (uint64_t)((uint64_t)clock() * UINT64_C(34958268769))
^ (uint64_t)((uint64_t)getpid() * UINT64_C(2772668794075091))
^ (uint64_t)((uint64_t)getppid() * UINT64_C(19455108437));
if (!prng->state[0])
prng->state[0] = (uint64_t)UINT64_C(16233055073);
}
#endif /* XORSHIFT64_H */
If it can seed the state from SEED_SOURCE, it is used as-is. Otherwise, if POSIX.1 clock_gettime() is available, it is used (CLOCK_MONOTONIC, if possible; otherwise CLOCK_REALTIME). Otherwise, time (time(NULL)), CPU time spent thus far (clock()), process ID (getpid()), and parent process ID (getppid()) are used to seed the state.
If you wanted the above to also run on Windows, you'd need to add a few #ifndef _WIN32 guards, and either omit the process ID parts, or replace them with something else. (I don't use Windows myself, and cannot test such code, so I omitted such from above.)
The idea is that you can include the above file, and implement other pseudo-random number generators in the same format, and choose between them by simply including different files. (You can include multiple files, but you'll need to do some ugly #define prng_state prng_somename_state, #include "somename.h", #undef prng_state hacking to ensure unique names for each.)
Here is an example of how to use the above:
#include <stdlib.h>
#include <inttypes.h>
#include <stdint.h>
#include <stdio.h>
#include "xorshift64.h"
int main(void)
{
prng_state prng1, prng2;
prng_seed(&prng1);
prng_seed(&prng2);
printf("Seed 1 = 0x%016" PRIx64 "\n", prng1.state[0]);
printf("Seed 2 = 0x%016" PRIx64 "\n", prng2.state[0]);
printf("After skipping 16 rounds:\n");
prng_skip(&prng1, 16);
prng_skip(&prng2, 16);
printf("Seed 1 = 0x%016" PRIx64 "\n", prng1.state[0]);
printf("Seed 2 = 0x%016" PRIx64 "\n", prng2.state[0]);
return EXIT_SUCCESS;
}
Obviously, initializing two PRNGs like this is problematic in the fallback case, because it basically relies on clock() yielding different values for consecutive calls (so expects each call to take at least 1 millisecond of CPU time).
However, even a small change in the seeds thus generated is sufficient to yield very different sequences. I like to generate and discard (skip) a number of initial values to ensure the generator state is well mixed:
Seed 1 = 0x8a62585b6e71f915
Seed 2 = 0x8a6259a84464e15f
After skipping 16 rounds:
Seed 1 = 0x9895f664c83ad25e
Seed 2 = 0xa3fd7359dd150e83
The header also implements 0 <= prng_u64() < 2**64, 0 <= prng_one() < 1, -1 < prng_delta() < +1, and min <= prng_range(,min,max) <= max, which should be uniform.
I use the above Xorshift64* variant for tasks where a lot of quite uniform pseudorandom numbers are needed, so the functions also tend to use the faster methods (like max. 50% average exclusion rate rather than 64-bit modulus operation, and so on) (of those that I know of).
Additionally, if you require repeatability, you can simply save a randomly-seeded prng_state structure (a single uint64_t), and load it later, to reproduce the exact same sequence. Just remember to only do the skipping (generate-and-discard) only after randomly seeding, not after loading a new seed from a file.

Converting rather copious comments into an answer.
If two programs are started in the same second, they'll both have the same sequence of random numbers.
Consider whether you need to use a better random number generator than the rand()/srand() duo — that is usually only barely random (better than nothing, but not by a large margin). Do NOT use them for cryptography.
I asked about platform; you responded Ubuntu 16.04 LTS.
Use /dev/urandom or /dev/random to get some random bytes for the seed.
On many Unix-like platforms, there's a device /dev/random — on Linux, there's also a slightly lower-quality device /dev/urandom which won't block whereas /dev/random might. Systems such as macOS (BSD) have /dev/urandom as a synonym for /dev/random for Linux compatibility. You can open it and read 4 bytes (or the relevant number of bytes) of random data, and use that as a seed for the PRNG of your choice.
I often use the drand48() set of functions because they are in POSIX and were in System V Unix. They're usually adequate for my needs.
Look at the manuals across platforms; there are often other random number generators. C++11 provides high-quality PRNG — the header <random> has a number of different ones, such as the MT 19937 (Mersenne Twister). MacOS Sierra (BSD) has random(3) and arc4random(3) as alternatives to rand() – as well as drand48() et al.
Another possibility on Linux is simply to keep a connection to /dev/urandom open, reading more bytes when you need them. However, that gives up any chance of replaying a random sequence. The PRNG systems have the merit of allowing you to replay the same sequence again by recording and setting the random seed that you use. By default, grab a seed from /dev/urandom, but if the user requests it, take a seed from the command line, and report the seed used (at least on request).

Optimising radix-2 FFT C code

I'm a beginner in C programming.
I am current trying to work on a project requiring 1024-point FFT implementation using radix-2, Decimation-in-frequency.
I attach the FFT function C code below.
How can i increase the performance by modifying the C code.
#include "i_cmplx.h" /* definition of the complex type */
#include "twiddle1024.h" /* quantised and scaled Twiddle factors */
#define LL 1024 /* Maximum length of FFT */
/* fft radix-2 funtion using Decimation In Frequency */
//#pragma CODE_SECTION(fft, "mycode"); // Makes the program run from internal memory
void fft(COMPLEX *Y, int N) /* FFT(input sample array, # of points) */
{
int temp1R, temp1I, temp2R,temp2I; /* 32 bits temporary storage for */
/* intermediate results */
short tempR, tempI, c, s; /* 16 bits temporary storages */
/* variables */
int TwFStep, /* Step between twiddle factors */
TwFIndex, /* Index of twiddle factors */
BLStep, /* Step for incrementing butterfly index */
BLdiff, /* Difference between upper and lower butterfly legs */
upperIdx,
lowerIdx, /* upper and lower indexes of buterfly leg */
i, j, k; /* loop control variables */
BLdiff=N;
TwFStep=1;
for(k=N;k>1;k=(k>>1)) /* Do Log(base 2)(N) Stages */
{
BLStep=BLdiff;
BLdiff=BLdiff>>1;
TwFIndex=0;
for(j=0;j<BLdiff;j++)/* Nbr of twiddle factors to use=BLDiff */
{
c=w[TwFIndex].real;
s=w[TwFIndex].imag;
TwFIndex=TwFIndex+TwFStep;
/* Now do N/BLStep butterflies */
for(upperIdx=j;upperIdx<N;upperIdx+=BLStep)
{
/* Calculations inside this loop avoid overflow by shifting left once
the result of every adittion/substration and by shifting left 15
places the result of every multiplication. Double precision temporary
results (32-bit) are used in order to avoid losing information because
of overflow. Final DFT result is scaled by N (number of points), i.e.,
2^(Nbr of stages) =2^(log(base 2) N) = N */
lowerIdx=upperIdx+BLdiff;
temp1R = (Y[upperIdx].real - Y[lowerIdx].real)>>1;
temp2R = (Y[upperIdx].real + Y[lowerIdx].real)>>1;
Y[upperIdx].real = (short) temp2R;
temp1I = (Y[upperIdx].imag - Y[lowerIdx].imag)>>1;
temp2I = (Y[upperIdx].imag + Y[lowerIdx].imag)>>1;
Y[upperIdx].imag = (short) temp2I;
temp2R = (c*temp1R - s*temp1I)>>15;
Y[lowerIdx].real = (short) temp2R;
temp2I = (c*temp1I + s*temp1R)>>15;
Y[lowerIdx].imag = (short) temp2I;
}
}
TwFStep = TwFStep<<1; /* update separation of twiddle factors)*/
}
/* bit reversal for resequencing data */
j=0;
for (i=1;i<(N-1);i++)
{
k=N/2;
while (k<=j)
{
j = j-k;
k=k/2;
}
j=j+k;
if (i<j)
{
tempR=Y[j].real;
tempI=Y[j].imag;
Y[j].real=Y[i].real;
Y[j].imag=Y[i].imag;
Y[i].real=tempR;
Y[i].imag=tempI;
}
}
return;
}

Distributed shared memory [duplicate]

This question already has answers here:
Distributed shared memory library for C++? [closed]
(3 answers)
Closed 8 years ago.
My current system runs on Linux, with the different tasks using shared memory to access the common data (which is defined as a C struct). The size of the shared data is about 100K.
Now, I want to run the main user interface on Windows, while keeping all the other tasks in Linux, and I’m looking for the best replacement for the shared memory. The refresh rate of the UI is about 10 times per second.
My doubt is if it’s better to do it by myself or to use a third party solution. If I do it on my own, my idea is to open a socket and then use some sort of data serialization between the client (Windows) and the server (Linux).
In the case of third party solutions, I’m a bit overwhelmed by the number of options. After some search, it seems to me that the right solution could be MPI, but I would like to consider other options before start working with it: XDR, OpenMP, JSON, DBus, RDMA, memcached, boost libraries … Has anyone got any experience with any of them? Which are the pros and cons of using such solutions for accessing the shared memory on Linux from Windows?
Maybe MPI or the other third party solutions are too elaborated for such a simple use as mine and I should use a do-it-yourself approach? Any advice if I take that solution? Am I missing something? Am I looking in the wrong direction? I wouldn’t like to reinvent the wheel.
For the file sharing, I’m considering Samba.
My development is done with C and C++, and, of course, the solution needs to be compiled in Linux and Windows.

AFAIK (but I don't know Windows) you cannot share memory (à la shm_overview(7)) on both Linux and Windows at once (you could share data, using some network protocol, this is what memcached does).
However, you can make a Linux process answering to network messages from a Windows machine (but that is not shared memory!).
Did you consider using a Web interface? You could add a web interface -with ajax techniques- on Linux (to your Linux software), e.g. using FastCGI or an HTTP server library like libonion or wt. Then your Windows users could use their browser to interact with your program (which would still run on some Linux compute server).
PS. Read the wikipage on distributed shared memory. It means something different than what you are asking! Read also about message passing!

I'd recommend using a plain TCP socket between the user interface and the service, with simple tagged message frames passed between the two in preferably architecture-independent manner (i.e. with specific byte order and integer and floating-point types).
On the server end, I'd use a helper process or thread, that maintains two local copies of the shared state struct per client. First one reflects the state the client knows about, and the second one is used to snapshot the shared state at regular intervals. When the two differ, the helper sends the differing fields to the client. Conversely, the client can send modification requests to the helper.
I would not send the 100k shared data structure as a single chunk; it'd be very inefficient. Also, assuming the state contains multiple fields, having the client send the entire new 100k state would overwrite fields not meant to by the client.
I'd use one message for each field in the state, or atomically manipulated set of fields. The message structure itself should be very simple. At minimum, each message should start with a fixed-size length and type fields, so that it is trivial to receive the messages. It is always a good idea to leave the possibility of later extending the capabilities/fields, without breaking backwards compatibility, too.
You don't need a lot of code to implement the above. You do need some accessor/manipulator code for each different type of field in the state (char, short, int, long, double, float, and any other type or struct you might use), and that (and the helper that compares the active state with the simulated user state) will probably be the bulk of the code.
Although I don't normally recommend reinventing the wheel, in this case I do recommend writing your own message passing code, directly on top of TCP, simply because all the libraries I know of that could be used for this, are rather unwieldy or would impose some design choices I'd rather leave to the application.
If you want, I can provide some example code that would hopefully illustrate this better. I don't have Windows, but Qt does have a QTcpSocket class you can use that works the same in both Linux and Windows, so there shouldn't be (m)any differences. In fact, if you design the message structure well, you could write the UI in some other language like Python, without any differences to the server itself.
Promised examples follow; apologies for maximum-length post.
Paired counters are useful when there is a single writer for a specific field, possibly many readers, and you wish to add minimal overhead to the write path. Reading never blocks, and a reader will see whether they got a valid copy or if an update was in progress and they should not rely on the value they saw. (This also allows reliable write semantics for a field in an async-signal-safe context, where pthread locking is not guaranteed to work.)
This implementation works best on GCC-4.7 and later, but should work with earlier C compilers, too. Legacy __sync built-ins also work with Intel, Pathscale, and PGI C compilers. counter.h:
#ifndef COUNTER_H
#define COUNTER_H
/*
* Atomic generation counter support.
*
* A generation counter allows a single writer thread to
* locklessly modify a value non-atomically, while letting
* any number of reader threads detect the change. Reader
* threads can also check if the value they observed is
* "atomic", or taken during a modification in progress.
*
* There is no protection against multiple concurrent writers.
*
* If the writer gets canceled or dies during an update,
* the counter will get garbled. Reinitialize before relying
* on such a counter.
*
* There is no guarantee that a reader will observe an
* "atomic" value, if the writer thread modifies the value
* more often than the reader thread can read it.
*
* Two typical use cases:
*
* A) A single writer requires minimum overhead/latencies,
* whereas readers can poll and retry if necessary
*
* B) Non-atomic value or structure modified in
* an interrupt context (async-signal-safe)
*
* Initialization:
*
* Counter counter = COUNTER_INITIALIZER;
*
* or
*
* Counter counter;
* counter_init(&counter);
*
* Write sequence:
*
* counter_acquire(&counter);
* (modify or write value)
* counter_release(&counter);
*
* Read sequence:
*
* unsigned int check;
* check = counter_before(&counter);
* (obtain a copy of the value)
* if (check == counter_after(&counter))
* (a copy of the value is atomic)
*
* Read sequence with spin-waiting,
* will spin forever if counter garbled:
*
* unsigned int check;
* do {
* check = counter_before(&counter);
* (obtain a copy of the value)
* } while (check != counter_after(&counter));
*
* All these are async-signal-safe, and will never block
* (except for the spinning loop just above, obviously).
*/
typedef struct {
volatile unsigned int value[2];
} Counter;
#define COUNTER_INITIALIZER {{0U,0U}}
#if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7)
/*
* GCC 4.7 and later provide __atomic built-ins.
* These are very efficient on x86 and x86-64.
*/
static inline void counter_init(Counter *const counter)
{
/* This is silly; assignments should suffice. */
do {
__atomic_store_n(&(counter->value[1]), 0U, __ATOMIC_SEQ_CST);
__atomic_store_n(&(counter->value[0]), 0U, __ATOMIC_SEQ_CST);
} while (__atomic_load_n(&(counter->value[0]), __ATOMIC_SEQ_CST) |
__atomic_load_n(&(counter->value[1]), __ATOMIC_SEQ_CST));
}
static inline unsigned int counter_before(Counter *const counter)
{
return __atomic_load_n(&(counter->value[1]), __ATOMIC_SEQ_CST);
}
static inline unsigned int counter_after(Counter *const counter)
{
return __atomic_load_n(&(counter->value[0]), __ATOMIC_SEQ_CST);
}
static inline unsigned int counter_acquire(Counter *const counter)
{
return __atomic_fetch_add(&(counter->value[0]), 1U, __ATOMIC_SEQ_CST);
}
static inline unsigned int counter_release(Counter *const counter)
{
return __atomic_fetch_add(&(counter->value[1]), 1U, __ATOMIC_SEQ_CST);
}
#else
/*
* Rely on legacy __sync built-ins.
*
* Because there is no __sync_fetch(),
* counter_before() and counter_after() are safe,
* but not optimal (especially on x86 and x86-64).
*/
static inline void counter_init(Counter *const counter)
{
/* This is silly; assignments should suffice. */
do {
counter->value[0] = 0U;
counter->value[1] = 0U;
__sync_synchronize();
} while (__sync_fetch_and_add(&(counter->value[0]), 0U) |
__sync_fetch_and_add(&(counter->value[1]), 0U));
}
static inline unsigned int counter_before(Counter *const counter)
{
return __sync_fetch_and_add(&(counter->value[1]), 0U);
}
static inline unsigned int counter_after(Counter *const counter)
{
return __sync_fetch_and_add(&(counter->value[0]), 0U);
}
static inline unsigned int counter_acquire(Counter *const counter)
{
return __sync_fetch_and_add(&(counter->value[0]), 1U);
}
static inline unsigned int counter_release(Counter *const counter)
{
return __sync_fetch_and_add(&(counter->value[1]), 1U);
}
#endif
#endif /* COUNTER_H */
Each shared state field type needs their own structure defined. For our purposes, I'll just use a Value structure to describe one of them. It is up to you to define all the needed Field and Value types to match your needs. For example, a 3D vector of double precision floating-point components would be
typedef struct {
double x;
double y;
double z;
} Value;
typedef struct {
Counter counter;
Value value;
} Field;
where the modifying thread uses e.g.
void set_field(Field *const field, Value *const value)
{
counter_acquire(&field->counter);
field->value = value;
counter_release(&field->counter);
}
and each reader maintains their own local shapshot using e.g.
typedef enum {
UPDATED = 0,
UNCHANGED = 1,
BUSY = 2
} Updated;
Updated check_field(Field *const local, const Field *const shared)
{
Field cache;
cache.counter[0] = counter_before(&shared->counter);
/* Local counter check allows forcing an update by
* simply changing one half of the local counter.
* If you don't need that, omit the local-only part. */
if (local->counter[0] == local->counter[1] &&
cache.counter[0] == local->counter[0])
return UNCHANGED;
cache.value = shared->value;
cache.counter[1] = counter_after(&shared->counter);
if (cache.counter[0] != cache.counter[1])
return BUSY;
*local = cache;
return UPDATED;
}
Note that the cache above constitutes one local copy of the value, so each reader only needs to maintain one copy of the fields it is interested in. Also, the above accessor function only modifies local with a successful snapshot.
When using pthread locking, pthread_rwlock_t is a good option, as long as the programmer understands the priority issues with multiple readers and writers. (see man pthread_rwlock_rdlock and man pthread_rwlock_wrlock for details.)
Personally, I would just stick to mutexes with as short holding times as possible, to reduce overall complexity. The Field structure for a mutex-protected change-counted value I'd use would probably be something like
typedef struct {
pthread_mutex_t mutex;
Value value;
volatile unsigned int change;
} Field;
void set_field(Field *const field, const Value *const value)
{
pthread_mutex_lock(&field->mutex);
field->value = *value;
#if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7)
__atomic_fetch_add(&field->change, 1U, __ATOMIC_SEQ_CST);
#else
__sync_fetch_and_add(&field->change, 1U);
#endif
pthread_mutex_unlock(&field->mutex);
}
void get_field(Field *const local, Field *const field)
{
pthread_mutex_lock(&field->mutex);
local->value = field->value;
#if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7)
local->value = __atomic_fetch_n(&field->change, __ATOMIC_SEQ_CST);
#else
local->value = __sync_fetch_and_add(&field->change, 0U);
#endif
pthread_mutex_unlock(&field->mutex);
}
Updated check_field(Field *const local, Field *const shared)
{
#if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7)
if (local->change == __atomic_fetch_n(&field->change, __ATOMIC_SEQ_CST))
return UNCHANGED;
#else
if (local->change == __sync_fetch_and_add(&field->change, 0U))
return UNCHANGED;
#endif
pthread_mutex_lock(&shared->mutex);
local->value = shared->value;
#if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7)
local->change = __atomic_fetch_n(&field->change, __ATOMIC_SEQ_CST);
#else
local->change = __sync_fetch_and_add(&field->change, 0U);
#endif
pthread_mutex_unlock(&shared->mutex);
}
Note that the mutex field in the local copy is unused, and can be omitted from the local copy (defining a suitable Cache etc. structure type without it).
The semantics with respect to the change field are simple: It is always accessed atomically, and can therefore be read without holding the mutex. It is only modified while holding the mutex (but because readers do not hold the mutex, it must be modified atomically even then).
Note that if your server is restricted to x86 or x86-64 architectures, you can use ordinary accesses to change above (e.g. field->change++), as unsigned int access is atomic on these architectures; no need to use built-in functions. (Although increment is non-atomic, readers will only ever see the old or the new unsigned int, so that's not a problem either.)
On the server, you'll need some kind of "agent" process or thread, acting on behalf of the remote clients, sending field changes to each client, and optionally receiving field contents for update from each client.
Efficient implementation of such an agent is a big question. A properly thought-out recommendation would require more detailed information on the system. The simplest possible implementation, I guess, would be a low-priority thread simply scanning the shared state continuously, reporting any field changes not known to each client as it sees them. If change rate is high -- say, usually more than a dozen changes per second over the entire state -- then that may be quite acceptable. If change rate varies, adding a global change counter (modified whenever a single field is modified, i.e. in set_field()) and sleeping (or at least yielding) when no global change is observed, is better.
If there are multiple remote clients, I would use a single agent, which collects changes into a queue, with a cached copy of the state as seen at the head of the queue (with previous changes being those that lead up to it). Initially connected clients get the entire state, then each queued change following that. Note that the queue entries do not need to represent fields as such, but the messages sent to each client to update that field, and that if a latter update replaces an already queued value, the already queued value can be removed, reducing the amount of data that needs to be sent to clients that are catching up. This is an interesting topic on its own, and would warrant a question of its own, in my opinion.
Like I mentioned above, I would personally use a plain TCP socket between the user interface and the service.
There are four main issues with respect to the TCP messages:
Byte order and data formats (unless both server and remote clients are guaranteed to use the same hardware architecture)
My preferred solution is to use native byte order (but well-defined standard types) when sending, with recipient doing any byte order conversions. This requires that each end sends an initial message with predetermined "prototype values" for each integer and floating-point type used.
I assume integers use two's complement format with no padding bits, and double and float are double- and single-precision IEEE-754 types, respectively. Most current hardware architectures have these natively (although byte ordering does vary). Weird architectures can and do emulate these in software, typically by supplying suitable command-line switches or using some libraries.
Message framing (as TCP is a stream of bytes, not a stream of packets, from the application perspective)
Most robust option is to prefix each message with a fixed size type field and a fixed size length field. (Usually, the length field indicates how many additional bytes are part of this message, and will include any possible padding bytes the sender might have added.)
The recipient will receive TCP input until it has enough cached for the fixed type and length parts to examine the complete packet length. Then it receives more data, until it has a full TCP packet. A minimal receive buffer implementation is something like
Extensibility
Backwards compatibility is essential for making updates and upgrades easier. This means you'll probably need to include some kind of version number in the initial message from server to client, and have both server and client ignore messages they do not know. (This obviously requires a length field in each message, as recipients cannot infer the length of messages they do not recognize.)
In some cases you might need the ability to mark a message important, in the sense that if the recipient does not recognize it, further communications should be aborted. This is easiest to achieve by using some kind of identifier or bit in the type field. For example, PNG file format is very similar to the message format I've described here, with a four-byte (four ASCII character) type field, and a four-byte (32-bit) length field for each message ("chunk"). If the first type character is uppercase ASCII, it means it is a critical one.
Passing initial state
Passing the initial state as a single chunk fixes the entire shared state structure. Instead, I recommend passing each field or record separately, just as if they were updates. Optionally, when all the initial fields have been sent, you can send a message that informs the client they have the entire state; this should let the client first receive the complete state, then construct and render the user interface, without having to dynamically adjust to varying number of fields.
Whether each client requires the full state, or just a smaller subset of the state, is another question. When a remote client connects to a server, it probably should include some kind of identifier the server can use to make that determination.
Here is messages.h, a header file with inline implementation of integer and floating-point type handling and byte order detection:
#ifndef MESSAGES_H
#define MESSAGES_H
#include <stdint.h>
#include <string.h>
#include <errno.h>
#if defined(__GNUC__)
static const struct __attribute__((packed)) {
#else
#pragma pack(push,1)
#endif
const uint64_t u64; /* Offset +0 */
const int64_t i64; /* +8 */
const double dbl; /* +16 */
const uint32_t u32; /* +24 */
const int32_t i32; /* +28 */
const float flt; /* +32 */
const uint16_t u16; /* +36 */
const int16_t i16; /* +38 */
} prototypes = {
18364758544493064720ULL, /* u64 */
-1311768465156298103LL, /* i64 */
0.71948481353325643983254167324048466980457305908203125,
3735928559UL, /* u32 */
-195951326L, /* i32 */
1.06622731685638427734375f, /* flt */
51966U, /* u16 */
-7658 /* i16 */
};
#if !defined(__GNUC__)
#pragma pack(pop)
#endif
/* Add prototype section to a buffer.
*/
size_t add_prototypes(unsigned char *const data, const size_t size)
{
if (size < sizeof prototypes) {
errno = ENOSPC;
return 0;
} else {
memcpy(data, &prototypes, sizeof prototypes);
return sizeof prototypes;
}
}
/*
* Byte order manipulation functions.
*/
static void reorder64(void *const dst, const void *const src, const int order)
{
if (order) {
uint64_t value;
memcpy(&value, src, 8);
if (order & 1)
value = ((value >> 8U) & 0x00FF00FF00FF00FFULL)
| ((value & 0x00FF00FF00FF00FFULL) << 8U);
if (order & 2)
value = ((value >> 16U) & 0x0000FFFF0000FFFFULL)
| ((value & 0x0000FFFF0000FFFFULL) << 16U);
if (order & 4)
value = ((value >> 32U) & 0x00000000FFFFFFFFULL)
| ((value & 0x00000000FFFFFFFFULL) << 32U);
memcpy(dst, &value, 8);
} else
if (dst != src)
memmove(dst, src, 8);
}
static void reorder32(void *const dst, const void *const src, const int order)
{
if (order) {
uint32_t value;
memcpy(&value, src, 4);
if (order & 1)
value = ((value >> 8U) & 0x00FF00FFUL)
| ((value & 0x00FF00FFUL) << 8U);
if (order & 2)
value = ((value >> 16U) & 0x0000FFFFUL)
| ((value & 0x0000FFFFUL) << 16U);
memcpy(dst, &value, 4);
} else
if (dst != src)
memmove(dst, src, 4);
}
static void reorder16(void *const dst, const void *const src, const int order)
{
if (order & 1) {
const unsigned char c[2] = { ((const unsigned char *)src)[0],
((const unsigned char *)src)[1] };
((unsigned char *)dst)[0] = c[1];
((unsigned char *)dst)[1] = c[0];
} else
if (dst != src)
memmove(dst, src, 2);
}
/* Detect byte order conversions needed.
*
* If the prototypes cannot be supported, returns -1.
*
* If prototype record uses native types, returns 0.
*
* Otherwise, bits 0..2 are the integer order conversion,
* and bits 3..5 are the floating-point order conversion.
* If 'order' is the return value, use
* reorderXX(local, remote, order)
* for integers, and
* reorderXX(local, remote, order/8)
* for floating-point types.
*
* For adjusting records sent to server, just do the same,
* but with order obtained by calling this function with
* parameters swapped.
*/
int detect_order(const void *const native, const void *const other)
{
const unsigned char *const source = other;
const unsigned char *const target = native;
unsigned char temp[8];
int iorder = 0;
int forder = 0;
/* Verify the size of the types.
* C89/C99/C11 says sizeof (char) == 1, but check that too. */
if (sizeof (double) != 8 ||
sizeof (int64_t) != 8 ||
sizeof (uint64_t) != 8 ||
sizeof (float) != 4 ||
sizeof (int32_t) != 4 ||
sizeof (uint32_t) != 4 ||
sizeof (int16_t) != 2 ||
sizeof (uint16_t) != 2 ||
sizeof (unsigned char) != 1 ||
sizeof prototypes != 40)
return -1;
/* Find byte order for the largest floating-point type. */
while (1) {
reorder64(temp, source + 16, forder);
if (!memcmp(temp, target + 16, 8))
break;
if (++forder >= 8)
return -1;
}
/* Verify forder works for all other floating-point types. */
reorder32(temp, source + 32, forder);
if (memcmp(temp, target + 32, 4))
return -1;
/* Find byte order for the largest integer type. */
while (1) {
reorder64(temp, source + 0, iorder);
if (!memcmp(temp, target + 0, 8))
break;
if (++iorder >= 8)
return -1;
}
/* Verify iorder works for all other integer types. */
reorder64(temp, source + 8, iorder);
if (memcmp(temp, target + 8, 8))
return -1;
reorder32(temp, source + 24, iorder);
if (memcmp(temp, target + 24, 4))
return -1;
reorder32(temp, source + 28, iorder);
if (memcmp(temp, target + 28, 4))
return -1;
reorder16(temp, source + 36, iorder);
if (memcmp(temp, target + 36, 2))
return -1;
reorder16(temp, source + 38, iorder);
if (memcmp(temp, target + 38, 2))
return -1;
/* Everything works. */
return iorder + 8 * forder;
}
/* Verify current architecture is supportable.
* This could be a compile-time check.
*
* (The buffer contains prototypes for network byte order,
* and actually returns the order needed to convert from
* native to network byte order.)
*
* Returns -1 if architecture is not supported,
* a nonnegative (0 or positive) value if successful.
*/
static int verify_architecture(void)
{
static const unsigned char network_endian[40] = {
254U, 220U, 186U, 152U, 118U, 84U, 50U, 16U, /* u64 */
237U, 203U, 169U, 135U, 238U, 204U, 170U, 137U, /* i64 */
63U, 231U, 6U, 5U, 4U, 3U, 2U, 1U, /* dbl */
222U, 173U, 190U, 239U, /* u32 */
244U, 82U, 5U, 34U, /* i32 */
63U, 136U, 122U, 35U, /* flt */
202U, 254U, /* u16 */
226U, 22U, /* i16 */
};
return detect_order(&prototypes, network_endian);
}
#endif /* MESSAGES_H */
Here is an example function that a sender can use to pack a 3-component vector identified by a 32-bit unsigned integer, into a 36-byte message. This uses a message frame starting with four bytes ("Vec3"), followed by the 32-bit frame length, followed by the 32-bit identifier, followed by three doubles:
size_t add_vector(unsigned char *const data, const size_t size,
const uint32_t id,
const double x, const double y, const double z)
{
const uint32_t length = 4 + 4 + 4 + 8 + 8 + 8;
if (size < (size_t)bytes) {
errno = ENOSPC;
return 0;
}
/* Message identifier, four bytes */
buffer[0] = 'V';
buffer[1] = 'e';
buffer[2] = 'c';
buffer[3] = '3';
/* Length field, uint32_t */
memcpy(buffer + 4, &length, 4);
/* Identifier, uint32_t */
memcpy(buffer + 8, &id, 4);
/* Vector components */
memcpy(buffer + 12, &x, 8);
memcpy(buffer + 20, &y, 8);
memcpy(buffer + 28, &z, 8);
return length;
}
Personally, I prefer shorter, 16-bit identifiers and 16-bit length; that also limits maximum message length to 65536 bytes, making 65536 bytes a good read/write buffer size.
The recipient can handle the received TCP data stream for example thus:
static unsigned char *input_data; /* Data buffer */
static size_t input_size; /* Data buffer size */
static unsigned char *input_head; /* Next byte in buffer */
static unsigned char *input_tail; /* After last buffered byte */
static int input_order; /* From detect_order() */
/* Function to handle "Vec3" messages: */
static void handle_vec3(unsigned char *const msg)
{
uint32_t id;
double x, y, z;
reorder32(&id, msg+8, input_order);
reorder64(&x, msg+12, input_order/8);
reorder64(&y, msg+20, input_order/8);
reorder64(&z, msg+28, input_order/8);
/* Do something with vector id, x, y, z. */
}
/* Function that tries to consume thus far buffered
* input data -- typically run once after each
* successful TCP receive. */
static void consume(void)
{
while (input_head + 8 < input_tail) {
uint32_t length;
/* Get current packet length. */
reorder32(&length, input_head + 4, input_order);
if (input_head + length < input_tail)
break; /* Not all read, yet. */
/* We have a full packet! */
/* Handle "Vec3" packet: */
if (input_head[0] == 'V' &&
input_head[1] == 'e' &&
input_head[2] == 'c' &&
input_head[3] == '3')
handle_vec3(input_head);
/* Advance to next packet. */
input_head += length;
}
if (input_head < input_tail) {
/* Move partial message to start of buffer. */
if (input_head > input_data) {
const size_t have = input_head - input_data;
memmove(input_data, input_head, have);
input_head = input_data;
input_tail = input_data + have;
}
} else {
/* Buffer is empty. */
input_head = input_data;
input_tail = input_data;
}
}
Questions?

CRC-32 on MicroController (Atmel)

I am currently trying to implement a CRC-32 for an incoming datastream (Serial communication) on an ATMEGA1280 and I am a little lost how to do this on the embedded side in C.... If anyone could point me in the proper direction and/or help in anyway i would greatly appreciate it...

There are plenty of CRC-32 implementations in C. The AT MEGA1280 has 128 KB of code space, it shoudn't have any problems running any off-the-shelf implementation.
Here is pretty much the first one I found.

You should know what polynomial you are dealing with. So it is not enough to know that you are using CRC, but you should also know polynomial.
You are looking for function with this kind of prototype
uint32_t crc(uint8_t * data, int len, uint32_t polynomial)
Or even further, this function can also support updating if you getting your data asynchronously, so there is extra parameter for resuming computation.
With CRC32 you stream bits through CRC function and you get 32 bit number that is used for data corruption checking.
I am sure you will be able to find C code of crc online.
EDIT:
It looks like, CRC32 polynomial is sort of standard and is usually unified.
That mean that CRC32 implementation will employ correct polynomial.
http://en.wikipedia.org/wiki/Computation_of_CRC

Copy-paste solution for AVR, ripped from here:
/* crc32.c -- compute the CRC-32 of a data stream
* Copyright (C) 1995-1996 Mark Adler
* For conditions of distribution and use, see copyright notice in zlib.h
*/
/* $Id: crc32.c,v 1.1 2007/10/07 20:47:35 matthias Exp $ */
/* ========================================================================
* Table of CRC-32's of all single-byte values (made by make_crc_table)
*/
static unsigned long crc_table[256] = {
0x00000000L, 0x77073096L, 0xee0e612cL, 0x990951baL, 0x076dc419L,
0x706af48fL, 0xe963a535L, 0x9e6495a3L, 0x0edb8832L, 0x79dcb8a4L,
0xe0d5e91eL, 0x97d2d988L, 0x09b64c2bL, 0x7eb17cbdL, 0xe7b82d07L,
0x90bf1d91L, 0x1db71064L, 0x6ab020f2L, 0xf3b97148L, 0x84be41deL,
0x1adad47dL, 0x6ddde4ebL, 0xf4d4b551L, 0x83d385c7L, 0x136c9856L,
0x646ba8c0L, 0xfd62f97aL, 0x8a65c9ecL, 0x14015c4fL, 0x63066cd9L,
0xfa0f3d63L, 0x8d080df5L, 0x3b6e20c8L, 0x4c69105eL, 0xd56041e4L,
0xa2677172L, 0x3c03e4d1L, 0x4b04d447L, 0xd20d85fdL, 0xa50ab56bL,
0x35b5a8faL, 0x42b2986cL, 0xdbbbc9d6L, 0xacbcf940L, 0x32d86ce3L,
0x45df5c75L, 0xdcd60dcfL, 0xabd13d59L, 0x26d930acL, 0x51de003aL,
0xc8d75180L, 0xbfd06116L, 0x21b4f4b5L, 0x56b3c423L, 0xcfba9599L,
0xb8bda50fL, 0x2802b89eL, 0x5f058808L, 0xc60cd9b2L, 0xb10be924L,
0x2f6f7c87L, 0x58684c11L, 0xc1611dabL, 0xb6662d3dL, 0x76dc4190L,
0x01db7106L, 0x98d220bcL, 0xefd5102aL, 0x71b18589L, 0x06b6b51fL,
0x9fbfe4a5L, 0xe8b8d433L, 0x7807c9a2L, 0x0f00f934L, 0x9609a88eL,
0xe10e9818L, 0x7f6a0dbbL, 0x086d3d2dL, 0x91646c97L, 0xe6635c01L,
0x6b6b51f4L, 0x1c6c6162L, 0x856530d8L, 0xf262004eL, 0x6c0695edL,
0x1b01a57bL, 0x8208f4c1L, 0xf50fc457L, 0x65b0d9c6L, 0x12b7e950L,
0x8bbeb8eaL, 0xfcb9887cL, 0x62dd1ddfL, 0x15da2d49L, 0x8cd37cf3L,
0xfbd44c65L, 0x4db26158L, 0x3ab551ceL, 0xa3bc0074L, 0xd4bb30e2L,
0x4adfa541L, 0x3dd895d7L, 0xa4d1c46dL, 0xd3d6f4fbL, 0x4369e96aL,
0x346ed9fcL, 0xad678846L, 0xda60b8d0L, 0x44042d73L, 0x33031de5L,
0xaa0a4c5fL, 0xdd0d7cc9L, 0x5005713cL, 0x270241aaL, 0xbe0b1010L,
0xc90c2086L, 0x5768b525L, 0x206f85b3L, 0xb966d409L, 0xce61e49fL,
0x5edef90eL, 0x29d9c998L, 0xb0d09822L, 0xc7d7a8b4L, 0x59b33d17L,
0x2eb40d81L, 0xb7bd5c3bL, 0xc0ba6cadL, 0xedb88320L, 0x9abfb3b6L,
0x03b6e20cL, 0x74b1d29aL, 0xead54739L, 0x9dd277afL, 0x04db2615L,
0x73dc1683L, 0xe3630b12L, 0x94643b84L, 0x0d6d6a3eL, 0x7a6a5aa8L,
0xe40ecf0bL, 0x9309ff9dL, 0x0a00ae27L, 0x7d079eb1L, 0xf00f9344L,
0x8708a3d2L, 0x1e01f268L, 0x6906c2feL, 0xf762575dL, 0x806567cbL,
0x196c3671L, 0x6e6b06e7L, 0xfed41b76L, 0x89d32be0L, 0x10da7a5aL,
0x67dd4accL, 0xf9b9df6fL, 0x8ebeeff9L, 0x17b7be43L, 0x60b08ed5L,
0xd6d6a3e8L, 0xa1d1937eL, 0x38d8c2c4L, 0x4fdff252L, 0xd1bb67f1L,
0xa6bc5767L, 0x3fb506ddL, 0x48b2364bL, 0xd80d2bdaL, 0xaf0a1b4cL,
0x36034af6L, 0x41047a60L, 0xdf60efc3L, 0xa867df55L, 0x316e8eefL,
0x4669be79L, 0xcb61b38cL, 0xbc66831aL, 0x256fd2a0L, 0x5268e236L,
0xcc0c7795L, 0xbb0b4703L, 0x220216b9L, 0x5505262fL, 0xc5ba3bbeL,
0xb2bd0b28L, 0x2bb45a92L, 0x5cb36a04L, 0xc2d7ffa7L, 0xb5d0cf31L,
0x2cd99e8bL, 0x5bdeae1dL, 0x9b64c2b0L, 0xec63f226L, 0x756aa39cL,
0x026d930aL, 0x9c0906a9L, 0xeb0e363fL, 0x72076785L, 0x05005713L,
0x95bf4a82L, 0xe2b87a14L, 0x7bb12baeL, 0x0cb61b38L, 0x92d28e9bL,
0xe5d5be0dL, 0x7cdcefb7L, 0x0bdbdf21L, 0x86d3d2d4L, 0xf1d4e242L,
0x68ddb3f8L, 0x1fda836eL, 0x81be16cdL, 0xf6b9265bL, 0x6fb077e1L,
0x18b74777L, 0x88085ae6L, 0xff0f6a70L, 0x66063bcaL, 0x11010b5cL,
0x8f659effL, 0xf862ae69L, 0x616bffd3L, 0x166ccf45L, 0xa00ae278L,
0xd70dd2eeL, 0x4e048354L, 0x3903b3c2L, 0xa7672661L, 0xd06016f7L,
0x4969474dL, 0x3e6e77dbL, 0xaed16a4aL, 0xd9d65adcL, 0x40df0b66L,
0x37d83bf0L, 0xa9bcae53L, 0xdebb9ec5L, 0x47b2cf7fL, 0x30b5ffe9L,
0xbdbdf21cL, 0xcabac28aL, 0x53b39330L, 0x24b4a3a6L, 0xbad03605L,
0xcdd70693L, 0x54de5729L, 0x23d967bfL, 0xb3667a2eL, 0xc4614ab8L,
0x5d681b02L, 0x2a6f2b94L, 0xb40bbe37L, 0xc30c8ea1L, 0x5a05df1bL,
0x2d02ef8dL
};
#define DO1(buf) crc = crc_table[((int)crc ^ (*buf++)) & 0xff] ^ (crc >> 8);
#define DO2(buf) DO1(buf); DO1(buf);
#define DO4(buf) DO2(buf); DO2(buf);
#define DO8(buf) DO4(buf); DO4(buf);
unsigned long crc32(crc, buf, len)
unsigned long crc;
const unsigned char *buf;
unsigned int len;
{
if (!buf) return(0L);
crc = crc ^ 0xffffffffL;
while (len >= 8)
{
DO8(buf);
len -= 8;
}
if (len) do {
DO1(buf);
} while (--len);
return(crc^0xffffffffL);
}

Develop Reference

c reactjs sql-server angularjs arrays wpf database batch-file google-app-engine silverlight