parallelize prime sieve segmentation error - c

I changed the first version of the Code linked here to make the loop on line 360 (in the Code I shared below) run in parallel.
For this I replaced the fields\variables where the results are saved by fields of field\variables so that every thread can save them without deleting the results of other threads.
Additionally I replaced the calculation of values used in every loop pass in a way that they aren't depending on the value they had in the last loop pass (So that I can calculate them just depending on the value of the loop variable).
I am going to post the entire Code here be course the Minimal example would just be 10 lines shorter but miss any possibility to Check if the result is wrong or not. No special compiler features are used, just -fopenmp (under g++) as argument is necessary.
//
// prime_sieve.c
//
// Copyright (C) July 2002, Tomás Oliveira e Silva
//
// e-mail: tos#ua.pt
// www: http://www.ieeta.pt/~tos
//
// Comparison of two simple (but serious) implementations of the segmented sieve of
// Eratosthenes. The second implementation can generate primes reasonably fast near
// 1e18 (using around 400Mbytes of memory).
//
// _implementation_=0 gives a classical segmented sieve
// _implementation_=1 gives a cache-friendly segmented sieve
//
// See timing results for the two implementations at the end.
//
// Main idea: use one linked list for each interval of the segmented sieve, putting in it
// the primes than have an odd multiple in that interval (but not in a previous interval);
// this allows a better utilization of the processor data caches, giving significant time
// savings (up to a factor of 6) when working near 1e18. The amount of memory used is
// approximately 8*pi(sqrt(N)), where N is the last number of the interval, and pi(x) is
// the usual prime counting function.
//
// Assumptions: pointers have 4 bytes, gcc compiler
//
//
// Released under the GNU general public license (version 2 or any later version); see the
// gpl.txt file (or the page http://www.gnu.org/licenses/gpl.html) for details.
//
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
#include <math.h>
#include <time.h>
#include <stdio.h>
#include <stdlib.h>
#include <omp.h>
//
// configuration specification
//
// default parameters optimized for integers near 1e18; see tables at the end
//
#define nthreads 2
#ifndef _sieve_bits_log2_
# define _sieve_bits_log2_ 19
#endif
#ifndef _bucket_size_log2_
# define _bucket_size_log2_ 10
#endif
#if _bucket_size_log2_ > 16
# error "_bucket_size_log2_ is too large"
#endif
//
// basic type definitions
//
typedef unsigned char u08;
typedef unsigned int u32;
typedef unsigned long long u64;
//
// memory allocation
//
static void *get_memory(u32 size)
{
size_t m;
m = (size_t)malloc(size + 255); // this assumes that sizeof(void *) = sizeof(size_t)
if((void *)m == NULL)
exit(1);
m = (m + (size_t)255) & ~(size_t)255;
return (void *)m; // pointer aligned on a 256 byte boundary
}
//
// count the number of zeros
//
static u32 count_zero_bits(u08 *addr,u32 size)
{
static u32 data[256];
u32 i,j;
if(data[1] == 0)
for(i = 0;i < 256;i++)
for(j = i ^ 255;j;j >>= 1)
if(j & 1)
data[i]++;
j = 0;
for(i = 0;i < size;i++)
j += data[(u32)addr[i] & 255];
return j;
}
//
// generation of the (small) primes used by the main sieve
//
#define number_of_small_primes 6541// number of primes below 2 ^ 19
static u32 small_primes[number_of_small_primes];
static u32 small_sieve[nthreads][1024];//65 536 bits
static u32 small_base[nthreads];
static void update_small_sieve(u32 th_id)
{
u32 i,j;
for(j = 0;j < 1024;j++)
small_sieve[th_id][j] = 0;
for(i = 0;i < number_of_small_primes;i++)
{
j = small_primes[i] * small_primes[i];
if(j >= small_base[th_id] + 65536)
break;
if(j < small_base[th_id])
{
j = small_base[th_id] / small_primes[i];
j *= small_primes[i];
if(j < small_base[th_id])
j += small_primes[i];
if((j & 1) == 0)
j += small_primes[i];
}
for(j = (j - small_base[th_id]) >> 1;j < 32768;j += small_primes[i])
small_sieve[th_id][j >> 5] |= 1 << (j & 31);
}
}
//
// main sieve
//
// the following structure is used to record the
// information required to sieve an interval
//
// the value of _bucket_size_log2_ should
// be small (and a multiple of the L1 or L2 data cache line size)
//
#define primes_per_bucket ((1 << (_bucket_size_log2_ - 3)) - 1)
typedef struct bucket
{
struct bucket *next; // pointer to next bucket
u32 count; // count of the number of primes in this bucket
struct
{
u32 p; // prime
u32 o; // the bit number of the first odd multiple (>= main_base) of the prime
}
data[primes_per_bucket];
}
bucket;
static u32 main_sieve[nthreads][1 << (_sieve_bits_log2_ - 5)];
static u64 main_limit; // wird nicht parallel geaendert
static bucket **main_lists[nthreads],*available_buckets[nthreads];
static u32 list_size_log2;
void more_buckets(int th_id) { u32 i,j; i = 1 << (20 - _bucket_size_log2_);
available_buckets[th_id] = (bucket *)get_memory(i * sizeof(bucket)); for(j = 0;j < i;j++)
available_buckets[th_id][j].next = (j < i - 1) ? &available_buckets[th_id][j + 1] : NULL; }
void new_bucket(u64 k,int th_id) { bucket *b; if(available_buckets[th_id] == NULL) more_buckets(th_id);
b = available_buckets[th_id]; available_buckets[th_id] = available_buckets[th_id]->next;
b->next = main_lists[th_id][k]; main_lists[th_id][k] = b; b->count = 0; }
static void init_main_sieve(const u64 main_base, const u32 th_id, u32 next_prime, const u32 current_list)
{
u64 t,end;
u32 i,j;
u32 k;
end = main_base + (u64)(2 << _sieve_bits_log2_);
if ( small_base[th_id] != (next_prime/65536) * 65536) {
small_base[th_id] = (next_prime/65536) * 65536;
update_small_sieve(th_id);
}
while((t = (u64)next_prime * (u64)next_prime) < end)
{
if(next_prime >= small_base[th_id] + 65536)
{
small_base[th_id] += 65536;
update_small_sieve(th_id);
}
// primes are (beside two) always odd so they have at least a distance of 2.
// you dont have to save information about even numbers, so divide distance by two.
i = (next_prime - small_base[th_id]) >> 1;
if((small_sieve[th_id][i >> 5] & (1 << (i & 31))) == 0)// is nextprime a prime?
{
if(t < main_base) // setze t auf das erste vielfache der Primzahl > main_base
{
t = main_base / (u64)next_prime;
t *= (u64)next_prime;
if(t < main_base)
t += (u64)next_prime;
if(((u32)t & 1) == 0)
t += (u64)next_prime;
}
i = (u32)((t - main_base) >> 1); // bit number
k = (current_list + (i >> _sieve_bits_log2_)) & ((1 << list_size_log2) - 1);
if(main_lists[th_id][k]->count == primes_per_bucket){
//#pragma omp critical
new_bucket(k, th_id);
}
j = main_lists[th_id][k]->count++;
main_lists[th_id][k]->data[j].p = next_prime;
main_lists[th_id][k]->data[j].o = i & ((1 << _sieve_bits_log2_) - 1);
}
// atomic add
next_prime += 2;
}
}
static void do_main_sieve(const u64 main_base, const u32 th_id, u32 next_prime, const u32 current_list)
{
bucket *b;
bucket *c;
u32 j,k;
u32 i,p,o;
init_main_sieve(main_base, th_id, next_prime, current_list);
for(i = 0;i < (1 << (_sieve_bits_log2_ - 5));i++)
main_sieve[th_id][i] = 0;
b = main_lists[th_id][current_list];
while(b != NULL)
{
for(i = 0;i < b->count;i++)
{
p = b->data[i].p;
for(o = b->data[i].o;o < (1 << _sieve_bits_log2_);o += p)
//finde das entsprechende u32 feld mit allen bits von o auser den 5 letzten
// und finde mit den letzten 5 bits von 0 die Stelle in dem u32 wert die du auf 1
// dh. vielfaches einer Zahl, setzt
main_sieve[th_id][o >> 5] |= 1 << (o & 31);
k = (current_list + (o >> _sieve_bits_log2_)) & ((1 << list_size_log2) - 1);
if(main_lists[th_id][k]->count == primes_per_bucket) {
//#pragma omp critical
new_bucket(k, th_id);
}
j = main_lists[th_id][k]->count++;
main_lists[th_id][k]->data[j].p = p;
main_lists[th_id][k]->data[j].o = o & ((1 << _sieve_bits_log2_) - 1);
}
c = b;
b = b->next;
c->next = available_buckets[th_id];
available_buckets[th_id] = c;
}
main_lists[th_id][current_list] = NULL;
#pragma omp critical
new_bucket(current_list, th_id);
//current_list = (current_list + 1) & ((1 << list_size_log2) - 1);
}
void set_small_primes(void)
{
u32 i,j;
if(small_primes[0] == 0)
{ // initialize the small_primes array
for(j = 0;j < 1024;j++)
small_sieve[0][j] = 0;
for(i = 3;i < 256;i += 2)// 256 ^2 = 65 536
if((small_sieve[0][i >> 6] & (1 << ((i >> 1) & 31))) == 0)
for(j = (i * i) >> 1;j < 32768;j += i)
small_sieve[0][j >> 5] |= 1 << (j & 31);
j = 0;
for(i = 3;i < 65536;i += 2)
if((small_sieve[0][i >> 6] & (1 << ((i >> 1) & 31))) == 0)
small_primes[j++] = i;
if(j != number_of_small_primes)
exit(2); // this should never happen
}
}
//
// main program
//
int main(int argc,char **argv)
{
double t;
u32 i,j;
u64 pi, counter=0;
u64 main_base;
int ntasks = 1;
u32 next_prime = 3;
u32 current_list = 0;
omp_set_num_threads(nthreads);
if(argc == 1)
i = 15;
else
i = atoi(argv[1]);
if(i < 6)
i = 6;
if(i > 18)
i = 18;
printf("%2u %2u",_sieve_bits_log2_,_bucket_size_log2_);
main_base = 1ull;
for(j = 0;j < i;j++)
main_base *= 10ull;
main_limit = main_base + 2000000000ull;
// set list_size_log2
u32 l;
l = 1 + (u32)ceil(sqrt((double)main_limit));
l = 2 + (l >> _sieve_bits_log2_);
for(list_size_log2 = 2;(1 << list_size_log2) < l;list_size_log2++)
;
//set main_lists
for (int i = 0; i < nthreads;i++) {
available_buckets[i] = NULL;
main_lists[i] = (bucket **)get_memory((1 << list_size_log2) * sizeof(bucket *));
for(u32 k = 0;k < (1 << list_size_log2);k++)
{
main_lists[i][k] = NULL;
new_bucket(k, i);
}
}
//set_small_primes
t = (double)clock();
for (int i = 0; i < nthreads;i++) small_base[i] = 0;
set_small_primes();
printf(" %2d",i);
// init main sieve
init_main_sieve(main_base,0, next_prime, current_list);
t = ((double)clock() - t) / (double)CLOCKS_PER_SEC;
printf(" %6.2f",t);
j = 1 << (_sieve_bits_log2_ - 3);
pi = 0ull;
main_limit = main_base + 1000000000ull;
if(((u32)main_base | (u32)main_limit) & 63)
{
fprintf(stderr,"Warning: prime number counts may be incorrect\n");
fprintf(stderr," main_base and main_limit should be multiples of 64\n");
}
// calculate iteration count fast
t = (double)clock();
u64 main_base_tmp = main_base;
const u64 main_base_const = main_base_tmp;
for(;;)
{
i = (u32)(main_limit - main_base_tmp) >> 4;
if(i <= j)
break;
main_base_tmp += (u64)j << 4;
counter++;
}
{
//prepare values
int th_id = omp_get_thread_num();
u64 main_base_private = main_base_const;
u64 end = main_base_private + (u64)(2 << _sieve_bits_log2_);
u32 next_prime_private = next_prime;
while ((u64) next_prime_private * (u64) next_prime_private < end) next_prime_private += 2;
next_prime = next_prime_private;
// call function
do_main_sieve(main_base_private, th_id, next_prime_private, current_list);
// calculate results
pi += (u64)count_zero_bits((u08 *)main_sieve[th_id],j);
}
while (1) printf("B");
#pragma omp parallel for //private (main_base)
for(u64 c=1;c<counter;c++)
{
//prepare values
u32 current_list_private = current_list;
for (u64 count = 0; count < c; count++)
current_list_private = (current_list_private + 1) & ((1 << list_size_log2) - 1);
int th_id = omp_get_thread_num();
u64 main_base_private = main_base_const+((u64)j << 4)*(c);
u64 end = main_base_const+((u64)j << 4)*(c-1) + (u64)(2 << _sieve_bits_log2_);
u32 next_prime_private = next_prime;
while ((u64) next_prime_private * (u64) next_prime_private < end) next_prime_private += 2;
// call function
do_main_sieve(main_base_private, th_id, next_prime_private, current_list_private);
// calculate results
#pragma omp atomic
pi += (u64)count_zero_bits((u08 *)main_sieve[th_id],j);
printf(" %llu",c);
}
main_base = main_base_const+((u64)j << 4)*(counter);
u64 end = main_base + (u64)(2 << _sieve_bits_log2_);
while ((u64) next_prime * (u64) next_prime < end) next_prime += 2;
for (u64 count = 0; count < counter; count++)
current_list = (current_list + 1) & ((1 << list_size_log2) - 1);
do_main_sieve(main_base, 0, next_prime, current_list);
i = (u32)(main_limit - main_base) >> 4;
pi += (u64)count_zero_bits((u08 *)main_sieve[0],i);
t = ((double)clock() - t) / (double)CLOCKS_PER_SEC;
printf(" %7.2f %8llu\n",t,pi);
return 0;
}
I checked all variables used in this Code, they should be not depending on anything else then the loop variable (and other variables calculated only depending on the loop variable). To be specific
next_prime, main_base, small_base, small_sieve, available_buckets_buckets, main_sieve, current_list should not cause any trouble.
If would really appreciate it if anybody could have a look at it and tell me why I always get the same wrong result if I choose a threadnum > 1.
May some IDE's could tell more about that too but I use Codelite rarely and don't know how to get this information.

Related

(spoj)prime generator using c- time limited exceeded issue?

My code is throwing time exceeded in spoj although it is executed correctly,
Here's my code:
#include <stdio.h>
int main() {
int t;
scanf("%d",&t);
while(t--){
long int n,m,count=0;
scanf("%ld%ld",&n,&m);
for(int i=n;i<=m;i++){
count=0;
for(int j=1;j<=i;j++){
if(i%j==0){
count++;
}
}
if(count==2){
printf("%d\n",i);
}
}
}
return 0;
}
From my top comments, there are various speedups ...
The biggest issue is that you iterate by 1 until you reach m. This is much faster if you stop at the int equivalent of sqrt(m).
And, after checking for 2, you only need to test odd values, so you can do: i += 2 instead of i++ (e.g. 3,5,7,9,11,13,15,17,29,...).
And, after 3, primes are numbers that are only of the form: 6n-1 or 6n+1. So, you can test [only] the sequence 5,7 11,13 17,19 23,25 29,31 ...
thank you for showing interest to solve my doubt could u please clearly explain from point to point
There are many references for the above speedups. For the first two, look at https://en.wikipedia.org/wiki/Trial_division
For (3), a web search on prime 6n produces:
https://primes.utm.edu/notes/faq/six.html
https://reflectivemaths.wordpress.com/2011/07/22/proof-primes-are-6n-1/
Note that [as others have mentioned], a "sieve" algorithm may be faster if you have sufficient memory: https://en.wikipedia.org/wiki/Sieve_of_Eratosthenes
Note that I first mentioned (1) (on SO) in a comment here: Finding Primes, where did I got wrong?
I had developed a benchmark program for successively faster algorithms for that question, but never got around to adding an answer.
Here is the program that I developed. Note that the function prime1 is equivalent to your algorithm.
// primebnc/primebnc.c -- prime algorithm benchmark
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
//#define MAXCNT 10001
//#define MAXCNT 20002
//#define MAXCNT 30003
//#define MAXCNT 40004
#define MAXCNT 1000000
int opt_f;
typedef unsigned long long val_t;
typedef struct {
int tst_no; // test number
const char *tst_who; // test description
double tst_elap; // elapsed time for test
} tst_t;
int tstnocur; // current test number
int tstnobase; // lowest test number
int tstnomax; // highest test number
int inited; // 1=initialization complete
const char *reason;
tst_t tstlist[30]; // list of test results
int pcntold;
double bestrate;
double tvprev;
int maxcnt;
val_t *pxold;
val_t *pxnow;
// iteratively get square root
#define SQR(_n) \
for (; (xsqrt * xsqrt) < _n; xsqrt += 1)
// tstloc -- find test control for given test number
tst_t *
tstloc(int tstno)
{
if (tstno == 0)
tstno = tstnocur;
return &tstlist[tstno];
}
// tstset -- setup test
void
tstset(int tstno,const char *tag)
{
tst_t *tst;
tstnocur = tstno;
// remember lowest test number
if (tstnobase == 0)
tstnobase = tstno;
// increase count of active tests
if (tstno > tstnomax)
tstnomax = tstno;
reason = tag;
tst = tstloc(tstno);
tst->tst_no = tstno;
tst->tst_who = reason;
}
int
prime1(val_t *primes)
{
val_t i;
int idx;
int isprime;
val_t n = 2;
idx = 0;
while (idx <= maxcnt) {
isprime = 1;
for (i = 2; i < n; i++) {
if (n % i == 0) {
isprime = 0;
break;
}
}
if (isprime) {
primes[idx] = n;
idx++;
}
n++;
}
tstset(1,"baseline 2 to n");
return idx;
}
int
prime2(val_t *primes)
{
val_t i;
int idx;
int isprime;
val_t xsqrt;
val_t n = 2;
val_t n2;
xsqrt = 0;
idx = 0;
primes[idx++] = 2;
while (idx <= maxcnt) {
// get sqrt(n)
SQR(n);
isprime = 1;
for (i = 2; i <= xsqrt; i++) {
if (n % i == 0) {
isprime = 0;
break;
}
}
if (isprime) {
primes[idx] = n;
idx++;
}
n2 = n + 1;
if (n2 < n)
printf("overflow: idx=%d\n",idx);
n = n2;
}
tstset(2,"2 to sqrt by 2");
return idx;
}
int
prime3(val_t *primes)
{
val_t i;
int idx;
int isprime;
val_t xsqrt;
val_t n;
xsqrt = 0;
idx = 0;
primes[idx++] = 2;
primes[idx++] = 3;
n = 5;
while (idx <= maxcnt) {
// get sqrt(n)
SQR(n);
isprime = 1;
for (i = 3; i <= xsqrt; i += 2) {
if (n % i == 0) {
isprime = 0;
break;
}
}
if (isprime) {
primes[idx] = n;
idx++;
}
n += 2;
}
tstset(3,"3 to sqrt by 2");
return idx;
}
int
prime4(val_t *primes)
{
val_t i;
int idx;
int isprime;
val_t xsqrt;
val_t n;
val_t lo;
val_t hi;
xsqrt = 0;
idx = 0;
primes[idx++] = 2;
primes[idx++] = 3;
n = 6;
while (idx <= maxcnt) {
lo = n - 1;
hi = n + 1;
// get sqrt(n)
SQR(hi);
isprime = 3;
for (i = 3; i <= xsqrt; i += 2) {
if (isprime & 1) {
if (lo % i == 0)
isprime &= ~1;
}
if (isprime & 2) {
if (hi % i == 0)
isprime &= ~2;
}
if (! isprime)
break;
}
if (isprime & 1) {
primes[idx] = lo;
idx++;
}
if (isprime & 2) {
primes[idx] = hi;
idx++;
}
n += 6;
}
tstset(4,"6 to sqrt by 6 (combined 6n-1/6n+1 loops)");
return idx;
}
int
prime5(val_t *primes)
{
val_t i;
int idx;
int isprime;
val_t xsqrt;
val_t n;
val_t lo;
val_t hi;
xsqrt = 0;
idx = 0;
primes[idx++] = 2;
primes[idx++] = 3;
n = 6;
while (idx <= maxcnt) {
lo = n - 1;
hi = n + 1;
// get sqrt(n)
SQR(hi);
isprime = 1;
for (i = 3; i <= xsqrt; i += 2) {
if (lo % i == 0) {
isprime = 0;
break;
}
}
if (isprime) {
primes[idx] = lo;
idx++;
}
isprime = 1;
for (i = 3; i <= xsqrt; i += 2) {
if (hi % i == 0) {
isprime = 0;
break;
}
}
if (isprime) {
primes[idx] = hi;
idx++;
}
n += 6;
}
tstset(5,"6 to sqrt by 6 (separate 6n-1/6n+1 loops)");
return idx;
}
int
prime6(val_t *primes)
{
int cnt;
int isprime;
val_t xsqrt;
val_t n;
val_t lo;
val_t hi;
val_t pval;
val_t *pptr;
val_t *pend;
xsqrt = 0;
cnt = 0;
primes[cnt++] = 2;
primes[cnt++] = 3;
n = 6;
while (cnt <= maxcnt) {
lo = n - 1;
hi = n + 1;
// get sqrt(n)
SQR(hi);
isprime = 3;
pptr = primes;
pend = primes + cnt;
for (; pptr < pend; ++pptr) {
pval = *pptr;
// early stop if we exceed square root of number being tested
if (pval > xsqrt)
break;
// test 6n - 1
if (isprime & 1) {
if ((lo % pval) == 0)
isprime &= ~1;
}
// test 6n + 1
if (isprime & 2) {
if ((hi % pval) == 0)
isprime &= ~2;
}
// bug out if both are non-prime
if (! isprime)
break;
}
// 6n - 1 is prime
if (isprime & 1) {
primes[cnt] = lo;
cnt++;
}
// 6n + 1 is prime
if (isprime & 2) {
primes[cnt] = hi;
cnt++;
}
n += 6;
}
tstset(6,"6 to sqrt by prime list (combined 6n-1/6n+1 loops)");
return cnt;
}
int
prime7(val_t *primes)
{
int cnt;
int isprime;
val_t xsqrt;
val_t n;
val_t lo;
val_t hi;
val_t pval;
val_t *pptr;
val_t *pend;
xsqrt = 0;
cnt = 0;
primes[cnt++] = 2;
primes[cnt++] = 3;
n = 6;
while (cnt <= maxcnt) {
lo = n - 1;
hi = n + 1;
// get sqrt(n)
SQR(hi);
// check for 6n - 1 is prime
isprime = 1;
pptr = primes;
pend = primes + cnt;
for (; pptr < pend; ++pptr) {
pval = *pptr;
// early stop if we exceed square root of number being tested
if (pval > xsqrt)
break;
// test 6n - 1
if ((lo % pval) == 0) {
isprime = 0;
break;
}
}
if (isprime) {
primes[cnt] = lo;
cnt++;
}
// check for 6n + 1 is prime
isprime = 1;
pptr = primes;
pend = primes + cnt;
for (; pptr < pend; ++pptr) {
pval = *pptr;
// early stop if we exceed square root of number being tested
if (pval > xsqrt)
break;
// test 6n + 1
if ((hi % pval) == 0) {
isprime = 0;
break;
}
}
if (isprime) {
primes[cnt] = hi;
cnt++;
}
n += 6;
}
tstset(7,"6 to sqrt by prime list (separate 6n-1/6n+1 loops)");
return cnt;
}
double
tscgetf(void)
{
struct timespec ts;
double sec;
clock_gettime(CLOCK_REALTIME,&ts);
sec = ts.tv_nsec;
sec /= 1e9;
sec += ts.tv_sec;
return sec;
}
void
showrat(double ratio,int tstprev)
{
const char *tag;
if (ratio > 1.0)
tag = "faster";
else {
tag = "slower";
ratio = 1.0 / ratio;
}
printf(" %.3fx %s than prime%d\n",ratio,tag,tstprev);
}
void
timeit(int (*pfnc)(val_t *))
{
tst_t *tstcur;
tst_t *tstcmp;
val_t *pnow;
val_t *pold;
int pcntact;
double tvbeg;
double tvend;
double rate;
double ratio;
printf("---------------\n");
pold = pxold;
pnow = inited ? pxnow : pxold;
// load up the cache
for (int i = 0; i < maxcnt; i++)
pnow[i] = 1;
tvbeg = tscgetf();
pcntact = pfnc(pnow);
tvend = tscgetf();
tvend -= tvbeg;
tstcur = tstloc(0);
printf("prime%d: %s\n",tstnocur,reason);
// show prime generation rate
rate = (double) maxcnt / tvend;
printf(" %.9f (%.3f primes/sec)\n",tvend,rate);
do {
if (! inited) {
pcntold = pcntact;
bestrate = rate;
break;
}
// show time ratio
for (int tstno = tstnobase; tstno <= tstnomax; ++tstno) {
if (tstno == tstnocur)
continue;
tstcmp = tstloc(tstno);
ratio = tstcmp->tst_elap / tvend;
showrat(ratio,tstno);
}
for (int i = 0; i < maxcnt; i++) {
if (pnow[i] != pold[i]) {
printf("%d: pold=%lld pnow=%lld\n",i,pold[i],pnow[i]);
break;
}
}
} while (0);
tstcur->tst_elap = tvend;
inited = 1;
}
int
main(int argc,char **argv)
{
char *cp;
--argc;
++argv;
maxcnt = MAXCNT;
for (; argc > 0; --argc, ++argv) {
cp = *argv;
if (*cp != '-')
break;
switch (cp[1]) {
case 'f':
opt_f = 1;
break;
case 'N':
maxcnt = strtol(cp + 2,&cp,10);
break;
}
}
setlinebuf(stdout);
if (opt_f)
maxcnt = 40004;
printf("maxcnt=%d\n",maxcnt);
pxold = calloc(maxcnt + 1,sizeof(val_t));
pxnow = calloc(maxcnt + 1,sizeof(val_t));
// this takes a whole minute
if (opt_f)
timeit(prime1);
// these are _much_ faster
timeit(prime2);
timeit(prime3);
timeit(prime4);
timeit(prime5);
timeit(prime6);
timeit(prime7);
return 0;
}
Here is the program output with the -f option (to force invocation of prime1):
maxcnt=40004
---------------
prime1: baseline 2 to n
69.242110729 (577.741 primes/sec)
---------------
prime2: 2 to sqrt by 2
0.182171345 (219595.459 primes/sec)
380.093x faster than prime1
---------------
prime3: 3 to sqrt by 2
0.091353893 (437901.424 primes/sec)
757.955x faster than prime1
1.994x faster than prime2
---------------
prime4: 6 to sqrt by 6 (combined 6n-1/6n+1 loops)
0.095818758 (417496.541 primes/sec)
722.636x faster than prime1
1.901x faster than prime2
1.049x slower than prime3
---------------
prime5: 6 to sqrt by 6 (separate 6n-1/6n+1 loops)
0.095270157 (419900.642 primes/sec)
726.797x faster than prime1
1.912x faster than prime2
1.043x slower than prime3
1.006x faster than prime4
---------------
prime6: 6 to sqrt by prime list (combined 6n-1/6n+1 loops)
0.047716141 (838374.591 primes/sec)
1451.126x faster than prime1
3.818x faster than prime2
1.915x faster than prime3
2.008x faster than prime4
1.997x faster than prime5
---------------
prime7: 6 to sqrt by prime list (separate 6n-1/6n+1 loops)
0.040664196 (983764.685 primes/sec)
1702.778x faster than prime1
4.480x faster than prime2
2.247x faster than prime3
2.356x faster than prime4
2.343x faster than prime5
1.173x faster than prime6
Here is the program output without -f:
maxcnt=1000000
---------------
prime2: 2 to sqrt by 2
24.093246222 (41505.407 primes/sec)
---------------
prime3: 3 to sqrt by 2
12.029967308 (83125.745 primes/sec)
2.003x faster than prime2
---------------
prime4: 6 to sqrt by 6 (combined 6n-1/6n+1 loops)
12.633468866 (79154.824 primes/sec)
1.907x faster than prime2
1.050x slower than prime3
---------------
prime5: 6 to sqrt by 6 (separate 6n-1/6n+1 loops)
12.002494335 (83316.015 primes/sec)
2.007x faster than prime2
1.002x faster than prime3
1.053x faster than prime4
---------------
prime6: 6 to sqrt by prime list (combined 6n-1/6n+1 loops)
4.346790791 (230054.780 primes/sec)
5.543x faster than prime2
2.768x faster than prime3
2.906x faster than prime4
2.761x faster than prime5
---------------
prime7: 6 to sqrt by prime list (separate 6n-1/6n+1 loops)
3.761972904 (265817.970 primes/sec)
6.404x faster than prime2
3.198x faster than prime3
3.358x faster than prime4
3.190x faster than prime5
1.155x faster than prime6

Generalizing Radix Sort uint64_t code from 32 MSB and 48 MSB to 40 MSB sort bits

Is it possible to generalize this Radix Sort code to look at only the 40 most significant bits of uint64_t data?
Generalizing the 32 sort bit code from user RGCLGR to 48 and 64 bits and comparing sorting a uint64_t[36M] on 64, 48 and 32 bits:
Time= 3.130 sec = 20.342%, RADIX_SORT_UINT64_REG, hits=4, 0.782 sec each
Time= 2.336 sec = 15.180%, RADIX_SORT_UINT64_48R, hits=4, 0.584 sec each
Time= 1.540 sec = 10.007%, RADIX_SORT_UINT64_32R, hits=4, 0.385 sec each
This confirms the expected linearity between bits sorted and time to sort.
I have a need to sort hundreds of uint64_t[]s on only the 34 Most Significant Bits. The 48 MSB sort works, but sorting on only 40 bits should take ~5/6 as long. This could reduce a 58 second travail to a mere 48 second ordeal for the user.
The difference between the 32 MSB code and the 48 MSB code is mostly slight variations except for one code segment:
32 bit code segment radix sorting mIndex [0, 1, 2, 3]:
for (i = 0; i < count; i++) { /* radix sort */
u = pData[i];
pTemp[mIndex[3][(u >> 32) & 0xff]++] = u;
}
for (i = 0; i < count; i++) {
u = pTemp[i];
pData[mIndex[2][(u >> 40) & 0xff]++] = u;
}
for (i = 0; i < count; i++) {
u = pData[i];
pTemp[mIndex[1][(u >> 48) & 0xff]++] = u;
}
for (i = 0; i < count; i++) {
u = pTemp[i];
pData[mIndex[0][(u >> 56) & 0xff]++] = u;
}
The 48 bit segment prepends this code to handle mIndex [4, 5]:
for (i = 0; i < count; i++) { /* radix sort */
u = pData[i];
pTemp[mIndex[5][(u >> 16) & 0xff]++] = u;
}
for (i = 0; i < count; i++) {
u = pTemp[i];
pData[mIndex[4][(u >> 24) & 0xff]++] = u;
}
Converting to full 64 bit sorting adds similar code to operate on matrix indexes [6, 7]
Is it even possible to add mIndex[4] to create a 40 MSB sort?
The pData array is used with even mIndex indexes.
The pTemp array is used with odd mIndex indexes.
Is this method limited to being generalized only for even byte counts?
===================================
Full code for sorting a uint64[] on the 32 most significant bits:
// From code submitted by on stackoverflow.com rcgldr, Nov 3 2017
void radix_sort_r64_32(uint64_t *pData, uint64_t *pTemp, size_t count,
EV_TIME_STR *tsa)
{
size_t mIndex[4][256] = { 0 }; /* index matrix */
size_t * pmIndex; /* ptr to row of matrix */
size_t i, j, m, n;
uint64_t u;
if(tsa) time_event(E_RADIX_SORT_UINT64_32R, tsa, E_TIME_EVENT, 1, 0);
for (i = 0; i < count; i++) { /* generate histograms */
u = pData[i];
mIndex[3][(u >> 32) & 0xff]++;
mIndex[2][(u >> 40) & 0xff]++;
mIndex[1][(u >> 48) & 0xff]++;
mIndex[0][(u >> 56) & 0xff]++;
}
for (j = 0; j < 4; j++) { /* convert to indices */
pmIndex = mIndex[j];
n = 0;
for (i = 0; i < 256; i++) {
m = pmIndex[i];
pmIndex[i] = n;
n += m;
}
}
for (i = 0; i < count; i++) { /* radix sort */
u = pData[i];
pTemp[mIndex[3][(u >> 32) & 0xff]++] = u;
}
for (i = 0; i < count; i++) {
u = pTemp[i];
pData[mIndex[2][(u >> 40) & 0xff]++] = u;
}
for (i = 0; i < count; i++) {
u = pData[i];
pTemp[mIndex[1][(u >> 48) & 0xff]++] = u;
}
for (i = 0; i < count; i++) {
u = pTemp[i];
pData[mIndex[0][(u >> 56) & 0xff]++] = u;
}
} // End Radix_Sort_R64_32().
======================
And the diff between the 32 bit and the 48 bit sort versions:
diff ~/tmp/radix.sort.32.c ~/tmp/radix.sort.48.c
< < void radix_sort_r64_32(uint64_t *pData, uint64_t *pTemp, size_t count,
< ---
< > void radix_sort_r64_48(uint64_t *pData, uint64_t *pTemp, size_t count,
<
< < size_t mIndex[4][256] = { 0 }; /* index matrix */
< ---
< > size_t mIndex[6][256] = { 0 }; /* index matrix */
<
< < if(tsa) time_event(E_RADIX_SORT_UINT64_32R, tsa, E_TIME_EVENT, 1, 0);
< ---
< > if(tsa) time_event(E_RADIX_SORT_UINT64_48R, tsa, E_TIME_EVENT, 1, 0);
<
< ---
< > mIndex[5][(u >> 16) & 0xff]++; // B2
< > mIndex[4][(u >> 24) & 0xff]++; // B3
<
< < for (j = 0; j < 4; j++) { /* convert to indices */
< ---
< > for (j = 0; j < 6; j++) { /* convert to indices */
<
< > pTemp[mIndex[5][(u >> 16) & 0xff]++] = u;
< > }
< > for (i = 0; i < count; i++) { /* radix sort */
< > u = pTemp[i];
< > pData[mIndex[4][(u >> 24) & 0xff]++] = u;
< > }
< > for (i = 0; i < count; i++) { /* radix sort */
< > u = pData[i];
< 44c56
< < } // End Radix_Sort_R64_32().
< ---
< > } // End Radix_Sort_R64_48().
Executive summary of unique differences:
Unique lines from "~/tmp/radix.sort.32.c":
02) void radix_sort_r64_32(uint64_t *pData, uint64_t *pTemp, size_t count,
05) size_t mIndex[4][256] = { 0 }; /* index matrix */
19) for (j = 0; j < 4; j++) { /* convert to indices */
Unique lines from "~/tmp/radix.sort.48.c":
01) void radix_sort_r64_48(uint64_t *pData, uint64_t *pTemp, size_t count,
04) size_t mIndex[6][256] = { 0 }; /* index matrix */
14) mIndex[5][(u >> 16) & 0xff]++; // B2
15) mIndex[4][(u >> 24) & 0xff]++; // B3
22) for (j = 0; j < 6; j++) { /* convert to indices */
34) pTemp[mIndex[5][(u >> 16) & 0xff]++] = u;
38) pData[mIndex[4][(u >> 24) & 0xff]++] = u;

Peculiar problem regarding convolution in PPM format

After debugging, trying different image softwares (xdg, gimp) I persist to have a bug which throws me off completely.
Problem is about convolution in PPM format, for images different in aspect ratio, I'm using 1500x1000px image, where mask of {0,0,0, 0,1,0, 0,0,0} works just fine (it's just copying image), however for mask where first or last row is different than 0 eg. {0,1,0, 0,0,0, 0,0,0} image is moved by 1/3 of its size rightwards. I find it peculiar, because as far as I know, I do not have an overflow or any pointer arithmetic that might cause this problem.
I've narrowed it down to the kernel of convolution. Afaik I do not have any problems saving, reading image, after running i_convolution it just moves image by predefined value?.
void i_convolution(unsigned int **in, unsigned int ***out,
int y_max, int x_max, int kernel_size)
{
int kernel_sum = 0;
for(int i = 0; i < kernel_size; i++)
{
for(int j = 0; j < kernel_size; j++)
{
kernel_sum += kernel[i * kernel_size + j];
}
}
printf("kernel sum = %d\n", kernel_sum);
for (int i = 1; i < y_max - 1; i++)
{
for (int j = 1; j < x_max - 1; j++)
{
int r = 0;
int g = 0;
int b = 0;
for (int y_conv = -1; y_conv <= 1; y_conv++)
{
for (int x_conv = -1; x_conv <= 1; x_conv++)
{
int y_index = i + y_conv;
int x_index = j + x_conv;
unsigned char rval = (unsigned char)(in[y_index][x_index] & 0xff);
unsigned char gval = (unsigned char)((in[y_index][x_index] & 0xff00) >> 8);
unsigned char bval = (unsigned char)((in[y_index][x_index] & 0xff0000) >> 16);
int kernel_val = kernel[(y_conv + 1)*kernel_size + (x_conv + 1)];
r += (int)(rval * kernel_val);
g += (int)(gval * kernel_val);
b += (int)(bval * kernel_val);
}
}
r /= kernel_sum;//median filtration
g /= kernel_sum;//median filtration
b /= kernel_sum;//median filtration
// b = abs(b);
if (r > 255) r = 255;
else if(r < 0) r = 0;
if (g > 255) g = 255;
else if(g < 0) g = 0;
if (b > 255) b = 255;
else if(b < 0) b = 0;
unsigned int val;
val = 0;
val |= b & 0xff;
val <<= 8;
val |= g & 0xff;
val <<= 8;
val |= r & 0xff;
(*out)[i][j] = val;
}
}
}
let's take kernel {0, 1, 0, 0, 0, 0,
result are like this, with left being original, right after convolution
https://i.imgur.com/rzXKjUY.png
I will be thankful for any help.
Best regards.
I mark it as solved, because there was a problem with me misinterpreting PPM format height and width, which caused this behaviour, swapping y with x (and allocating memory as such) solves it!

How do I include a switch-case statement for this encryption/decryption code?

I just started using a microcontroller and I have to implement encryption/decryption in it. Sorry for the super long post.
This is the python script and do not need to be edited.
DEVPATH = "/dev"
TTYPREFIX = "ttyACM"
INPUT = b"Hello!"
#OUTPUT = b"Ifmmp!"
if __name__=='__main__':
for tty in (os.path.join(DEVPATH,tty) for tty in os.listdir(DEVPATH) \
if tty.startswith(TTYPREFIX)):
try:
ctt = serial.Serial(tty, timeout=1, writeTimeout=1)
except serial.SerialException:
continue
ctt.flushInput()
ctt.flushOutput()
# print(ctt)
try:
ctt.write(INPUT)
except serial.SerialTimeoutException:
ctt.__exit__()
continue
for retry in range(3): # Try three times to read connection test result
ret = ctt.read(2*len(INPUT))
print("ret: " + repr(ret))
if INPUT in ret:
sys.exit(0)
break
else:
ctt.__exit__()
continue
break
else:
print("Failed")
sys.exit(1)
This is the main.c file. I know that CDC_Device_BytesReceived will receive the input from the python script. And if there are input, it will run the while loop since Bytes will be more than 0.
while (1)
{
/* Check if data received */
Bytes = CDC_Device_BytesReceived(&VirtualSerial_CDC_Interface);
while(Bytes > 0)
{
/* Send data back to the host */
ch = CDC_Device_ReceiveByte(&VirtualSerial_CDC_Interface);
CDC_Device_SendByte(&VirtualSerial_CDC_Interface, ch);
--Bytes;
}
CDC_Device_USBTask(&VirtualSerial_CDC_Interface);
}
return 0;
}
However, in the loop, I was tasked to add a switch case so that it will switch between encryption and decryption. But I have no idea what kind of condition to use to differentiate the encryption and decryption.
This is the code for encryption.
int crypto_aead_encrypt(unsigned char* c, unsigned long long* clen,
const unsigned char* m, unsigned long long mlen,
const unsigned char* ad, unsigned long long adlen,
const unsigned char* nsec, const unsigned char* npub,
const unsigned char* k)
{
int klen = CRYPTO_KEYBYTES; // 16 bytes
int size = 320 / 8; // 40 bytes
int rate = 128 / 8; // 16 bytes
// int capacity = size - rate;
// Permutation
int a = 12;
int b = 8;
// Padding process appends a 1 to the associated data
i64 s = adlen / rate + 1;
// Padding process appends a 1 to the plain text
i64 t = mlen / rate + 1;
// Length = plaintext mod r
// i64 l = mlen % rate;
u8 S[size];
// Resulting Padded associated data is split into s blocks of r bits
u8 A[s * rate];
// Resulting Padded plain text is split into t blocks of r bits
u8 P[t * rate];
i64 i, j;
// Pad Associated Data
for(i = 0; i < adlen; ++i)
{
A[i] = ad[i];
A[adlen] = 0x80; // 128 bits
// No Padding Applied
for(i = adlen + 1; i < s * rate; ++i)
{
A[i] = 0;
}
}
// Pad Plaintext
for(i = 0; i < mlen; ++i)
{
P[i] = m[i];
P[mlen] = 0x80; // 128 bits
// No Padding Applied
for(i = mlen + 1; i < t * rate; ++i)
{
P[i] = 0;
}
}
// Initialization
// IV = k || r || a || b || 0
// S = IV || K || N
S[0] = klen * 8;
S[1] = rate * 8;
S[2] = a;
S[3] = b;
// i < 40 - 2 * 16 = 8
for(i = 4; i < size - 2 * klen; ++i)
{
// S[4] until S[7] = 0
S[i] = 0;
}
// i < 16
for(i = 0; i < klen; ++i)
{
// S[8] = k[0], S[9] = k[1] until S[23] = k[15]
S[size - 2 * klen + i] = k[i];
}
// i < 16
for(i = 0; i < klen; i++)
{
// S[24] = npub[0], S[25] = npub[1] until S[39] = npub[15]
S[size - klen + i] = npub[i];
}
printstate("Initial Value: ", S);
// S - state, 12-a - start, a - 12 rounds
permutations(S, 12 - a, a);
// i < 16
for(i = 0; i < klen; ++i)
{
// S[24] ^= k[0], S[25] ^= k[1] until S[39] ^= k[15]
S[size - klen + i] ^= k[i];
}
printstate("Initialization: ", S);
// Process Associated Data
if(adlen != 0)
{
// i < s = (adlen / rate + 1)
for(i = 0; i < s; ++i)
{
// rate = 16
for(j = 0; j < rate; ++i)
{
// S ^= A
S[j] ^= A[i * rate + j];
}
// S - state, 12-b - start, b - 8 rounds
permutations(S, 12 - b, b);
}
}
// S <- S ^= 1
S[size - 1] ^= 1;
printstate("Process Associated Data: ", S);
// Process Plain Text
for(i = 0; i < t - 1; ++i)
{
for(j = 0; j < rate; ++j)
{
// S <- S ^= P
S[j] ^= P[i * rate + j];
// c <- S
c[i * rate + j] = S[j];
}
// S <- permutation b (S)
permutations(S, 12 - b, b);
}
for(j = 0; j < rate; ++j)
{
// S <- S ^= Pt
S[j] ^= P[(t-1) * rate + j];
}
for(j = 0; j < 1; ++j);
{
// C <- S
// Bitstring S truncated to the first (most significant) k bits
c[(t - 1) * rate + j] = S[j];
}
printstate("Process Plaintext: ", S);
// Finalization
for(i = 0; i < klen; ++i)
{
S[rate + i] ^= k[i];
}
permutations(S, 12 - a, a);
for(i = 0; i < klen; ++i)
{
// T <- S ^= k
// Bitstring S truncated to the last (least significant) k bits
S[size - klen + i] ^= k[i];
}
printstate("Finalization: ", S);
// Return Cipher Text & Tag
for(i = 0; i < klen; ++i)
{
c[mlen + i] = S[size - klen + i];
}
*clen = mlen + klen;
return 0;
}
and the code for decryption
int crypto_aead_decrypt(unsigned char *m, unsigned long long *mlen,
unsigned char *nsec, const unsigned char *c,
unsigned long long clen, const unsigned char *ad,
unsigned long long adlen, const unsigned char *npub,
const unsigned char *k)
{
*mlen = 0;
if (clen < CRYPTO_KEYBYTES)
return -1;
int klen = CRYPTO_KEYBYTES;
// int nlen = CRYPTO_NPUBBYTES;
int size = 320 / 8;
int rate = 128 / 8;
// int capacity = size - rate;
int a = 12;
int b = 8;
i64 s = adlen / rate + 1;
i64 t = (clen - klen) / rate + 1;
i64 l = (clen - klen) % rate;
u8 S[size];
u8 A[s * rate];
u8 M[t * rate];
i64 i, j;
// pad associated data
for (i = 0; i < adlen; ++i)
{
A[i] = ad[i];
}
A[adlen] = 0x80;
for (i = adlen + 1; i < s * rate; ++i)
{
A[i] = 0;
}
// initialization
S[0] = klen * 8;
S[1] = rate * 8;
S[2] = a;
S[3] = b;
for (i = 4; i < size - 2 * klen; ++i)
{
S[i] = 0;
}
for (i = 0; i < klen; ++i)
{
S[size - 2 * klen + i] = k[i];
}
for (i = 0; i < klen; ++i)
{
S[size - klen + i] = npub[i];
}
printstate("initial value:", S);
permutations(S, 12 - a, a);
for (i = 0; i < klen; ++i)
{
S[size - klen + i] ^= k[i];
}
printstate("initialization:", S);
// process associated data
if (adlen)
{
for (i = 0; i < s; ++i)
{
for (j = 0; j < rate; ++j)
{
S[j] ^= A[i * rate + j];
}
permutations(S, 12 - b, b);
}
}
S[size - 1] ^= 1;
printstate("process associated data:", S);
// process plaintext
for (i = 0; i < t - 1; ++i)
{
for (j = 0; j < rate; ++j)
{
M[i * rate + j] = S[j] ^ c[i * rate + j];
S[j] = c[i * rate + j];
}
permutations(S, 12 - b, b);
}
for (j = 0; j < l; ++j)
{
M[(t - 1) * rate + j] = S[j] ^ c[(t - 1) * rate + j];
}
for (j = 0; j < l; ++j)
{
S[j] = c[(t - 1) * rate + j];
S[l] ^= 0x80;
}
printstate("process plaintext:", S);
// finalization
for (i = 0; i < klen; ++i)
{
S[rate + i] ^= k[i];
}
permutations(S, 12 - a, a);
for (i = 0; i < klen; ++i)
{
S[size - klen + i] ^= k[i];
}
printstate("finalization:", S);
// return -1 if verification fails
for (i = 0; i < klen; ++i)
{
if (c[clen - klen + i] != S[size - klen + i])
{
return -1;
}
}
// return plaintext
*mlen = clen - klen;
for (i = 0; i < *mlen; ++i)
{
m[i] = M[i];
}
return 0;
}
Thanks for the help in advance, I am really clueless right now.
However, in the loop, I was tasked to add a switch case so that it
will switch between encryption and decryption. But I have no idea what
kind of condition to use to differentiate the encryption and
decryption.
According to your comments, the calls for encryption and decryption are happening inside of CDC_Device_ReceiveByte and CDC_Device_SendByte, which means you need to create a state machine for sending and receiving of the bytes. The condition that you would use for this is the return value of CDC_Device_BytesReceived.
You can create an enum for the states, and a simple struct for holding the current state along with any other pertinent information. You can create a function for the state machine that maps out what to do given the current state. Your while(1) loop will simply call the function to ensure the state machine moves along. You might implement that like this:
typedef enum{
IDLE,
DECRYPTING,
ENCRYPTING,
}state_t;
typedef struct{
state_t current_state;
}fsm_t;
fsm_t my_fsm = {0}; //initial state is idle
void myFSM(void){
switch(my_fsm.current_state){
case IDLE:
{
/* Check if data received */
Bytes = CDC_Device_BytesReceived(&VirtualSerial_CDC_Interface);
if(Bytes) my_fsm.current_state = DECRYPTING; //we have data, decrypt it
break;
}
case DECRYPTING:
{
/* Send data back to the host */
ch = CDC_Device_ReceiveByte(&VirtualSerial_CDC_Interface);
my_fsm.current_state = ENCRYPTING; // encrypt byte that we are going to send to host
break;
}
case ENCRYPTING:
{
CDC_Device_SendByte(&VirtualSerial_CDC_Interface, ch);
--Bytes;
if(Bytes){
my_fsm.current_state = DECRYPTING; // still have bytes left to decrypt
}
else my_fsm.current_state = IDLE;
break;
}
default:
{
asm("nop"); // whoops
break;
}
}
}
Now your loop is just
while(1){
myFSM();
}

Radix Sort Optimization

I was trying to optimize the Radix Sort code, because I felt there was room for it as traditional codes in books and on web seem a direct copy of one another and also they work very slow as they take an arbitrary number such as 10 for modulo operation. I have optimized the code as far as I could go, maybe I might have missed some optimization techniques. In that case please enlighten me.
Motivation for optimization:
http://codercorner.com/RadixSortRevisited.htm
http://stereopsis.com/radix.html
I was unable to implement all the optimizations in the articles, mostly it was beyond my skills and understanding and lack of sufficient time, if you can feel free to implement them.
EDIT 4
This Java version of Radix Sort calculates all histograms in 1 read and does not need to fill array Z with zeros after every LSB sort along with the usual ability to skip sorting and jump to next LSB sorting if all previous LSB's are same. As usual this is only for 32-bit integers but a 64-bit version can be created from it.
protected static int[] DSC(int A[])// Sorts in descending order
{
int tmp[] = new int[A.length] ;
int Z[] = new int[1024] ;
int i, Jump, Jump2, Jump3, Jump4, swap[] ;
Jump = A[0] & 255 ;
Z[Jump] = 1 ;
Jump2 = ((A[0] >> 8) & 255) + 256 ;
Z[Jump2] = 1 ;
Jump3 = ((A[0] >> 16) & 255) + 512 ;
Z[Jump3] = 1 ;
Jump4 = (A[0] >> 24) + 768 ;
Z[Jump4] = 1 ;
// Histograms creation
for (i = 1 ; i < A.length; ++i)
{
++Z[A[i] & 255] ;
++Z[((A[i] >> 8) & 255) + 256] ;
++Z[((A[i] >> 16) & 255) + 512] ;
++Z[(A[i] >> 24) + 768] ;
}
// 1st LSB Byte Sort
if( Z[Jump] != A.length )
{
Z[0] = A.length - Z[0];
for (i = 1; i < 256; ++i)
{
Z[i] = Z[i - 1] - Z[i];
}
for (i = 0; i < A.length; ++i)
{
tmp[Z[A[i] & 255]++] = A[i];
}
swap = A ; A = tmp ; tmp = swap ;
}
// 2nd LSB Byte Sort
if( Z[Jump2] != A.length )
{
Z[256] = A.length - Z[256];
for (i = 257; i < 512; ++i)
{
Z[i] = Z[i - 1] - Z[i];
}
for (i = 0; i < A.length; ++i)
{
tmp[Z[((A[i] >> 8) & 255) + 256]++] = A[i];
}
swap = A ; A = tmp ; tmp = swap ;
}
// 3rd LSB Byte Sort
if( Z[Jump3] != A.length )
{
Z[512] = A.length - Z[512];
for (i = 513; i < 768; ++i)
{
Z[i] = Z[i - 1] - Z[i];
}
for (i = 0; i < A.length; ++i)
{
tmp[Z[((A[i] >> 16) & 255) + 512]++] = A[i];
}
swap = A ; A = tmp ; tmp = swap ;
}
// 4th LSB Byte Sort
if( Z[Jump4] != A.length )
{
Z[768] = A.length - Z[768];
for (i = 769; i < Z.length; ++i)
{
Z[i] = Z[i - 1] - Z[i];
}
for (i = 0; i < A.length; ++i)
{
tmp[Z[(A[i] >> 24) + 768]++] = A[i];
}
return tmp ;
}
return A ;
}
The Java version ran faster with != sign than == sign
if( Z[Jump] != A.length )
{
// lines of code
}...
but in C the below version was on average, 25% faster (with equalto sign) than its counterpart with != sign. Your hardware might react differently.
if( Z[Jump] == A.length );
else
{
// lines of code
}...
Below is the C code ( "long" on my machine is 32 bits )
long* Radix_2_ac_long(long *A, size_t N, long *Temp)// Sorts in ascending order
{
size_t Z[1024] = {0};
long *swp;
size_t i, Jump, Jump2, Jump3, Jump4;
// Sort-circuit set-up
Jump = *A & 255;
Z[Jump] = 1;
Jump2 = ((*A >> 8) & 255) + 256;
Z[Jump2] = 1;
Jump3 = ((*A >> 16) & 255) + 512;
Z[Jump3] = 1;
Jump4 = (*A >> 24) + 768;
Z[Jump4] = 1;
// Histograms creation
for(i = 1 ; i < N ; ++i)
{
++Z[*(A+i) & 255];
++Z[((*(A+i) >> 8) & 255) + 256];
++Z[((*(A+i) >> 16) & 255) + 512];
++Z[(*(A+i) >> 24) + 768];
}
// 1st LSB byte sort
if( Z[Jump] == N );
else
{
for( i = 1 ; i < 256 ; ++i )
{
Z[i] = Z[i-1] + Z[i];
}
for( i = N-1 ; i < N ; --i )
{
*(--Z[*(A+i) & 255] + Temp) = *(A+i);
}
swp = A;
A = Temp;
Temp = swp;
}
// 2nd LSB byte sort
if( Z[Jump2] == N );
else
{
for( i = 257 ; i < 512 ; ++i )
{
Z[i] = Z[i-1] + Z[i];
}
for( i = N-1 ; i < N ; --i )
{
*(--Z[((*(A+i) >> 8) & 255) + 256] + Temp) = *(A+i);
}
swp = A;
A = Temp;
Temp = swp;
}
// 3rd LSB byte sort
if( Z[Jump3] == N );
else
{
for( i = 513 ; i < 768 ; ++i )
{
Z[i] = Z[i-1] + Z[i];
}
for( i = N-1 ; i < N ; --i )
{
*(--Z[((*(A+i) >> 16) & 255) + 512] + Temp) = *(A+i);
}
swp = A;
A = Temp;
Temp = swp;
}
// 4th LSB byte sort
if( Z[Jump4] == N );
else
{
for( i = 769 ; i < 1024 ; ++i )
{
Z[i] = Z[i-1] + Z[i];
}
for( i = N-1 ; i < N ; --i )
{
*(--Z[(*(A+i) >> 24) + 768] + Temp) = *(A+i);
}
return Temp;
}
return A;
}
EDIT 5
The sort now handles negative numbers too. Only some minor/negligible tweaks to the code did it. It runs a little slower as a result but the effect is not significant. Coded in C, below ( "long" on my system is 32 bits )
long* Radix_Sort(long *A, size_t N, long *Temp)
{
size_t Z[1024] = {0};
long *swp;
size_t Jump, Jump2, Jump3, Jump4;
long i;
// Sort-circuit set-up
Jump = *A & 255;
Z[Jump] = 1;
Jump2 = ((*A >> 8) & 255) + 256;
Z[Jump2] = 1;
Jump3 = ((*A >> 16) & 255) + 512;
Z[Jump3] = 1;
Jump4 = ((*A >> 24) & 255) + 768;
Z[Jump4] = 1;
// Histograms creation
for(i = 1 ; i < N ; ++i)
{
++Z[*(A+i) & 255];
++Z[((*(A+i) >> 8) & 255) + 256];
++Z[((*(A+i) >> 16) & 255) + 512];
++Z[((*(A+i) >> 24) & 255) + 768];
}
// 1st LSB byte sort
if( Z[Jump] == N );
else
{
for( i = 1 ; i < 256 ; ++i )
{
Z[i] = Z[i-1] + Z[i];
}
for( i = N-1 ; i >= 0 ; --i )
{
*(--Z[*(A+i) & 255] + Temp) = *(A+i);
}
swp = A;
A = Temp;
Temp = swp;
}
// 2nd LSB byte sort
if( Z[Jump2] == N );
else
{
for( i = 257 ; i < 512 ; ++i )
{
Z[i] = Z[i-1] + Z[i];
}
for( i = N-1 ; i >= 0 ; --i )
{
*(--Z[((*(A+i) >> 8) & 255) + 256] + Temp) = *(A+i);
}
swp = A;
A = Temp;
Temp = swp;
}
// 3rd LSB byte sort
if( Z[Jump3] == N );
else
{
for( i = 513 ; i < 768 ; ++i )
{
Z[i] = Z[i-1] + Z[i];
}
for( i = N-1 ; i >= 0 ; --i )
{
*(--Z[((*(A+i) >> 16) & 255) + 512] + Temp) = *(A+i);
}
swp = A;
A = Temp;
Temp = swp;
}
// 4th LSB byte sort and negative numbers sort
if( Z[Jump4] == N );
else
{
for( i = 897 ; i < 1024 ; ++i )// -ve values frequency starts after index 895, i.e at 896 ( 896 = 768 + 128 ), goes upto 1023
{
Z[i] = Z[i-1] + Z[i];
}
Z[768] = Z[768] + Z[1023];
for( i = 769 ; i < 896 ; ++i )
{
Z[i] = Z[i-1] + Z[i];
}
for( i = N-1 ; i >= 0 ; --i )
{
*(--Z[((*(A+i) >> 24) & 255) + 768] + Temp) = *(A+i);
}
return Temp;
}
return A;
}
EDIT 6
Below is the pointer optimized version ( accesses array locations via pointers ) that takes on average, approximately 20% less time to sort than the one above. It also uses 4 separate arrays for faster address calculation ( "long" on my system is 32 bits ).
long* Radix_Sort(long *A, size_t N, long *Temp)
{
long Z1[256] ;
long Z2[256] ;
long Z3[256] ;
long Z4[256] ;
long T = 0 ;
while(T != 256)
{
*(Z1+T) = 0 ;
*(Z2+T) = 0 ;
*(Z3+T) = 0 ;
*(Z4+T) = 0 ;
++T;
}
size_t Jump, Jump2, Jump3, Jump4;
// Sort-circuit set-up
Jump = *A & 255 ;
Z1[Jump] = 1;
Jump2 = (*A >> 8) & 255 ;
Z2[Jump2] = 1;
Jump3 = (*A >> 16) & 255 ;
Z3[Jump3] = 1;
Jump4 = (*A >> 24) & 255 ;
Z4[Jump4] = 1;
// Histograms creation
long *swp = A + N;
long *i = A + 1;
for( ; i != swp ; ++i)
{
++Z1[*i & 255];
++Z2[(*i >> 8) & 255];
++Z3[(*i >> 16) & 255];
++Z4[(*i >> 24) & 255];
}
// 1st LSB byte sort
if( Z1[Jump] == N );
else
{
swp = Z1+256 ;
for( i = Z1+1 ; i != swp ; ++i )
{
*i = *(i-1) + *i;
}
swp = A-1;
for( i = A+N-1 ; i != swp ; --i )
{
*(--Z1[*i & 255] + Temp) = *i;
}
swp = A;
A = Temp;
Temp = swp;
}
// 2nd LSB byte sort
if( Z2[Jump2] == N );
else
{
swp = Z2+256 ;
for( i = Z2+1 ; i != swp ; ++i )
{
*i = *(i-1) + *i;
}
swp = A-1;
for( i = A+N-1 ; i != swp ; --i )
{
*(--Z2[(*i >> 8) & 255] + Temp) = *i;
}
swp = A;
A = Temp;
Temp = swp;
}
// 3rd LSB byte sort
if( Z3[Jump3] == N );
else
{
swp = Z3 + 256 ;
for( i = Z3+1 ; i != swp ; ++i )
{
*i = *(i-1) + *i;
}
swp = A-1;
for( i = A+N-1 ; i != swp ; --i )
{
*(--Z3[(*i >> 16) & 255] + Temp) = *i;
}
swp = A;
A = Temp;
Temp = swp;
}
// 4th LSB byte sort and negative numbers sort
if( Z4[Jump4] == N );
else
{
swp = Z4 + 256 ;
for( i = Z4+129 ; i != swp ; ++i )
{
*i = *(i-1) + *i;
}
*Z4 = *Z4 + *(Z4+255) ;
swp = Z4 + 128 ;
for( i = Z4+1 ; i != swp ; ++i )
{
*i = *(i-1) + *i;
}
swp = A - 1;
for( i = A+N-1 ; i != swp ; --i )
{
*(--Z4[(*i >> 24) & 255] + Temp) = *i;
}
return Temp;
}
return A;
}
The edit 4 version is good enough if the original and temp arrays fit in cache. If the array size is much greater than cache size, most of the overhead is due to the random order writes to the arrays. A hybrid msb/lsb radix sort can avoid this issue. For example split the array into 256 bins according to the most significant byte, then do a lsb radix sort on each of the 256 bins. The idea here is that a pair (original and temp) of bins will fit within the cache, where random order writes are not an issue (for most cache implementations).
For a 8MB cache, the goal is for each of the bins to be < 4MB in size = 1 million 32 bit integers if the integers evenly distribute into the bins. This strategy would work for array size up to 256 million 32 bit integers. For larger arrays, the msb phase could split up the array into 1024 bins, for up to 1 billion 32 bit integers. On my system, sorting 16,777,216 (2^24) 32 bit integers with a classic 8,8,8,8 lsb radix sort took 0.45 seconds, while the hybrid 8 msb : 8,8,8 lsb took 0.24 seconds.
// split array into 256 bins according to most significant byte
void RadixSort(uint32_t * a, size_t count)
{
size_t aIndex[260] = {0}; // count / array
uint32_t * b = new uint32_t [count]; // allocate temp array
size_t i;
for(i = 0; i < count; i++) // generate histogram
aIndex[1+((size_t)(a[i] >> 24))]++;
for(i = 2; i < 257; i++) // convert to indices
aIndex[i] += aIndex[i-1];
for(i = 0; i < count; i++) // sort by msb
b[aIndex[a[i]>>24]++] = a[i];
for(i = 256; i; i--) // restore aIndex
aIndex[i] = aIndex[i-1];
aIndex[0] = 0;
for(i = 0; i < 256; i++) // radix sort the 256 bins
RadixSort3(&b[aIndex[i]], &a[aIndex[i]], aIndex[i+1]-aIndex[i]);
delete[] b;
}
// sort a bin by 3 least significant bytes
void RadixSort3(uint32_t * a, uint32_t *b, size_t count)
{
size_t mIndex[3][256] = {0}; // count / matrix
size_t i,j,m,n;
uint32_t u;
if(count == 0)
return;
for(i = 0; i < count; i++){ // generate histograms
u = a[i];
for(j = 0; j < 3; j++){
mIndex[j][(size_t)(u & 0xff)]++;
u >>= 8;
}
}
for(j = 0; j < 3; j++){ // convert to indices
m = 0;
for(i = 0; i < 256; i++){
n = mIndex[j][i];
mIndex[j][i] = m;
m += n;
}
}
for(j = 0; j < 3; j++){ // radix sort
for(i = 0; i < count; i++){ // sort by current lsb
u = a[i];
m = (size_t)(u>>(j<<3))&0xff;
b[mIndex[j][m]++] = u;
}
std::swap(a, b); // swap ptrs
}
}
Example code for classic lsb radix sorts:
Example C++ lsb radix sort using 8,8,8,8 bit fields:
typedef unsigned int uint32_t;
void RadixSort(uint32_t * a, size_t count)
{
size_t mIndex[4][256] = {0}; // count / index matrix
uint32_t * b = new uint32_t [count]; // allocate temp array
size_t i,j,m,n;
uint32_t u;
for(i = 0; i < count; i++){ // generate histograms
u = a[i];
for(j = 0; j < 4; j++){
mIndex[j][(size_t)(u & 0xff)]++;
u >>= 8;
}
}
for(j = 0; j < 4; j++){ // convert to indices
m = 0;
for(i = 0; i < 256; i++){
n = mIndex[j][i];
mIndex[j][i] = m;
m += n;
}
}
for(j = 0; j < 4; j++){ // radix sort
for(i = 0; i < count; i++){ // sort by current lsb
u = a[i];
m = (size_t)(u>>(j<<3))&0xff;
b[mIndex[j][m]++] = u;
}
std::swap(a, b); // swap ptrs
}
delete[] b;
}
Example C++ code using 16,16 bit fields:
typedef unsigned int uint32_t;
uint32_t * RadixSort(uint32_t * a, size_t count)
{
size_t mIndex[2][65536] = {0}; // count / index matrix
uint32_t * b = new uint32_t [count]; // allocate temp array
size_t i,j,m,n;
uint32_t u;
for(i = 0; i < count; i++){ // generate histograms
u = a[i];
for(j = 0; j < 2; j++){
mIndex[j][(size_t)(u & 0xffff)]++;
u >>= 16;
}
}
for(j = 0; j < 2; j++){ // convert to indices
m = 0;
for(i = 0; i < 65536; i++){
n = mIndex[j][i];
mIndex[j][i] = m;
m += n;
}
}
for(j = 0; j < 2; j++){ // radix sort
for(i = 0; i < count; i++){ // sort by current lsb
u = a[i];
m = (size_t)(u>>(j<<4))&0xffff;
b[mIndex[j][m]++] = u;
}
std::swap(a, b); // swap ptrs
}
delete[] b;
return(a);
}
N & 15 , N & 31 , N & 63 .... and so on , which of these bitwise
operations takes least time?
They are same. Do not take it bad, but optimizing for speed without knowing how long things last may end up quite bad. And even when you know the timing, hardware is very complicated nowadays and quite unpredictable. You program in java, that is another layer of insanely complex system. The same code may be faster today and slower tomorrow. Your say approximately 2.232891909840167 times faster. In reality, you have measurement on one hardware and software configuration with one set of data and you can only hope the measurement is representative enough. Unfortunately, it is not always the case.
I rewrote your function. It is shorter and simpler, yet does not seem to be slower. Compilers tend to like code that is not too clever, as there are many optimizations for simple cases. The correction for negative numbers is not particulary nice, you can delete it if you do not like it. It seems to work best for 8 bits and 11 bits, probably due to cache sizes, have a look at comments of rcgldr.
EDIT
#ytoamn you are right, if all is in the first bucket the loop should continue, not break. That was a bug. To the other changes, I would rather avoid the contract you have done now. I think there are three natural contracts for sorting function. First one is sorting the original array and returning null. Second is sorting the original array and return it. The third is returning new sorted array and keeping the original array intact. I like the first one, as its behaviour is unambiguous. The way you have it now you should add big warning to the documentation, that the original array has changed and is returned from the function is some cases and in other not. Second thing I would avoid is the old C code style. You should define loop variable in the loop if you need it only there. Defining it globally injects dependency that may lead to bugs. And it has no advantages here, as properly defined loop variables would share the space in the end anyway. Compiler is well aware of the scope, you should use the smallest scope you need.
EDIT2
Feel free to comment directly under my post :-) Local variables are just addresses on the stack. You allocate memory when constructing object which is not the case here. As for the array, think about this code:
public static void Tst(int[] A) {
int[] tmp = new int[A.length];
A[0] = 6;
A = tmp; // changes what parameter A contains
A[0] = 7;
}
public static void main(String[] args) {
int[] A = new int[1];
A[0] = 5;
Tst(A);
System.out.println(A[0]); //prints 6
}
It prints 6. Number 7 is written into tmp array only. Array A in main is not affected.
protected static void ASC2(int A[], int bits) {
int[] origA = A;
int[] tmp = new int[A.length];
int[] Z = new int[1 << bits];
int mask = (1 << bits) - 1;
for (int shift = 0; shift < 32; shift += bits) {
if (shift > 0) {
Arrays.fill(Z, 0);
}
for (int i = 0; i < A.length; ++i) {
Z[(A[i] >> shift) & mask]++;
}
if (Z[0] == A.length) {
continue; // all in first bucket
}
Z[Z.length - 1] = A.length - Z[Z.length - 1];
for (int i = Z.length - 2; i >= 0; --i) {
Z[i] = Z[i + 1] - Z[i];
}
if (shift + bits > 31) { // negative numbers correction
int halfLength = Z.length / 2;
int positSum = Z[halfLength];
int negSum = A.length - positSum;
if (negSum > 0) {
for (int i = 0; i < halfLength; ++i) {
Z[i] += negSum;
}
for (int i = halfLength; i < Z.length; ++i) {
Z[i] -= positSum;
}
}
}
for (int i = 0; i < A.length; ++i) {
tmp[Z[(A[i] >> shift) & mask]++] = A[i];
}
int[] swap = A;
A = tmp;
tmp = swap;
}
if (A != origA) {
System.arraycopy(A, 0, origA, 0, A.length);
}
}
EDIT3
Loop unroll is a valid technique, improving short circuiting is really nice. But with using array lengths as constants you definitely start to be too clever. If you hard coded the base size, why not hard code it all like this:
protected static int[] DSC2(int A[])// sorts in descending order
{
int tmp[] = new int[A.length];
int Z[] = new int[256];
int sample, swap[];
// 1st LSB byte extraction
sample = A[0] & 255;
for (int i = 0; i < A.length; ++i) {
Z[A[i] & 255]++;
}
if (Z[sample] != A.length) {
Z[0] = A.length - Z[0];
for (int i = 1; i < Z.length; ++i) {
Z[i] = Z[i - 1] - Z[i];
}
for (int i = 0; i < A.length; ++i) {
tmp[Z[A[i] & 255]++] = A[i];
}
swap = A;
A = tmp;
tmp = swap;
Arrays.fill(Z, 0);
} else {
Z[sample] = 0;
}
// 2nd LSB byte extraction
sample = (A[0] >> 8) & 255;
for (int i = 0; i < A.length; ++i) {
Z[(A[i] >> 8) & 255]++;
}
if (Z[sample] != A.length) {
Z[0] = A.length - Z[0];
for (int i = 1; i < Z.length; ++i) {
Z[i] = Z[i - 1] - Z[i];
}
for (int i = 0; i < A.length; ++i) {
tmp[Z[(A[i] >> 8) & 255]++] = A[i];
}
swap = A;
A = tmp;
tmp = swap;
Arrays.fill(Z, 0);
} else {
Z[sample] = 0;
}
// 3rd LSB byte extraction
sample = (A[0] >> 16) & 255;
for (int i = 0; i < A.length; ++i) {
Z[(A[i] >> 16) & 255]++;
}
if (Z[sample] != A.length) {
Z[0] = A.length - Z[0];
for (int i = 1; i < Z.length; ++i) {
Z[i] = Z[i - 1] - Z[i];
}
for (int i = 0; i < A.length; ++i) {
tmp[Z[(A[i] >> 16) & 255]++] = A[i];
}
swap = A;
A = tmp;
tmp = swap;
Arrays.fill(Z, 0);
} else {
Z[sample] = 0;
}
// 4th LSB byte extraction
sample = (A[0] >> 24) & 255;
for (int i = 0; i < A.length; ++i) {
Z[(A[i] >> 24) & 255]++;
}
if (Z[sample] != A.length) {
Z[0] = A.length - Z[0];
for (int i = 1; i < Z.length; ++i) {
Z[i] = Z[i - 1] - Z[i];
}
for (int i = 0; i < A.length; ++i) {
tmp[Z[(A[i] >> 24) & 255]++] = A[i];
}
A = tmp;
}
return A;
}

Resources