Why is SSE4.2 cmpstr slower than regular code?

Why is SSE4.2 cmpstr slower than regular code? - c

I'm trying to validate a string that must only contain ASCII visible characters, white space and \t.
But it seems that ASCII table lookups are faster than the _mm_cmpestri instruction with _SIDD_CMP_RANGES on most CPUs.
I've tested it on an i5-2410M, an i7-3720QM, an i7-5600U and a KVM-virtualized Xeon of unknown type and only on the last one is the vectorized version faster.
My test code is here:
#include <stdio.h>
#include <string.h>
#include <inttypes.h>
#include <sys/time.h>
#include <sys/mman.h>
#include <immintrin.h>
#include <stdalign.h>
#include <stdlib.h>
#define MIN(a,b) (((a)<(b))?(a):(b))
#define ALIGNED16 alignas(16)
#define MEASURE(msg,stmt) { \
struct timeval tv; \
gettimeofday(&tv, NULL); \
uint64_t us1 = tv.tv_sec * (uint64_t)1000000 + tv.tv_usec; \
stmt; \
gettimeofday(&tv, NULL); \
uint64_t us2 = tv.tv_sec * (uint64_t)1000000 + tv.tv_usec; \
printf("%-20s - %.4fms\n", msg, ((double)us2 - us1) / 1000); \
}
// Character table
#define VWSCHAR(c) (vis_ws_chars[(unsigned char)(c)]) // Visible characters and white space
#define YES 1,
#define NO 0,
#define YES16 YES YES YES YES YES YES YES YES YES YES YES YES YES YES YES YES
#define NO16 NO NO NO NO NO NO NO NO NO NO NO NO NO NO NO NO
#define NO128 NO16 NO16 NO16 NO16 NO16 NO16 NO16 NO16
// Visible ASCII characters with space and tab
ALIGNED16 static const int vis_ws_chars[256] = {
// NUL SOH STX ETX EOT ENQ ACK BEL BS HT LF VT FF CR SO SI
NO NO NO NO NO NO NO NO NO YES NO NO NO NO NO NO
// DLE DC1 DC2 DC3 DC4 NAK SYN ETB CAN EM SUB ESC FS GS RS US
NO16
// SP ! " # $ % & ' ( ) * + , - . /
// 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
// # A B C D E F G H I J K L M N O
// P Q R S T U V W X Y Z [ \ ] ^ _
// ` a b c d e f g h i j k l m n o
YES16 YES16 YES16 YES16 YES16
// p q r s t u v w x y z { | } ~ DEL
YES YES YES YES YES YES YES YES YES YES YES YES YES YES YES NO
// Non-ASCII characters
NO128
};
size_t search_logic(const char* data, size_t len) {
__m128i ht = _mm_set1_epi8('\t');
//__m128i del = _mm_set1_epi8(0x7f);
__m128i td = _mm_set1_epi8('~');
__m128i sp_m1 = _mm_set1_epi8(' ' - 1);
size_t i = 0;
while (len - i >= 16) {
__m128i c = _mm_loadu_si128((const __m128i *) (data + i));
// (!((c < del) && (c >= sp)) && (c != ht)) == 0
//if(!_mm_testc_si128(_mm_and_si128(_mm_cmpgt_epi8(c, sp_m1), _mm_cmplt_epi8(c, del)), _mm_xor_si128(c, ht)))
//break;
// !(c == del) && ((c == ht) || (c >= sp)) == 1
//if(!_mm_test_all_ones(_mm_andnot_si128(_mm_cmpeq_epi8(c, del), _mm_or_si128(_mm_cmpeq_epi8(c, ht), _mm_cmpgt_epi8(c, sp_m1)))))
//break;
// (((c != ht) && (c >= sp)) && (c > td)) == 0
if(!_mm_test_all_zeros(_mm_and_si128(_mm_xor_si128(c, ht), _mm_cmpgt_epi8(c, sp_m1)), _mm_cmpgt_epi8(c, td)))
break;
i += 16;
}
// Check last 15 bytes
for (; i < len; ++i) {
if (!VWSCHAR(data[i])) {
break;
}
}
return i;
}
size_t search_table(const char* data, size_t len)
{
// Search non-matching character via table lookups
size_t i = 0;
while (len - i >= 16) {
if (!VWSCHAR(data[i + 0])) break;
if (!VWSCHAR(data[i + 1])) break;
if (!VWSCHAR(data[i + 2])) break;
if (!VWSCHAR(data[i + 3])) break;
if (!VWSCHAR(data[i + 4])) break;
if (!VWSCHAR(data[i + 5])) break;
if (!VWSCHAR(data[i + 6])) break;
if (!VWSCHAR(data[i + 7])) break;
if (!VWSCHAR(data[i + 8])) break;
if (!VWSCHAR(data[i + 9])) break;
if (!VWSCHAR(data[i + 10])) break;
if (!VWSCHAR(data[i + 11])) break;
if (!VWSCHAR(data[i + 12])) break;
if (!VWSCHAR(data[i + 13])) break;
if (!VWSCHAR(data[i + 14])) break;
if (!VWSCHAR(data[i + 15])) break;
i += 16;
}
// Check last 15 bytes
for (; i < len; ++i) {
if (!VWSCHAR(data[i])) {
break;
}
}
return i;
}
size_t search_sse4cmpstr(const char* data, size_t len)
{
static const char legal_ranges[16] = {
'\t', '\t',
' ', '~',
};
__m128i v1 = _mm_loadu_si128((const __m128i*)legal_ranges);
size_t i = 0;
while (len - i >= 16) {
__m128i v2 = _mm_loadu_si128((const __m128i*)(data + i));
unsigned consumed = _mm_cmpestri(v1, 4, v2, 16, _SIDD_LEAST_SIGNIFICANT|_SIDD_CMP_RANGES|_SIDD_UBYTE_OPS|_SIDD_NEGATIVE_POLARITY);
i += consumed;
if (consumed < 16) {
return i;
}
}
// Check last 15 bytes
for (; i < len; ++i) {
if (!VWSCHAR(data[i])) {
break;
}
}
return i;
}
size_t search_sse4cmpstr_implicit(const char* data, size_t len)
{
static const char legal_ranges[16] = {
'\t', '\t',
' ', '~',
};
__m128i v1 = _mm_loadu_si128((const __m128i*)legal_ranges);
size_t i = 0;
while (len - i >= 16) {
__m128i v2 = _mm_loadu_si128((const __m128i*)(data + i));
unsigned consumed = _mm_cmpistri(v1, v2, _SIDD_LEAST_SIGNIFICANT|_SIDD_CMP_RANGES|_SIDD_UBYTE_OPS|_SIDD_NEGATIVE_POLARITY);
i += consumed;
if (consumed < 16) {
return i;
}
}
// Check last 15 bytes
for (; i < len; ++i) {
if (!VWSCHAR(data[i])) {
break;
}
}
return i;
}
int main()
{
printf("Setting up 1GB of data...\n");
size_t len = 1024 * 1024 * 1024 + 3;
char* data = (char*)mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_POPULATE, -1, 0); // Aligned
srand(0);
for (size_t i = 0; i < len; ++i) {
const char v = rand() % 96;
data[i] = v == 95 ? '\t' : ' ' + v;
}
size_t end = len - 2;
data[end] = '\n'; // Illegal character to be found
MEASURE("table lookup", {
size_t i = search_table(data, len);
if (i != end) printf("INCORRECT RESULT: %zu instead of %zu", i, end);
});
MEASURE("cmpestr ranges", {
size_t i = search_sse4cmpstr(data, len);
if (i != end) printf("INCORRECT RESULT: %zu instead of %zu", i, end);
});
MEASURE("cmpistr ranges", {
size_t i = search_sse4cmpstr_implicit(data, len);
if (i != end) printf("INCORRECT RESULT: %zu instead of %zu", i, end);
});
MEASURE("logic ranges", {
size_t i = search_logic(data, len);
if (i != end) printf("INCORRECT RESULT: %zu instead of %zu", i, end);
});
}
Compiled with gcc -O3 -march=native -pedantic -Wall -Wextra main2.cpp it gives me these results:
Setting up 1GB of data...
table lookup - 476.4820ms
cmpestr ranges - 519.3350ms
cmpistr ranges - 497.5770ms
logic ranges - 153.2650ms
I've also checked the assembly output and search_sse4cmpstr uses vpcmpestri while search_table is non-vectorized.
Am I using it wrong? Or why does this instruction exist at all?
EDIT:
As pointed out in the comments, cmpistr (implicit length instruction with less parameters) is slightly faster than cmpestr and sometimes faster than the table lookup.
However, SSE2 bitwise and integer operations seem to be even faster.
EDIT2
Peter Cordes found the right answer.
I've added the revised program in a new answer, so please look at this one if you are interested in cmpstr.
DO NOT USE THE CODE ABOVE!

The code has an unnecessary dependency of i on the previous vector, bottlenecking on pcmpestri + L1d load-use latency of about 12 + 5 cycles. (https://agner.org/optimize/ and https://uops.info/) So yes, you are using it wrong, unfortunately.
If you wrote it similar to your scalar loop, doing i+=16 and just checking the pcmpestri result as a loop-exit condition, you'd bottleneck on its throughput of 1 vector per 4 clocks on your Sandybridge-family CPUs. (SnB and IvB specifically).
Or if your input can use pcmpistri, that's somewhat less bad and can go at 1 per 3 clocks on Sandybridge-family.
I didn't notice this problem at first because I wasn't expecting the loop to be written that way, and there was other clutter in the asm loop. :/ I spent a bunch of time profiling with perf to be sure it wasn't a front-end bottleneck from the microcoded (8 uop) instruction on my Skylake CPU. See the now-archived comments.
A throughput bottleneck would let you go at about 4 bytes / cycle, vs.
about 1 for the other way (2 loads per input byte, and Intel since SnB can do 2 loads per clock). So a factor of 4 speedup. Or a factor of 8 on Nehalem with 1/clock load throughput.
The latency bottleneck is just about 1 cycle per input byte, about the same as the table lookup, by coincidence.
Also, don't use len - i < 16; gcc actually calculates that inside the loop costing extra uops. Use i < len-15 once you know that len>=16. (unsigned types make this tricky because they wrap at zero; what you want it to compile to is a cmp/jcc to skip the loop, then a do{}while asm loop structure. So the initial len>=16 really is separate from the normal loop condition.)
Other fun facts about pcmpestri:
How much faster are SSE4.2 string instructions than SSE2 for memcmp? (it's slower, especially with AVX2)
SSE42 & STTNI - PcmpEstrM is twice slower than PcmpIstrM, is it true? Yes, the explicit-length versions are slower than the implicit-length versions. Masking based on extra 2 length inputs is slower and costs more uops than scanning for a 0 byte in the existing inputs, apparently.
Performance doesn't depend on the value of the immediate. At once point I thought it did, but that was with i dependent on the result, so changing the immediate led to cache-line splits, making loop latency even worse. Re-testing with an i+=16 loop shows no effect.
If used with a REX.W prefix (to take inputs in RAX and RDX instead of EAX and EDX) it's much slower (according to https://uops.info/) for Intel, but there's no intrinsic for that so you don't have to worry about compilers doing that.
Or why does this instruction exist at all?
These instructions were introduced in Nehalem. Intel might have had plans to make them faster if they "caught on" and became widely used e.g. for short-string strcmp. But without fault-suppression (for unaligned loads that potentially cross into a new page) they're hard to use without checking stuff about a pointer. If you're going to do checks anyway, you might as well use an efficient pcmpeqb/pmovmskb which is fewer uops. And maybe find the first zero in either string with pminub/pcmpeqb/pmovmskb -> bsf. Maybe there's a use-case for SSE4.2 for the initial startup of a strcmp, but once you get going not so much.
And most of the world cares about UTF-8, not 8-bit character sets. And with UTF-16 not being fixed-width anymore (thanks to 32-bit Unicode), even wide-character stuff is harder to accelerate with these.
Using the ranges features basically requires hand-vectorization, which is a lot of work for something that only handles ASCII.
And as you found, for simple cases you can go even faster with pcmpgtb and boolean logic. With AVX2 you could process 32 bytes at once instead of 16, but there's no AVX2 version of vpcmpistri, only the AVX1 VEX encoding of the 16-byte instruction.

As Peter Cordes has pointed out, the problem was caused by an unnecessary dependency on the output of cmpstr.
This can be solved by simply restructuring this loop:
while (len - i >= 16) {
__m128i v2 = _mm_loadu_si128((const __m128i*)(data + i));
unsigned consumed = _mm_cmpistri(v1, v2, _SIDD_LEAST_SIGNIFICANT|_SIDD_CMP_RANGES|_SIDD_UBYTE_OPS|_SIDD_NEGATIVE_POLARITY);
i += consumed;
if (consumed < 16) {
return i;
}
}
into that one:
if (len >= 16)
while (i <= len - 16) {
__m128i v2 = _mm_loadu_si128((const __m128i*)(data + i));
unsigned consumed = _mm_cmpistri(v1, v2, _SIDD_LEAST_SIGNIFICANT|_SIDD_CMP_RANGES|_SIDD_UBYTE_OPS|_SIDD_NEGATIVE_POLARITY);
if (consumed < 16) {
return i + consumed;
}
i += 16;
}
The results for my i5-2410M compiled with gcc -pedantic -Wall -Wextra -O3 -march=native sse42cmpstr.c look now far better:
Setting up 1GB of data...
table - 484.5900ms
cmpestr - 231.9770ms
cmpistr - 121.3510ms
logic - 142.3700ms
Now cmpistr is clearly faster than both cmpestr and table search and surpasses even
the hand-crafted SSE2 logical comparisons on most CPUs I've tested.
The full test code is here:
#include <stdio.h>
#include <inttypes.h>
#include <sys/time.h>
#include <sys/mman.h>
#include <immintrin.h>
#include <stdalign.h>
#define ALIGNED16 __attribute__((aligned(16)))
#define MEASURE(msg,stmt) { \
struct timeval tv; \
gettimeofday(&tv, NULL); \
uint64_t us1 = tv.tv_sec * (uint64_t)1000000 + tv.tv_usec; \
stmt; \
gettimeofday(&tv, NULL); \
uint64_t us2 = tv.tv_sec * (uint64_t)1000000 + tv.tv_usec; \
printf("%-20s - %.4fms\n", msg, ((double)us2 - us1) / 1000); \
}
// Character table
#define VWSCHAR(c) (vis_ws_chars[(unsigned char)(c)]) // Visible characters and white space
#define YES 1,
#define NO 0,
#define YES16 YES YES YES YES YES YES YES YES YES YES YES YES YES YES YES YES
#define NO16 NO NO NO NO NO NO NO NO NO NO NO NO NO NO NO NO
#define NO128 NO16 NO16 NO16 NO16 NO16 NO16 NO16 NO16
// Visible ASCII characters with space and tab
ALIGNED16 static const int vis_ws_chars[256] = {
// NUL SOH STX ETX EOT ENQ ACK BEL BS HT LF VT FF CR SO SI
NO NO NO NO NO NO NO NO NO YES NO NO NO NO NO NO
// DLE DC1 DC2 DC3 DC4 NAK SYN ETB CAN EM SUB ESC FS GS RS US
NO16
// SP ! " # $ % & ' ( ) * + , - . /
// 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
// # A B C D E F G H I J K L M N O
// P Q R S T U V W X Y Z [ \ ] ^ _
// ` a b c d e f g h i j k l m n o
YES16 YES16 YES16 YES16 YES16
// p q r s t u v w x y z { | } ~ DEL
YES YES YES YES YES YES YES YES YES YES YES YES YES YES YES NO
// Non-ASCII characters
NO128
};
// Search using the ASCII table above
size_t search_table(const char* data, size_t len)
{
// Search non-matching character via table lookups
size_t i = 0;
if(len >= 16) {
while (i <= len - 16) {
if (!VWSCHAR(data[i + 0])) break;
if (!VWSCHAR(data[i + 1])) break;
if (!VWSCHAR(data[i + 2])) break;
if (!VWSCHAR(data[i + 3])) break;
if (!VWSCHAR(data[i + 4])) break;
if (!VWSCHAR(data[i + 5])) break;
if (!VWSCHAR(data[i + 6])) break;
if (!VWSCHAR(data[i + 7])) break;
if (!VWSCHAR(data[i + 8])) break;
if (!VWSCHAR(data[i + 9])) break;
if (!VWSCHAR(data[i + 10])) break;
if (!VWSCHAR(data[i + 11])) break;
if (!VWSCHAR(data[i + 12])) break;
if (!VWSCHAR(data[i + 13])) break;
if (!VWSCHAR(data[i + 14])) break;
if (!VWSCHAR(data[i + 15])) break;
i += 16;
}
}
// Check last bytes
for (; i < len; ++i) {
if (!VWSCHAR(data[i])) {
break;
}
}
return i;
}
// Search using SSE4.2 cmpestri (explicit length)
size_t search_sse4cmpestr(const char* data, size_t len)
{
ALIGNED16 static const char legal_ranges[16] = {
'\t', '\t',
' ', '~',
};
__m128i v1 = _mm_loadu_si128((const __m128i*) legal_ranges);
size_t i = 0;
if(len >= 16) {
while (i <= len - 16) {
__m128i v2 = _mm_loadu_si128((const __m128i*) (data + i));
unsigned consumed = _mm_cmpestri(v1, 4, v2, 16, _SIDD_LEAST_SIGNIFICANT|_SIDD_CMP_RANGES|_SIDD_UBYTE_OPS|_SIDD_NEGATIVE_POLARITY);
if (consumed < 16) {
return i + consumed;
}
i += 16;
}
}
for (; i < len; ++i) {
if (!VWSCHAR(data[i])) {
return i;
}
}
return i;
}
// Search using SSE4.2 cmpistri (implicit length)
size_t search_sse4cmpistr(const char* data, size_t len)
{
ALIGNED16 static const char legal_ranges[16] = {
'\t', '\t',
' ', '~',
};
__m128i v1 = _mm_loadu_si128((const __m128i*) legal_ranges);
size_t i = 0;
if (len >= 16) {
while (i <= len - 16) {
__m128i v2 = _mm_loadu_si128((const __m128i*)(data + i));
unsigned consumed = _mm_cmpistri(v1, v2, _SIDD_LEAST_SIGNIFICANT|_SIDD_CMP_RANGES|_SIDD_UBYTE_OPS|_SIDD_NEGATIVE_POLARITY);
if (consumed < 16) {
return i + consumed;
}
i += 16;
}
}
for (; i < len; ++i) {
if (!VWSCHAR(data[i])) {
return i;
}
}
return i;
}
// Search using SSE2 logic instructions
size_t search_logic(const char* data, size_t len) {
__m128i ht = _mm_set1_epi8('\t');
//__m128i del = _mm_set1_epi8(0x7f);
__m128i td = _mm_set1_epi8('~');
__m128i sp_m1 = _mm_set1_epi8(' ' - 1);
size_t i = 0;
if(len >= 16) {
while (len - 16 >= i) {
__m128i c = _mm_loadu_si128((const __m128i *) (data + i));
// (((c != ht) && (c >= sp)) && (c > td)) == 0
if(!_mm_test_all_zeros(_mm_and_si128(_mm_xor_si128(c, ht), _mm_cmpgt_epi8(c, sp_m1)), _mm_cmpgt_epi8(c, td)))
break;
i += 16;
}
}
// Check last bytes
for (; i < len; ++i) {
if (!VWSCHAR(data[i])) {
break;
}
}
return i;
}
int main()
{
printf("Setting up 1GB of data...\n");
size_t len = 1024 * 1024 * 1024 + 3;
char* data = (char*)mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_POPULATE, -1, 0); // Aligned
for (size_t i = 0; i < len; ++i) {
const char v = i % 96;
data[i] = v == 95 ? '\t' : ' ' + v;
}
size_t end = len - 2;
data[end] = '\n'; // Illegal character to be found
MEASURE("table", {
size_t i = search_table(data, len);
if (i != end) printf("INCORRECT RESULT: %u instead of %u\n", i, end);
});
MEASURE("cmpestr", {
size_t i = search_sse4cmpestr(data, len);
if (i != end) printf("INCORRECT RESULT: %u instead of %u\n", i, end);
});
MEASURE("cmpistr", {
size_t i = search_sse4cmpistr(data, len);
if (i != end) printf("INCORRECT RESULT: %u instead of %u\n", i, end);
});
MEASURE("logic", {
size_t i = search_logic(data, len);
if (i != end) printf("INCORRECT RESULT: %u instead of %u\n", i, end);
});
}

Related

Is this code use the Fast Doubling method for Fibonacci number calculation?

I need to understand how the fib_ui.c function in gmp repo work :
in their documentation, they state that :
Beyond the table, values are generated with a binary powering
algorithm, calculating a pair F[n] and F[n-1] working from high to low
across the bits of n. The formulas used are
F[2k+1] = 4F[k]^2 - F[k-1]^2 + 2(-1)^k F[2k-1] = F[k]^2 + F[k-1]^2
F[2k] = F[2k+1] - F[2k-1]
What they mean by " binary powering algorithm"
Is this the same as the "Fast Doubling method"
Do they change the number to binary? and why?
#include <stdio.h>
#include "gmp-impl.h"
#include "longlong.h"
/* change to "#define TRACE(x) x" to get some traces */
#define TRACE(x)
/* In the F[2k+1] below for k odd, the -2 won't give a borrow from the low
limb because the result F[2k+1] is an F[4m+3] and such numbers are always
== 1, 2 or 5 mod 8, whereas an underflow would leave 6 or 7. (This is
the same as in mpn_fib2_ui.)
In the F[2k+1] for k even, the +2 won't give a carry out of the low limb
in normal circumstances. This is an F[4m+1] and we claim that F[3*2^b+1]
== 1 mod 2^b is the first F[4m+1] congruent to 0 or 1 mod 2^b, and hence
if n < 2^GMP_NUMB_BITS then F[n] cannot have a low limb of 0 or 1. No
proof for this claim, but it's been verified up to b==32 and has such a
nice pattern it must be true :-). Of interest is that F[3*2^b] == 0 mod
2^(b+1) seems to hold too.
When n >= 2^GMP_NUMB_BITS, which can arise in a nails build, then the low
limb of F[4m+1] can certainly be 1, and an mpn_add_1 must be used. */
void
mpz_fib_ui (mpz_ptr fn, unsigned long n)
{
mp_ptr fp, xp, yp;
mp_size_t size, xalloc;
unsigned long n2;
mp_limb_t c;
TMP_DECL;
if (n <= FIB_TABLE_LIMIT)
{
MPZ_NEWALLOC (fn, 1)[0] = FIB_TABLE (n);
SIZ(fn) = (n != 0); /* F[0]==0, others are !=0 */
return;
}
n2 = n/2;
xalloc = MPN_FIB2_SIZE (n2) + 1;
fp = MPZ_NEWALLOC (fn, 2 * xalloc);
TMP_MARK;
TMP_ALLOC_LIMBS_2 (xp,xalloc, yp,xalloc);
size = mpn_fib2_ui (xp, yp, n2);
TRACE (printf ("mpz_fib_ui last step n=%lu size=%ld bit=%lu\n",
n >> 1, size, n&1);
mpn_trace ("xp", xp, size);
mpn_trace ("yp", yp, size));
if (n & 1)
{
/* F[2k+1] = (2F[k]+F[k-1])*(2F[k]-F[k-1]) + 2*(-1)^k */
mp_size_t xsize, ysize;
#if HAVE_NATIVE_mpn_add_n_sub_n
xp[size] = mpn_lshift (xp, xp, size, 1);
yp[size] = 0;
ASSERT_NOCARRY (mpn_add_n_sub_n (xp, yp, xp, yp, size+1));
xsize = size + (xp[size] != 0);
ASSERT (yp[size] <= 1);
ysize = size + yp[size];
#else
mp_limb_t c2;
c2 = mpn_lshift (fp, xp, size, 1);
c = c2 + mpn_add_n (xp, fp, yp, size);
xp[size] = c;
xsize = size + (c != 0);
c2 -= mpn_sub_n (yp, fp, yp, size);
yp[size] = c2;
ASSERT (c2 <= 1);
ysize = size + c2;
#endif
size = xsize + ysize;
c = mpn_mul (fp, xp, xsize, yp, ysize);
#if GMP_NUMB_BITS >= BITS_PER_ULONG
/* no overflow, see comments above */
ASSERT (n & 2 ? fp[0] >= 2 : fp[0] <= GMP_NUMB_MAX-2);
fp[0] += (n & 2 ? -CNST_LIMB(2) : CNST_LIMB(2));
#else
if (n & 2)
{
ASSERT (fp[0] >= 2);
fp[0] -= 2;
}
else
{
ASSERT (c != GMP_NUMB_MAX); /* because it's the high of a mul */
c += mpn_add_1 (fp, fp, size-1, CNST_LIMB(2));
fp[size-1] = c;
}
#endif
}
else
{
/* F[2k] = F[k]*(F[k]+2F[k-1]) */
mp_size_t xsize, ysize;
#if HAVE_NATIVE_mpn_addlsh1_n
c = mpn_addlsh1_n (yp, xp, yp, size);
#else
c = mpn_lshift (yp, yp, size, 1);
c += mpn_add_n (yp, yp, xp, size);
#endif
yp[size] = c;
xsize = size;
ysize = size + (c != 0);
size += ysize;
c = mpn_mul (fp, yp, ysize, xp, xsize);
}
/* one or two high zeros */
+
− size -= (c == 0);
size -= (fp[size-1] == 0);
SIZ(fn) = size;
TRACE (printf ("done special, size=%ld\n", size);
mpn_trace ("fp ", fp, size));
TMP_FREE;
}

How to speed up printf in C

I have a task to print all the prime numbers between 1 and 1000000 in class and the fastest 10 programs get extra marks. The main problem is the time it takes for the prime numbers to be printed to the console.
Basically using the Sieve of Eratosthenes I produce an array with only boolean values in it. The boolean value Numbers[i] is true if i+2 is a prime number.
for(i = 0; i <= n - 2; ++i)
if (Numbers[i]) // True if the number is prime
printf("%d\n", i+2);
Printf seems to be really slow as the program can generate the list of primes in about 0.035 s but then takes a further 11 seconds to print the list. Is there anyway I can speed this up, thanks.

Beneath is a slightly unoptimized implementation (although I skipped the intermediate list and print directly) of what I think you were supposed to do. Running that program on an AMD A8-6600K with a small load (mainly a Youtube music-video for some personal entertainment) results in
real 0m1.211s
user 0m0.047s
sys 0m0.122s
averaged over a couple of runs. So the problem lies in your implementation of the sieve or you are hiding some essential facts about your hardware.
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <inttypes.h>
#include <limits.h>
#include <string.h>
/* I call it a general bitset. Others might call it an abomination. YMMV. */
# define ERAT_BITS (sizeof(uint32_t)*CHAR_BIT)
# define GET_BIT(s,n) ((*(s+(n/ERAT_BITS)) & ( 1<<( n % ERAT_BITS ))) != 0)
# define SET_BIT(s,n) (*(s+(n/ERAT_BITS)) |= ( 1<<( n % ERAT_BITS )))
# define CLEAR_BIT(s,n) (*(s+(n/ERAT_BITS)) &= ~( 1<<( n % ERAT_BITS )))
# define TOG_BIT(s,n) (*(s+(n/ERAT_BITS)) ^= ( 1<<( n % ERAT_BITS )))
/* size is the size in bits, the overall size might be bigger */
typedef struct mp_bitset_t {
uint32_t size;
uint32_t *content;
} mp_bitset_t;
# define mp_bitset_alloc(bst, n) \
do {\
(bst)->content=malloc(( n /(sizeof(uint32_t)) + 1 ));\
if ((bst)->content == NULL) {\
fprintf(stderr, "memory allocation for bitset failed");\
exit(EXIT_FAILURE);\
}\
(bst)->size = n;\
} while (0)
# define mp_bitset_size(bst) ((bst)->size)
# define mp_bitset_setall(bst) memset((bst)->content,~(uint32_t)(0),\
(bst->size /(sizeof(uint32_t) ) +1 ))
# define mp_bitset_clearall(bst) memset((bst)->content,0,\
(bst->size /(sizeof(uint32_t) ) +1 ))
# define mp_bitset_clear(bst,n) CLEAR_BIT((bst)->content, n)
# define mp_bitset_set(bst,n) SET_BIT((bst)->content, n)
# define mp_bitset_get(bst,n) GET_BIT((bst)->content, n)
# define mp_bitset_free(bst) \
do {\
free((bst)->content);\
free(bst);\
} while (0)
uint32_t mp_bitset_nextset(mp_bitset_t * bst, uint32_t n);
uint32_t mp_bitset_prevset(mp_bitset_t * bst, uint32_t n);
void mp_eratosthenes(mp_bitset_t * bst);
/* It's called Hallek's method but it has many inventors*/
static uint32_t isqrt(uint32_t n)
{
uint32_t s, rem, root;
if (n < 1)
return 0;
/* This is actually the highest square but it goes
* downward from this, quite fast */
s = 1 << 30;
rem = n;
root = 0;
while (s > 0) {
if (rem >= (s | root)) {
rem -= (s | root);
root >>= 1;
root |= s;
} else {
root >>= 1;
}
s >>= 2;
}
return root;
}
uint32_t mp_bitset_nextset(mp_bitset_t *bst, uint32_t n)
{
while ((n < mp_bitset_size(bst)) && (!mp_bitset_get(bst, n))) {
n++;
}
return n;
}
/*
* Standard method, quite antique now, but good enough for the handful
* of primes needed here.
*/
void mp_eratosthenes(mp_bitset_t *bst)
{
uint32_t n, k, r, j;
mp_bitset_setall(bst);
mp_bitset_clear(bst, 0);
mp_bitset_clear(bst, 1);
n = mp_bitset_size(bst);
r = isqrt(n);
for (k = 4; k < n; k += 2)
mp_bitset_clear(bst, k);
k = 0;
while ((k = mp_bitset_nextset(bst, k + 1)) < n) {
if (k > r) {
break;
}
for (j = k * k; j < n; j += k * 2) {
mp_bitset_clear(bst, j);
}
}
}
#define UPPER_LIMIT 1000000 /* one million */
int main(void) {
mp_bitset_t *bst;
uint32_t n, k, j;
bst = malloc(sizeof(mp_bitset_t));
if(bst == NULL) {
fprintf(stderr, "failed to allocate %zu bytes\n",sizeof(mp_bitset_t));
exit(EXIT_FAILURE);
}
mp_bitset_alloc(bst, UPPER_LIMIT);
mp_bitset_setall(bst);
mp_bitset_clear(bst, 0); // 0 is not prime b.d.
mp_bitset_clear(bst, 1); // 1 is not prime b.d.
n = mp_bitset_size(bst);
for (k = 4; k < n; k += 2) {
mp_bitset_clear(bst, k);
}
k = 0;
while ((k = mp_bitset_nextset(bst, k + 1)) < n) {
printf("%" PRIu32 "\n", k);
for (j = k * k; j < n; j += k * 2) {
mp_bitset_clear(bst, j);
}
}
mp_bitset_free(bst);
return EXIT_SUCCESS;
}
Compiled with
gcc-4.9 -O3 -g3 -W -Wall -Wextra -Wuninitialized -Wstrict-aliasing -pedantic -std=c11 tests.c -o tests
(GCC is gcc-4.9.real (Ubuntu 4.9.4-2ubuntu1~14.04.1) 4.9.4)

Since by default console output is line buffered, which is the reason of the increased time.
You can use the setvbuf function to allow printing to console/stdout only in chunks rather than for each iteration.
E.g.
char buffer[256];
setvbuf(stdout, buffer, _IOFBF, sizeof(buffer));
You can alter the size of buffer according to your needs.
IOFBF option is for full buffering i.e. output will be printed once the buffer is full.
See setvbuf for more details

How to execute faster than "snprintf(mystr, 22, "{%+0.4f,%+0.4f}", (double)3.14159265, (double) 2.718281828459);" on a 32 bit mcu

I've tried a few things, any it seems that at best I'm 1.5x slower than the printf() family of functions, which boggles my mind a bit. I think what I'm up against in this situation is the addressing of my device is 32bit, and I don't have an FPU. I've tried a couple of "ftoa()" implementations and constrained them to only look for 2 digits on the left of the decimal point, and left myself some breadcrumbs as to what the total length is of a larger overall string that I'm trying to build. At the end of the day, it seems like the nature of an array of 8-bit elements on a 32bit system is leading to a bunch of hidden shift operations, bitwise "OR" and bitwise NAND operations that are just slowing things down ridiculously...
Anyone have any general tips for this situation? (other than a re-architect to an 8.24 fixed point design) I've tried the compiler optimizations from wysiwyg to execution speed focused, nothing seems to beat snprintf.
Here's the fastest one that I had tried:
#if (__DEBUG)
#define DATA_FIFO_SIZE (8)
#else
#define DATA_FIFO_SIZE (1024)
#endif
typedef struct
{
int32_t rval[4];
double cval[4];
uint16_t idx;
uint16_t padding; //#attention the compiler was padding with 2 bytes to align to 32bit
} data_fifo_entry;
const char V_ERR_MSG[7] = "ERROR,\0";
static data_fifo_entry data_fifo[DATA_FIFO_SIZE];
static char embed_text[256];
/****
* float to ASCII, adapted from
* https://stackoverflow.com/questions/2302969/how-to-implement-char-ftoafloat-num-without-sprintf-library-function-i#7097567
*
****/
//#attention the following floating point #defs are linked!!
#define MAX_DIGITS_TO_PRINT_FLOAT (6)
#define MAX_SUPPORTED_PRINTABLE_FLOAT (+999999.99999999999999999999999999)
#define MIN_SUPPORTED_PRINTABLE_FLOAT (-999999.99999999999999999999999999)
#define FLOAT_TEST6 (100000.0)
#define FLOAT_TEST5 (10000.0)
#define FLOAT_TEST4 (1000.0)
#define FLOAT_TEST3 (100.0)
#define FLOAT_TEST2 (10.0)
#define FLOAT_TEST1 (1.0)
static inline int ftoa(char *s, const float f_in, const uint8_t precision)
{
float f_p = 0.0001;
float n = f_in;
int neg = (n < 0.0);
int length = 0;
switch (precision)
{
case (1):
{
f_p = 0.1;
break;
}
case (2):
{
f_p = 0.01;
break;
}
case (3):
{
f_p = 0.001;
break;
}
//case (4) is the default assumption
case (5):
{
f_p = 0.00001;
break;
}
case (6):
{
f_p = 0.000001;
break;
}
default: //already assumed, no assignments here
{
break;
}
} /* switch */
// handle special cases
if (isnan(n))
{
strcpy(s, "nan\0");
length = 4;
}
else if ((isinf(n)) || (n >= MAX_SUPPORTED_PRINTABLE_FLOAT) ||
((-1.0 * n) < MIN_SUPPORTED_PRINTABLE_FLOAT))
{
strcpy(s, "inf\0");
length = 4;
}
else if (n == 0.0)
{
int idx;
s[length++] = '+';
s[length++] = '0';
s[length++] = '.';
for (idx = 0; idx < precision; idx++)
{
s[length++] = '0';
}
s[length++] = '\0';
}
else if (((n > 0.0) && (n < f_p)) || ((n < 0.0) && ((-1.0 * n) < f_p)))
{
int idx;
if (n >= 0.0)
{
s[length++] = '+';
}
else
{
s[length++] = '-';
}
s[length++] = '0';
s[length++] = '.';
for (idx = 1; idx < precision; idx++)
{
s[length++] = '0';
}
s[length++] = '\0';
}
else
{
int digit, m;
if (neg)
{
n = -n;
}
// calculate magnitude
if (n >= FLOAT_TEST6)
{
m = 6;
}
else if (n >= FLOAT_TEST5)
{
m = 5;
}
else if (n >= FLOAT_TEST4)
{
m = 4;
}
else if (n >= FLOAT_TEST3)
{
m = 3;
}
else if (n >= FLOAT_TEST2)
{
m = 2;
}
else if (n >= FLOAT_TEST1)
{
m = 1;
}
else
{
m = 0;
}
if (neg)
{
s[length++] = '-';
}
else
{
s[length++] = '+';
}
// set up for scientific notation
if (m < 1.0)
{
m = 0;
}
// convert the number
while (n > f_p || m >= 0)
{
double weight = pow(10.0, m);
if ((weight > 0) && !isinf(weight))
{
digit = floor(n / weight);
n -= (digit * weight);
s[length++] = '0' + digit;
}
if ((m == 0) && (n > 0))
{
s[length++] = '.';
}
m--;
}
s[length++] = '\0';
}
return (length - 1);
} /* ftoa */
static inline void print2_and_idx(int8_t idx1, int8_t idx2, uint16_t fifo_idx)
{
//#attention 10 characters already in the buffer, idx does NOT start at zero
uint8_t idx = V_PREFIX_LENGTH;
char scratch[16] = {'\0'};
char * p_fifo_id;
if ((idx1 >= 0) && (idx1 < MAX_IDX) && (idx2 >= 0) && (idx2 < MAX_IDX) &&
(fifo_idx >= 0) && (fifo_idx < DATA_FIFO_SIZE))
{
ftoa(scratch, data_fifo[fifo_idx].cval[idx1], 4);
memcpy((void *)&embed_text[idx += 7], (void *)scratch, 7);
embed_text[idx++] = ',';
ftoa(scratch, data_fifo[fifo_idx].cval[idx2], 4);
memcpy((void *)&embed_text[idx += 7], (void *)scratch, 7);
embed_text[idx++] = ',';
//!\todo maybe print the .idx as fixed width, zero pad to 5 digits
p_fifo_id = utoa((char *)&embed_text[idx], (unsigned int)data_fifo[fifo_idx].idx, 10);
idx += strlen(p_fifo_id);
embed_text[idx++] = ',';
}
else
{
memcpy((void *)&embed_text[idx], (void *)V_ERR_MSG, 7);
}
} /* print2_and_idx */

Instead of using *printf() with FP arguments, convert the FP values first into scaled integers.
With still calling snprintf(), yet with integer and simple character arguments, my code was about 20x faster than the baseline.
Your mileage may vary. YMMV.
//baseline
void format2double_1(char *mystr, double pi, double e) {
snprintf(mystr, 22, "{%+0.4f,%+0.4f}", pi, e);
//puts(mystr);
}
void format2double_2(char *mystr, double pi, double e) {
int pi_i = (int) lrint(pi * 10000.0);
int api_i = abs(pi_i);
int e_i = (int) lrint(e * 10000.0);
int ae_i = abs(e_i);
snprintf(mystr, 22, "{%c%d.%04d,%c%d.%04d}", //
"+-"[pi_i < 0], api_i / 10000, api_i % 10000, //
"+-"[e_i < 0], ae_i / 10000, ae_i % 10000);
//puts(mystr);
}
[edit]
For a proper -0.0 text, use "+-"[!!signbit(pi)]
[edit]
Some idea for OP to consider as a ftoa() replacement. Central code is lrint(f_in * fscale[precision]); which rounds and scales. Untested.
#define PRINTABLE_MAGNITUDE_LIMIT 1000000
int ftoa_1(char *s, const float f_in, const uint8_t precision) {
int n;
sprintf(s, "%+.*f%n", precision, f_in, &n);
return n;
}
int ftoa_2(char *s, const float f_in, const uint8_t precision) {
float fscale[] = { 1, 10, 100, 1000, 10000, 100000, 1000000 };
long iscale[] = { 1, 10, 100, 1000, 10000, 100000, 1000000 };
assert(precision > 0 && precision < sizeof fscale / sizeof fscale[0]);
// gross range check
if (f_in > -PRINTABLE_MAGNITUDE_LIMIT && f_in < PRINTABLE_MAGNITUDE_LIMIT) {
long value = lrint(f_in * fscale[precision]);
value = labs(value);
long scale = iscale[precision];
long ipart = value / scale;
long fpart = value % scale;
// fine range check
if (ipart < PRINTABLE_MAGNITUDE_LIMIT) {
int n;
sprintf(s, "%c%ld:%0*ld%n", signbit(f_in) ? '-' : '+', ipart, precision,
fpart, &n);
return n;
}
}
// Out of range values need not be of performance concern for now.
return ftoa_1(s, f_in, precision);
}
[edit]
To convert a positive or 0 integer to a string quickly without the need to shift the buffer or reverse it, see below. It also returns the string length for subsequent string building.
// Convert an unsigned to a decimal string and return its length
size_t utoa_length(char *dest, unsigned u) {
size_t len = 0;
if (u >= 10) {
len = utoa_length(dest, u/10);
dest += len;
}
dest[0] = '0' + u%10;
dest[1] = '\0';
return len + 1;
}

In a similar vein of #chux's answer, if the remaining snprintf is still slow you can go down the rabbit hole of hand-composing strings/hand-rendering integers.
char *fmtp04f(char *buf, char *lim, double d) {
// if there's no space at all don't bother
if(buf==lim) return buf;
// 10 characters in maximum 32 bit integer, one for the dot,
// one for the terminating NUL in debug prints
char b[12];
// current position in the buffer
char *bp = b;
// scale and round
int32_t i = lrint(d * 10000.);
// write sign and fix i sign
// (we do have at least one character available in buf)
if(signbit(d)) {
*buf++='-';
i = -i;
} else {
*buf++='+';
}
// *always* write down the last 4 digits, even if they are zeroes
// (they'll become the 4 digits after the decimal dot)
for(; bp!=b+4; ) {
*bp++ = '0' + i%10;
i/=10;
}
*bp++='.';
// write down the remaining digits, writing at least one
do {
*bp++ = '0' + i%10;
i/=10;
} while(i != 0);
// bp is at the character after the last, step back
--bp;
// data is now into b *in reversed order*;
// reverse-copy it into the user-provided buffer
while(buf!=lim) {
*buf++ = *bp;
// check before decrementing, as a pointer to one-before-first
// is not allowed in C
if(bp == b) break;
--bp;
}
if(buf!=lim) *buf=0; // "regular" case: terminate *after*
else lim[-1]=0; // bad case: truncate
return buf;
}
void doformat(char *buf, char *lim, double a, double b) {
if(buf==lim) return; // cannot do anything
*buf++='{';
if(buf==lim) goto end;
buf = fmtp04f(buf, lim, a);
if(buf==lim) return; // already terminated by fmtp04f
*buf++=',';
if(buf==lim) goto end;
buf = fmtp04f(buf, lim, b);
if(buf==lim) return; // idem
*buf++='}';
if(buf==lim) goto end;
*buf++=0;
end:
lim[-1]=0; // always terminate
}
It passes some random tests, so I'm reasonably confident that it is not too wrong.
For some reason, #chux version on my machine (64 bit Linux, gcc 6.3) is generally 2/3 times faster than the baseline, while my version is usually 10/30 times faster than the baseline. I don't know if this is because my snprintf is particularly good or particularly bad. As said above, YMMV.

How to avoid sprintf when joining variables

I am working on code to get USB device details into single String, and have following code,
struct usb_bus *bus;
struct usb_device *dev;
usb_init();
usb_find_busses();
usb_find_devices();
for (bus = usb_busses; bus; bus = bus->next)
for (dev = bus->devices; dev; dev = dev->next)
{
// working outputs
printf("Trying device %s/%s\n", bus->dirname, dev->filename);
printf("Trying device2 %0x\n", dev->descriptor.idVendor);
printf("Trying device3 %0x\n", dev->descriptor.idProduct);
char deviceDetailsStr[150];
sprintf(deviceDetailsStr, "%s_%s_%0x_%0x", bus->dirname,
dev->filename,dev->descriptor.idVendor,dev->descriptor.idProduct);
... have other code here that works on "deviceDetailsStr"
}
Been reading thatt "sprintf" has performance issues, since it supports lots of transforms.
Can you please suggest what is better alternative to using "sprintf", so that all 4 variables data gets read into variable "deviceDetailsStr"
End goal is "deviceDetailsStr" char array needs to have all 4 entires as single string.
Thanks

If you want the best performance, I would say you need to write something custom. Here's an example for your specific requirements.
uint32_t printHex( char * buffer, uint32_t value ) {
uint32_t sz = value <= 0xF ? 1 :
value <= 0xFF ? 2 :
value <= 0xFFF ? 3 :
value <= 0xFFFF ? 4 :
value <= 0xFFFFF ? 5 :
value <= 0xFFFFFF ? 6 :
value <= 0xFFFFFFF ? 7 : 8;
for( uint32_t i=sz-1; i; i-- ) {
buffer[ i ] = ((value & 0xF) <= 9 ? '0' : 'a'-10 ) + (value & 0xF);
value=value>>4;
}
return sz;
}
char buffer[150];
unsigned bi=0;
for( char * ptr = bus->dirname ; *ptr; ptr++ ) buffer[bi++] = *ptr;
buffer[bi++] = '_';
for( char * ptr = bus->filename; *ptr; ptr++ ) buffer[bi++] = *ptr;
buffer[bi++] = '_';
bi += printHex( buffer + bi, dev->descriptor.idVendor );
buffer[bi++] = '_';
bi += printHex( buffer + bi, dev->descriptor.idProduct );
buffer[bi] = '\0';
Note: There are no size checks (just like sprintf). Again if you want best performance, this is a trade off.

Step 1: Determine the maximum buffer size needs.
Assume bus->dirname, dev->filename are arrays.
#define Mark_SZ ((sizeof bus->dirname - 1) + 1 + \
(sizeof dev->filename - 1) + 1 + \
((sizeof dev->descriptor.idVendor * CHAR_BIT + 3) /4) + 1 + \
((sizeof dev->descriptor.idProduct * CHAR_BIT + 3) /4) + 1)
#defined Extra (depends on: "other code here that works on "deviceDetailsStr"")
char deviceDetailsStr[Mark_SZ + Extra];
Step 2: Copy in each part
// Some untested code to give you an idea.
char *p = deviceDetailsStr;
size_t n = strlen(bus->dirname);
memcpy(p, bus->dirname, n);
p += n;
*p++ = '_';
n = strlen(dev->filename);
memcpy(p, dev->filename, n);
p += n;
*p++ = '_';
p += sprintf(p, "%x", dev->descriptor.idVendor);
*p++ = '_';
sprintf(p, "%x", dev->descriptor.idProduct);
I coded sprintf(p, "%x", dev->descriptor.idVendor) and sprintf(p, "%x", dev->descriptor.idProduct) by themselves as hoping even a modest compiler will recognize this and replace with the equivalent itoa() like function calls. Otherwise, simply code up a replacement unsigned to string.
I see no value with "0" in "%0x"

A possible algorithm for determining whether two strings are anagrams of one another? [closed]

Closed. This question is off-topic. It is not currently accepting answers.
Want to improve this question? Update the question so it's on-topic for Stack Overflow.
Closed 10 years ago.
Improve this question
I have this idea (using C language) for checking whether two strings formed from ASCII letters are anagrams of one another:
Check if the strings are the same length.
Check if the sum of the ASCII values of all chars is the same for both strings.
Check if the product of the ASCII values of all chars is the same for both strings.
I believe that if all three are correct, then the strings must be anagrams of one another. However, I can't prove it. Can someone help me prove or disprove that this would work?
Thanks!

I wrote a quick program to brute-force search for conflicts and found that this approach does not always work. The strings ABFN and AAHM have the same ASCII sum and product, but are not anagrams of one another. Their ASCII sum is 279 and ASCII product is 23,423,400.
There are a lot more conflicts than this. My program, searching over all length-four strings, found 11,737 conflicts.
For reference, here's the C++ source code:
#include <iostream>
#include <map>
#include <string>
#include <vector>
using namespace std;
int main() {
/* Sparse 2D table where used[sum][prod] is either nothing or is a string
* whose characters sum to "sum" and whose product is "prod".
*/
map<int, map<int, string> > used;
/* List of all usable characters in the string. */
vector<char> usable;
for (char ch = 'A'; ch <= 'Z'; ch++) {
usable.push_back(ch);
}
for (char ch = 'a'; ch <= 'z'; ch++) {
usable.push_back(ch);
}
/* Brute-force search over all possible length-four strings. To avoid
* iterating over anagrams, the search only explores strings whose letters
* are in increasing ASCII order.
*/
for (int a = 0; a < usable.size(); a++) {
for (int b = a; b < usable.size(); b++) {
for (int c = b; c < usable.size(); c++) {
for (int d = c; d < usable.size(); d++) {
/* Compute the sum and product. */
int sum = usable[a] + usable[b] + usable[c] + usable[d];
int prod = usable[a] * usable[b] * usable[c] * usable[d];
/* See if we have already seen this. */
if (used.count(sum) &&
used[sum].count(prod)) {
cout << "Conflict found: " << usable[a] << usable[b] << usable[c] << usable[d] << " conflicts with " << used[sum][prod] << endl;
}
/* Update the table. */
used[sum][prod] = string() + usable[a] + usable[b] + usable[c] + usable[d];
}
}
}
}
}
Hope this helps!

Your approach is false; I can't explain why because I don't understand it, but there are different sets at least for cardinality 3 that have the same sum and product: https://math.stackexchange.com/questions/38671/two-sets-of-3-positive-integers-with-equal-sum-and-product

The letters a-z and A-Z are used to index an array of 26 primes, and the product of these primes is used as a hash value for the word. Equal product <--> same letters.
(the order of the hashvalues in the primes26[] array in the below fragment is based on the letter frequencies in the Dutch language, as an attempt mimimise the expected product)
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#define COUNTOF(a) (sizeof (a)/ sizeof (a)[0])
typedef unsigned long long HashVal;
HashVal hashmem (char *str, size_t len);
unsigned char primes26[] =
{
5,71,79,19,2,83,31,43,11,53,37,23,41,3,13,73,101,17,29,7,59,47,61,97,89,67,
};
struct anahash {
struct anahash *next;
unsigned freq;
HashVal hash;
char word[1];
};
struct anahash *hashtab[1024*1024] = {NULL,};
struct anahash *new_word(char *str, size_t len);
struct anahash **hash_find(struct anahash *wp);
/*********************************************/
HashVal hashmem (char *str, size_t len)
{
size_t idx;
HashVal val=1;
if (!len) return 0;
for (idx = 0; idx < len; idx++) {
char ch = str[idx];
if (ch >= 'A' && ch <= 'Z' ) val *= primes26[ ch - 'A'];
else if (ch >= 'a' && ch <= 'z' ) val *= primes26[ ch - 'a'];
else continue;
}
return val;
}
struct anahash *new_word(char *str, size_t len)
{
struct anahash *wp;
if (!len) len = strlen(str);
wp = malloc(len + sizeof *wp );
wp->hash = hashmem(str, len);
wp->next = NULL;
wp->freq = 0;
memcpy (wp->word, str, len);
wp->word[len] = 0;
return wp;
}
struct anahash **hash_find(struct anahash *wp)
{
unsigned slot;
struct anahash **pp;
slot = wp->hash % COUNTOF(hashtab);
for (pp = &hashtab[slot]; *pp; pp= &(*pp)->next) {
if ((*pp)->hash < wp->hash) continue;
if (strcmp( wp->word, (*pp)->word ) > 0) continue;
break;
}
return pp;
}
char buff [16*4096];
int main (void)
{
size_t pos,end;
struct anahash *wp, **pp;
HashVal val;
memset(hashtab, 0, sizeof hashtab);
while (fgets(buff, sizeof buff, stdin)) {
for (pos=0; pos < sizeof buff && buff[pos]; ) {
for(end = pos; end < sizeof buff && buff[end]; end++ ) {
if (buff[end] < 'A' || buff[end] > 'z') break;
if (buff[end] > 'Z' && buff[end] < 'a') break;
}
if (end > pos) {
wp = new_word(buff+pos, end-pos);
if (!wp) {pos=end; continue; }
pp = hash_find(wp);
if (!*pp) *pp = wp;
else if ((*pp)->hash == wp->hash
&& !strcmp((*pp)->word , wp->word)) free(wp);
else { wp->next = *pp; *pp = wp; }
(*pp)->freq +=1;
}
pos = end;
for(end = pos; end < sizeof buff && buff[end]; end++ ) {
if (buff[end] >= 'A' && buff[end] <= 'Z') break;
if (buff[end] >= 'z' && buff[end] <= 'a') break;
}
pos = end;
}
}
for (pos = 0; pos < COUNTOF(hashtab); pos++) {
if (! &hashtab[pos] ) continue;
for (pp = &hashtab[pos]; wp = *pp; pp = &wp->next) {
if (val != wp->hash) {
fprintf (stdout, "\nSlot:%u:\n", pos );
val = wp->hash;
}
fprintf (stdout, "\t%llx:%u:%s\n", wp->hash, wp->freq, wp->word);
}
}
return 0;
}

Thanks for such a great question! Instead of trying to disprove your proposition altogether, I spent sometime trying to find ways to augment it so it becomes true. I have the sense that if the standard deviations are equal then the two are equal. But instead of testing that far, I do a simpler test and have not found a counter example as yet. Here is what I have tested:
In addition to the conditions you mentioned before,
ASCII square-root of the sum of the squares must be equal:
I use the following python program. I have no complete proof, but maybe my response will help. Anyway, take a look.
from math import sqrt
class Nothing:
def equalString( self, strA, strB ):
prodA, prodB = 1, 1
sumA, sumB = 0, 0
geoA, geoB = 0, 0
for a in strA:
i = ord( a )
prodA *= i
sumA += i
geoA += ( i ** 2 )
geoA = sqrt( geoA )
for b in strB:
i = ord( b )
prodB *= i
sumB += i
geoB += ( i ** 2 )
geoB = sqrt( geoB )
if prodA == prodB and sumA == sumB and geoA == geoB:
return True
else:
return False
def compareStrings( self ):
first, last = ord( 'A' ), ord( 'z' )
for a in range( first, last + 1 ):
for b in range( a, last + 1 ):
for c in range( b, last + 1 ):
for d in range( c, last + 1 ):
strA = chr( a ) + chr( b ) + chr( c ) + chr( d )
strB = chr( d ) + chr( c ) + chr( b ) + chr( a )
if not self.equalString( strA, strB ):
print "%s and %s should be equal.\n" % ( strA, strB )
print "Done"

If you don't mind modifying the strings, sort each of them and compare the two signatures.

Develop Reference

c reactjs sql-server angularjs arrays wpf database batch-file google-app-engine silverlight