How to avoid sprintf when joining variables

How to avoid sprintf when joining variables - c

I am working on code to get USB device details into single String, and have following code,
struct usb_bus *bus;
struct usb_device *dev;
usb_init();
usb_find_busses();
usb_find_devices();
for (bus = usb_busses; bus; bus = bus->next)
for (dev = bus->devices; dev; dev = dev->next)
{
// working outputs
printf("Trying device %s/%s\n", bus->dirname, dev->filename);
printf("Trying device2 %0x\n", dev->descriptor.idVendor);
printf("Trying device3 %0x\n", dev->descriptor.idProduct);
char deviceDetailsStr[150];
sprintf(deviceDetailsStr, "%s_%s_%0x_%0x", bus->dirname,
dev->filename,dev->descriptor.idVendor,dev->descriptor.idProduct);
... have other code here that works on "deviceDetailsStr"
}
Been reading thatt "sprintf" has performance issues, since it supports lots of transforms.
Can you please suggest what is better alternative to using "sprintf", so that all 4 variables data gets read into variable "deviceDetailsStr"
End goal is "deviceDetailsStr" char array needs to have all 4 entires as single string.
Thanks

If you want the best performance, I would say you need to write something custom. Here's an example for your specific requirements.
uint32_t printHex( char * buffer, uint32_t value ) {
uint32_t sz = value <= 0xF ? 1 :
value <= 0xFF ? 2 :
value <= 0xFFF ? 3 :
value <= 0xFFFF ? 4 :
value <= 0xFFFFF ? 5 :
value <= 0xFFFFFF ? 6 :
value <= 0xFFFFFFF ? 7 : 8;
for( uint32_t i=sz-1; i; i-- ) {
buffer[ i ] = ((value & 0xF) <= 9 ? '0' : 'a'-10 ) + (value & 0xF);
value=value>>4;
}
return sz;
}
char buffer[150];
unsigned bi=0;
for( char * ptr = bus->dirname ; *ptr; ptr++ ) buffer[bi++] = *ptr;
buffer[bi++] = '_';
for( char * ptr = bus->filename; *ptr; ptr++ ) buffer[bi++] = *ptr;
buffer[bi++] = '_';
bi += printHex( buffer + bi, dev->descriptor.idVendor );
buffer[bi++] = '_';
bi += printHex( buffer + bi, dev->descriptor.idProduct );
buffer[bi] = '\0';
Note: There are no size checks (just like sprintf). Again if you want best performance, this is a trade off.

Step 1: Determine the maximum buffer size needs.
Assume bus->dirname, dev->filename are arrays.
#define Mark_SZ ((sizeof bus->dirname - 1) + 1 + \
(sizeof dev->filename - 1) + 1 + \
((sizeof dev->descriptor.idVendor * CHAR_BIT + 3) /4) + 1 + \
((sizeof dev->descriptor.idProduct * CHAR_BIT + 3) /4) + 1)
#defined Extra (depends on: "other code here that works on "deviceDetailsStr"")
char deviceDetailsStr[Mark_SZ + Extra];
Step 2: Copy in each part
// Some untested code to give you an idea.
char *p = deviceDetailsStr;
size_t n = strlen(bus->dirname);
memcpy(p, bus->dirname, n);
p += n;
*p++ = '_';
n = strlen(dev->filename);
memcpy(p, dev->filename, n);
p += n;
*p++ = '_';
p += sprintf(p, "%x", dev->descriptor.idVendor);
*p++ = '_';
sprintf(p, "%x", dev->descriptor.idProduct);
I coded sprintf(p, "%x", dev->descriptor.idVendor) and sprintf(p, "%x", dev->descriptor.idProduct) by themselves as hoping even a modest compiler will recognize this and replace with the equivalent itoa() like function calls. Otherwise, simply code up a replacement unsigned to string.
I see no value with "0" in "%0x"

Related

Why is SSE4.2 cmpstr slower than regular code?

I'm trying to validate a string that must only contain ASCII visible characters, white space and \t.
But it seems that ASCII table lookups are faster than the _mm_cmpestri instruction with _SIDD_CMP_RANGES on most CPUs.
I've tested it on an i5-2410M, an i7-3720QM, an i7-5600U and a KVM-virtualized Xeon of unknown type and only on the last one is the vectorized version faster.
My test code is here:
#include <stdio.h>
#include <string.h>
#include <inttypes.h>
#include <sys/time.h>
#include <sys/mman.h>
#include <immintrin.h>
#include <stdalign.h>
#include <stdlib.h>
#define MIN(a,b) (((a)<(b))?(a):(b))
#define ALIGNED16 alignas(16)
#define MEASURE(msg,stmt) { \
struct timeval tv; \
gettimeofday(&tv, NULL); \
uint64_t us1 = tv.tv_sec * (uint64_t)1000000 + tv.tv_usec; \
stmt; \
gettimeofday(&tv, NULL); \
uint64_t us2 = tv.tv_sec * (uint64_t)1000000 + tv.tv_usec; \
printf("%-20s - %.4fms\n", msg, ((double)us2 - us1) / 1000); \
}
// Character table
#define VWSCHAR(c) (vis_ws_chars[(unsigned char)(c)]) // Visible characters and white space
#define YES 1,
#define NO 0,
#define YES16 YES YES YES YES YES YES YES YES YES YES YES YES YES YES YES YES
#define NO16 NO NO NO NO NO NO NO NO NO NO NO NO NO NO NO NO
#define NO128 NO16 NO16 NO16 NO16 NO16 NO16 NO16 NO16
// Visible ASCII characters with space and tab
ALIGNED16 static const int vis_ws_chars[256] = {
// NUL SOH STX ETX EOT ENQ ACK BEL BS HT LF VT FF CR SO SI
NO NO NO NO NO NO NO NO NO YES NO NO NO NO NO NO
// DLE DC1 DC2 DC3 DC4 NAK SYN ETB CAN EM SUB ESC FS GS RS US
NO16
// SP ! " # $ % & ' ( ) * + , - . /
// 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
// # A B C D E F G H I J K L M N O
// P Q R S T U V W X Y Z [ \ ] ^ _
// ` a b c d e f g h i j k l m n o
YES16 YES16 YES16 YES16 YES16
// p q r s t u v w x y z { | } ~ DEL
YES YES YES YES YES YES YES YES YES YES YES YES YES YES YES NO
// Non-ASCII characters
NO128
};
size_t search_logic(const char* data, size_t len) {
__m128i ht = _mm_set1_epi8('\t');
//__m128i del = _mm_set1_epi8(0x7f);
__m128i td = _mm_set1_epi8('~');
__m128i sp_m1 = _mm_set1_epi8(' ' - 1);
size_t i = 0;
while (len - i >= 16) {
__m128i c = _mm_loadu_si128((const __m128i *) (data + i));
// (!((c < del) && (c >= sp)) && (c != ht)) == 0
//if(!_mm_testc_si128(_mm_and_si128(_mm_cmpgt_epi8(c, sp_m1), _mm_cmplt_epi8(c, del)), _mm_xor_si128(c, ht)))
//break;
// !(c == del) && ((c == ht) || (c >= sp)) == 1
//if(!_mm_test_all_ones(_mm_andnot_si128(_mm_cmpeq_epi8(c, del), _mm_or_si128(_mm_cmpeq_epi8(c, ht), _mm_cmpgt_epi8(c, sp_m1)))))
//break;
// (((c != ht) && (c >= sp)) && (c > td)) == 0
if(!_mm_test_all_zeros(_mm_and_si128(_mm_xor_si128(c, ht), _mm_cmpgt_epi8(c, sp_m1)), _mm_cmpgt_epi8(c, td)))
break;
i += 16;
}
// Check last 15 bytes
for (; i < len; ++i) {
if (!VWSCHAR(data[i])) {
break;
}
}
return i;
}
size_t search_table(const char* data, size_t len)
{
// Search non-matching character via table lookups
size_t i = 0;
while (len - i >= 16) {
if (!VWSCHAR(data[i + 0])) break;
if (!VWSCHAR(data[i + 1])) break;
if (!VWSCHAR(data[i + 2])) break;
if (!VWSCHAR(data[i + 3])) break;
if (!VWSCHAR(data[i + 4])) break;
if (!VWSCHAR(data[i + 5])) break;
if (!VWSCHAR(data[i + 6])) break;
if (!VWSCHAR(data[i + 7])) break;
if (!VWSCHAR(data[i + 8])) break;
if (!VWSCHAR(data[i + 9])) break;
if (!VWSCHAR(data[i + 10])) break;
if (!VWSCHAR(data[i + 11])) break;
if (!VWSCHAR(data[i + 12])) break;
if (!VWSCHAR(data[i + 13])) break;
if (!VWSCHAR(data[i + 14])) break;
if (!VWSCHAR(data[i + 15])) break;
i += 16;
}
// Check last 15 bytes
for (; i < len; ++i) {
if (!VWSCHAR(data[i])) {
break;
}
}
return i;
}
size_t search_sse4cmpstr(const char* data, size_t len)
{
static const char legal_ranges[16] = {
'\t', '\t',
' ', '~',
};
__m128i v1 = _mm_loadu_si128((const __m128i*)legal_ranges);
size_t i = 0;
while (len - i >= 16) {
__m128i v2 = _mm_loadu_si128((const __m128i*)(data + i));
unsigned consumed = _mm_cmpestri(v1, 4, v2, 16, _SIDD_LEAST_SIGNIFICANT|_SIDD_CMP_RANGES|_SIDD_UBYTE_OPS|_SIDD_NEGATIVE_POLARITY);
i += consumed;
if (consumed < 16) {
return i;
}
}
// Check last 15 bytes
for (; i < len; ++i) {
if (!VWSCHAR(data[i])) {
break;
}
}
return i;
}
size_t search_sse4cmpstr_implicit(const char* data, size_t len)
{
static const char legal_ranges[16] = {
'\t', '\t',
' ', '~',
};
__m128i v1 = _mm_loadu_si128((const __m128i*)legal_ranges);
size_t i = 0;
while (len - i >= 16) {
__m128i v2 = _mm_loadu_si128((const __m128i*)(data + i));
unsigned consumed = _mm_cmpistri(v1, v2, _SIDD_LEAST_SIGNIFICANT|_SIDD_CMP_RANGES|_SIDD_UBYTE_OPS|_SIDD_NEGATIVE_POLARITY);
i += consumed;
if (consumed < 16) {
return i;
}
}
// Check last 15 bytes
for (; i < len; ++i) {
if (!VWSCHAR(data[i])) {
break;
}
}
return i;
}
int main()
{
printf("Setting up 1GB of data...\n");
size_t len = 1024 * 1024 * 1024 + 3;
char* data = (char*)mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_POPULATE, -1, 0); // Aligned
srand(0);
for (size_t i = 0; i < len; ++i) {
const char v = rand() % 96;
data[i] = v == 95 ? '\t' : ' ' + v;
}
size_t end = len - 2;
data[end] = '\n'; // Illegal character to be found
MEASURE("table lookup", {
size_t i = search_table(data, len);
if (i != end) printf("INCORRECT RESULT: %zu instead of %zu", i, end);
});
MEASURE("cmpestr ranges", {
size_t i = search_sse4cmpstr(data, len);
if (i != end) printf("INCORRECT RESULT: %zu instead of %zu", i, end);
});
MEASURE("cmpistr ranges", {
size_t i = search_sse4cmpstr_implicit(data, len);
if (i != end) printf("INCORRECT RESULT: %zu instead of %zu", i, end);
});
MEASURE("logic ranges", {
size_t i = search_logic(data, len);
if (i != end) printf("INCORRECT RESULT: %zu instead of %zu", i, end);
});
}
Compiled with gcc -O3 -march=native -pedantic -Wall -Wextra main2.cpp it gives me these results:
Setting up 1GB of data...
table lookup - 476.4820ms
cmpestr ranges - 519.3350ms
cmpistr ranges - 497.5770ms
logic ranges - 153.2650ms
I've also checked the assembly output and search_sse4cmpstr uses vpcmpestri while search_table is non-vectorized.
Am I using it wrong? Or why does this instruction exist at all?
EDIT:
As pointed out in the comments, cmpistr (implicit length instruction with less parameters) is slightly faster than cmpestr and sometimes faster than the table lookup.
However, SSE2 bitwise and integer operations seem to be even faster.
EDIT2
Peter Cordes found the right answer.
I've added the revised program in a new answer, so please look at this one if you are interested in cmpstr.
DO NOT USE THE CODE ABOVE!

The code has an unnecessary dependency of i on the previous vector, bottlenecking on pcmpestri + L1d load-use latency of about 12 + 5 cycles. (https://agner.org/optimize/ and https://uops.info/) So yes, you are using it wrong, unfortunately.
If you wrote it similar to your scalar loop, doing i+=16 and just checking the pcmpestri result as a loop-exit condition, you'd bottleneck on its throughput of 1 vector per 4 clocks on your Sandybridge-family CPUs. (SnB and IvB specifically).
Or if your input can use pcmpistri, that's somewhat less bad and can go at 1 per 3 clocks on Sandybridge-family.
I didn't notice this problem at first because I wasn't expecting the loop to be written that way, and there was other clutter in the asm loop. :/ I spent a bunch of time profiling with perf to be sure it wasn't a front-end bottleneck from the microcoded (8 uop) instruction on my Skylake CPU. See the now-archived comments.
A throughput bottleneck would let you go at about 4 bytes / cycle, vs.
about 1 for the other way (2 loads per input byte, and Intel since SnB can do 2 loads per clock). So a factor of 4 speedup. Or a factor of 8 on Nehalem with 1/clock load throughput.
The latency bottleneck is just about 1 cycle per input byte, about the same as the table lookup, by coincidence.
Also, don't use len - i < 16; gcc actually calculates that inside the loop costing extra uops. Use i < len-15 once you know that len>=16. (unsigned types make this tricky because they wrap at zero; what you want it to compile to is a cmp/jcc to skip the loop, then a do{}while asm loop structure. So the initial len>=16 really is separate from the normal loop condition.)
Other fun facts about pcmpestri:
How much faster are SSE4.2 string instructions than SSE2 for memcmp? (it's slower, especially with AVX2)
SSE42 & STTNI - PcmpEstrM is twice slower than PcmpIstrM, is it true? Yes, the explicit-length versions are slower than the implicit-length versions. Masking based on extra 2 length inputs is slower and costs more uops than scanning for a 0 byte in the existing inputs, apparently.
Performance doesn't depend on the value of the immediate. At once point I thought it did, but that was with i dependent on the result, so changing the immediate led to cache-line splits, making loop latency even worse. Re-testing with an i+=16 loop shows no effect.
If used with a REX.W prefix (to take inputs in RAX and RDX instead of EAX and EDX) it's much slower (according to https://uops.info/) for Intel, but there's no intrinsic for that so you don't have to worry about compilers doing that.
Or why does this instruction exist at all?
These instructions were introduced in Nehalem. Intel might have had plans to make them faster if they "caught on" and became widely used e.g. for short-string strcmp. But without fault-suppression (for unaligned loads that potentially cross into a new page) they're hard to use without checking stuff about a pointer. If you're going to do checks anyway, you might as well use an efficient pcmpeqb/pmovmskb which is fewer uops. And maybe find the first zero in either string with pminub/pcmpeqb/pmovmskb -> bsf. Maybe there's a use-case for SSE4.2 for the initial startup of a strcmp, but once you get going not so much.
And most of the world cares about UTF-8, not 8-bit character sets. And with UTF-16 not being fixed-width anymore (thanks to 32-bit Unicode), even wide-character stuff is harder to accelerate with these.
Using the ranges features basically requires hand-vectorization, which is a lot of work for something that only handles ASCII.
And as you found, for simple cases you can go even faster with pcmpgtb and boolean logic. With AVX2 you could process 32 bytes at once instead of 16, but there's no AVX2 version of vpcmpistri, only the AVX1 VEX encoding of the 16-byte instruction.

As Peter Cordes has pointed out, the problem was caused by an unnecessary dependency on the output of cmpstr.
This can be solved by simply restructuring this loop:
while (len - i >= 16) {
__m128i v2 = _mm_loadu_si128((const __m128i*)(data + i));
unsigned consumed = _mm_cmpistri(v1, v2, _SIDD_LEAST_SIGNIFICANT|_SIDD_CMP_RANGES|_SIDD_UBYTE_OPS|_SIDD_NEGATIVE_POLARITY);
i += consumed;
if (consumed < 16) {
return i;
}
}
into that one:
if (len >= 16)
while (i <= len - 16) {
__m128i v2 = _mm_loadu_si128((const __m128i*)(data + i));
unsigned consumed = _mm_cmpistri(v1, v2, _SIDD_LEAST_SIGNIFICANT|_SIDD_CMP_RANGES|_SIDD_UBYTE_OPS|_SIDD_NEGATIVE_POLARITY);
if (consumed < 16) {
return i + consumed;
}
i += 16;
}
The results for my i5-2410M compiled with gcc -pedantic -Wall -Wextra -O3 -march=native sse42cmpstr.c look now far better:
Setting up 1GB of data...
table - 484.5900ms
cmpestr - 231.9770ms
cmpistr - 121.3510ms
logic - 142.3700ms
Now cmpistr is clearly faster than both cmpestr and table search and surpasses even
the hand-crafted SSE2 logical comparisons on most CPUs I've tested.
The full test code is here:
#include <stdio.h>
#include <inttypes.h>
#include <sys/time.h>
#include <sys/mman.h>
#include <immintrin.h>
#include <stdalign.h>
#define ALIGNED16 __attribute__((aligned(16)))
#define MEASURE(msg,stmt) { \
struct timeval tv; \
gettimeofday(&tv, NULL); \
uint64_t us1 = tv.tv_sec * (uint64_t)1000000 + tv.tv_usec; \
stmt; \
gettimeofday(&tv, NULL); \
uint64_t us2 = tv.tv_sec * (uint64_t)1000000 + tv.tv_usec; \
printf("%-20s - %.4fms\n", msg, ((double)us2 - us1) / 1000); \
}
// Character table
#define VWSCHAR(c) (vis_ws_chars[(unsigned char)(c)]) // Visible characters and white space
#define YES 1,
#define NO 0,
#define YES16 YES YES YES YES YES YES YES YES YES YES YES YES YES YES YES YES
#define NO16 NO NO NO NO NO NO NO NO NO NO NO NO NO NO NO NO
#define NO128 NO16 NO16 NO16 NO16 NO16 NO16 NO16 NO16
// Visible ASCII characters with space and tab
ALIGNED16 static const int vis_ws_chars[256] = {
// NUL SOH STX ETX EOT ENQ ACK BEL BS HT LF VT FF CR SO SI
NO NO NO NO NO NO NO NO NO YES NO NO NO NO NO NO
// DLE DC1 DC2 DC3 DC4 NAK SYN ETB CAN EM SUB ESC FS GS RS US
NO16
// SP ! " # $ % & ' ( ) * + , - . /
// 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
// # A B C D E F G H I J K L M N O
// P Q R S T U V W X Y Z [ \ ] ^ _
// ` a b c d e f g h i j k l m n o
YES16 YES16 YES16 YES16 YES16
// p q r s t u v w x y z { | } ~ DEL
YES YES YES YES YES YES YES YES YES YES YES YES YES YES YES NO
// Non-ASCII characters
NO128
};
// Search using the ASCII table above
size_t search_table(const char* data, size_t len)
{
// Search non-matching character via table lookups
size_t i = 0;
if(len >= 16) {
while (i <= len - 16) {
if (!VWSCHAR(data[i + 0])) break;
if (!VWSCHAR(data[i + 1])) break;
if (!VWSCHAR(data[i + 2])) break;
if (!VWSCHAR(data[i + 3])) break;
if (!VWSCHAR(data[i + 4])) break;
if (!VWSCHAR(data[i + 5])) break;
if (!VWSCHAR(data[i + 6])) break;
if (!VWSCHAR(data[i + 7])) break;
if (!VWSCHAR(data[i + 8])) break;
if (!VWSCHAR(data[i + 9])) break;
if (!VWSCHAR(data[i + 10])) break;
if (!VWSCHAR(data[i + 11])) break;
if (!VWSCHAR(data[i + 12])) break;
if (!VWSCHAR(data[i + 13])) break;
if (!VWSCHAR(data[i + 14])) break;
if (!VWSCHAR(data[i + 15])) break;
i += 16;
}
}
// Check last bytes
for (; i < len; ++i) {
if (!VWSCHAR(data[i])) {
break;
}
}
return i;
}
// Search using SSE4.2 cmpestri (explicit length)
size_t search_sse4cmpestr(const char* data, size_t len)
{
ALIGNED16 static const char legal_ranges[16] = {
'\t', '\t',
' ', '~',
};
__m128i v1 = _mm_loadu_si128((const __m128i*) legal_ranges);
size_t i = 0;
if(len >= 16) {
while (i <= len - 16) {
__m128i v2 = _mm_loadu_si128((const __m128i*) (data + i));
unsigned consumed = _mm_cmpestri(v1, 4, v2, 16, _SIDD_LEAST_SIGNIFICANT|_SIDD_CMP_RANGES|_SIDD_UBYTE_OPS|_SIDD_NEGATIVE_POLARITY);
if (consumed < 16) {
return i + consumed;
}
i += 16;
}
}
for (; i < len; ++i) {
if (!VWSCHAR(data[i])) {
return i;
}
}
return i;
}
// Search using SSE4.2 cmpistri (implicit length)
size_t search_sse4cmpistr(const char* data, size_t len)
{
ALIGNED16 static const char legal_ranges[16] = {
'\t', '\t',
' ', '~',
};
__m128i v1 = _mm_loadu_si128((const __m128i*) legal_ranges);
size_t i = 0;
if (len >= 16) {
while (i <= len - 16) {
__m128i v2 = _mm_loadu_si128((const __m128i*)(data + i));
unsigned consumed = _mm_cmpistri(v1, v2, _SIDD_LEAST_SIGNIFICANT|_SIDD_CMP_RANGES|_SIDD_UBYTE_OPS|_SIDD_NEGATIVE_POLARITY);
if (consumed < 16) {
return i + consumed;
}
i += 16;
}
}
for (; i < len; ++i) {
if (!VWSCHAR(data[i])) {
return i;
}
}
return i;
}
// Search using SSE2 logic instructions
size_t search_logic(const char* data, size_t len) {
__m128i ht = _mm_set1_epi8('\t');
//__m128i del = _mm_set1_epi8(0x7f);
__m128i td = _mm_set1_epi8('~');
__m128i sp_m1 = _mm_set1_epi8(' ' - 1);
size_t i = 0;
if(len >= 16) {
while (len - 16 >= i) {
__m128i c = _mm_loadu_si128((const __m128i *) (data + i));
// (((c != ht) && (c >= sp)) && (c > td)) == 0
if(!_mm_test_all_zeros(_mm_and_si128(_mm_xor_si128(c, ht), _mm_cmpgt_epi8(c, sp_m1)), _mm_cmpgt_epi8(c, td)))
break;
i += 16;
}
}
// Check last bytes
for (; i < len; ++i) {
if (!VWSCHAR(data[i])) {
break;
}
}
return i;
}
int main()
{
printf("Setting up 1GB of data...\n");
size_t len = 1024 * 1024 * 1024 + 3;
char* data = (char*)mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_POPULATE, -1, 0); // Aligned
for (size_t i = 0; i < len; ++i) {
const char v = i % 96;
data[i] = v == 95 ? '\t' : ' ' + v;
}
size_t end = len - 2;
data[end] = '\n'; // Illegal character to be found
MEASURE("table", {
size_t i = search_table(data, len);
if (i != end) printf("INCORRECT RESULT: %u instead of %u\n", i, end);
});
MEASURE("cmpestr", {
size_t i = search_sse4cmpestr(data, len);
if (i != end) printf("INCORRECT RESULT: %u instead of %u\n", i, end);
});
MEASURE("cmpistr", {
size_t i = search_sse4cmpistr(data, len);
if (i != end) printf("INCORRECT RESULT: %u instead of %u\n", i, end);
});
MEASURE("logic", {
size_t i = search_logic(data, len);
if (i != end) printf("INCORRECT RESULT: %u instead of %u\n", i, end);
});
}

A possible algorithm for determining whether two strings are anagrams of one another? [closed]

Closed. This question is off-topic. It is not currently accepting answers.
Want to improve this question? Update the question so it's on-topic for Stack Overflow.
Closed 10 years ago.
Improve this question
I have this idea (using C language) for checking whether two strings formed from ASCII letters are anagrams of one another:
Check if the strings are the same length.
Check if the sum of the ASCII values of all chars is the same for both strings.
Check if the product of the ASCII values of all chars is the same for both strings.
I believe that if all three are correct, then the strings must be anagrams of one another. However, I can't prove it. Can someone help me prove or disprove that this would work?
Thanks!

I wrote a quick program to brute-force search for conflicts and found that this approach does not always work. The strings ABFN and AAHM have the same ASCII sum and product, but are not anagrams of one another. Their ASCII sum is 279 and ASCII product is 23,423,400.
There are a lot more conflicts than this. My program, searching over all length-four strings, found 11,737 conflicts.
For reference, here's the C++ source code:
#include <iostream>
#include <map>
#include <string>
#include <vector>
using namespace std;
int main() {
/* Sparse 2D table where used[sum][prod] is either nothing or is a string
* whose characters sum to "sum" and whose product is "prod".
*/
map<int, map<int, string> > used;
/* List of all usable characters in the string. */
vector<char> usable;
for (char ch = 'A'; ch <= 'Z'; ch++) {
usable.push_back(ch);
}
for (char ch = 'a'; ch <= 'z'; ch++) {
usable.push_back(ch);
}
/* Brute-force search over all possible length-four strings. To avoid
* iterating over anagrams, the search only explores strings whose letters
* are in increasing ASCII order.
*/
for (int a = 0; a < usable.size(); a++) {
for (int b = a; b < usable.size(); b++) {
for (int c = b; c < usable.size(); c++) {
for (int d = c; d < usable.size(); d++) {
/* Compute the sum and product. */
int sum = usable[a] + usable[b] + usable[c] + usable[d];
int prod = usable[a] * usable[b] * usable[c] * usable[d];
/* See if we have already seen this. */
if (used.count(sum) &&
used[sum].count(prod)) {
cout << "Conflict found: " << usable[a] << usable[b] << usable[c] << usable[d] << " conflicts with " << used[sum][prod] << endl;
}
/* Update the table. */
used[sum][prod] = string() + usable[a] + usable[b] + usable[c] + usable[d];
}
}
}
}
}
Hope this helps!

Your approach is false; I can't explain why because I don't understand it, but there are different sets at least for cardinality 3 that have the same sum and product: https://math.stackexchange.com/questions/38671/two-sets-of-3-positive-integers-with-equal-sum-and-product

The letters a-z and A-Z are used to index an array of 26 primes, and the product of these primes is used as a hash value for the word. Equal product <--> same letters.
(the order of the hashvalues in the primes26[] array in the below fragment is based on the letter frequencies in the Dutch language, as an attempt mimimise the expected product)
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#define COUNTOF(a) (sizeof (a)/ sizeof (a)[0])
typedef unsigned long long HashVal;
HashVal hashmem (char *str, size_t len);
unsigned char primes26[] =
{
5,71,79,19,2,83,31,43,11,53,37,23,41,3,13,73,101,17,29,7,59,47,61,97,89,67,
};
struct anahash {
struct anahash *next;
unsigned freq;
HashVal hash;
char word[1];
};
struct anahash *hashtab[1024*1024] = {NULL,};
struct anahash *new_word(char *str, size_t len);
struct anahash **hash_find(struct anahash *wp);
/*********************************************/
HashVal hashmem (char *str, size_t len)
{
size_t idx;
HashVal val=1;
if (!len) return 0;
for (idx = 0; idx < len; idx++) {
char ch = str[idx];
if (ch >= 'A' && ch <= 'Z' ) val *= primes26[ ch - 'A'];
else if (ch >= 'a' && ch <= 'z' ) val *= primes26[ ch - 'a'];
else continue;
}
return val;
}
struct anahash *new_word(char *str, size_t len)
{
struct anahash *wp;
if (!len) len = strlen(str);
wp = malloc(len + sizeof *wp );
wp->hash = hashmem(str, len);
wp->next = NULL;
wp->freq = 0;
memcpy (wp->word, str, len);
wp->word[len] = 0;
return wp;
}
struct anahash **hash_find(struct anahash *wp)
{
unsigned slot;
struct anahash **pp;
slot = wp->hash % COUNTOF(hashtab);
for (pp = &hashtab[slot]; *pp; pp= &(*pp)->next) {
if ((*pp)->hash < wp->hash) continue;
if (strcmp( wp->word, (*pp)->word ) > 0) continue;
break;
}
return pp;
}
char buff [16*4096];
int main (void)
{
size_t pos,end;
struct anahash *wp, **pp;
HashVal val;
memset(hashtab, 0, sizeof hashtab);
while (fgets(buff, sizeof buff, stdin)) {
for (pos=0; pos < sizeof buff && buff[pos]; ) {
for(end = pos; end < sizeof buff && buff[end]; end++ ) {
if (buff[end] < 'A' || buff[end] > 'z') break;
if (buff[end] > 'Z' && buff[end] < 'a') break;
}
if (end > pos) {
wp = new_word(buff+pos, end-pos);
if (!wp) {pos=end; continue; }
pp = hash_find(wp);
if (!*pp) *pp = wp;
else if ((*pp)->hash == wp->hash
&& !strcmp((*pp)->word , wp->word)) free(wp);
else { wp->next = *pp; *pp = wp; }
(*pp)->freq +=1;
}
pos = end;
for(end = pos; end < sizeof buff && buff[end]; end++ ) {
if (buff[end] >= 'A' && buff[end] <= 'Z') break;
if (buff[end] >= 'z' && buff[end] <= 'a') break;
}
pos = end;
}
}
for (pos = 0; pos < COUNTOF(hashtab); pos++) {
if (! &hashtab[pos] ) continue;
for (pp = &hashtab[pos]; wp = *pp; pp = &wp->next) {
if (val != wp->hash) {
fprintf (stdout, "\nSlot:%u:\n", pos );
val = wp->hash;
}
fprintf (stdout, "\t%llx:%u:%s\n", wp->hash, wp->freq, wp->word);
}
}
return 0;
}

Thanks for such a great question! Instead of trying to disprove your proposition altogether, I spent sometime trying to find ways to augment it so it becomes true. I have the sense that if the standard deviations are equal then the two are equal. But instead of testing that far, I do a simpler test and have not found a counter example as yet. Here is what I have tested:
In addition to the conditions you mentioned before,
ASCII square-root of the sum of the squares must be equal:
I use the following python program. I have no complete proof, but maybe my response will help. Anyway, take a look.
from math import sqrt
class Nothing:
def equalString( self, strA, strB ):
prodA, prodB = 1, 1
sumA, sumB = 0, 0
geoA, geoB = 0, 0
for a in strA:
i = ord( a )
prodA *= i
sumA += i
geoA += ( i ** 2 )
geoA = sqrt( geoA )
for b in strB:
i = ord( b )
prodB *= i
sumB += i
geoB += ( i ** 2 )
geoB = sqrt( geoB )
if prodA == prodB and sumA == sumB and geoA == geoB:
return True
else:
return False
def compareStrings( self ):
first, last = ord( 'A' ), ord( 'z' )
for a in range( first, last + 1 ):
for b in range( a, last + 1 ):
for c in range( b, last + 1 ):
for d in range( c, last + 1 ):
strA = chr( a ) + chr( b ) + chr( c ) + chr( d )
strB = chr( d ) + chr( c ) + chr( b ) + chr( a )
if not self.equalString( strA, strB ):
print "%s and %s should be equal.\n" % ( strA, strB )
print "Done"

If you don't mind modifying the strings, sort each of them and compare the two signatures.

Data types conversion (unsigned long long to char)

Can anyone tell me what is wrong with the following code?
__inline__
char* ut_byte_to_long (ulint nb) {
char* a = malloc(sizeof(nb));
int i = 0;
for (i=0;i<sizeof(nb);i++) {
a[i] = (nb>>(i*8)) & 0xFF;
}
return a;
}
This string is then concatenated as part of a larger one using strcat. The string prints fine but for the integers which are represented as character symbols. I'm using %s and fprintf to check the result.
Thanks a lot.
EDIT
I took one of the comments below (I was adding the terminating \0 separately, before calling fprintf, but after strcat. Modifying my initial function...
__inline__
char* ut_byte_to_long (ulint nb) {
char* a = malloc(sizeof(nb) + 1);
int i = 0;
for (i=0;i<sizeof(nb);i++) {
a[i] = (nb>>(i*8)) & 0xFF;
}
a[nb] = '\0' ;
return a;
}
This sample code still isn't printing out a number...
char* tmp;
tmp = ut_byte_to_long(start->id);
fprintf(stderr, "Value of node is %s \n ", tmp);

strcat is expecting a null byte terminating the string.
Change your malloc size to sizeof(nb) + 1 and append '\0' to the end.

You have two problems.
The first is that the character array a contains numbers, such as 2, instead of ASCII codes representing those numbers, such as '2' (=50 on ASCII, might be different in other systems). Try modifying your code to
a[i] = (nb>>(i*8)) & 0xFF + '0';
The second problem is that the result of the above computation can be anything between 0 and 255, or in other words, a number which requires more than one digit to print.
If you want to print hexadecimal numbers (0-9, A-F), two digits per such computation will be enough, and you can write something like
a[2*i + 0] = int2hex( (nb>>(i*8)) & 0x0F ); //right hexa digit
a[2*i + 1] = int2hex( (nb>>(i*8+4)) & 0x0F ); //left hexa digit
where
char int2hex(int n) {
if (n <= 9 && n >= 0)
return n + '0';
else
return (n-10) + 'A';
}

if you dont want to use sprintf(target_string,"%lu",source_int) or the non standard itoa(), here is a version of the function that transform a long to a string :
__inline__
char* ut_byte_to_long (ulint nb) {
char* a = (char*) malloc(22*sizeof(char));
int i=21;
int j;
do
{
i--;
a[i] = nb % 10 + '0';
nb = nb/10;
}while (nb > 0);
// the number is stored from a[i] to a[21]
//shifting the string to a[0] : a[21-i]
for(j = 0 ; j < 21 && i < 21 ; j++ , i++)
{
a[j] = a[i];
}
a[j] = '\0';
return a;
}
I assumed that an unsigned long contain less than 21 digits. (biggest number is 18,446,744,073,709,551,615 which equals 2^64 − 1 : 20 digits)

What does "Unsigned modulo 256" mean in the context of image decoding

Because I'm masochistic I'm trying to write something in C to decode an 8-bit PNG file (it's a learning thing, I'm not trying to reinvent libpng...)
I've got to the point when the stuff in my deflated, unfiltered data buffer unmistakably resembles the source image (see below), but it's still quite, erm, wrong, and I'm pretty sure there's something askew with my implementation of the filtering algorithms. Most of them are quite simple, but there's one major thing I don't understand in the docs, not being good at maths or ever having taken a comp-sci course:
Unsigned arithmetic modulo 256 is used, so that both the inputs and outputs fit into bytes.
What does that mean?
If someone can tell me that I'd be very grateful!
For reference, (and I apologise for the crappy C) my noddy implementation of the filtering algorithms described in the docs look like:
unsigned char paeth_predictor (unsigned char a, unsigned char b, unsigned char c) {
// a = left, b = above, c = upper left
char p = a + b - c; // initial estimate
char pa = abs(p - a); // distances to a, b, c
char pb = abs(p - b);
char pc = abs(p - c);
// return nearest of a,b,c,
// breaking ties in order a,b,c.
if (pa <= pb && pa <= pc) return a;
else if (pb <= pc) return b;
else return c;
}
void unfilter_sub(char* out, char* in, int bpp, int row, int rowlen) {
for (int i = 0; i < rowlen; i++)
out[i] = in[i] + (i < bpp ? 0 : out[i-bpp]);
}
void unfilter_up(char* out, char* in, int bpp, int row, int rowlen) {
for (int i = 0; i < rowlen; i++)
out[i] = in[i] + (row == 0 ? 0 : out[i-rowlen]);
}
void unfilter_paeth(char* out, char* in, int bpp, int row, int rowlen) {
char a, b, c;
for (int i = 0; i < rowlen; i++) {
a = i < bpp ? 0 : out[i - bpp];
b = row < 1 ? 0 : out[i - rowlen];
c = i < bpp ? 0 : (row == 0 ? 0 : out[i - rowlen - bpp]);
out[i] = in[i] + paeth_predictor(a, b, c);
}
}
And the images I'm seeing:
Source
Source http://img220.imageshack.us/img220/8111/testdn.png
Output
Output http://img862.imageshack.us/img862/2963/helloworld.png

It means that, in the algorithm, whenever an arithmetic operation is performed, it is performed modulo 256, i.e. if the result is greater than 256 then it "wraps" around. The result is that all values will always fit into 8 bits and not overflow.
Unsigned types already behave this way by mandate, and if you use unsigned char (and a byte on your system is 8 bits, which it probably is), then your calculation results will naturally just never overflow beyond 8 bits.

It means only the last 8 bits of the result is used. 2^8=256, the last 8 bits of unsigned value v is the same as (v%256).
For example, 2+255=257, or 100000001, last 8 bits of 257 is 1, and 257%256 is also 1.

In 'simple language' it means that you never go "out" of your byte size.
For example in C# if you try this it will fail:
byte test = 255 + 255;
(1,13): error CS0031: Constant value '510' cannot be converted to a
'byte'
byte test = (byte)(255 + 255);
(1,13): error CS0221: Constant value '510' cannot be converted to a
'byte' (use 'unchecked' syntax to override)
For every calculation you have to do modulo 256 (C#: % 256).
Instead of writing % 256 you can also do AND 255:
(175 + 205) mod 256 = (175 + 205) AND 255
Some C# samples:
byte test = ((255 + 255) % 256);
// test: 254
byte test = ((255 + 255) & 255);
// test: 254
byte test = ((1 + 379) % 256);
// test: 124
byte test = ((1 + 379) & 0xFF);
// test: 124
Note that you sometimes can simplify a byte-series:
(byteVal1 + byteVal2 + byteVal3) % 256
= (((byteVal1 % 256) + (byteVal2 % 256)) % 256 + (byteVal3 % 256)) % 256

Print large base 256 array in base 10 in c

I have an array of unsigned chars in c I am trying to print in base 10, and I am stuck. I think this will be better explained in code, so, given:
unsigned char n[3];
char[0] = 1;
char[1] = 2;
char[2] = 3;
I would like to print 197121.
This is trivial with small base 256 arrays. One can simply 1 * 256 ^ 0 + 2 * 256 ^ 1 + 3 * 256 ^ 2.
However, if my array was 100 bytes large, then this quickly becomes a problem. There is no integral type in C that is 100 bytes large, which is why I'm storing numbers in unsigned char arrays to begin with.
How am I supposed to efficiently print this number out in base 10?
I am a bit lost.

There's no easy way to do it using only the standard C library. You'll either have to write the function yourself (not recommended), or use an external library such as GMP.
For example, using GMP, you could do:
unsigned char n[100]; // number to print
mpz_t num;
mpz_import(num, 100, -1, 1, 0, 0, n); // convert byte array into GMP format
mpz_out_str(stdout, 10, num); // print num to stdout in base 10
mpz_clear(num); // free memory for num

When I saw this question, I purpose to solve it, but at that moment I was very busy.
This last weekend I've could gain some prize hours of free time so I considered my pending challenge.
First of all, I suggest you to considered above response. I never use GMP library but I'm sure that it's better solution than a handmade code.
Also, you could be interest to analyze code of bc calculator; it can works with big numbers and I used to test my own code.
Ok, if you are still interested in a code do it by yourself (only with support C language and Standard C library) may be I can give you something.
Before all, a little bit theory. In basic numeric theory (modular arithmetic level) theres is an algorithm that inspire me to arrive at one solution; Multiply and Power algorithm to solve a^N module m:
Result := 1;
for i := k until i = 0
if n_i = 1 then Result := (Result * a) mod m;
if i != 0 then Result := (Result * Result) mod m;
end for;
Where k is number of digits less one of N in binary representation, and n_i is i binary digit. For instance (N is exponent):
N = 44 -> 1 0 1 1 0 0
k = 5
n_5 = 1
n_4 = 0
n_3 = 1
n_2 = 1
n_1 = 0
n_0 = 0
When we make a module operation, as an integer division, we can lose part of the number, so we only have to modify algorithm to don't miss relevant data.
Here is my code (take care that it is an adhoc code, strong dependency of may computer arch. Basically I play with data length of C language so, be carefully because my data length could not be the same):
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
enum { SHF = 31, BMASK = 0x1 << SHF, MODULE = 1000000000UL, LIMIT = 1024 };
unsigned int scaleBigNum(const unsigned short scale, const unsigned int lim, unsigned int *num);
unsigned int pow2BigNum(const unsigned int lim, unsigned int *nsrc, unsigned int *ndst);
unsigned int addBigNum(const unsigned int lim1, unsigned int *num1, const unsigned int lim2, unsigned int *num2);
unsigned int bigNum(const unsigned short int base, const unsigned int exp, unsigned int **num);
int main(void)
{
unsigned int *num, lim;
unsigned int *np, nplim;
int i, j;
for(i = 1; i < LIMIT; ++i)
{
lim = bigNum(i, i, &num);
printf("%i^%i == ", i, i);
for(j = lim - 1; j > -1; --j)
printf("%09u", num[j]);
printf("\n");
free(num);
}
return 0;
}
/*
bigNum: Compute number base^exp and store it in num array
#base: Base number
#exp: Exponent number
#num: Pointer to array where it stores big number
Return: Array length of result number
*/
unsigned int bigNum(const unsigned short int base, const unsigned int exp, unsigned int **num)
{
unsigned int m, lim, mem;
unsigned int *v, *w, *k;
//Note: mem has the exactly amount memory to allocate (dinamic memory version)
mem = ( (unsigned int) (exp * log10( (float) base ) / 9 ) ) + 3;
v = (unsigned int *) malloc( mem * sizeof(unsigned int) );
w = (unsigned int *) malloc( mem * sizeof(unsigned int) );
for(m = BMASK; ( (m & exp) == 0 ) && m; m >>= 1 ) ;
v[0] = (m) ? 1 : 0;
for(lim = 1; m > 1; m >>= 1)
{
if( exp & m )
lim = scaleBigNum(base, lim, v);
lim = pow2BigNum(lim, v, w);
k = v;
v = w;
w = k;
}
if(exp & 0x1)
lim = scaleBigNum(base, lim, v);
free(w);
*num = v;
return lim;
}
/*
scaleBigNum: Make an (num[] <- scale*num[]) big number operation
#scale: Scalar that multiply big number
#lim: Length of source big number
#num: Source big number (array of unsigned int). Update it with new big number value
Return: Array length of operation result
Warning: This method can write in an incorrect position if we don't previous reallocate num (if it's necessary). bigNum method do it for us
*/
unsigned int scaleBigNum(const unsigned short scale, const unsigned int lim, unsigned int *num)
{
unsigned int i;
unsigned long long int n, t;
for(n = 0, t = 0, i = 0; i < lim; ++i)
{
t = (n / MODULE);
n = ( (unsigned long long int) scale * num[i] );
num[i] = (n % MODULE) + t; // (n % MODULE) + t always will be smaller than MODULE
}
num[i] = (n / MODULE);
return ( (num[i]) ? lim + 1 : lim );
}
/*
pow2BigNum: Make a (dst[] <- src[] * src[]) big number operation
#lim: Length of source big number
#src: Source big number (array of unsigned int)
#dst: Destination big number (array of unsigned int)
Return: Array length of operation result
Warning: This method can write in an incorrect position if we don't previous reallocate num (if it's necessary). bigNum method do it for us
*/
unsigned int pow2BigNum(const unsigned int lim, unsigned int *src, unsigned int *dst)
{
unsigned int i, j;
unsigned long long int n, t;
unsigned int k, c;
for(c = 0, dst[0] = 0, i = 0; i < lim; ++i)
{
for(j = i, n = 0; j < lim; ++j)
{
n = ( (unsigned long long int) src[i] * src[j] );
k = i + j;
if(i != j)
{
t = 2 * (n % MODULE);
n = 2 * (n / MODULE);
// (i + j)
dst[k] = ( (k > c) ? ((c = k), 0) : dst[k] ) + (t % MODULE);
++k; // (i + j + 1)
dst[k] = ( (k > c) ? ((c = k), 0) : dst[k] ) + ( (t / MODULE) + (n % MODULE) );
++k; // (i + j + 2)
dst[k] = ( (k > c) ? ((c = k), 0) : dst[k] ) + (n / MODULE);
}
else
{
dst[k] = ( (k > c) ? ((c = k), 0) : dst[k] ) + (n % MODULE);
++k; // (i + j)
dst[k] = ( (k > c) ? ((c = k), 0) : dst[k] ) + (n / MODULE);
}
for(k = i + j; k < (lim + j); ++k)
{
dst[k + 1] += (dst[k] / MODULE);
dst[k] %= MODULE;
}
}
}
i = lim << 1;
return ((dst[i - 1]) ? i : i - 1);
}
/*
addBigNum: Make a (num2[] <- num1[] + num2[]) big number operation
#lim1: Length of source num1 big number
#num1: First source operand big number (array of unsigned int). Should be smaller than second
#lim2: Length of source num2 big number
#num2: Second source operand big number (array of unsigned int). Should be equal or greater than first
Return: Array length of operation result or 0 if num1[] > num2[] (dosen't do any op)
Warning: This method can write in an incorrect position if we don't previous reallocate num2
*/
unsigned int addBigNum(const unsigned int lim1, unsigned int *num1, const unsigned int lim2, unsigned int *num2)
{
unsigned long long int n;
unsigned int i;
if(lim1 > lim2)
return 0;
for(num2[lim2] = 0, n = 0, i = 0; i < lim1; ++i)
{
n = num2[i] + num1[i] + (n / MODULE);
num2[i] = n % MODULE;
}
for(n /= MODULE; n; ++i)
{
num2[i] += n;
n = (num2[i] / MODULE);
}
return (lim2 > i) ? lim2 : i;
}
To compile:
gcc -o bgn <name>.c -Wall -O3 -lm //Math library if you wants to use log func
To check result, use direct output as and input to bc. Easy shell script:
#!/bin/bash
select S in ` awk -F '==' '{print $1 " == " $2 }' | bc`;
do
0;
done;
echo "Test Finished!";
We have and array of unsigned int (4 bytes) where we store at each int of array a number of 9 digits ( % 1000000000UL ); hence num[0] we will have the first 9 digits, num[1] we will have digit 10 to 18, num[2]...
I use convencional memory to work but an improvement can do it with dinamic memory. Ok, but how length It could be the array? (or how many memory we need to allocate?). Using bc calculator (bc -l with mathlib) we can determine how many digits has a number:
l(a^N) / l(10) // Natural logarith to Logarithm base 10
If we know digits, we know amount integers we needed:
( l(a^N) / (9 * l(10)) ) + 1 // Truncate result
If you work with value such as (2^k)^N you can resolve it logarithm with this expression:
( k*N*l(2)/(9*l(10)) ) + 1 // Truncate result
to determine the exactly length of integer array. Example:
256^800 = 2^(8*800) ---> l(2^(8*800))/(9*l(10)) + 1 = 8*800*l(2)/(9*l(10)) + 1
The value 1000000000UL (10^9) constant is very important. A constant like 10000000000UL (10^10) dosen't work because can produce and indetected overflow (try what's happens with number 16^16 and 10^10 constant) and a constant more little such as 1000000000UL (10^8) are correct but we need to reserve more memory and do more steps. 10^9 is key constant for unsigned int of 32 bits and unsigned long long int of 64 bits.
The code has two parts, Multiply (easy) and Power by 2 (more hard). Multiply is just multiplication and scale and propagate the integer overflow. It take the principle of associative property in math to do exactly the inverse principle, so if k(A + B + C) we want kA + kB + kC where number will be k*A*10^18 + k*B*10^9 + kC. Obiously, kC operation can generate a number bigger than 999 999 999, but never more bigger than 0xFF FF FF FF FF FF FF FF. A number bigger than 64 bits can never occur in a multiplication because C is an unsigned integer of 32 bits and k is a unsigned short of 16 bits. In worts case, we will have this number:
k = 0x FF FF;
C = 0x 3B 9A C9 FF; // 999999999
n = k*C = 0x 3B 9A | 8E 64 36 01;
n % 1000000000 = 0x 3B 99 CA 01;
n / 1000000000 = 0x FF FE;
After Mul kB we need to add 0x FF FE from last multiplication of C ( B = kB + (C / module) ), and so on (we have 18 bits arithmetic offset, enough to guarantee correct values).
Power is more complex but is in essencial, the same problem (multiplication and add), so I give some tricks about code power:
Data types are important, very important
If you try to multiplication an unsigned integer with unsigned integer, you get another unsigned integer. Use explicit cast to get unsigned long long int and don't lose data.
Always use unsigned modifier, dont forget it!
Power by 2 can directly modify 2 index ahead of current index
gdb is your friend
I've developed another method that add big numbers. These last I don't prove so much but I think it works well. Don't be cruels with me if it has a bug.
...and that's all!
PD1: Developed in a
Intel(R) Pentium(R) 4 CPU 1.70GHz
Data length:
unsigned short: 2
unsigned int: 4
unsigned long int: 4
unsigned long long int: 8
Numbers such as 256^1024 it spend:
real 0m0.059s
user 0m0.033s
sys 0m0.000s
A bucle that's compute i^i where i goes to i = 1 ... 1024:
real 0m40.716s
user 0m14.952s
sys 0m0.067s
For numbers such as 65355^65355, spent time is insane.
PD2: My response is so late but I hope my code it will be usefull.
PD3: Sorry, explain me in english is one of my worst handicaps!
Last update: I just have had an idea that with same algorithm but other implementation, improve response and reduce amount memory to use (we can use the completely bits of unsigned int). The secret: n^2 = n * n = n * (n - 1 + 1) = n * (n - 1) + n.
(I will not do this new code, but if someone are interested, may be after exams... )

I don't know if you still need a solution, but I wrote an article about this problem. It shows a very simple algorithm which can be used to convert an arbitrary long number with base X to a corresponding number of base Y. The algorithm is written in Python, but it is really only a few lines long and doesn't use any Python magic. I needed such an algorithm for a C implementation, too, but decided to describe it using Python for two reasons. First, Python is very readable by anyone who understands algorithms written in a pseudo programming language and, second, I am not allowed to post the C version, because it I did it for my company. Just have a look and you will see how easy this problem can be solved in general. An implementation in C should be straight forward...

Here is a function that does what you want:
#include <math.h>
#include <stddef.h> // for size_t
double getval(unsigned char *arr, size_t len)
{
double ret = 0;
size_t cur;
for(cur = 0; cur < len; cur++)
ret += arr[cur] * pow(256, cur);
return ret;
}
That looks perfectly readable to me. Just pass the unsigned char * array you want to convert and the size. Note that it won't be perfect - for arbitrary precision, I suggest looking into the GNU MP BigNum library, as has been suggested already.
As a bonus, I don't like your storing your numbers in little-endian order, so here's a version if you want to store base-256 numbers in big-endian order:
#include <stddef.h> // for size_t
double getval_big_endian(unsigned char *arr, size_t len)
{
double ret = 0;
size_t cur;
for(cur = 0; cur < len; cur++)
{
ret *= 256;
ret += arr[cur];
}
return ret;
}
Just things to consider.

It may be too late or too irrelevant to make this suggestion, but could you store each byte as two base 10 digits (or one base 100) instead of one base 256? If you haven't implemented division yet, then that implies all you have is addition, subtraction, and maybe multiplication; those shouldn't be too hard to convert. Once you've done that, printing it would be trivial.

As I was not satisfied with the other answers provided, I decided to write an alternative solution myself:
#include <stdlib.h>
#define BASE_256 256
char *largenum2str(unsigned char *num, unsigned int len_num)
{
int temp;
char *str, *b_256 = NULL, *cur_num = NULL, *prod = NULL, *prod_term = NULL;
unsigned int i, j, carry = 0, len_str = 1, len_b_256, len_cur_num, len_prod, len_prod_term;
//Get 256 as an array of base-10 chars we'll use later as our second operand of the product
for ((len_b_256 = 0, temp = BASE_256); temp > 0; len_b_256++)
{
b_256 = realloc(b_256, sizeof(char) * (len_b_256 + 1));
b_256[len_b_256] = temp % 10;
temp = temp / 10;
}
//Our first operand (prod) is the last element of our num array, which we'll convert to a base-10 array
for ((len_prod = 0, temp = num[len_num - 1]); temp > 0; len_prod++)
{
prod = realloc(prod, sizeof(*prod) * (len_prod + 1));
prod[len_prod] = temp % 10;
temp = temp / 10;
}
while (len_num > 1) //We'll stay in this loop as long as we still have elements in num to read
{
len_num--; //Decrease the length of num to keep track of the current element
//Convert this element to a base-10 unsigned char array
for ((len_cur_num = 0, temp = num[len_num - 1]); temp > 0; len_cur_num++)
{
cur_num = (char *)realloc(cur_num, sizeof(char) * (len_cur_num + 1));
cur_num[len_cur_num] = temp % 10;
temp = temp / 10;
}
//Multiply prod by 256 and save that as prod_term
len_prod_term = 0;
prod_term = NULL;
for (i = 0; i < len_b_256; i++)
{ //Repeat this loop 3 times, one for each element in {6,5,2} (256 as a reversed base-10 unsigned char array)
carry = 0; //Set the carry to 0
prod_term = realloc(prod_term, sizeof(*prod_term) * (len_prod + i)); //Allocate memory to save prod_term
for (j = i; j < (len_prod_term); j++) //If we have digits from the last partial product of the multiplication, add it here
{
prod_term[j] = prod_term[j] + prod[j - i] * b_256[i] + carry;
if (prod_term[j] > 9)
{
carry = prod_term[j] / 10;
prod_term[j] = prod_term[j] % 10;
}
else
{
carry = 0;
}
}
while (j < (len_prod + i)) //No remaining elements of the former prod_term, so take only into account the results of multiplying mult * b_256
{
prod_term[j] = prod[j - i] * b_256[i] + carry;
if (prod_term[j] > 9)
{
carry = prod_term[j] / 10;
prod_term[j] = prod_term[j] % 10;
}
else
{
carry = 0;
}
j++;
}
if (carry) //A carry may be present in the last term. If so, allocate memory to save it and increase the length of prod_term
{
len_prod_term = j + 1;
prod_term = realloc(prod_term, sizeof(*prod_term) * (len_prod_term));
prod_term[j] = carry;
}
else
{
len_prod_term = j;
}
}
free(prod); //We don't need prod anymore, prod will now be prod_term
prod = prod_term;
len_prod = len_prod_term;
//Add prod (formerly prod_term) to our current number of the num array, expressed in a b-10 array
carry = 0;
for (i = 0; i < len_cur_num; i++)
{
prod[i] = prod[i] + cur_num[i] + carry;
if (prod[i] > 9)
{
carry = prod[i] / 10;
prod[i] -= 10;
}
else
{
carry = 0;
}
}
while (carry && (i < len_prod))
{
prod[i] = prod[i] + carry;
if (prod[i] > 9)
{
carry = prod[i] / 10;
prod[i] -= 10;
}
else
{
carry = 0;
}
i++;
}
if (carry)
{
len_prod++;
prod = realloc(prod, sizeof(*prod) * len_prod);
prod[len_prod - 1] = carry;
carry = 0;
}
}
str = malloc(sizeof(char) * (len_prod + 1)); //Allocate memory for the return string
for (i = 0; i < len_prod; i++) //Convert the numeric result to its representation as characters
{
str[len_prod - 1 - i] = prod[i] + '0';
}
str[i] = '\0'; //Terminate our string
free(b_256); //Free memory
free(prod);
free(cur_num);
return str;
}
The idea behind it all derives from simple math. For any base-256 number, its base-10 representation can be calculated as:
num[i]*256^i + num[i-1]*256^(i-1) + (···) + num[2]*256^2 + num[1]*256^1 + num[0]*256^0
which expands to:
(((((num[i])*256 + num[i-1])*256 + (···))*256 + num[2])*256 + num[1])*256 + num[0]
So all we have to do is to multiply, step-by step, each element of the number array by 256 and add to it the next element, and so on... That way we can get the base-10 number.