I have got a vector of 16 char that contains either 0 or 1 and I would like to add each 4 non-overlapping elements using SSE.
A simplified version of the code without vectorization looks like this
char a[16]={1,0,0,1 ,0,0,1,0, 0,1,0,0, 0,0,0,1};
char sum1 = a[0] + a[1] + a[2] + a[3];
char sum2 = a[4] + a[5] + a[6] + a[7];
char sum3 = a[8] + a[9] + a[10] + a[11];
char sum4 = a[12] + a[13] + a[14] + a[15];
In my application, the length of the vector is much larger than 16 but it is always a multiple of 16. I get this vector using other SSE logical operations that provide me with a good speedup so I would like to know how I could vectorise those additions. Below is the full code where vec1, vec2 and vec3 have the same length n (multiple of 16) and vector counts is n/4.
void myfunc( const char *vec1, const char *vec2, char *vec3, int *counts, int n){
__m128i *r1 = (__m128i*)vec1;
__m128i *r2 = (__m128i*)vec2;
char *a = vec3;
char temp[16] __attribute__ ((aligned (16)));
for ( int i = 0; i < n; i+=16, r1++, r2++, a+=16 ) {
_mm_store_si128((__m128i*)a, _mm_and_si128(*r1, *r2));
_mm_store_si128((__m128i*)temp, _mm_or_si128(*r1, *r2));
char size = a[0]+a[1]+a[2]+a[3];
if( size == 0 ){
memcpy(a, temp, 4*sizeof(char));
counts[k]++;
}
k++;
size = a[4]+a[5]+a[6]+a[7];
if( size == 0 ){
memcpy(a+4, temp+4, 4*sizeof(char));
counts[k]++;
}
k++;
size = a[8]+a[9]+a[10]+a[11];
if( size == 0 ){
memcpy(a+8, temp+8, 4*sizeof(char));
counts[k]++;
}
k++;
size = a[12]+a[13]+a[14]+a[15];
if( size == 0 ){
memcpy(a+12, temp+12, 4*sizeof(char));
counts[k]++;
}
k++;
}
}
Any help would be greatly appreciated.
Instead of comparing bytes you can compare integers. Load four integers from a, temp and counts into SSE registers ( call them a4, tmp4, and counts4 in the code below). Then you can process four integers at once with SSE. This assumes that counts is a int32 array.
For example let's assume a4 = {0,3,0,4}, counts4 = {1,2,3,4}, and tmp4 = {5,6,7,8}. In the code below
test will be {-1, 0, -1, 0}. Subtracting that from counts gives counts = {2,2,4,4}. Logical AND of test with tmp4 is {5,0,7,0}. Adding that to a4 gives a4 = {5,3,7,4}. This should do what you want.
for ( int i = 0; i < n; i+=16, r1++, r2++, a+=16, k+=4 ) {
_mm_store_si128((__m128i*)a, _mm_and_si128(*r1, *r2));
_mm_store_si128((__m128i*)temp, _mm_or_si128(*r1, *r2));
__m128i a4 = _mm_load_si128((__m128i*)a);
__m128i tmp4 = _mm_load_si128((__m128i*)tmp);
__m128i counts4 = _mm_load_si128((__m128i*)&counts[k]);
__m128i test = _mm_cmpeq_epi32(_mm_set1_epi32(0), a4);
a4 = _mm_add_epi32(a4, _mm_and_si128(tmp4,test));
counts4 = _mm_sub_epi32(counts4, test);
_mm_store_si128((__m128i*)a, a4);
_mm_store_si128((__m128i*)counts, counts4);
}
Related
I am trying to make a convolution algorithm for grayscale bmp image. The below code is from Image processing course on Udemy, but the explanation about the variables and formula used was little short. The issue is in 2D discrete convolution part, im not able to understand the formula implemented here
struct Mask{
int Rows;
int Cols;
unsigned char *Data;
};
int main()
{
int imgWidth, imgHeight, imgBitDepth;
unsigned char imgHeader[BMP_HEADER_SIZE];
unsigned char imgColorTable[BMP_COLOR_TABLE_SIZE];
unsigned char imgBuffer[CUSTOM_IMG_SIZE];
unsigned char imgBuffer2[CUSTOM_IMG_SIZE];
const char imgName[] = "images/cameraman.bmp";
const char newImgName[] = "images/cameraman_new.bmp";
struct Mask lpMask;
signed char *tmp;
int i;
lpMask.Cols = lpMask.Rows = 5;
lpMask.Data = (unsigned char *)malloc(25);
/* -1 -1 -1 -1 -1
-1 -1 -1 -1 -1
-1 -1 24 -1 -1
-1 -1 -1 -1 -1
-1 -1 -1 -1 -1*/
//set all mask values to -1
tmp = (signed char *)lpMask.Data;
for (i = 0; i < 25; ++i)
{
*tmp = -1;
++tmp;
}
//set middle value to 24
tmp = (signed char *)lpMask.Data + 13;
*tmp = 24;
imageReader(imgName, &imgHeight, &imgWidth, &imgBitDepth, imgHeader, imgColorTable, imgBuffer);
Convolve(imgHeight, imgWidth, &lpMask, imgBuffer, imgBuffer2);
imageWriter(newImgName, imgHeader, imgColorTable, imgBuffer2, imgBitDepth);
printf("Success!\n");
return 0;
}
//2D Discrete Convolution
void Convolve(int imgRows, int imgCols, struct Mask *myMask, unsigned char *input_buf, unsigned char *output_buf)
{
long i, j, m, n, idx, jdx;
int ms, im, val;
unsigned char *tmp;
//outer summation loop - image
for (i = 0; i < imgRows; ++i)
//inner summation loop - image
for (j = 0; j < imgCols; ++j)
{
val = 0;
//outer summation loop - mask
for (m = 0; m < myMask->Rows; ++m)
//inner summation loop - mask
for (n = 0; n < myMask->Cols; ++n)
{
//Issue in understanding below part
ms = (signed char)*(myMask->Data + m * myMask->Rows + n);
// index of input img, used for checking boundary
idx = i - m;
jdx = j - n;
if (idx >= 0 && jdx >= 0) //ignore input samples which are out of bound
im = *(input_buf + idx * imgRows + jdx);
val += ms * im;
}
//truncate values to remain inside 0to255 range
if (val > 255) val = 255;
if (val < 0) val = 0;
tmp = output_buf + i * imgRows + j;
*tmp = (unsigned char)val;
}
}
Here in 3 lines, the formula used is similar and most difficult to understand its implementation, if possible please help out with understanding these codes logic or what they are doing exactly:
ms = (signed char)*(myMask->Data + m * myMask->Rows + n);
im = *(input_buf + idx * imgRows + jdx);
tmp = output_buf + i * imgRows + j;
For formula/pseudocode used, check Convolution section on following website:- https://en.wikipedia.org/wiki/Kernel_(image_processing)
OR
g(x,y) = ∑k= -n2 to n2 ∑j= -m2 to m2 h(j,k) * f(x-j, y-k) ,
where m2 = half of mask's width & n2 = half of mask's height
OR
The expressions you ask about are simply the computation of a location of particular pixel indexed in 2 dimensions (row, column), stored in a flat memory buffer.
For example, ms = (signed char)*(myMask->Data + m * myMask->Rows + n); start with the mask image data buffer itself, myMask->Data, which is a pointer. The first row of data shows up first, followed by the second row. So to access the pixel at row m, column n, you first have to skip m rows of data, which is the size of a row * m. Then you have to skip n pixels inside the row. Once the location of the pixel is computed, it is dereferenced with *.
The only complaint I have for this example code is the name myMask->Rows. In this case, m represents a row index, and to compute the offset, it is multiplied by the size of a row, which should be the number of columns in the image, not the number of rows. So that reference should instead be myMask->Cols.
I am given a array of lowercase characters (up to 1.5Gb) and a character c. And I want to find how many occurrences are of the character c using AVX instructions.
unsigned long long char_count_AVX2(char * vector, int size, char c){
unsigned long long sum =0;
int i, j;
const int con=3;
__m256i ans[con];
for(i=0; i<con; i++)
ans[i]=_mm256_setzero_si256();
__m256i Zer=_mm256_setzero_si256();
__m256i C=_mm256_set1_epi8(c);
__m256i Assos=_mm256_set1_epi8(0x01);
__m256i FF=_mm256_set1_epi8(0xFF);
__m256i shield=_mm256_set1_epi8(0xFF);
__m256i temp;
int couter=0;
for(i=0; i<size; i+=32){
couter++;
shield=_mm256_xor_si256(_mm256_cmpeq_epi8(ans[0], Zer), FF);
temp=_mm256_cmpeq_epi8(C, *((__m256i*)(vector+i)));
temp=_mm256_xor_si256(temp, FF);
temp=_mm256_add_epi8(temp, Assos);
ans[0]=_mm256_add_epi8(temp, ans[0]);
for(j=1; j<con; j++){
temp=_mm256_cmpeq_epi8(ans[j-1], Zer);
shield=_mm256_and_si256(shield, temp);
temp=_mm256_xor_si256(shield, FF);
temp=_mm256_add_epi8(temp, Assos);
ans[j]=_mm256_add_epi8(temp, ans[j]);
}
}
for(j=con-1; j>=0; j--){
sum<<=8;
unsigned char *ptr = (unsigned char*)&(ans[j]);
for(i=0; i<32; i++){
sum+=*(ptr+i);
}
}
return sum;
}
I'm intentionally leaving out some parts, which you need to figure out yourself (e.g. handling lengths that aren't a multiple of 4*255*32 bytes), but your most inner loop should look something like the one starting with for(int i...):
_mm256_cmpeq_epi8 will get you a -1 in each byte, which you can use as an integer. If you subtract that from a counter (using _mm256_sub_epi8) you can directly count up to 255 or 128. The inner loop contains just these two intrinsics. You have to stop and
#include <immintrin.h>
#include <stdint.h>
static inline
__m256i hsum_epu8_epu64(__m256i v) {
return _mm256_sad_epu8(v, _mm256_setzero_si256()); // SAD against zero is a handy trick
}
static inline
uint64_t hsum_epu64_scalar(__m256i v) {
__m128i lo = _mm256_castsi256_si128(v);
__m128i hi = _mm256_extracti128_si256(v, 1);
__m128i sum2x64 = _mm_add_epi64(lo, hi); // narrow to 128
hi = _mm_unpackhi_epi64(sum2x64, sum2x64);
__m128i sum = _mm_add_epi64(hi, sum2x64); // narrow to 64
return _mm_cvtsi128_si64(sum);
}
unsigned long long char_count_AVX2(char const* vector, size_t size, char c)
{
__m256i C=_mm256_set1_epi8(c);
// todo: count elements and increment `vector` until it is aligned to 256bits (=32 bytes)
__m256i const * simd_vector = (__m256i const *) vector;
// *simd_vector is an alignment-required load, unlike _mm256_loadu_si256()
__m256i sum64 = _mm256_setzero_si256();
size_t unrolled_size_limit = size - 4*255*32 + 1;
for(size_t k=0; k<unrolled_size_limit ; k+=4*255*32) // outer loop: TODO
{
__m256i counter[4]; // multiple counter registers to hide latencies
for(int j=0; j<4; j++)
counter[j]=_mm256_setzero_si256();
// inner loop: make sure that you don't go beyond the data you can read
for(int i=0; i<255; ++i)
{ // or limit this inner loop to ~22 to avoid branch mispredicts
for(int j=0; j<4; ++j)
{
counter[j]=_mm256_sub_epi8(counter[j], // count -= 0 or -1
_mm256_cmpeq_epi8(*simd_vector, C));
++simd_vector;
}
}
// only need one outer accumulator: OoO exec hides the latency of adding into it
sum64 = _mm256_add_epi64(sum64, hsum_epu8_epu64(counter[0]));
sum64 = _mm256_add_epi64(sum64, hsum_epu8_epu64(counter[1]));
sum64 = _mm256_add_epi64(sum64, hsum_epu8_epu64(counter[2]));
sum64 = _mm256_add_epi64(sum64, hsum_epu8_epu64(counter[3]));
}
uint64_t sum = hsum_epu64_scalar(sum64);
// TODO add up remaining bytes with sum.
// Including a rolled-up vector loop before going scalar
// because we're potentially a *long* way from the end
// Maybe put some logic into the main loop to shorten the 255 inner iterations
// if we're close to the end. A little bit of scalar work there shouldn't hurt every 255 iters.
return sum;
}
Godbolt link: https://godbolt.org/z/do5e3- (clang is slightly better than gcc at unrolling the most inner loop: gcc includes some useless vmovdqa instructions that will bottleneck the front-end if the data is hot in L1d cache, preventing us from running close to 2x 32-byte loads per clock)
If you don't insist on using only SIMD instructions, you can make use
of the VPMOVMSKB instruction in combination with the POPCNT instruction. The former combines the highest bits of each byte into a 32-bit integer mask and the latter counts the 1 bits in this integer (=the count of char matches).
int couter=0;
for(i=0; i<size; i+=32) {
...
couter +=
_mm_popcnt_u32(
(unsigned int)_mm256_movemask_epi8(
_mm256_cmpeq_epi8( C, *((__m256i*)(vector+i) ))
)
);
...
}
I haven't tested this solution, but you should get the gist.
Probably the fastest: memcount_avx2 and memcount_sse2
size_t memcount_avx2(const void *s, int c, size_t n)
{
__m256i cv = _mm256_set1_epi8(c),
zv = _mm256_setzero_si256(),
sum = zv, acr0,acr1,acr2,acr3;
const char *p,*pe;
for(p = s; p != (char *)s+(n- (n % (252*32)));)
{
for(acr0 = acr1 = acr2 = acr3 = zv, pe = p+252*32; p != pe; p += 128)
{
acr0 = _mm256_sub_epi8(acr0, _mm256_cmpeq_epi8(cv, _mm256_lddqu_si256((const __m256i *)p)));
acr1 = _mm256_sub_epi8(acr1, _mm256_cmpeq_epi8(cv, _mm256_lddqu_si256((const __m256i *)(p+32))));
acr2 = _mm256_sub_epi8(acr2, _mm256_cmpeq_epi8(cv, _mm256_lddqu_si256((const __m256i *)(p+64))));
acr3 = _mm256_sub_epi8(acr3, _mm256_cmpeq_epi8(cv, _mm256_lddqu_si256((const __m256i *)(p+96))));
__builtin_prefetch(p+1024);
}
sum = _mm256_add_epi64(sum, _mm256_sad_epu8(acr0, zv));
sum = _mm256_add_epi64(sum, _mm256_sad_epu8(acr1, zv));
sum = _mm256_add_epi64(sum, _mm256_sad_epu8(acr2, zv));
sum = _mm256_add_epi64(sum, _mm256_sad_epu8(acr3, zv));
}
for(acr0 = zv; p+32 < (char *)s + n; p += 32)
acr0 = _mm256_sub_epi8(acr0, _mm256_cmpeq_epi8(cv, _mm256_lddqu_si256((const __m256i *)p)));
sum = _mm256_add_epi64(sum, _mm256_sad_epu8(acr0, zv));
size_t count = _mm256_extract_epi64(sum, 0)
+ _mm256_extract_epi64(sum, 1)
+ _mm256_extract_epi64(sum, 2)
+ _mm256_extract_epi64(sum, 3);
while(p != (char *)s + n)
count += *p++ == c;
return count;
}
Benchmark skylake i7-6700 - 3.4GHz - gcc 8.3:
memcount_avx2 : 28 GB/s
memcount_sse: 23 GB/s
char_count_AVX2 : 23 GB/s (from post)
Let ib be the input base and ob the output base. str is the ASCII representation of some arbitrary large integer x. I need to define f such as:
f(str="1234567890", ib=10, ob=16) = {4, 9, 9, 6, 0, 2, 13, 2}
... where the return type of f is an int array containing the base ob digits of this integer. We assume that 2 >= ob <= MAX_INT and 2 >= ib <= 10, and str will always be a valid string (no negative needed).
Something to get OP started, but enough to leave OP to enjoy the coding experience.
// form (*d) = (*d)*a + b
static void mult_add(int *d, size_t *width, int ob, int a, int b) {
// set b as the carry
// for *width elements,
// x = (Multiply d[] by `a` (using wider than int math) and add carry)
// d[] = x mod ob
// carry = x/ob
// while (carry <> 0)
// widen d
// x = carry
// d[] = x mod ob
// carry = x/ob
}
int *ql_f(const char *src, int ib, int ob) {
// Validate input
assert(ib >= 2 && ib <= 10);
assert(ob >= 2 && ob <= INT_MAX);
assert(src);
// Allocate space
size_t length = strlen(src);
// + 2 + 4 is overkill, OP to validate and right-size later
size_t dsize = (size_t) (log(ib)/log(ob)*length + 2 + 4);
int *d = malloc(sizeof *d * dsize);
assert(d);
// Initialize d to zero
d[0] = 0;
size_t width = 1;
while (*src) {
mult_add(d, &width, ob, ib, *src - '0');
src++;
}
// add -1 to end, TBD code
return d;
}
I wrote this with older specifications, so it's not valid any more, but it might be useful as a starting point.
The code can handle long long magnitudes. Going to arbitrary precision numbers in C is a big leap!
Note using -1 as the ending marker instead of 0. Can accept ib from 2 to 36 and any ob.
Includes example main.
Function f is not reentrant as-is. To make it thread-safe, it could allocate the required memory then return a pointer to it. The simplest protocol would be having the caller responsible for freeing the memory afterwards.
#include <stdlib.h>
#include <limits.h>
#include <stdio.h>
int *f(const char *str, int ib, int ob) {
static int result[CHAR_BIT * sizeof(long long) + 1];
int i = sizeof(result) / sizeof(int) - 1;
long long l = strtoll(str, NULL, ib);
result[i--] = -1;
while (l) {
result[i] = l % ob;
l /= ob;
i--;
}
return result + i + 1;
}
int main()
{
int *x = f("1234567890", 16, 10);
while (*x > -1) {
printf("%d ", *x);
x++;
}
return 0;
}
suppose I have n1 and n2 I want to multiply them
for example I have array
n1={1,2,3};
and in
n2={5,6}
they are two integers in n1 we have the 123 and in n2 56
123*56=6888
then in result I should have
result = {6,8,8,8}
here is the incomplete algorithm which I thought
for(i in n1 bigger array)
for(j in n2 smaller one)
{
mult=n1[i]*n2[j]
mult+= carry;
if(mult>=10)
{
carry = (mult/10);
mult-= (carry*10);
}
}
}
How can I write it? I don't know the place of store
after finishing the insider loop I should store num in array and then compute again and...
How should I write it? I searched the whole of overflow here but I didn't find about it in c code
The Goal is to Compute the Large numbers Integer Numbers has 8 Bytes,in other words 64 bits so they can store 2pow64-1 which is 19 digits now this will help to compute very larger than 19 digits
It would be slightly easier if your digit-arrays were little-endian. Then your example multiplication would look
3 2 1 * 6 5
---------------
18 12 6
15 10 5
---------------
18 27 16 5 // now propagate carries
8 28 16 5
8 8 18 5
8 8 8 6
============
The product of n1[i] and n2[j] would contribute to result[i+j]. The main loop could roughly look like
for (i = 0; i < l1; ++i) // l1 is length of n1
{
for (j = 0; j < l2; ++j) // l2 is length of n2
{
result[i+j] += n1[i]*n2[j];
}
}
// now carry propagation
You see that the result must be at least (l1-1) + (l2-1) + 1 long, since the product of the most significant digits goes int result[(l1-1) + (l2-1)]. On the other hand, n1 < 10^l1 and n2 < 10^l2, so the product is < 10^(l1+l2) and you need at most l1+l2 digits.
But if you're working with char (signed or unsigned), that will quickly overflow in each digit, since (for k <= min(l1-1,l2-1)) k+1 products of two digits (each can be as large as 81) contribute to digit k of the product.
So it's better to perform the multiplication grouped according to the result digit, accumulating in a larger type, and doing carry propagation on writing the result digit. With little-endian numbers
char *mult(char *n1, size_t l1, char *n2, size_t l2, size_t *rl)
{
// allocate and zero-initialise, may be one more digit than needed
char *result = calloc(l1+l2+1,1);
*rl = l1 + l2;
size_t k, i, lim = l1+l2-1;
for (k = 0; k < lim; ++k)
{
unsigned long accum = result[k];
for (i = (k < l2) ? 0 : k-(l2-1); i <= k && i < l1; ++i)
{
accum += (n1[i] - '0') * (n2[k-i] - '0');
}
result[k] = accum % 10 + '0';
accum /= 10;
i = k+1;
while(accum > 0)
{
result[i] += accum % 10;
accum /= 10;
++i;
}
}
if (result[l1+l2-1] == 0)
{
*rl -= 1;
char *real_result = calloc(l1+l2,1);
for (i = 0; i < l1+l2-1; ++i)
{
real_result[i] = result[i];
}
free(result);
return real_result;
}
else
{
result[l1+l2-1] += '0';
return result;
}
}
For big-endian numbers, the indexing has to be modified - you can figure that out yourself, hopefully - but the principle remains the same.
Indeed, the result isn't much different after tracking indices with pencil and paper:
char *mult(char *n1, size_t l1, char *n2, size_t l2, size_t *rl)
{
// allocate and zero-initialise, may be one more digit than needed
// we need (l1+l2-1) or (l1+l2) digits for the product and a 0-terminator
char *result = calloc(l1+l2+1,1);
*rl = l1 + l2;
size_t k, i, lim = l1+l2-1;
// calculate the product from least significant digit to
// most significant, least significant goes into result[l1+l2-1],
// the digit result[0] can only be nonzero by carry propagation.
for (k = lim; k > 0; --k)
{
unsigned long accum = result[k]; // start with carry
for (i = (k < l2) ? 0 : k-l2; i < k && i < l1; ++i)
{
accum += (n1[i] - '0') * (n2[k-1-i] - '0');
}
result[k] = accum % 10 + '0';
accum /= 10;
i = k-1;
while(accum > 0)
{
result[i] += accum % 10;
accum /= 10;
--i;
}
}
if (result[0] == 0) // no carry in digit 0, we allocated too much
{
*rl -= 1;
char *real_result = calloc(l1+l2,1);
for (i = 0; i < l1+l2-1; ++i)
{
real_result[i] = result[i+1];
}
free(result);
return real_result;
}
else
{
result[0] += '0'; // make it an ASCII digit
return result;
}
}
Edit: added 0-terminators
Note: these are not NUL-terminated (unsigned) char arrays, so we need to keep length information (that's good to do anyway), hence it would be better to store that info together with the digit array in a struct. Also, as written it only works for positive numbers. Dealing with negative numbers is awkward if you only have raw arrays, so another point for storing additional info.
Keeping the digits as '0' + value doesn't make sense for the computations, it is only convenient for printing, but that only if they were NUL-terminated arrays. You may want to add a slot for the NUL-terminator then. In that case, the parameter rl in which we store the length of the product is not strictly necessary.
Definitely an interesting problem.
Here was my thought:
For the given array, append each value to the end of a string. Thus you construct a string of the numbers in order. {1,2,3} = "123"
Then, you use a "ToInteger" method that you can find in one of the C libraries. Now you have your number to multiply with.
With this logic, you can probably look up how the "ToInteger" or "ToString" methods work with numbers, which would lead to an answer.
Think how you would do it on paper, since you are simulating multiplying two decimal numbers. For starters, I think you'd go from least significant to most significant digit, so you'd be counting down the indexes (2, 1, 0 for the larger array; 1, 0 for the smaller). Also, you'd somehow have to arrange that when you multiply by n2[0] (the 5 in 56), you start adding at the tens place, not the units.
You won't find complete C code for your problem at SO. Your first approach isn't that bad. You could do the following:
Multiply n1 and n2, conversion is done by mulitplication and addition, i. e. a{1,2,3} -> 1*100 + 2*10 + 3*1, easy to implement
Count the digits of your multiplication result (use division inside a loop)
While looping through the digits you can store them back into another array
If you can't or if you don't want to deal with dynamic array allocation, then think about how big your array for storage must be beforehand and perform a static allocation.
Edit
Based on the discussion another approach:
Suppose, that r = n1 * n2
Create a n*m 2D array, where
n = number of digits in n2
m = number of digits in n1 + 1
Within a loop multiply each digit of n1 with one of the elements of n2, store the result in the array, store the result per-digit in the 2D-array, don't forget to add the carry to each digit
Repeat 2 with all other digits of n2
Now the array is filled and you'll have to add each digits like you would do it on paper, store each result within a target array, take care of the carry again
There is one thing left in the algorithm: Determine the size of the target array, based on the informations within the intermediate array, you can think about this by using pencil and paper ;)
This code isn't optimized, nor does it account for generic lengths of arrays/numbers, but it should give you the general idea of how to implement the algorithm:
(This is similar to string-to-int or int-to-string algorithms, just add the ASCII offset to each item of the array and you have it.)
#include <stdio.h>
#include <stdint.h>
#define N1_N 3
#define N2_N 2
#define MAX_N 4 /* maximum array length allowed */
void print_array (const uint8_t* array, size_t size);
uint32_t array_to_ulong (const uint8_t* array, size_t size);
size_t ulong_to_array (uint8_t* array, size_t size, uint32_t val);
int main()
{
uint8_t n1[N1_N] = {1,2,3};
uint8_t n2[N2_N] = {5,6};
uint8_t n3[MAX_N];
size_t n3_size = MAX_N;
uint32_t n1_int;
uint32_t n2_int;
uint32_t result;
print_array(n1, N1_N);
printf(" * ");
print_array(n2, N2_N);
n1_int = array_to_ulong (n1, N1_N);
n2_int = array_to_ulong (n2, N2_N);
result = n1_int * n2_int;
printf(" = %d = ", result);
n3_size = ulong_to_array (n3, n3_size, result);
print_array(n3, n3_size);
getchar();
return 0;
}
void print_array (const uint8_t* array, size_t size)
{
size_t i;
printf("{");
for(i=0; i<size; i++)
{
printf("%d", array[i]);
if(i != size-1)
{
printf(", ");
}
}
printf("}");
}
uint32_t array_to_ulong (const uint8_t* array, size_t size)
{
uint32_t result = 0;
uint32_t multiplier = 1;
size_t i;
for(i=1; i<=size; i++)
{
result += array[size-i] * multiplier;
multiplier *= 10;
}
return result;
}
size_t ulong_to_array (uint8_t* array, size_t size, uint32_t val)
{
size_t i;
for(i=1; i<=size && val!=0; i++)
{
array[size-i] = val % 10;
val /= 10;
}
return i-1;
}
12345 * 6789 is:
12345 * 6 * 1000 +
12345 * 7 * 100 +
12345 * 8 * 10 +
12345 * 9 * 1
and that is:
1 * 6*1000 * 10000 + 2 * 6*1000 * 1000 + 3 * 6*1000 * 100 + 4 * 6*1000 * 10 + 5 * 6*1000 * 1 +
1 * 7*100 * 10000 + 2 * 7*100 * 1000 + 3 * 7*100 * 100 + 4 * 7*100 * 10 + 5 * 7*100 * 1 +
1 * 8*10 * 10000 + 2 * 8*10 * 1000 + 3 * 8*10 * 100 + 4 * 8*10 * 10 + 5 * 8*10 * 1 +
1 * 9*1 * 10000 + 2 * 9*1 * 1000 + 3 * 9*1 * 100 + 4 * 9*1 * 10 + 5 * 9*1 * 1
so the algorith is multiply each value by each value and add (cumulate) it to the appropriate result array element (1000 is 10^3 so array element 3 (array starting by zero)).
then move thru the result array and shift for results bigger than 10 the div by ten to the left (starting by the far right)
#include<stdio.h>
#include<math.h>
#include<stdlib.h>
#include<string.h>
#define MAX 10000
char * multiply(char [],char[]);
int main(){
char a[MAX];
char b[MAX];
char *c;
int la,lb;
int i;
printf("Enter the first number : ");
scanf("%s",a);
printf("Enter the second number : ");
scanf("%s",b);
printf("Multiplication of two numbers : ");
c = multiply(a,b);
printf("%s",c);
return 0;
}
char * multiply(char a[],char b[]){
static char mul[MAX];
char c[MAX];
char temp[MAX];
int la,lb;
int i,j,k=0,x=0,y;
long int r=0;
long sum = 0;
la=strlen(a)-1;
lb=strlen(b)-1;
for(i=0;i<=la;i++){
a[i] = a[i] - 48;
}
for(i=0;i<=lb;i++){
b[i] = b[i] - 48;
}
for(i=lb;i>=0;i--){
r=0;
for(j=la;j>=0;j--){
temp[k++] = (b[i]*a[j] + r)%10;
r = (b[i]*a[j]+r)/10;
}
temp[k++] = r;
x++;
for(y = 0;y<x;y++){
temp[k++] = 0;
}
}
k=0;
r=0;
for(i=0;i<la+lb+2;i++){
sum =0;
y=0;
for(j=1;j<=lb+1;j++){
if(i <= la+j){
sum = sum + temp[y+i];
}
y += j + la + 1;
}
c[k++] = (sum+r) %10;
r = (sum+r)/10;
}
c[k] = r;
j=0;
for(i=k-1;i>=0;i--){
mul[j++]=c[i] + 48;
}
mul[j]='\0';
return mul;
}
I have an array of unsigned chars in c I am trying to print in base 10, and I am stuck. I think this will be better explained in code, so, given:
unsigned char n[3];
char[0] = 1;
char[1] = 2;
char[2] = 3;
I would like to print 197121.
This is trivial with small base 256 arrays. One can simply 1 * 256 ^ 0 + 2 * 256 ^ 1 + 3 * 256 ^ 2.
However, if my array was 100 bytes large, then this quickly becomes a problem. There is no integral type in C that is 100 bytes large, which is why I'm storing numbers in unsigned char arrays to begin with.
How am I supposed to efficiently print this number out in base 10?
I am a bit lost.
There's no easy way to do it using only the standard C library. You'll either have to write the function yourself (not recommended), or use an external library such as GMP.
For example, using GMP, you could do:
unsigned char n[100]; // number to print
mpz_t num;
mpz_import(num, 100, -1, 1, 0, 0, n); // convert byte array into GMP format
mpz_out_str(stdout, 10, num); // print num to stdout in base 10
mpz_clear(num); // free memory for num
When I saw this question, I purpose to solve it, but at that moment I was very busy.
This last weekend I've could gain some prize hours of free time so I considered my pending challenge.
First of all, I suggest you to considered above response. I never use GMP library but I'm sure that it's better solution than a handmade code.
Also, you could be interest to analyze code of bc calculator; it can works with big numbers and I used to test my own code.
Ok, if you are still interested in a code do it by yourself (only with support C language and Standard C library) may be I can give you something.
Before all, a little bit theory. In basic numeric theory (modular arithmetic level) theres is an algorithm that inspire me to arrive at one solution; Multiply and Power algorithm to solve a^N module m:
Result := 1;
for i := k until i = 0
if n_i = 1 then Result := (Result * a) mod m;
if i != 0 then Result := (Result * Result) mod m;
end for;
Where k is number of digits less one of N in binary representation, and n_i is i binary digit. For instance (N is exponent):
N = 44 -> 1 0 1 1 0 0
k = 5
n_5 = 1
n_4 = 0
n_3 = 1
n_2 = 1
n_1 = 0
n_0 = 0
When we make a module operation, as an integer division, we can lose part of the number, so we only have to modify algorithm to don't miss relevant data.
Here is my code (take care that it is an adhoc code, strong dependency of may computer arch. Basically I play with data length of C language so, be carefully because my data length could not be the same):
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
enum { SHF = 31, BMASK = 0x1 << SHF, MODULE = 1000000000UL, LIMIT = 1024 };
unsigned int scaleBigNum(const unsigned short scale, const unsigned int lim, unsigned int *num);
unsigned int pow2BigNum(const unsigned int lim, unsigned int *nsrc, unsigned int *ndst);
unsigned int addBigNum(const unsigned int lim1, unsigned int *num1, const unsigned int lim2, unsigned int *num2);
unsigned int bigNum(const unsigned short int base, const unsigned int exp, unsigned int **num);
int main(void)
{
unsigned int *num, lim;
unsigned int *np, nplim;
int i, j;
for(i = 1; i < LIMIT; ++i)
{
lim = bigNum(i, i, &num);
printf("%i^%i == ", i, i);
for(j = lim - 1; j > -1; --j)
printf("%09u", num[j]);
printf("\n");
free(num);
}
return 0;
}
/*
bigNum: Compute number base^exp and store it in num array
#base: Base number
#exp: Exponent number
#num: Pointer to array where it stores big number
Return: Array length of result number
*/
unsigned int bigNum(const unsigned short int base, const unsigned int exp, unsigned int **num)
{
unsigned int m, lim, mem;
unsigned int *v, *w, *k;
//Note: mem has the exactly amount memory to allocate (dinamic memory version)
mem = ( (unsigned int) (exp * log10( (float) base ) / 9 ) ) + 3;
v = (unsigned int *) malloc( mem * sizeof(unsigned int) );
w = (unsigned int *) malloc( mem * sizeof(unsigned int) );
for(m = BMASK; ( (m & exp) == 0 ) && m; m >>= 1 ) ;
v[0] = (m) ? 1 : 0;
for(lim = 1; m > 1; m >>= 1)
{
if( exp & m )
lim = scaleBigNum(base, lim, v);
lim = pow2BigNum(lim, v, w);
k = v;
v = w;
w = k;
}
if(exp & 0x1)
lim = scaleBigNum(base, lim, v);
free(w);
*num = v;
return lim;
}
/*
scaleBigNum: Make an (num[] <- scale*num[]) big number operation
#scale: Scalar that multiply big number
#lim: Length of source big number
#num: Source big number (array of unsigned int). Update it with new big number value
Return: Array length of operation result
Warning: This method can write in an incorrect position if we don't previous reallocate num (if it's necessary). bigNum method do it for us
*/
unsigned int scaleBigNum(const unsigned short scale, const unsigned int lim, unsigned int *num)
{
unsigned int i;
unsigned long long int n, t;
for(n = 0, t = 0, i = 0; i < lim; ++i)
{
t = (n / MODULE);
n = ( (unsigned long long int) scale * num[i] );
num[i] = (n % MODULE) + t; // (n % MODULE) + t always will be smaller than MODULE
}
num[i] = (n / MODULE);
return ( (num[i]) ? lim + 1 : lim );
}
/*
pow2BigNum: Make a (dst[] <- src[] * src[]) big number operation
#lim: Length of source big number
#src: Source big number (array of unsigned int)
#dst: Destination big number (array of unsigned int)
Return: Array length of operation result
Warning: This method can write in an incorrect position if we don't previous reallocate num (if it's necessary). bigNum method do it for us
*/
unsigned int pow2BigNum(const unsigned int lim, unsigned int *src, unsigned int *dst)
{
unsigned int i, j;
unsigned long long int n, t;
unsigned int k, c;
for(c = 0, dst[0] = 0, i = 0; i < lim; ++i)
{
for(j = i, n = 0; j < lim; ++j)
{
n = ( (unsigned long long int) src[i] * src[j] );
k = i + j;
if(i != j)
{
t = 2 * (n % MODULE);
n = 2 * (n / MODULE);
// (i + j)
dst[k] = ( (k > c) ? ((c = k), 0) : dst[k] ) + (t % MODULE);
++k; // (i + j + 1)
dst[k] = ( (k > c) ? ((c = k), 0) : dst[k] ) + ( (t / MODULE) + (n % MODULE) );
++k; // (i + j + 2)
dst[k] = ( (k > c) ? ((c = k), 0) : dst[k] ) + (n / MODULE);
}
else
{
dst[k] = ( (k > c) ? ((c = k), 0) : dst[k] ) + (n % MODULE);
++k; // (i + j)
dst[k] = ( (k > c) ? ((c = k), 0) : dst[k] ) + (n / MODULE);
}
for(k = i + j; k < (lim + j); ++k)
{
dst[k + 1] += (dst[k] / MODULE);
dst[k] %= MODULE;
}
}
}
i = lim << 1;
return ((dst[i - 1]) ? i : i - 1);
}
/*
addBigNum: Make a (num2[] <- num1[] + num2[]) big number operation
#lim1: Length of source num1 big number
#num1: First source operand big number (array of unsigned int). Should be smaller than second
#lim2: Length of source num2 big number
#num2: Second source operand big number (array of unsigned int). Should be equal or greater than first
Return: Array length of operation result or 0 if num1[] > num2[] (dosen't do any op)
Warning: This method can write in an incorrect position if we don't previous reallocate num2
*/
unsigned int addBigNum(const unsigned int lim1, unsigned int *num1, const unsigned int lim2, unsigned int *num2)
{
unsigned long long int n;
unsigned int i;
if(lim1 > lim2)
return 0;
for(num2[lim2] = 0, n = 0, i = 0; i < lim1; ++i)
{
n = num2[i] + num1[i] + (n / MODULE);
num2[i] = n % MODULE;
}
for(n /= MODULE; n; ++i)
{
num2[i] += n;
n = (num2[i] / MODULE);
}
return (lim2 > i) ? lim2 : i;
}
To compile:
gcc -o bgn <name>.c -Wall -O3 -lm //Math library if you wants to use log func
To check result, use direct output as and input to bc. Easy shell script:
#!/bin/bash
select S in ` awk -F '==' '{print $1 " == " $2 }' | bc`;
do
0;
done;
echo "Test Finished!";
We have and array of unsigned int (4 bytes) where we store at each int of array a number of 9 digits ( % 1000000000UL ); hence num[0] we will have the first 9 digits, num[1] we will have digit 10 to 18, num[2]...
I use convencional memory to work but an improvement can do it with dinamic memory. Ok, but how length It could be the array? (or how many memory we need to allocate?). Using bc calculator (bc -l with mathlib) we can determine how many digits has a number:
l(a^N) / l(10) // Natural logarith to Logarithm base 10
If we know digits, we know amount integers we needed:
( l(a^N) / (9 * l(10)) ) + 1 // Truncate result
If you work with value such as (2^k)^N you can resolve it logarithm with this expression:
( k*N*l(2)/(9*l(10)) ) + 1 // Truncate result
to determine the exactly length of integer array. Example:
256^800 = 2^(8*800) ---> l(2^(8*800))/(9*l(10)) + 1 = 8*800*l(2)/(9*l(10)) + 1
The value 1000000000UL (10^9) constant is very important. A constant like 10000000000UL (10^10) dosen't work because can produce and indetected overflow (try what's happens with number 16^16 and 10^10 constant) and a constant more little such as 1000000000UL (10^8) are correct but we need to reserve more memory and do more steps. 10^9 is key constant for unsigned int of 32 bits and unsigned long long int of 64 bits.
The code has two parts, Multiply (easy) and Power by 2 (more hard). Multiply is just multiplication and scale and propagate the integer overflow. It take the principle of associative property in math to do exactly the inverse principle, so if k(A + B + C) we want kA + kB + kC where number will be k*A*10^18 + k*B*10^9 + kC. Obiously, kC operation can generate a number bigger than 999 999 999, but never more bigger than 0xFF FF FF FF FF FF FF FF. A number bigger than 64 bits can never occur in a multiplication because C is an unsigned integer of 32 bits and k is a unsigned short of 16 bits. In worts case, we will have this number:
k = 0x FF FF;
C = 0x 3B 9A C9 FF; // 999999999
n = k*C = 0x 3B 9A | 8E 64 36 01;
n % 1000000000 = 0x 3B 99 CA 01;
n / 1000000000 = 0x FF FE;
After Mul kB we need to add 0x FF FE from last multiplication of C ( B = kB + (C / module) ), and so on (we have 18 bits arithmetic offset, enough to guarantee correct values).
Power is more complex but is in essencial, the same problem (multiplication and add), so I give some tricks about code power:
Data types are important, very important
If you try to multiplication an unsigned integer with unsigned integer, you get another unsigned integer. Use explicit cast to get unsigned long long int and don't lose data.
Always use unsigned modifier, dont forget it!
Power by 2 can directly modify 2 index ahead of current index
gdb is your friend
I've developed another method that add big numbers. These last I don't prove so much but I think it works well. Don't be cruels with me if it has a bug.
...and that's all!
PD1: Developed in a
Intel(R) Pentium(R) 4 CPU 1.70GHz
Data length:
unsigned short: 2
unsigned int: 4
unsigned long int: 4
unsigned long long int: 8
Numbers such as 256^1024 it spend:
real 0m0.059s
user 0m0.033s
sys 0m0.000s
A bucle that's compute i^i where i goes to i = 1 ... 1024:
real 0m40.716s
user 0m14.952s
sys 0m0.067s
For numbers such as 65355^65355, spent time is insane.
PD2: My response is so late but I hope my code it will be usefull.
PD3: Sorry, explain me in english is one of my worst handicaps!
Last update: I just have had an idea that with same algorithm but other implementation, improve response and reduce amount memory to use (we can use the completely bits of unsigned int). The secret: n^2 = n * n = n * (n - 1 + 1) = n * (n - 1) + n.
(I will not do this new code, but if someone are interested, may be after exams... )
I don't know if you still need a solution, but I wrote an article about this problem. It shows a very simple algorithm which can be used to convert an arbitrary long number with base X to a corresponding number of base Y. The algorithm is written in Python, but it is really only a few lines long and doesn't use any Python magic. I needed such an algorithm for a C implementation, too, but decided to describe it using Python for two reasons. First, Python is very readable by anyone who understands algorithms written in a pseudo programming language and, second, I am not allowed to post the C version, because it I did it for my company. Just have a look and you will see how easy this problem can be solved in general. An implementation in C should be straight forward...
Here is a function that does what you want:
#include <math.h>
#include <stddef.h> // for size_t
double getval(unsigned char *arr, size_t len)
{
double ret = 0;
size_t cur;
for(cur = 0; cur < len; cur++)
ret += arr[cur] * pow(256, cur);
return ret;
}
That looks perfectly readable to me. Just pass the unsigned char * array you want to convert and the size. Note that it won't be perfect - for arbitrary precision, I suggest looking into the GNU MP BigNum library, as has been suggested already.
As a bonus, I don't like your storing your numbers in little-endian order, so here's a version if you want to store base-256 numbers in big-endian order:
#include <stddef.h> // for size_t
double getval_big_endian(unsigned char *arr, size_t len)
{
double ret = 0;
size_t cur;
for(cur = 0; cur < len; cur++)
{
ret *= 256;
ret += arr[cur];
}
return ret;
}
Just things to consider.
It may be too late or too irrelevant to make this suggestion, but could you store each byte as two base 10 digits (or one base 100) instead of one base 256? If you haven't implemented division yet, then that implies all you have is addition, subtraction, and maybe multiplication; those shouldn't be too hard to convert. Once you've done that, printing it would be trivial.
As I was not satisfied with the other answers provided, I decided to write an alternative solution myself:
#include <stdlib.h>
#define BASE_256 256
char *largenum2str(unsigned char *num, unsigned int len_num)
{
int temp;
char *str, *b_256 = NULL, *cur_num = NULL, *prod = NULL, *prod_term = NULL;
unsigned int i, j, carry = 0, len_str = 1, len_b_256, len_cur_num, len_prod, len_prod_term;
//Get 256 as an array of base-10 chars we'll use later as our second operand of the product
for ((len_b_256 = 0, temp = BASE_256); temp > 0; len_b_256++)
{
b_256 = realloc(b_256, sizeof(char) * (len_b_256 + 1));
b_256[len_b_256] = temp % 10;
temp = temp / 10;
}
//Our first operand (prod) is the last element of our num array, which we'll convert to a base-10 array
for ((len_prod = 0, temp = num[len_num - 1]); temp > 0; len_prod++)
{
prod = realloc(prod, sizeof(*prod) * (len_prod + 1));
prod[len_prod] = temp % 10;
temp = temp / 10;
}
while (len_num > 1) //We'll stay in this loop as long as we still have elements in num to read
{
len_num--; //Decrease the length of num to keep track of the current element
//Convert this element to a base-10 unsigned char array
for ((len_cur_num = 0, temp = num[len_num - 1]); temp > 0; len_cur_num++)
{
cur_num = (char *)realloc(cur_num, sizeof(char) * (len_cur_num + 1));
cur_num[len_cur_num] = temp % 10;
temp = temp / 10;
}
//Multiply prod by 256 and save that as prod_term
len_prod_term = 0;
prod_term = NULL;
for (i = 0; i < len_b_256; i++)
{ //Repeat this loop 3 times, one for each element in {6,5,2} (256 as a reversed base-10 unsigned char array)
carry = 0; //Set the carry to 0
prod_term = realloc(prod_term, sizeof(*prod_term) * (len_prod + i)); //Allocate memory to save prod_term
for (j = i; j < (len_prod_term); j++) //If we have digits from the last partial product of the multiplication, add it here
{
prod_term[j] = prod_term[j] + prod[j - i] * b_256[i] + carry;
if (prod_term[j] > 9)
{
carry = prod_term[j] / 10;
prod_term[j] = prod_term[j] % 10;
}
else
{
carry = 0;
}
}
while (j < (len_prod + i)) //No remaining elements of the former prod_term, so take only into account the results of multiplying mult * b_256
{
prod_term[j] = prod[j - i] * b_256[i] + carry;
if (prod_term[j] > 9)
{
carry = prod_term[j] / 10;
prod_term[j] = prod_term[j] % 10;
}
else
{
carry = 0;
}
j++;
}
if (carry) //A carry may be present in the last term. If so, allocate memory to save it and increase the length of prod_term
{
len_prod_term = j + 1;
prod_term = realloc(prod_term, sizeof(*prod_term) * (len_prod_term));
prod_term[j] = carry;
}
else
{
len_prod_term = j;
}
}
free(prod); //We don't need prod anymore, prod will now be prod_term
prod = prod_term;
len_prod = len_prod_term;
//Add prod (formerly prod_term) to our current number of the num array, expressed in a b-10 array
carry = 0;
for (i = 0; i < len_cur_num; i++)
{
prod[i] = prod[i] + cur_num[i] + carry;
if (prod[i] > 9)
{
carry = prod[i] / 10;
prod[i] -= 10;
}
else
{
carry = 0;
}
}
while (carry && (i < len_prod))
{
prod[i] = prod[i] + carry;
if (prod[i] > 9)
{
carry = prod[i] / 10;
prod[i] -= 10;
}
else
{
carry = 0;
}
i++;
}
if (carry)
{
len_prod++;
prod = realloc(prod, sizeof(*prod) * len_prod);
prod[len_prod - 1] = carry;
carry = 0;
}
}
str = malloc(sizeof(char) * (len_prod + 1)); //Allocate memory for the return string
for (i = 0; i < len_prod; i++) //Convert the numeric result to its representation as characters
{
str[len_prod - 1 - i] = prod[i] + '0';
}
str[i] = '\0'; //Terminate our string
free(b_256); //Free memory
free(prod);
free(cur_num);
return str;
}
The idea behind it all derives from simple math. For any base-256 number, its base-10 representation can be calculated as:
num[i]*256^i + num[i-1]*256^(i-1) + (···) + num[2]*256^2 + num[1]*256^1 + num[0]*256^0
which expands to:
(((((num[i])*256 + num[i-1])*256 + (···))*256 + num[2])*256 + num[1])*256 + num[0]
So all we have to do is to multiply, step-by step, each element of the number array by 256 and add to it the next element, and so on... That way we can get the base-10 number.