Algorithm to scale a raw 24 bit pixelmap to greater than 50% in C - c

Reducing a raw pixelmap to a value of 50% is easy. I simply slide a 2x2 square across the map and average the RGB components of the 4 pixels as follows:
img = XGetImage(d_remote,RootWindow(d_remote,0),0,0,attr.width,attr.height,XAllPlanes(),ZPixmap);
int i;
int j;
for(i=0;i<attr.height;i=i+2){
for(j=0;j<attr.width;j=j+2) {
unsigned long p1 = XGetPixel(img, j, i);
unsigned long p1R = p1 & 0x00ff0000;
unsigned long p1G = p1 & 0x0000ff00;
unsigned long p1B = p1 & 0x000000ff;
unsigned long p2 = XGetPixel(img, j+1, i);
unsigned long p2R = p2 & 0x00ff0000;
unsigned long p2G = p2 & 0x0000ff00;
unsigned long p2B = p2 & 0x000000ff;
unsigned long p3 = XGetPixel(img, j, i+1);
unsigned long p3R = p3 & 0x00ff0000;
unsigned long p3G = p3 & 0x0000ff00;
unsigned long p3B = p3 & 0x000000ff;
unsigned long p4 = XGetPixel(img, j+1, i+1);
unsigned long p4R = p4 & 0x00ff0000;
unsigned long p4G = p4 & 0x0000ff00;
unsigned long p4B = p4 & 0x000000ff;
unsigned long averageR = (p1R+p2R+p3R+p4R)/4 & 0x00ff0000;
unsigned long averageG = (p1G+p2G+p3G+p4G)/4 & 0x0000ff00;
unsigned long averageB = (p1B+p2B+p3B+p4B)/4 & 0x000000ff;
int average = averageR | averageG | averageB;
XPutPixel(newImg, j/2, i/2, average);
}
}
This would make a pixelmap that is 500x500 turn into one that is 250x250. This is a 50% reduction. What if I wanted to scale it by 20%. For example I would like my 500x500 image to turn into 400x400? The smallest square I can slide is a 2x2. I don't see how I can get a reduction that is not a perfect power of 2.
Solution:
How's this for effort?? I modified a script I found that does bi-linear interpolation to work on XImages. It should work for any generic pixelmap. I do find the code ugly though since I see images as 2d arrays. I don't see why all the image code is mapped to a 1d array. It's harder to visualize. This works for any resize.
void resize(XImage* input, XImage* output, int sourceWidth, int sourceHeight, int targetWidth, int targetHeight)
{
int a, b, c, d, x, y, index;
float x_ratio = ((float)(sourceWidth - 1)) / targetWidth;
float y_ratio = ((float)(sourceHeight - 1)) / targetHeight;
float x_diff, y_diff, blue, red, green ;
int offset = 0 ;
int i=0;
int j=0;
int* inputData = (int*)input->data;
int* outputData = (int*)output->data;
for (i = 0; i < targetHeight; i++)
{
for (j = 0; j < targetWidth; j++)
{
x = (int)(x_ratio * j) ;
y = (int)(y_ratio * i) ;
x_diff = (x_ratio * j) - x ;
y_diff = (y_ratio * i) - y ;
index = (y * sourceWidth + x) ;
a = inputData[index] ;
b = inputData[index + 1] ;
c = inputData[index + sourceWidth] ;
d = inputData[index + sourceWidth + 1] ;
// blue element
blue = (a&0xff)*(1-x_diff)*(1-y_diff) + (b&0xff)*(x_diff)*(1-y_diff) +
(c&0xff)*(y_diff)*(1-x_diff) + (d&0xff)*(x_diff*y_diff);
// green element
green = ((a>>8)&0xff)*(1-x_diff)*(1-y_diff) + ((b>>8)&0xff)*(x_diff)*(1-y_diff) +
((c>>8)&0xff)*(y_diff)*(1-x_diff) + ((d>>8)&0xff)*(x_diff*y_diff);
// red element
red = ((a>>16)&0xff)*(1-x_diff)*(1-y_diff) + ((b>>16)&0xff)*(x_diff)*(1-y_diff) +
((c>>16)&0xff)*(y_diff)*(1-x_diff) + ((d>>16)&0xff)*(x_diff*y_diff);
outputData[offset++] = (int)red << 16 | (int)green << 8 | (int)blue;
}
}
}

Here is some pseudocode for downscaling. WS,HS is the target image size WB,HB is the source size. WS is less than WB and HS is less than HB.
double row[WB];
double Xratio= WB/WS;
double Yratio= HB/HS;
double curYratio= Yratio;
double remainY= Yratio - floor(Yratio);
double remainX= Xratio - floor(Xratio);
double curXratio;
double rfac, cfac;
int icol,irow, orow, ocol;
zero-out row
orow= 0;
for(irow=0..HB-1)
{
// we find out how much of this row we will add to the current sum
if (curYratio>=1.0) rfac= 1.0; else rfac= curYratio;
// we add it
for(icol=0..WB) row[icol] += rfac * input[irow][icol];
// we reduce the total weight
curYratio -= rfac;
// if the total weight is now zero, we have a complete row,
// otherwise we still need some of the next row
if (curYratio!=0.0) continue;
// we have a complete row, compute the weighted average
for(icol=0..WB-1) row[icol]/= Yratio;
// now we can scale the row in horizontal
curXratio= Xratio;
ocol= 0;
double pixel= 0.0;
for(icol=0..WB-1)
{
if (curXratio>=1.0) cfac= 1.0; else cfac= curXratio;
pixel+= row[icol]*cfac;
curXratio -= cfac;
if (curXratio!=0) continue;
// now we have a complete pixel
out[orow][ocol]= pixel / Xratio;
pixel= remainX * row[icol];
curXratio= Xratio - remainX;
ocol++;
}
orow++;
// let's put the remainder of the last input row into 'row'
for(icol=0..WB-1) row[i]= remainY*input[irow][icol];
curYratio= Yratio - remainY;
}
This took longer than I thought it would, but there it is. Anyway, it's not very wise to run this directly on an input bitmap. You should convert each pixel value to it's sRGB value before doing any arithmetic. The pixel values in a common bitmap are just names for the real values which should be used in computations. Look up sRGB on wikipedia, it has good information.
If you do it without converting to sRGB and back, you will have a darker image when you scale down.

Related

2D Discrete Convolution of Image and Mask using C

I am trying to make a convolution algorithm for grayscale bmp image. The below code is from Image processing course on Udemy, but the explanation about the variables and formula used was little short. The issue is in 2D discrete convolution part, im not able to understand the formula implemented here
struct Mask{
int Rows;
int Cols;
unsigned char *Data;
};
int main()
{
int imgWidth, imgHeight, imgBitDepth;
unsigned char imgHeader[BMP_HEADER_SIZE];
unsigned char imgColorTable[BMP_COLOR_TABLE_SIZE];
unsigned char imgBuffer[CUSTOM_IMG_SIZE];
unsigned char imgBuffer2[CUSTOM_IMG_SIZE];
const char imgName[] = "images/cameraman.bmp";
const char newImgName[] = "images/cameraman_new.bmp";
struct Mask lpMask;
signed char *tmp;
int i;
lpMask.Cols = lpMask.Rows = 5;
lpMask.Data = (unsigned char *)malloc(25);
/* -1 -1 -1 -1 -1
-1 -1 -1 -1 -1
-1 -1 24 -1 -1
-1 -1 -1 -1 -1
-1 -1 -1 -1 -1*/
//set all mask values to -1
tmp = (signed char *)lpMask.Data;
for (i = 0; i < 25; ++i)
{
*tmp = -1;
++tmp;
}
//set middle value to 24
tmp = (signed char *)lpMask.Data + 13;
*tmp = 24;
imageReader(imgName, &imgHeight, &imgWidth, &imgBitDepth, imgHeader, imgColorTable, imgBuffer);
Convolve(imgHeight, imgWidth, &lpMask, imgBuffer, imgBuffer2);
imageWriter(newImgName, imgHeader, imgColorTable, imgBuffer2, imgBitDepth);
printf("Success!\n");
return 0;
}
//2D Discrete Convolution
void Convolve(int imgRows, int imgCols, struct Mask *myMask, unsigned char *input_buf, unsigned char *output_buf)
{
long i, j, m, n, idx, jdx;
int ms, im, val;
unsigned char *tmp;
//outer summation loop - image
for (i = 0; i < imgRows; ++i)
//inner summation loop - image
for (j = 0; j < imgCols; ++j)
{
val = 0;
//outer summation loop - mask
for (m = 0; m < myMask->Rows; ++m)
//inner summation loop - mask
for (n = 0; n < myMask->Cols; ++n)
{
//Issue in understanding below part
ms = (signed char)*(myMask->Data + m * myMask->Rows + n);
// index of input img, used for checking boundary
idx = i - m;
jdx = j - n;
if (idx >= 0 && jdx >= 0) //ignore input samples which are out of bound
im = *(input_buf + idx * imgRows + jdx);
val += ms * im;
}
//truncate values to remain inside 0to255 range
if (val > 255) val = 255;
if (val < 0) val = 0;
tmp = output_buf + i * imgRows + j;
*tmp = (unsigned char)val;
}
}
Here in 3 lines, the formula used is similar and most difficult to understand its implementation, if possible please help out with understanding these codes logic or what they are doing exactly:
ms = (signed char)*(myMask->Data + m * myMask->Rows + n);
im = *(input_buf + idx * imgRows + jdx);
tmp = output_buf + i * imgRows + j;
For formula/pseudocode used, check Convolution section on following website:- https://en.wikipedia.org/wiki/Kernel_(image_processing)
OR
g(x,y) = ∑k= -n2 to n2 ∑j= -m2 to m2 h(j,k) * f(x-j, y-k) ,
where m2 = half of mask's width & n2 = half of mask's height
OR
The expressions you ask about are simply the computation of a location of particular pixel indexed in 2 dimensions (row, column), stored in a flat memory buffer.
For example, ms = (signed char)*(myMask->Data + m * myMask->Rows + n); start with the mask image data buffer itself, myMask->Data, which is a pointer. The first row of data shows up first, followed by the second row. So to access the pixel at row m, column n, you first have to skip m rows of data, which is the size of a row * m. Then you have to skip n pixels inside the row. Once the location of the pixel is computed, it is dereferenced with *.
The only complaint I have for this example code is the name myMask->Rows. In this case, m represents a row index, and to compute the offset, it is multiplied by the size of a row, which should be the number of columns in the image, not the number of rows. So that reference should instead be myMask->Cols.

how to create a simple iir low pass filter with not round errors? (16 bit pcm data)

i have an array of n length fullfilled by 16 bit (int16) pcm raw data,the data is in 44100 sample_rate
and stereo,so i have in my array first 2 bytes left channel then right channel etc...i tried to implement a simple low pass converting my array into floating points -1 1,the low pass works but there are round errors that cause little pops in the sound
now i do simply this :
INT32 left_id = 0;
INT32 right_id = 1;
DOUBLE filtered_l_db = 0.0;
DOUBLE filtered_r_db = 0.0;
DOUBLE last_filtered_left = 0;
DOUBLE last_filtered_right = 0;
DOUBLE l_db = 0.0;
DOUBLE r_db = 0.0;
DOUBLE low_filter = filter_freq(core->audio->low_pass_cut);
for(UINT32 a = 0; a < (buffer_size/2);++a)
{
l_db = ((DOUBLE)input_buffer[left_id]) / (DOUBLE)32768;
r_db = ((DOUBLE)input_buffer[right_id]) / (DOUBLE)32768;
///////////////LOW PASS
filtered_l_db = last_filtered_left +
(low_filter * (l_db -last_filtered_left ));
filtered_r_db = last_filtered_right +
(low_filter * (r_db - last_filtered_right));
last_filtered_left = filtered_l_db;
last_filtered_right = filtered_r_db;
INT16 l = (INT16)(filtered_l_db * (DOUBLE)32768);
INT16 r = (INT16)(filtered_r_db * (DOUBLE)32768);
output_buffer[left_id] = (output_buffer[left_id] + l);
output_buffer[right_id] = (output_buffer[right_id] + r);
left_id +=2;
right_id +=2;
}
PS: the input buffer is an int16 array with the pcm data from -32767 to 32767;
i found this function here
Low Pass filter in C
and was the only one that i could understand xd
DOUBLE filter_freq(DOUBLE cut_freq)
{
DOUBLE a = 1.0/(cut_freq * 2 * PI);
DOUBLE b = 1.0/SAMPLE_RATE;
return b/(a+b);
}
my aim is instead to have absolute precision on the wave,and to directly low pass using only integers
with the cost to lose resolution on the filter(and i'm ok with it)..i saw a lot of examples but i really didnt understand anything...someone of you would be so gentle to explain how this is done like you would explain to a little baby?(in code or pseudo code rapresentation) thank you
Assuming the result of function filter_freq can be written as a fraction m/n your filter calculation basically is
y_new = y_old + (m/n) * (x - y_old);
which can be transformed to
y_new = ((n * y_old) + m * (x - y_old)) / n;
The integer division / n truncates the result towards 0. If you want rounding instead of truncation you can implement it as
y_tmp = ((n * y_old) + m * (x - y_old));
if(y_tmp < 0) y_tmp -= (n / 2);
else y_tmp += (n / 2);
y_new = y_tmp / n
In order to avoid losing precision from dividing the result by n in one step and multiplying it by n in the next step you can save the value y_tmp before the division and use it in the next cycle.
y_tmp = (y_tmp + m * (x - y_old));
if(y_tmp < 0) y_new = y_tmp - (n / 2);
else y_new = y_tmp + (n / 2);
y_new /= n;
If your input data is int16_t I suggest to implement the calculation using int32_t to avoid overflows.
I tried to convert the filter in your code without checking other parts for possible problems.
INT32 left_id = 0;
INT32 right_id = 1;
int32_t filtered_l_out = 0; // output value after division
int32_t filtered_r_out = 0;
int32_t filtered_l_tmp = 0; // used to keep the output value before division
int32_t filtered_r_tmp = 0;
int32_t l_in = 0; // input value
int32_t r_in = 0;
DOUBLE low_filter = filter_freq(core->audio->low_pass_cut);
// define denominator and calculate numerator
// use power of 2 to allow bit-shift instead of division
const uint32_t filter_shift = 16U;
const int32_t filter_n = 1U << filter_shift;
int32_t filter_m = (int32_t)(low_filter * filter_n)
for(UINT32 a = 0; a < (buffer_size/2);++a)
{
l_in = input_buffer[left_id]);
r_in = input_buffer[right_id];
///////////////LOW PASS
filtered_l_tmp = filtered_l_tmp + filter_m * (l_in - filtered_l_out);
if(last_filtered_left < 0) {
filtered_l_out = last_filtered_left - filter_n/2;
} else {
filtered_l_out = last_filtered_left + filter_n/2;
}
//filtered_l_out /= filter_n;
filtered_l_out >>= filter_shift;
/* same calculation for right */
INT16 l = (INT16)(filtered_l_out);
INT16 r = (INT16)(filtered_r_out);
output_buffer[left_id] = (output_buffer[left_id] + l);
output_buffer[right_id] = (output_buffer[right_id] + r);
left_id +=2;
right_id +=2;
}
As your filter is initialized with 0 it may need several samples to follow a possible step to the first input value. Depending on your data it might be better to initialize the filter based on the first input value.

Optimize Bilinear Resize Algorithm in C

Can anyone spot any way to improve the speed in the next Bilinear resizing Algorithm?
I need to improve Speed as this is critical, keeping good image quality. Is expected to be used in mobile devices with low speed CPUs.
The algorithm is used mainly for up-scale resizing. Any other faster Bilinear algorithm also would be appreciated. Thanks
void resize(int* input, int* output, int sourceWidth, int sourceHeight, int targetWidth, int targetHeight)
{
int a, b, c, d, x, y, index;
float x_ratio = ((float)(sourceWidth - 1)) / targetWidth;
float y_ratio = ((float)(sourceHeight - 1)) / targetHeight;
float x_diff, y_diff, blue, red, green ;
int offset = 0 ;
for (int i = 0; i < targetHeight; i++)
{
for (int j = 0; j < targetWidth; j++)
{
x = (int)(x_ratio * j) ;
y = (int)(y_ratio * i) ;
x_diff = (x_ratio * j) - x ;
y_diff = (y_ratio * i) - y ;
index = (y * sourceWidth + x) ;
a = input[index] ;
b = input[index + 1] ;
c = input[index + sourceWidth] ;
d = input[index + sourceWidth + 1] ;
// blue element
blue = (a&0xff)*(1-x_diff)*(1-y_diff) + (b&0xff)*(x_diff)*(1-y_diff) +
(c&0xff)*(y_diff)*(1-x_diff) + (d&0xff)*(x_diff*y_diff);
// green element
green = ((a>>8)&0xff)*(1-x_diff)*(1-y_diff) + ((b>>8)&0xff)*(x_diff)*(1-y_diff) +
((c>>8)&0xff)*(y_diff)*(1-x_diff) + ((d>>8)&0xff)*(x_diff*y_diff);
// red element
red = ((a>>16)&0xff)*(1-x_diff)*(1-y_diff) + ((b>>16)&0xff)*(x_diff)*(1-y_diff) +
((c>>16)&0xff)*(y_diff)*(1-x_diff) + ((d>>16)&0xff)*(x_diff*y_diff);
output [offset++] =
0x000000ff | // alpha
((((int)red) << 24)&0xff0000) |
((((int)green) << 16)&0xff00) |
((((int)blue) << 8)&0xff00);
}
}
}
Off the the top of my head:
Stop using floating-point, unless you're certain your target CPU has it in hardware with good performance.
Make sure memory accesses are cache-optimized, i.e. clumped together.
Use the fastest data types possible. Sometimes this means smallest, sometimes it means "most native, requiring least overhead".
Investigate if signed/unsigned for integer operations have performance costs on your platform.
Investigate if look-up tables rather than computations gain you anything (but these can blow the caches, so be careful).
And, of course, do lots of profiling and measurements.
In-Line Cache and Lookup Tables
Cache your computations in your algorithm.
Avoid duplicate computations (like (1-y_diff) or (x_ratio * j))
Go through all the lines of your algorithm, and try to identify patterns of repetitions. Extract these to local variables. And possibly extract to functions, if they are short enough to be inlined, to make things more readable.
Use a lookup-table
It's quite likely that, if you can spare some memory, you can implement a "store" for your RGB values and simply "fetch" them based on the inputs that produced them. Maybe you don't need to store all of them, but you could experiment and see if some come back often. Alternatively, you could "fudge" your colors and thus end up with less values to store for more lookup inputs.
If you know the boundaries for you inputs, you can calculate the complete domain space and figure out what makes sense to cache. For instance, if you can't cache the whole R, G, B values, maybe you can at least pre-compute the shiftings ((b>>16) and so forth...) that are most likely deterministic in your case).
Use the Right Data Types for Performance
If you can avoid double and float variables, use int. On most architectures, int would be test faster type for computations because of the memory model. You can still achieve decent precision by simply shifting your units (ie use 1026 as int instead of 1.026 as double or float). It's quite likely that this trick would be enough for you.
x = (int)(x_ratio * j) ;
y = (int)(y_ratio * i) ;
x_diff = (x_ratio * j) - x ;
y_diff = (y_ratio * i) - y ;
index = (y * sourceWidth + x) ;
Could surely use some optimization: you were using x_ration * j-1 just a few cycles earlier, so all you really need here is x+=x_ratio
My random guess (use a profiler instead of letting people guess!):
The compiler has to generate that works when input and output overlap which means it has to do generate loads of redundant stores and loads. Add restrict to the input and output parameters to remove that safety feature.
You could also try using a=b; and c=d; instead of loading them again.
here is my version, steal some ideas. My C-fu is quite weak, so some lines are pseudocodes, but you can fix them.
void resize(int* input, int* output,
int sourceWidth, int sourceHeight,
int targetWidth, int targetHeight
) {
// Let's create some lookup tables!
// you can move them into 2-dimensional arrays to
// group together values used at the same time to help processor cache
int sx[0..targetWidth ]; // target->source X lookup
int sy[0..targetHeight]; // target->source Y lookup
int mx[0..targetWidth ]; // left pixel's multiplier
int my[0..targetHeight]; // bottom pixel's multiplier
// we don't have to calc indexes every time, find out when
bool reloadPixels[0..targetWidth ];
bool shiftPixels[0..targetWidth ];
int shiftReloadPixels[0..targetWidth ]; // can be combined if necessary
int v; // temporary value
for (int j = 0; j < targetWidth; j++){
// (8bit + targetBits + sourceBits) should be < max int
v = 256 * j * (sourceWidth-1) / (targetWidth-1);
sx[j] = v / 256;
mx[j] = v % 256;
reloadPixels[j] = j ? ( sx[j-1] != sx[j] ? 1 : 0)
: 1; // always load first pixel
// if no reload -> then no shift too
shiftPixels[j] = j ? ( sx[j-1]+1 = sx[j] ? 2 : 0)
: 0; // nothing to shift at first pixel
shiftReloadPixels[j] = reloadPixels[i] | shiftPixels[j];
}
for (int i = 0; i < targetHeight; i++){
v = 256 * i * (sourceHeight-1) / (targetHeight-1);
sy[i] = v / 256;
my[i] = v % 256;
}
int shiftReload;
int srcIndex;
int srcRowIndex;
int offset = 0;
int lm, rm, tm, bm; // left / right / top / bottom multipliers
int a, b, c, d;
for (int i = 0; i < targetHeight; i++){
srcRowIndex = sy[ i ] * sourceWidth;
tm = my[i];
bm = 255 - tm;
for (int j = 0; j < targetWidth; j++){
// too much ifs can be too slow, measure.
// always true for first pixel in a row
if( shiftReload = shiftReloadPixels[ j ] ){
srcIndex = srcRowIndex + sx[j];
if( shiftReload & 2 ){
a = b;
c = d;
}else{
a = input[ srcIndex ];
c = input[ srcIndex + sourceWidth ];
}
b = input[ srcIndex + 1 ];
d = input[ srcIndex + 1 + sourceWidth ];
}
lm = mx[j];
rm = 255 - lm;
// WTF?
// Input AA RR GG BB
// Output RR GG BB AA
if( j ){
leftOutput = rightOutput ^ 0xFFFFFF00;
}else{
leftOutput =
// blue element
((( ( (a&0xFF)*tm
+ (c&0xFF)*bm )*lm
) & 0xFF0000 ) >> 8)
// green element
| ((( ( ((a>>8)&0xFF)*tm
+ ((c>>8)&0xFF)*bm )*lm
) & 0xFF0000 )) // no need to shift
// red element
| ((( ( ((a>>16)&0xFF)*tm
+ ((c>>16)&0xFF)*bm )*lm
) & 0xFF0000 ) << 8 )
;
}
rightOutput =
// blue element
((( ( (b&0xFF)*tm
+ (d&0xFF)*bm )*lm
) & 0xFF0000 ) >> 8)
// green element
| ((( ( ((b>>8)&0xFF)*tm
+ ((d>>8)&0xFF)*bm )*lm
) & 0xFF0000 )) // no need to shift
// red element
| ((( ( ((b>>16)&0xFF)*tm
+ ((d>>16)&0xFF)*bm )*lm
) & 0xFF0000 ) << 8 )
;
output[offset++] =
// alpha
0x000000ff
| leftOutput
| rightOutput
;
}
}
}

need to create a webm video from RGB frames

I have an app that generates a bunch of jpgs that I need to turn into a webm video. I'm trying to get my rgb data from the jpegs into the vpxenc sample. I can see the basic shapes from the original jpgs in the output video, but everything is tinted green (even pixels that should be black are about halfway green) and every other scanline has some garbage in it.
I'm trying to feed it VPX_IMG_FMT_YV12 data, which I'm assuming is structured like so:
for each frame
8-bit Y data
8-bit averages of each 2x2 V block
8-bit averages of each 2x2 U block
Here is a source image and a screenshot of the video that is coming out:
Images
It's entirely possible that I'm doing the RGB->YV12 conversion incorrectly, but even if I only encode the 8-bit Y data and set the U and V blocks to 0, the video looks about the same. I'm basically running my RGB data through this equation:
// (R, G, and B are 0-255)
float y = 0.299f*R + 0.587f*G + 0.114f*B;
float v = (R-y)*0.713f;
float u = (B-v)*0.565f;
.. and then to produce the 2x2 filtered values for U and V that I write into vpxenc, I just do (a + b + c + d) / 4, where a,b,c,d are the U or V values of each 2x2 pixel block.
So I'm wondering:
Is there an easier way (in code) to take RGB data and feed it to vpx_codec_encode to get a nice webm video?
Is my RGB->YV12 conversion wrong somewhere?
Any help would be greatly appreciated.
freefallr: Sure. Here is the code. Note that it's converting the RGB->YUV in place as well as putting the YV12 output into pFullYPlane/pDownsampledUPlane/pDownsampledVPlane. This code produced nice looking WebM videos when I modified their vpxenc sample to use this data.
void RGB_To_YV12( unsigned char *pRGBData, int nFrameWidth, int nFrameHeight, void *pFullYPlane, void *pDownsampledUPlane, void *pDownsampledVPlane )
{
int nRGBBytes = nFrameWidth * nFrameHeight * 3;
// Convert RGB -> YV12. We do this in-place to avoid allocating any more memory.
unsigned char *pYPlaneOut = (unsigned char*)pFullYPlane;
int nYPlaneOut = 0;
for ( int i=0; i < nRGBBytes; i += 3 )
{
unsigned char B = pRGBData[i+0];
unsigned char G = pRGBData[i+1];
unsigned char R = pRGBData[i+2];
float y = (float)( R*66 + G*129 + B*25 + 128 ) / 256 + 16;
float u = (float)( R*-38 + G*-74 + B*112 + 128 ) / 256 + 128;
float v = (float)( R*112 + G*-94 + B*-18 + 128 ) / 256 + 128;
// NOTE: We're converting pRGBData to YUV in-place here as well as writing out YUV to pFullYPlane/pDownsampledUPlane/pDownsampledVPlane.
pRGBData[i+0] = (unsigned char)y;
pRGBData[i+1] = (unsigned char)u;
pRGBData[i+2] = (unsigned char)v;
// Write out the Y plane directly here rather than in another loop.
pYPlaneOut[nYPlaneOut++] = pRGBData[i+0];
}
// Downsample to U and V.
int halfHeight = nFrameHeight >> 1;
int halfWidth = nFrameWidth >> 1;
unsigned char *pVPlaneOut = (unsigned char*)pDownsampledVPlane;
unsigned char *pUPlaneOut = (unsigned char*)pDownsampledUPlane;
for ( int yPixel=0; yPixel < halfHeight; yPixel++ )
{
int iBaseSrc = ( (yPixel*2) * nFrameWidth * 3 );
for ( int xPixel=0; xPixel < halfWidth; xPixel++ )
{
pVPlaneOut[yPixel * halfWidth + xPixel] = pRGBData[iBaseSrc + 2];
pUPlaneOut[yPixel * halfWidth + xPixel] = pRGBData[iBaseSrc + 1];
iBaseSrc += 6;
}
}
}
Never mind. The scheme I was using was correct but I had a bug in the U/V downsampling code.

Print large base 256 array in base 10 in c

I have an array of unsigned chars in c I am trying to print in base 10, and I am stuck. I think this will be better explained in code, so, given:
unsigned char n[3];
char[0] = 1;
char[1] = 2;
char[2] = 3;
I would like to print 197121.
This is trivial with small base 256 arrays. One can simply 1 * 256 ^ 0 + 2 * 256 ^ 1 + 3 * 256 ^ 2.
However, if my array was 100 bytes large, then this quickly becomes a problem. There is no integral type in C that is 100 bytes large, which is why I'm storing numbers in unsigned char arrays to begin with.
How am I supposed to efficiently print this number out in base 10?
I am a bit lost.
There's no easy way to do it using only the standard C library. You'll either have to write the function yourself (not recommended), or use an external library such as GMP.
For example, using GMP, you could do:
unsigned char n[100]; // number to print
mpz_t num;
mpz_import(num, 100, -1, 1, 0, 0, n); // convert byte array into GMP format
mpz_out_str(stdout, 10, num); // print num to stdout in base 10
mpz_clear(num); // free memory for num
When I saw this question, I purpose to solve it, but at that moment I was very busy.
This last weekend I've could gain some prize hours of free time so I considered my pending challenge.
First of all, I suggest you to considered above response. I never use GMP library but I'm sure that it's better solution than a handmade code.
Also, you could be interest to analyze code of bc calculator; it can works with big numbers and I used to test my own code.
Ok, if you are still interested in a code do it by yourself (only with support C language and Standard C library) may be I can give you something.
Before all, a little bit theory. In basic numeric theory (modular arithmetic level) theres is an algorithm that inspire me to arrive at one solution; Multiply and Power algorithm to solve a^N module m:
Result := 1;
for i := k until i = 0
if n_i = 1 then Result := (Result * a) mod m;
if i != 0 then Result := (Result * Result) mod m;
end for;
Where k is number of digits less one of N in binary representation, and n_i is i binary digit. For instance (N is exponent):
N = 44 -> 1 0 1 1 0 0
k = 5
n_5 = 1
n_4 = 0
n_3 = 1
n_2 = 1
n_1 = 0
n_0 = 0
When we make a module operation, as an integer division, we can lose part of the number, so we only have to modify algorithm to don't miss relevant data.
Here is my code (take care that it is an adhoc code, strong dependency of may computer arch. Basically I play with data length of C language so, be carefully because my data length could not be the same):
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
enum { SHF = 31, BMASK = 0x1 << SHF, MODULE = 1000000000UL, LIMIT = 1024 };
unsigned int scaleBigNum(const unsigned short scale, const unsigned int lim, unsigned int *num);
unsigned int pow2BigNum(const unsigned int lim, unsigned int *nsrc, unsigned int *ndst);
unsigned int addBigNum(const unsigned int lim1, unsigned int *num1, const unsigned int lim2, unsigned int *num2);
unsigned int bigNum(const unsigned short int base, const unsigned int exp, unsigned int **num);
int main(void)
{
unsigned int *num, lim;
unsigned int *np, nplim;
int i, j;
for(i = 1; i < LIMIT; ++i)
{
lim = bigNum(i, i, &num);
printf("%i^%i == ", i, i);
for(j = lim - 1; j > -1; --j)
printf("%09u", num[j]);
printf("\n");
free(num);
}
return 0;
}
/*
bigNum: Compute number base^exp and store it in num array
#base: Base number
#exp: Exponent number
#num: Pointer to array where it stores big number
Return: Array length of result number
*/
unsigned int bigNum(const unsigned short int base, const unsigned int exp, unsigned int **num)
{
unsigned int m, lim, mem;
unsigned int *v, *w, *k;
//Note: mem has the exactly amount memory to allocate (dinamic memory version)
mem = ( (unsigned int) (exp * log10( (float) base ) / 9 ) ) + 3;
v = (unsigned int *) malloc( mem * sizeof(unsigned int) );
w = (unsigned int *) malloc( mem * sizeof(unsigned int) );
for(m = BMASK; ( (m & exp) == 0 ) && m; m >>= 1 ) ;
v[0] = (m) ? 1 : 0;
for(lim = 1; m > 1; m >>= 1)
{
if( exp & m )
lim = scaleBigNum(base, lim, v);
lim = pow2BigNum(lim, v, w);
k = v;
v = w;
w = k;
}
if(exp & 0x1)
lim = scaleBigNum(base, lim, v);
free(w);
*num = v;
return lim;
}
/*
scaleBigNum: Make an (num[] <- scale*num[]) big number operation
#scale: Scalar that multiply big number
#lim: Length of source big number
#num: Source big number (array of unsigned int). Update it with new big number value
Return: Array length of operation result
Warning: This method can write in an incorrect position if we don't previous reallocate num (if it's necessary). bigNum method do it for us
*/
unsigned int scaleBigNum(const unsigned short scale, const unsigned int lim, unsigned int *num)
{
unsigned int i;
unsigned long long int n, t;
for(n = 0, t = 0, i = 0; i < lim; ++i)
{
t = (n / MODULE);
n = ( (unsigned long long int) scale * num[i] );
num[i] = (n % MODULE) + t; // (n % MODULE) + t always will be smaller than MODULE
}
num[i] = (n / MODULE);
return ( (num[i]) ? lim + 1 : lim );
}
/*
pow2BigNum: Make a (dst[] <- src[] * src[]) big number operation
#lim: Length of source big number
#src: Source big number (array of unsigned int)
#dst: Destination big number (array of unsigned int)
Return: Array length of operation result
Warning: This method can write in an incorrect position if we don't previous reallocate num (if it's necessary). bigNum method do it for us
*/
unsigned int pow2BigNum(const unsigned int lim, unsigned int *src, unsigned int *dst)
{
unsigned int i, j;
unsigned long long int n, t;
unsigned int k, c;
for(c = 0, dst[0] = 0, i = 0; i < lim; ++i)
{
for(j = i, n = 0; j < lim; ++j)
{
n = ( (unsigned long long int) src[i] * src[j] );
k = i + j;
if(i != j)
{
t = 2 * (n % MODULE);
n = 2 * (n / MODULE);
// (i + j)
dst[k] = ( (k > c) ? ((c = k), 0) : dst[k] ) + (t % MODULE);
++k; // (i + j + 1)
dst[k] = ( (k > c) ? ((c = k), 0) : dst[k] ) + ( (t / MODULE) + (n % MODULE) );
++k; // (i + j + 2)
dst[k] = ( (k > c) ? ((c = k), 0) : dst[k] ) + (n / MODULE);
}
else
{
dst[k] = ( (k > c) ? ((c = k), 0) : dst[k] ) + (n % MODULE);
++k; // (i + j)
dst[k] = ( (k > c) ? ((c = k), 0) : dst[k] ) + (n / MODULE);
}
for(k = i + j; k < (lim + j); ++k)
{
dst[k + 1] += (dst[k] / MODULE);
dst[k] %= MODULE;
}
}
}
i = lim << 1;
return ((dst[i - 1]) ? i : i - 1);
}
/*
addBigNum: Make a (num2[] <- num1[] + num2[]) big number operation
#lim1: Length of source num1 big number
#num1: First source operand big number (array of unsigned int). Should be smaller than second
#lim2: Length of source num2 big number
#num2: Second source operand big number (array of unsigned int). Should be equal or greater than first
Return: Array length of operation result or 0 if num1[] > num2[] (dosen't do any op)
Warning: This method can write in an incorrect position if we don't previous reallocate num2
*/
unsigned int addBigNum(const unsigned int lim1, unsigned int *num1, const unsigned int lim2, unsigned int *num2)
{
unsigned long long int n;
unsigned int i;
if(lim1 > lim2)
return 0;
for(num2[lim2] = 0, n = 0, i = 0; i < lim1; ++i)
{
n = num2[i] + num1[i] + (n / MODULE);
num2[i] = n % MODULE;
}
for(n /= MODULE; n; ++i)
{
num2[i] += n;
n = (num2[i] / MODULE);
}
return (lim2 > i) ? lim2 : i;
}
To compile:
gcc -o bgn <name>.c -Wall -O3 -lm //Math library if you wants to use log func
To check result, use direct output as and input to bc. Easy shell script:
#!/bin/bash
select S in ` awk -F '==' '{print $1 " == " $2 }' | bc`;
do
0;
done;
echo "Test Finished!";
We have and array of unsigned int (4 bytes) where we store at each int of array a number of 9 digits ( % 1000000000UL ); hence num[0] we will have the first 9 digits, num[1] we will have digit 10 to 18, num[2]...
I use convencional memory to work but an improvement can do it with dinamic memory. Ok, but how length It could be the array? (or how many memory we need to allocate?). Using bc calculator (bc -l with mathlib) we can determine how many digits has a number:
l(a^N) / l(10) // Natural logarith to Logarithm base 10
If we know digits, we know amount integers we needed:
( l(a^N) / (9 * l(10)) ) + 1 // Truncate result
If you work with value such as (2^k)^N you can resolve it logarithm with this expression:
( k*N*l(2)/(9*l(10)) ) + 1 // Truncate result
to determine the exactly length of integer array. Example:
256^800 = 2^(8*800) ---> l(2^(8*800))/(9*l(10)) + 1 = 8*800*l(2)/(9*l(10)) + 1
The value 1000000000UL (10^9) constant is very important. A constant like 10000000000UL (10^10) dosen't work because can produce and indetected overflow (try what's happens with number 16^16 and 10^10 constant) and a constant more little such as 1000000000UL (10^8) are correct but we need to reserve more memory and do more steps. 10^9 is key constant for unsigned int of 32 bits and unsigned long long int of 64 bits.
The code has two parts, Multiply (easy) and Power by 2 (more hard). Multiply is just multiplication and scale and propagate the integer overflow. It take the principle of associative property in math to do exactly the inverse principle, so if k(A + B + C) we want kA + kB + kC where number will be k*A*10^18 + k*B*10^9 + kC. Obiously, kC operation can generate a number bigger than 999 999 999, but never more bigger than 0xFF FF FF FF FF FF FF FF. A number bigger than 64 bits can never occur in a multiplication because C is an unsigned integer of 32 bits and k is a unsigned short of 16 bits. In worts case, we will have this number:
k = 0x FF FF;
C = 0x 3B 9A C9 FF; // 999999999
n = k*C = 0x 3B 9A | 8E 64 36 01;
n % 1000000000 = 0x 3B 99 CA 01;
n / 1000000000 = 0x FF FE;
After Mul kB we need to add 0x FF FE from last multiplication of C ( B = kB + (C / module) ), and so on (we have 18 bits arithmetic offset, enough to guarantee correct values).
Power is more complex but is in essencial, the same problem (multiplication and add), so I give some tricks about code power:
Data types are important, very important
If you try to multiplication an unsigned integer with unsigned integer, you get another unsigned integer. Use explicit cast to get unsigned long long int and don't lose data.
Always use unsigned modifier, dont forget it!
Power by 2 can directly modify 2 index ahead of current index
gdb is your friend
I've developed another method that add big numbers. These last I don't prove so much but I think it works well. Don't be cruels with me if it has a bug.
...and that's all!
PD1: Developed in a
Intel(R) Pentium(R) 4 CPU 1.70GHz
Data length:
unsigned short: 2
unsigned int: 4
unsigned long int: 4
unsigned long long int: 8
Numbers such as 256^1024 it spend:
real 0m0.059s
user 0m0.033s
sys 0m0.000s
A bucle that's compute i^i where i goes to i = 1 ... 1024:
real 0m40.716s
user 0m14.952s
sys 0m0.067s
For numbers such as 65355^65355, spent time is insane.
PD2: My response is so late but I hope my code it will be usefull.
PD3: Sorry, explain me in english is one of my worst handicaps!
Last update: I just have had an idea that with same algorithm but other implementation, improve response and reduce amount memory to use (we can use the completely bits of unsigned int). The secret: n^2 = n * n = n * (n - 1 + 1) = n * (n - 1) + n.
(I will not do this new code, but if someone are interested, may be after exams... )
I don't know if you still need a solution, but I wrote an article about this problem. It shows a very simple algorithm which can be used to convert an arbitrary long number with base X to a corresponding number of base Y. The algorithm is written in Python, but it is really only a few lines long and doesn't use any Python magic. I needed such an algorithm for a C implementation, too, but decided to describe it using Python for two reasons. First, Python is very readable by anyone who understands algorithms written in a pseudo programming language and, second, I am not allowed to post the C version, because it I did it for my company. Just have a look and you will see how easy this problem can be solved in general. An implementation in C should be straight forward...
Here is a function that does what you want:
#include <math.h>
#include <stddef.h> // for size_t
double getval(unsigned char *arr, size_t len)
{
double ret = 0;
size_t cur;
for(cur = 0; cur < len; cur++)
ret += arr[cur] * pow(256, cur);
return ret;
}
That looks perfectly readable to me. Just pass the unsigned char * array you want to convert and the size. Note that it won't be perfect - for arbitrary precision, I suggest looking into the GNU MP BigNum library, as has been suggested already.
As a bonus, I don't like your storing your numbers in little-endian order, so here's a version if you want to store base-256 numbers in big-endian order:
#include <stddef.h> // for size_t
double getval_big_endian(unsigned char *arr, size_t len)
{
double ret = 0;
size_t cur;
for(cur = 0; cur < len; cur++)
{
ret *= 256;
ret += arr[cur];
}
return ret;
}
Just things to consider.
It may be too late or too irrelevant to make this suggestion, but could you store each byte as two base 10 digits (or one base 100) instead of one base 256? If you haven't implemented division yet, then that implies all you have is addition, subtraction, and maybe multiplication; those shouldn't be too hard to convert. Once you've done that, printing it would be trivial.
As I was not satisfied with the other answers provided, I decided to write an alternative solution myself:
#include <stdlib.h>
#define BASE_256 256
char *largenum2str(unsigned char *num, unsigned int len_num)
{
int temp;
char *str, *b_256 = NULL, *cur_num = NULL, *prod = NULL, *prod_term = NULL;
unsigned int i, j, carry = 0, len_str = 1, len_b_256, len_cur_num, len_prod, len_prod_term;
//Get 256 as an array of base-10 chars we'll use later as our second operand of the product
for ((len_b_256 = 0, temp = BASE_256); temp > 0; len_b_256++)
{
b_256 = realloc(b_256, sizeof(char) * (len_b_256 + 1));
b_256[len_b_256] = temp % 10;
temp = temp / 10;
}
//Our first operand (prod) is the last element of our num array, which we'll convert to a base-10 array
for ((len_prod = 0, temp = num[len_num - 1]); temp > 0; len_prod++)
{
prod = realloc(prod, sizeof(*prod) * (len_prod + 1));
prod[len_prod] = temp % 10;
temp = temp / 10;
}
while (len_num > 1) //We'll stay in this loop as long as we still have elements in num to read
{
len_num--; //Decrease the length of num to keep track of the current element
//Convert this element to a base-10 unsigned char array
for ((len_cur_num = 0, temp = num[len_num - 1]); temp > 0; len_cur_num++)
{
cur_num = (char *)realloc(cur_num, sizeof(char) * (len_cur_num + 1));
cur_num[len_cur_num] = temp % 10;
temp = temp / 10;
}
//Multiply prod by 256 and save that as prod_term
len_prod_term = 0;
prod_term = NULL;
for (i = 0; i < len_b_256; i++)
{ //Repeat this loop 3 times, one for each element in {6,5,2} (256 as a reversed base-10 unsigned char array)
carry = 0; //Set the carry to 0
prod_term = realloc(prod_term, sizeof(*prod_term) * (len_prod + i)); //Allocate memory to save prod_term
for (j = i; j < (len_prod_term); j++) //If we have digits from the last partial product of the multiplication, add it here
{
prod_term[j] = prod_term[j] + prod[j - i] * b_256[i] + carry;
if (prod_term[j] > 9)
{
carry = prod_term[j] / 10;
prod_term[j] = prod_term[j] % 10;
}
else
{
carry = 0;
}
}
while (j < (len_prod + i)) //No remaining elements of the former prod_term, so take only into account the results of multiplying mult * b_256
{
prod_term[j] = prod[j - i] * b_256[i] + carry;
if (prod_term[j] > 9)
{
carry = prod_term[j] / 10;
prod_term[j] = prod_term[j] % 10;
}
else
{
carry = 0;
}
j++;
}
if (carry) //A carry may be present in the last term. If so, allocate memory to save it and increase the length of prod_term
{
len_prod_term = j + 1;
prod_term = realloc(prod_term, sizeof(*prod_term) * (len_prod_term));
prod_term[j] = carry;
}
else
{
len_prod_term = j;
}
}
free(prod); //We don't need prod anymore, prod will now be prod_term
prod = prod_term;
len_prod = len_prod_term;
//Add prod (formerly prod_term) to our current number of the num array, expressed in a b-10 array
carry = 0;
for (i = 0; i < len_cur_num; i++)
{
prod[i] = prod[i] + cur_num[i] + carry;
if (prod[i] > 9)
{
carry = prod[i] / 10;
prod[i] -= 10;
}
else
{
carry = 0;
}
}
while (carry && (i < len_prod))
{
prod[i] = prod[i] + carry;
if (prod[i] > 9)
{
carry = prod[i] / 10;
prod[i] -= 10;
}
else
{
carry = 0;
}
i++;
}
if (carry)
{
len_prod++;
prod = realloc(prod, sizeof(*prod) * len_prod);
prod[len_prod - 1] = carry;
carry = 0;
}
}
str = malloc(sizeof(char) * (len_prod + 1)); //Allocate memory for the return string
for (i = 0; i < len_prod; i++) //Convert the numeric result to its representation as characters
{
str[len_prod - 1 - i] = prod[i] + '0';
}
str[i] = '\0'; //Terminate our string
free(b_256); //Free memory
free(prod);
free(cur_num);
return str;
}
The idea behind it all derives from simple math. For any base-256 number, its base-10 representation can be calculated as:
num[i]*256^i + num[i-1]*256^(i-1) + (···) + num[2]*256^2 + num[1]*256^1 + num[0]*256^0
which expands to:
(((((num[i])*256 + num[i-1])*256 + (···))*256 + num[2])*256 + num[1])*256 + num[0]
So all we have to do is to multiply, step-by step, each element of the number array by 256 and add to it the next element, and so on... That way we can get the base-10 number.

Resources