Transform Lebesgue curve to Hilbert curve - c

I map spatial data onto a one dimensional interval. First I use a space filling Lebesgue curve (or Z curve) in order to connect my points. This works and I get the following plot if I use gnuplot:
Then I want to transform the Lebesgue curve to a Hilbert curve. But it does not work. This is the output:
So it seems to work at the beginning. But after a while strange things happen and I don't know why.
Here is my code:
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include <limits.h>
#include <stdint.h>
#define DIM 2
// direction of hilbert curve
const unsigned char DirTable[4][4] =
{{1,2,0,0},{0,1,3,1},{2,0,2,3},{3,3,1,2}};
// Map z-order to hilbert curve
const unsigned char HilbertTable[4][4] =
{{0,3,1,2},{0,1,3,2},{2,3,1,0},{2,1,3,0}};
unsigned int Lebesgue2Hilbert(unsigned int lebesgue){
unsigned int hilbert = 1;
int level = 0;
int dir = 0;
for(unsigned int tmp=lebesgue; tmp>1; tmp>>=DIM, level++);
for(; level>0; level--){
int cell = (lebesgue >> ((level-1)*DIM)) & ((1<<DIM)-1);
hilbert = (hilbert<<DIM) + HilbertTable[dir][cell];
dir = DirTable[dir][cell];
}
return hilbert;
}
unsigned int Part1By1(unsigned int x)
{
x &= 0x0000ffff;
x = (x ^ (x << 8)) & 0x00ff00ff;
x = (x ^ (x << 4)) & 0x0f0f0f0f;
x = (x ^ (x << 2)) & 0x33333333;
x = (x ^ (x << 1)) & 0x55555555;
return x;
}
unsigned int EncodeMorton2(unsigned int x, unsigned int y)
{
return (Part1By1(y) << 1) + Part1By1(x);
}
struct sortKey{
int idx;
int key;
};
int compare (const void * a, const void * b)
{
return ( (*(struct sortKey*)b).key - (*(struct sortKey*)a).key );
}
int uniform_distribution(int rangeLow, int rangeHigh) {
double myRand = rand()/(1.0 + RAND_MAX);
int range = rangeHigh - rangeLow + 1;
int myRand_scaled = (myRand * range) + rangeLow;
return myRand_scaled;
}
int main (int argc, char **argv){
srand(time(NULL));
int n = 20;
int N = n*n;
// Uniform distribution
unsigned int pospar[DIM*N];
int k = 0;
for(unsigned int i=0; i<n; i++){
for(unsigned int j=0; j<n; j++){
pospar[DIM*k] = i;
pospar[DIM*k+1] = j;
k++;
}
}
// Lebesgue curve
unsigned int lkey[N];
for(int i=0; i<N; i++){
lkey[i] = EncodeMorton2(pospar[i*DIM+0],pospar[i*DIM+1]);
// printf("Lkey: %d\n", lkey[i]);
}
// Hilbert curve
unsigned int hkey[N];
for(int i=0; i<N; i++){
hkey[i] = Lebesgue2Hilbert(lkey[i]);
}
struct sortKey sk[N];
for(int i=0;i<N; i++){
sk[i].idx = i;
sk[i].key = hkey[i]; // "lkey[i]" or "hkey[i]"
}
qsort (sk, N, sizeof(struct sortKey), compare);
for (int i=0; i<N; i++){
printf ("%d %d\n", pospar[DIM*sk[i].idx], pospar[DIM*sk[i].idx+1]);
}
return 0;
}
Does anybody see what I have missed?

Related

Intel Intrinsics code optimization

So i'm trying to multiply a constant with short int a[101] with intel intrinsics. I have done it with addition but i can't seem to figure why it wont work with multiplication. Also before we used ints of 32 bits and now we use 16 bit short so we can have double as many values in the intrinsics to fill the 128 bit as far as i understand?
naive example of what im trying to do:
int main(int argc, char **argv){
short int a[101];
int len = sizeof(a)/sizeof(short);
/*Populating array a with values 1 to 101*/
mult(len, a);
return 0;
}
int mult(int len, short int *a){
int result = 0;
for(int i=0; i<len; i++){
result += a[i]*20;
}
return result;
}
And my code trying to do the same in intrinsics
/*Same main as before with a short int a[101] containing values 1 to 101*/
int SIMD(int len, short int *a){
int res;
int val[4];
/*Setting constant value to mulitply with*/
__m128i sum = _mm_set1_epi16(20);
__m128i s = _mm_setzero_si128( );
for(int i=0; i<len/4*4; i += 4){
__m128i vec = _mm_loadu_si128((__m128i *)(a+i));
s += _mm_mul_epu32(vec,sum);
}
_mm_storeu_si128((__m128i*) val, s);
res += val[0] + val[1] + val[2] + val[3];
/*Haldeling tail*/
for(int i=len/4*4; i<len; i++){
res += a[i];
}
return res;
}
So i do get a number out as result, but the number does not match the naive method, i have tried other intrinsics and changing numbers to see if it makes any noticable difference but nothing comes close to the output i expect. The computation time is almost the same as the naive at the moment aswell.
There are 8 short in one __m128i. So:
for(int i=0; i<len/4*4; i += 4)
should be
for(int i=0; i<len/8*8; i += 8)`
and:
res += val[0] + val[1] + val[2] + val[3];
should be:
res += val[0] + val[1] + val[2] + val[3] + val[4] + val[5] + val[6] + val[7];
and:
for(int i=len/4*4; i<len; i++)
should be:
for(int i=len/8*8; i<len; i++)
In:
s += _mm_mul_epu32(vec,sum);
_mm_mul_epu32 operates on 32-bit elements. It should be:
s += _mm_mullo_epi16(vec, sum);
The object res is not initialized; it should be:
int res = 0;
Here is working code:
#include <stdio.h>
#include <stdlib.h>
#include <immintrin.h>
// Number of elements in an array.
#define NumberOf(x) (sizeof (x) / sizeof *(x))
// Compute the result with scalar arithmetic.
static int mult(int len, short int *a)
{
int result = 0;
for (size_t i=0; i<len; i++)
{
result += a[i]*20;
}
return result;
}
// Compute the result with SIMD arithmetic.
static int SIMD(int len, short int *a)
{
// Initialize the multiplier and the sum.
__m128i multiplier = _mm_set1_epi16(20);
__m128i s = _mm_setzero_si128( );
// Process blocks of 8 short.
for (int i=0; i<len/8*8; i += 8)
{
__m128i vec = _mm_loadu_si128((__m128i *)(a+i));
// Multtiply by multiplier and add to sum.
s = _mm_add_epi16(s, _mm_mullo_epi16(vec, multiplier));
}
// Store the sum so far so its individual elements can be manipulated.
short val[8];
_mm_storeu_si128((__m128i*) val, s);
// Add the individual elements.
int res = 0;
for (size_t i = 0; i < 8; ++i)
res += val[i];
// Add the elements in the tail.
for (size_t i = len/8*8; i < len; ++i)
{
res += a[i];
}
return res;
}
int main(int argc, char **argv)
{
short int a[96];
int len = NumberOf(a);
// Initiailize a.
for (size_t i = 0; i < len; ++i)
a[i] = i+1;
printf("sum by scalar arithmetic is %d.\n", mult(len, a));
printf("sum by SIMD arithmetic is %d.\n", SIMD(len, a));
return 0;
}

How do I find distance between couple of points (x, y) from origin, and then sort the points, who is closest to (0, 0)?

i need to enter number of points(x,y), and then sort the points,from the closest one to (0,0) to the one that is far.. for example:
Enter number of points: 3
Enter point: 1 6
Enter point: 2 5
Enter point: 4 4
Sorted points:(2,5) (4,4) (1,6)
now i did a function that will find the distance,and i did an array and put the distance between two coordinate x and y,and i want to use merge sort to sort the array, my problem is how to go back and print the actual coordinate x y ... (i hope you would understand the problem),what can i do? i thought of putting the cordinate an array and sort them but that won't work :\
(and i didn't learn struct so i can't use unless if there is no other way ...)
plz anyone can help me i really have no idea have to continue:\
#include <stdio.h>
#include <stdlib.h>
void Enter_numbers(int x,int *z,int *first_coordinate,int *second_coordinate);
int distance(int a,int b);
void merge(int a[], int na, int b[], int nb, int c[]);
int merge_sort(int ar[], int n);
int main()
{
int x;
int *z;
int *first_coordinate;
int *second_coordinate;
printf("Enter number of points: ");
scanf("%d",&x);
z=(int*)malloc(x*sizeof(int));
first_coordinate=(int*)malloc(x*sizeof(int));
second_coordinate=(int*)malloc(x*sizeof(int));
Enter_numbers(x,z,first_coordinate,second_coordinate);
free(z);
free(first_coordinate);
free(second_coordinate);
return 0;
}
int distance(int a,int b)
{
int dis;
dis=((a*a)+(b*b));
return dis;
}
void Enter_numbers(int x,int *z,int *first_coordinate,int *second_coordinate)
{
int a=0,b=0;
int i=0;
int diss=0;
while(x>0)
{
printf("Enter points: ");
scanf("%d %d",&a,&b);
diss=distance(a,b);
z[i]=diss;
first_coordinate[i]=a;
second_coordinate[i]=b;
++i;
x--;
}
}
and the merge sort function i will use after i figure what to do :
int merge_sort(int ar[], int n)
{
int len;
int *temp_array, *base;
temp_array = (int*)malloc(sizeof(int)*n);
if(temp_array == NULL) {
printf("Dynamic Allocation Error in merge_sort");
return FAILURE;
}
for (len = 1; len < n; len *= 2) {
for (base = ar; base < ar + n; base += 2 * len) {
merge(base, len, base + len, len, temp_array);
memcpy(base, temp_array, 2*len*sizeof(int));
}
}
free(temp_array);
return SUCCESS;
}
and here is merge ...
void merge(int a[], int na, int b[], int nb, int c[])
{
int ia, ib, ic;
for(ia = ib = ic = 0; (ia < na) && (ib < nb); ic++)
{
if(a[ia] < b[ib]) {
c[ic] = a[ia];
ia++;
}
else {
c[ic] = b[ib];
ib++;
}
}
for(;ia < na; ia++, ic++) c[ic] = a[ia];
for(;ib < nb; ib++, ic++) c[ic] = b[ib];
}
I would use a struct for solving this task.
If you haven't learned struct yet, this seems to be a good time to learn it.
Note: If you really can't use stuct, see the last part of the answer.
With struct it could be something like:
#include <stdio.h>
#include <stdlib.h>
typedef struct
{
int x;
int y;
int squared_distance;
} dpoint;
int squared_dst(int x, int y)
{
return (x*x + y*y);
}
// Compare function used for sorting
int compare_dpoint_dst(const void * e1, const void * e2)
{
dpoint* p1 = (dpoint*)e1;
dpoint* p2 = (dpoint*)e2;
if (p1->squared_distance > p2->squared_distance) return 1;
if (p1->squared_distance < p2->squared_distance) return -1;
return 0;
}
void print_dpoint(dpoint dp)
{
printf("(%d, %d) : sd = %d\n", dp.x, dp.y, dp.squared_distance);
}
#define N 5
int main(void) {
// Array of points (fixed size for simplicity)
dpoint ps[N];
// Dummy input (for simplicity)
int x[N] = {1,5,2,3,4};
int y[N] = {9,3,7,1,3};
for (int i = 0; i < N; ++i)
{
ps[i].x = x[i];
ps[i].y = y[i];
}
// Calculate squared distance for all points
for (int i = 0; i < N; ++i)
{
ps[i].squared_distance = squared_dst(ps[i].x, ps[i].y);
}
printf("unsorted:\n");
for (int i = 0; i < N; ++i)
{
print_dpoint(ps[i]);
}
// Sort the points
qsort (ps, sizeof(ps)/sizeof(*ps), sizeof(*ps), compare_dpoint_dst);
printf("sorted:\n");
for (int i = 0; i < N; ++i)
{
print_dpoint(ps[i]);
}
return 0;
}
Notice that you can do the sorting on the squared distance so that you don't need square root in the program.
The program above will generate:
unsorted:
(1, 9) : sd = 82
(5, 3) : sd = 34
(2, 7) : sd = 53
(3, 1) : sd = 10
(4, 3) : sd = 25
sorted:
(3, 1) : sd = 10
(4, 3) : sd = 25
(5, 3) : sd = 34
(2, 7) : sd = 53
(1, 9) : sd = 82
No use of struct
If you for some reason can't use struct, you can use a shadow array to track the sorting but you'll have to write your own sorting. I don't recommend this approach - learn about structinstead. Anyway, it could be something like:
int x[N];
int y[N];
int sd[N]; // squared distance
int sw[N]; // swap order
// read input and calculate distance
// ...
// Fill sw with 0, 1, 2, ....
for (int i=0; i < N; ++i) sw[i] = i;
mySort(sd, sw, N);
// Now you can use sw for printing
for (int i=0; i < N; ++i)
{
// print element sw[i]
printf("(%d,%d)\n", x[sw[i]], y[sw[i]]);
}
}
void mySort(int sd[], int sw[], int N)
{
// .... code for sorting
// ....
// Assume that you need to swap element i and j here
temp = sd[i];
sd[i] = sd[j];
sd[j] = temp;
// Then do exactly the same for sw
temp = sw[i];
sw[i] = sw[j];
sw[j] = temp;
// ....
// ....
}

How to call Hilbert curve encode C routines

I was trying to run a Hilbert curve code written in C, which I found here
http://www.tddft.org/svn/octopus/trunk/src/grid/hilbert.c
The code runs, but the results I get form the output are not correct. I made a simple driver routine, that takes 3 values as arguments from the command line and passes them to a Hilbert curve encode, decode routines.
More precisely, I can't decode back the original coordinates (x,y,z).
One of my problems was to understand what the variable nbits is doing. I assume it is the size of the encoded Hilbert value. To check this I tried to modify the original definition of one of the functions from
void InttoTranspose(const int dim, const long long int h, int * x)
to
void InttoTranspose(const int dim, const long long int h, int * x, int* size)
Where I assign to *size the bit count variable ifbit. Anyway, this all didn't work. Therefore I would like to ask for your help.
The modified code is here:
#include<stdio.h>
//#include <config.h>
#include <assert.h>
/* This code is based on the implementation of the Hilbert curves
presented in:
J. Skilling, Programming the Hilbert curve, AIP Conf. Proc. 707, 381 (2004); http://dx.doi.org/10.1063/1.1751381
*/
*/ The int* size was included later in an attempt to find the proper size /*
/* void InttoTranspose(const int dim, const long long int h, int * x)*/
void InttoTranspose(const int dim, const long long int h, int * x, int* size){
/* the code uses some funny way of storing the bits */
int idir, ibit, ifbit;
for(idir = 0; idir < dim; idir++) x[idir] = 0;
ifbit = 0;
for(ibit = 0; ibit < 21; ibit++){
for(idir = dim - 1; idir >= 0; idir--){
x[idir] += (((h>>ifbit)&1)<<ibit);
ifbit++;
}
}
*size=ifbit; // I think that this should be nbits
}
void TransposetoAxes(int* X, int b, int n ){ /* position, #bits, dimension */
int N = 2 << (b-1), P, Q, t;
int i;
/* Gray decode by H ^ (H/2) */
t = X[n-1] >> 1;
for(i = n - 1; i > 0; i--) X[i] ^= X[i-1];
X[0] ^= t;
/* Undo excess work */
for( Q = 2; Q != N; Q <<= 1 ) {
P = Q - 1;
for( i = n-1; i >= 0 ; i-- ){
if( X[i] & Q ) {
X[0] ^= P; /* invert */
} else{
t = (X[0]^X[i]) & P; X[0] ^= t; X[i] ^= t; /* exchange */
}
}
}
}
//void FC_FUNC_(hilbert_index_to_point, HILBERT_INDEX_TO_POINT)(const int * dim, const int * nbits, const long long int * index, int * point){
// InttoTranspose(*dim, *index, point);
// TransposetoAxes(point, *nbits, *dim);
//}
int main(int argc,char* argv[]){
long long int hilbert;
int i=0,x[2],m;
while(argc--){
if(m=atoi(*argv++)) x[i++]=m, printf("--> %5d %5d\n",m,argc);
}
printf("x= %5d y= %5d z= %5d \n",x[0],x[1],x[2]);
InttoTranspose(3, hilbert, x,&m);
printf("hilbert encoded --> %llu size -->%d \n",hilbert,m);
TransposetoAxes(x,m, 3 );
printf("x= %5d y= %5d z= %5d \n",x[0],x[1],x[2]);
return 0;
}

Search an ordered array in a CUDA kernel

I'm writing a CUDA kernel and each thread has to complete the following task: suppose I have an ordered array a of n unsigned integers (the first one is always 0) stored in shared memory, each thread has to find the array index i such that a[i] ≤ threadIdx.x and a[i + 1] > threadIdx.x.
A naive solution could be:
for (i = 0; i < n - 1; i++)
if (a[i + 1] > threadIdx.x) break;
but I suppose this is not the optimal way to do it... can anyone suggest anything better?
Like Robert, I was thinking that a binary search has got to be faster that a naïve loop -- the upper bound of operation count for a binary search is O(log(n)), compared to O(N) for the loop.
My extremely simple implementation:
#include <iostream>
#include <climits>
#include <assert.h>
__device__ __host__
int midpoint(int a, int b)
{
return a + (b-a)/2;
}
__device__ __host__
int eval(int A[], int i, int val, int imin, int imax)
{
int low = (A[i] <= val);
int high = (A[i+1] > val);
if (low && high) {
return 0;
} else if (low) {
return -1;
} else {
return 1;
}
}
__device__ __host__
int binary_search(int A[], int val, int imin, int imax)
{
while (imax >= imin) {
int imid = midpoint(imin, imax);
int e = eval(A, imid, val, imin, imax);
if(e == 0) {
return imid;
} else if (e < 0) {
imin = imid;
} else {
imax = imid;
}
}
return -1;
}
__device__ __host__
int linear_search(int A[], int val, int imin, int imax)
{
int res = -1;
for(int i=imin; i<(imax-1); i++) {
if (A[i+1] > val) {
res = i;
break;
}
}
return res;
}
template<int version>
__global__
void search(int * source, int * result, int Nin, int Nout)
{
extern __shared__ int buff[];
int tid = threadIdx.x + blockIdx.x*blockDim.x;
int val = INT_MAX;
if (tid < Nin) val = source[threadIdx.x];
buff[threadIdx.x] = val;
__syncthreads();
int res;
switch(version) {
case 0:
res = binary_search(buff, threadIdx.x, 0, blockDim.x);
break;
case 1:
res = linear_search(buff, threadIdx.x, 0, blockDim.x);
break;
}
if (tid < Nout) result[tid] = res;
}
int main(void)
{
const int inputLength = 128000;
const int isize = inputLength * sizeof(int);
const int outputLength = 256;
const int osize = outputLength * sizeof(int);
int * hostInput = new int[inputLength];
int * hostOutput = new int[outputLength];
int * deviceInput;
int * deviceOutput;
for(int i=0; i<inputLength; i++) {
hostInput[i] = -200 + 5*i;
}
cudaMalloc((void**)&deviceInput, isize);
cudaMalloc((void**)&deviceOutput, osize);
cudaMemcpy(deviceInput, hostInput, isize, cudaMemcpyHostToDevice);
dim3 DimBlock(256, 1, 1);
dim3 DimGrid(1, 1, 1);
DimGrid.x = (outputLength / DimBlock.x) +
((outputLength % DimBlock.x > 0) ? 1 : 0);
size_t shmsz = DimBlock.x * sizeof(int);
for(int i=0; i<5; i++) {
search<1><<<DimGrid, DimBlock, shmsz>>>(deviceInput, deviceOutput,
inputLength, outputLength);
}
for(int i=0; i<5; i++) {
search<0><<<DimGrid, DimBlock, shmsz>>>(deviceInput, deviceOutput,
inputLength, outputLength);
}
cudaMemcpy(hostOutput, deviceOutput, osize, cudaMemcpyDeviceToHost);
for(int i=0; i<outputLength; i++) {
int idx = hostOutput[i];
int tidx = i % DimBlock.x;
assert( (hostInput[idx] <= tidx) && (tidx < hostInput[idx+1]) );
}
cudaDeviceReset();
return 0;
}
gave about a five times speed up compared to the loop:
>nvprof a.exe
======== NVPROF is profiling a.exe...
======== Command: a.exe
======== Profiling result:
Time(%) Time Calls Avg Min Max Name
60.11 157.85us 1 157.85us 157.85us 157.85us [CUDA memcpy HtoD]
32.58 85.55us 5 17.11us 16.63us 19.04us void search<int=1>(int*, int*, int, int)
6.52 17.13us 5 3.42us 3.35us 3.73us void search<int=0>(int*, int*, int, int)
0.79 2.08us 1 2.08us 2.08us 2.08us [CUDA memcpy DtoH]
I'm sure that someoneclever could do a lot better than that. But perhaps this gives you at least a few ideas.
can anyone suggest anything better?
A brute force approach would be to have each thread do a binary search (on threadIdx.x + 1).
// sets idx to the index of the first element in a that is
// equal to or larger than key
__device__ void bsearch_range(const int *a, const int key, const unsigned len_a, unsigned *idx){
unsigned lower = 0;
unsigned upper = len_a;
unsigned midpt;
while (lower < upper){
midpt = (lower + upper)>>1;
if (a[midpt] < key) lower = midpt +1;
else upper = midpt;
}
*idx = lower;
return;
}
__global__ void find_my_idx(const int *a, const unsigned len_a, int *my_idx){
unsigned idx = (blockDim.x * blockIdx.x) + threadIdx.x;
unsigned sp_a;
int val = idx+1;
bsearch_range(a, val, len_a, &sp_a);
my_idx[idx] = ((val-1) < a[sp_a]) ? sp_a:-1;
}
This is coded in browser, not tested. It's hacked from a piece of working code, however. If you have trouble making it work, I can revisit it. I don't recommend this approach on a device without caches (cc 1.x device).
This is actually searching on the full unique 1D thread index (blockDim.x * blockIdx.x + threadIdx.x + 1) You can change val to be anything you like.
You could also add an appropriate thread check, if the number of threads you intend to launch is greater than the length of your my_idx result vector.
I imagine there is a more clever approach that may use something akin to prefix sums.
This is the best algorithm so far. It's called: LPW Indexed Search
__global__ void find_position_lpw(int *a, int n)
{
int idx = threadIdx.x;
__shared__ int aux[ MAX_THREADS_PER_BLOCK /*1024*/ ];
aux[idx] = 0;
if (idx < n)
atomicAdd( &aux[a[idx]], 1); // atomics in case there are duplicates
__syncthreads();
int tmp;
for (int j = 1; j <= MAX_THREADS_PER_BLOCK / 2; j <<= 1)
{
if( idx >= j ) tmp = aux[idx - j];
__syncthreads();
if( idx >= j ) aux[idx] += tmp;
__syncthreads();
}
// result in "i"
int i = aux[idx] - 1;
// use "i" here...
// ...
}

Decompression stops inbetween and output file filled with zeros(BLACK PIXELS)?

I am trying to apply DCT(discrete cosine transformation) compression on a bmp(bitmap) file. I have a c file which i am running in Turbo C++. This is not actually compressing but i was trying to implement the DCT and IDCT. The code is as follows:
/*
the image to be compressed is a bmp with 24 bpp and
with name "college4.bmp" of dimensions 200*160 ie 25*20- 8*8 blocks
o/p is college2.dat
format: 8 bit signed integers starting rowwise from 0,0 to 8,8
the coefficients order is blue,green,red
for the block no 1 then 2 and soon
*/
#include<stdlib.h>
#include<stdio.h>
#include<math.h>
#define WIDTH 25
#define HEIGHT 20
typedef struct {
unsigned int type;
unsigned long int filesize;
unsigned int reserved1,reserved2;
unsigned long int offset;
} BMPHEAD;
typedef struct {
unsigned long int infosize;
unsigned long int width,height;
unsigned int planes,bitsperpixel;
unsigned long int compression;
unsigned long int sizeimage;
long int xpelspermeter,ypelspermeter;
unsigned long int colorused,colorimportant;
} INFOHEAD;
typedef struct {
char rgbquad[4];
} colortable;
BMPHEAD bmphead;
INFOHEAD infohead;
FILE *bmp_fp1,*bmp_fp2;
int buf[WIDTH][8][8][3],buf1[WIDTH][8][8][3];
float pi=3.14159265,DCTcoeff[8][8][8][8];
void generatedctcoeff() {
int y, i, j, x;
for (i = 0; i < 8; i++) {
for (j = 0; j < 8; j++) {
for (x = 0; x < 8; x++) {
for (y = 0; y < 8; y++) {
DCTcoeff[i][j][x][y] = cos(((2 * y + 1) * pi * j) / 16)
* cos(((2 * x + 1) * i * pi) / 16);
}
}
}
}
}
void outputtofile1() { // Write into college2.dat
int i, j, x, y, blockno; // One block at a time, buf contains pixel
int redcoef, greencoef, bluecoef; // data of one row of blocks
float gijred, gijgreen, gijblue, c, ci, cj;
c = 1 / (sqrt(2));
for (blockno = 0; blockno < WIDTH; blockno++) {
for (i = 0; i < 8; i++) {
for (j = 0; j < 8; j++) {
gijred = 0;
gijgreen = 0;
gijblue = 0;
for (x = 0; x < 8; x++) {
for (y = 0; y < 8; y++) {
gijblue = gijblue + DCTcoeff[i][j][x][y]
* buf[blockno][x][y][0];
gijgreen = gijgreen + DCTcoeff[i][j][x][y]
* buf[blockno][x][y][1];
gijred = gijred + DCTcoeff[i][j][x][y]
* buf[blockno][x][y][2];
}
}
ci = cj = 1.0;
if (i == 0)
ci = c;
if (j == 0)
cj = c;
gijblue = ci * cj * gijblue / 4;
gijgreen = ci * cj * gijgreen / 4;
gijred = ci * cj * gijred / 4;
bluecoef = (int) gijblue;
greencoef = (int) gijgreen;
redcoef = (int) gijred;
fprintf(bmp_fp2, "%d %d %d ", bluecoef, greencoef, redcoef);
}
}
} /* end of one block processing */
}
void compressimage() {
int rowcount,x,y;
bmp_fp1=fopen("college4.bmp","r");
bmp_fp2=fopen("college2.dat","w");
printf("generating coefficients...\n");
generatedctcoeff();
if(bmp_fp1==NULL) {
printf("can't open");
return;
}
printf("compressing....\n");
fread(&bmphead,1,sizeof(bmphead),bmp_fp1);
fread(&infohead,1,sizeof(infohead),bmp_fp1);
fseek(bmp_fp1,bmphead.offset,SEEK_SET);
for(rowcount=0;rowcount<HEIGHT;rowcount++) {
for(y=0;y<8;y++) {
for(x=0;x<infohead.width;x++) {
buf[x/8][x%8][y][0]=(int)fgetc(bmp_fp1);
buf[x/8][x%8][y][1]=(int)fgetc(bmp_fp1);
buf[x/8][x%8][y][2]=(int)fgetc(bmp_fp1);
}
}
outputtofile1(); //output contents of buf after dct to file
}
fclose(bmp_fp1);
fclose(bmp_fp2);
}
void outputtofile2() { //output buf to college3.bmp
int i, j, x, y, blockno; // buf now contains coefficients
float pxyred, pxygreen, pxyblue, c, ci, cj; // a temp buffer buf1 used to
c = 1 / (sqrt(2)); // store one row of block of
for (blockno = 0; blockno < WIDTH; blockno++) { // decoded pixel values
for (x = 0; x < 8; x++)
for (y = 0; y < 8; y++) {
pxyred = 0;
pxygreen = 0;
pxyblue = 0;
for (j = 0; j < 8; j++) {
cj = 1.0;
if (j == 0)
cj = c;
for (i = 0; i < 8; i++) {
ci = 1.0;
if (i == 0)
ci = c;
pxyblue = pxyblue + ci * cj * DCTcoeff[i][j][y][x] * buf[blockno][i][j][0];
pxygreen = pxygreen + ci * cj
* DCTcoeff[i][j][y][x] * buf[blockno][i][j][1];
pxyred = pxyred + ci * cj * DCTcoeff[i][j][y][x] * buf[blockno][i][j][2];
}
}
pxyblue /= 4;
pxygreen /= 4;
pxyred /= 4;
buf1[blockno][y][x][0] = pxyblue;
buf1[blockno][y][x][1] = pxygreen;
buf1[blockno][y][x][2] = pxyred;
}
}
for (y = 0; y < 8; y++) {
for (blockno = 0; blockno < WIDTH; blockno++)
for (x = 0; x < 8; x++) {
fprintf(bmp_fp2, "%c%c%c", (char) buf1[blockno][x][y][0],
(char) buf1[blockno][x][y][1],
(char) buf1[blockno][x][y][2]);
}
}
}
void uncompressimage() {
int blue,green,red,rowcount,colcount,i,j;
bmp_fp1=fopen("college2.dat","r");
bmp_fp2=fopen("college3.bmp","w");
printf("generating coefficients...\n");
generatedctcoeff();
if (bmp_fp1==NULL) {
printf("open failed");
return;
}
printf("uncompressing....\n");
bmphead.type=0x4d42;
bmphead.filesize=30518;
bmphead.reserved1=0;
bmphead.reserved2=0;
bmphead.offset=sizeof(bmphead)+sizeof(infohead);
infohead.infosize=sizeof(infohead);
infohead.width=200;
infohead.height=160;
infohead.planes=1;
infohead.bitsperpixel=24;
infohead.compression=0;
infohead.sizeimage=0;
infohead.xpelspermeter=3780;
infohead.ypelspermeter=3780;
infohead.colorused=0;
infohead.colorimportant=0;
fwrite(&bmphead,sizeof(BMPHEAD),1,bmp_fp2);
fwrite(&infohead,sizeof(INFOHEAD),1,bmp_fp2);
for(rowcount=0;rowcount<HEIGHT;rowcount++) {
for(colcount=0;colcount<WIDTH;colcount++) {
for(i=0;i<8;i++) {
for(j=0;j<8;j++) {
fscanf(bmp_fp1,"%d",&blue);
fscanf(bmp_fp1,"%d",&green);
fscanf(bmp_fp1,"%d",&red);
buf[colcount][i][j][0]=blue;
buf[colcount][i][j][1]=green;
buf[colcount][i][j][2]=red;
}
}
}
outputtofile2();
}
fclose(bmp_fp1);
fclose(bmp_fp2);
}
int main() {
printf("opening files...\n");
compressimage();
printf("opening files...again\n");
uncompressimage();
printf("successful decompression\nenter any key\n");
return 0;
}
Here is the image i am using as input
(im srry the site converted the bmp into png. You may convert it back to bmp to use it)
Here is the image that is generated:
The file college3.bmp that gets created is of size 200x160 and of 93.8 kB but till quarter of the image it has decoded the coefficients correctly but later the file is filled with black pixels. I have taken a screenshot of the o/p as it was saying not a valid bmp while uploading. I am sitting on this problem since feb,2004. If anyone can say me where there is a bug i would be very thankful. I have analysed the output file and found an EOF right at the place where the pixels are starting to be black. I read some other questions on the topic and found that the conversion factors ci,cj have been used improperly. While coding i had also got confused with the indices x,y,i and j. So i hope this problem i will solve in a few days.
Apparently, the problem in the above code is in how you open your files.
This is what should be in your code (note the explicitly specified open modes, binary and text):
void compressimage() {
...
bmp_fp1=fopen("college4.bmp","rb");
bmp_fp2=fopen("college2.dat","wt");
...
}
void uncompressimage() {
...
bmp_fp1=fopen("college2.dat","rt");
bmp_fp2=fopen("college3.bmp","wb");
...
}
With that and slightly altered structure definitions:
#pragma pack(push,1)
typedef struct {
unsigned short int type;
unsigned long int filesize;
unsigned short int reserved1,reserved2;
unsigned long int offset;
} BMPHEAD;
typedef struct {
unsigned long int infosize;
unsigned long int width,height;
unsigned short int planes,bitsperpixel;
unsigned long int compression;
unsigned long int sizeimage;
long int xpelspermeter,ypelspermeter;
unsigned long int colorused,colorimportant;
} INFOHEAD;
typedef struct {
char rgbquad[4];
} colortable;
#pragma pack(pop)
I'm able to compile your program successfully using 3 different compilers (Turbo C++, Open Watcom, gcc) and get the desired output picture.

Resources