How to check whether a file is in tar format?

How to check whether a file is in tar format? - archive

I want to implement a check of a tar file.
I am not interested only to check the file extension, but I need surer way to check.
For example for zip format I could check some lead bytes. But what is the way for tar files?
Regards,
B

You can check file's header.
Here's the specification: http://www.gnu.org/software/tar/manual/html_node/Standard.html

Check the magic bytes at offset 257. If they match "ustar" including the null terminator, the file is probably a tar.
See: http://www.gnu.org/software/tar/manual/html_node/Standard.html
/* tar Header Block, from POSIX 1003.1-1990. */
/* POSIX header. */
struct posix_header
{ /* byte offset */
char name[100]; /* 0 */
char mode[8]; /* 100 */
char uid[8]; /* 108 */
char gid[8]; /* 116 */
char size[12]; /* 124 */
char mtime[12]; /* 136 */
char chksum[8]; /* 148 */
char typeflag; /* 156 */
char linkname[100]; /* 157 */
char magic[6]; /* 257 */
char version[2]; /* 263 */
char uname[32]; /* 265 */
char gname[32]; /* 297 */
char devmajor[8]; /* 329 */
char devminor[8]; /* 337 */
char prefix[155]; /* 345 */
/* 500 */
};
#define TMAGIC "ustar" /* ustar and a null */
#define TMAGLEN 6

In shell you can use the file command
file AFile.tar
AFile.tar: POSIX tar archive (GNU)

A C++ code that checks the first magic string in a file in order to determine if it is a tar file. If unfortunately "star" appears exactly on the 257 location in a non-TAR file, it will produce a false positive, which is very unlikely:
#include <fstream>
#include <stdexcept>
bool isTarFile(const std::string& filePath) {
std::ifstream file(filePath, std::ios::binary);
if (!file) {
throw std::runtime_error("Could not open file: " + filePath);
}
constexpr char magic[] = "ustar";
constexpr size_t magicLength = sizeof(magic) - 1;
constexpr size_t bufferSize = sizeof(magic);
char buffer[bufferSize];
// Seek to the magic string location in the header
file.seekg(257);
if (!file) {
throw std::runtime_error("Could not seek to position 257 of file: " + filePath);
}
// Read the magic number
file.read(buffer, bufferSize);
if (!file) {
throw std::runtime_error("Could not read from file: " + filePath);
}
// Check the file signature
return (std::strncmp(buffer, magic, magicLength) == 0);
}

Related

How can I resolve the collision in the hashing in this code I did? Currently cannot search for NG CHEA YEAT's ID only

I have the following text file
1171203258:HOSSAIN, MARUF
1181202660:KUHAN RAJ A/L TAMIL CHEL WAM
1181203465:PONG KAI SUN
1191102443:FAIZA OSAMA ABDALLA HASHIM
1201302289:LEE JIA WEI
1201302368:SHEIKH, AHNAF AZMAIN
1201100584:HI CHIA LING
1201101509:NG CHEA YEAT
1191103201:PHUAH CHEE HAOU
1201100879:MOSTAFA ARABY MADBOULY AHMED
1191103215:TONG JUN YANG
1191103119:ANG QIZHENG
1171302286:DARWIN KUMAR A/L MUNIAN
1181101192:HAIZUN NAJWA BINTI MOHD RIFIN
1201100926:NG XUE NIE
1191302417:ALMARHOON, ALI HUSSAIN A
1201100225:HEMAN RAO A/L SUBRAMANIAM
1181100823:LIM ZHEN BANG
1161202587:SOHEIL PRAKASAN SUPPAN
1201100603:AVINASH MURALI
1181101858:CHEAH KOK YEW
1191103071:GAN WEI TONG
1201100301:KEVIN THAM ZHENG YIT
1201100648:LIM CHER AIK
1201302222:SHIVAA RUTRAN A/L NAGATHEESAN
1201100779:TAN WEI XIANG
1191100919:WONG HONG WEI
The code I have for now, work well but have collision in the hashing I think
Here is what I have so far:
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#define MDIR 27 //size of list
#define MBUFF 256
#define MHASH 109 //hash function is %109
#define MNAME 40
struct List{
char name[40];
int studID;
};
//function prototype
int comparator(const void* p, const void* q){
return strcmp(((struct List*)p)->name,((struct List*)q)->name);
}
int readData(struct List dir[]);
int hashfunc(char *name);
void hash(struct List dir[], int ndir,
int hashtable[]);
int search(char *key,
struct List s[], int hashtable[]);
//main function
int main(){
int ndir, result, hashtable[MHASH];
int count;
int i;
int j;
struct List s[27];
char temp[27];
char query[40];
FILE *fptr;
fptr = fopen("rec.txt", "r+");
if (fptr != NULL) {
printf("File created successfully!\n");
}
else {
printf("Failed to create the file.\n");
// exit status for OS that an error occurred
return -1;
}
for(count = 0; count < 27; count++){
fscanf(fptr,"%d", &s[count].studID);
fgets(s[count].name,40,fptr);
}
qsort
qsort(s,27,sizeof(struct List),comparator);
printing the sorted name then continue the hashing of searching
//printing sorted name
printf("Sorted Names\n");
for(i=0;i<27;i++){
printf("%d%s\n", i+1, s[i].name);
}
fclose(fptr);
hashing of searching part
ndir=readData(s);
hash(s,ndir,hashtable);
puts("\nName to search>>");
fgets(query,MNAME-1,stdin);
query[strlen(query)-1]='\0';
result=search(query,s,hashtable);
if(result==-1)
printf("Not Found");
else
printf("%s's ID is %d\n",
s[result].name, s[result].studID);
return 0;
}
read function
int readData(struct List dir[]){
FILE *fdir=fopen("rec.txt","r");
char buff[MBUFF];
int i=0;
while(i<MDIR && fgets(buff,MBUFF-1,fdir)){
dir[i].studID=atol(strtok(buff,":"));
strcpy(dir[i].name,strtok(NULL, "\n"));
i++;
}
return(i);
}
hash function
int hashfunc(char *name){
long sum=0;
int k=0;
while(name[k]){
sum+=name[k];
k++;
}
return( (int) (sum % MHASH) );
}
hash function
void hash(struct List dir[], int ndir,
int hashtable[]){
int k;
int index;
for(k=0;k<ndir;k++){
index = hashfunc(dir[k].name);
hashtable[index]=k;
}
}
search function
int search(char *key, struct List dir[],
int hashtable[]){
int index=hashfunc(key);
int k=hashtable[index];
if(strcmp(key,dir[k].name)==0)
return(k);
else
return(-1);
}
I am not sure for the hashing of searching part

Whenever faced with a need to separate fields in a line of data, the normal approach is to read an entire line of data as a string into a buffer (character array). Then you separate what you need from the buffer using whatever method fits the data the best. Either using a pair of pointers to bracket the text you need and then copying the characters between the pointers. You can automate the process using string functions like strchr() to locate the ':' in the buffer. You can also use string functions like strtok() to split the buffer into tokens on any given set of delimiters.
However here there is an even simpler method. Since you have a fixed format for the studID and name in the line, you can simply use sscanf(), e.g.
#include <stdio.h>
#include <stdlib.h>
#define MXSTUD 30 /* if you need a constant, #define one (or more) */
#define MXNAME 40
typedef struct list { /* adding typedef for convenience */
char name[MXNAME];
unsigned studID;
} list;
...
int main (int argc, char **argv) {
int count = 0; /* count of students */
char buf[MXNAME * 2]; /* temprorary storage for line */
list s[MXSTUD] = {{ .name = "" }}; /* list array initialized all 0 */
/* open filename given as 1st argument or "rec.text" if none given */
FILE *fptr = fopen (argc > 1 ? argv[1] : "rec.text", "r");
if (!fptr) { /* validate file open for reading */
fputs ("error: file open failed\n", stderr);
return 1;
}
while (fgets (buf, sizeof buf, fptr)) { /* read each line into buf */
/* separate studID and name using sscanf() */
if (sscanf (buf, "%u:%39[^\n]", &s[count].studID, s[count].name) == 2) {
count += 1; /* increment count on success */
}
}
...
That's all that is needed to read each line of data and separate the line into studID and name storing each in an element of the list array of struct.
Use qsort() For Sorting
Regardless of whether you have an array or allocated block of memory containing objects, qsort() provides a simple and efficient way to sort it. All you need to do is write a compare() function telling qsort() how to compare the elements. The declaration for the qsort() compare function is:
int compare (const void *a, const void *b);`
Where a and b are simple pointers-to elements of your array to be compared. So when writing the function, all you need to do is cast a and b to the proper type and write the logic to compare whatever you like in the two elements. A negative return means a sorts before b and a positive return means b sorts before a. A zero return means the elements are equal.
Casting the a and b to type const list * (you include const since the data isn't modified which allows the compiler freedom to optimize more fully), you simply loop over each name comparing characters and returning when two characters differ or the end of file is reached. Here, to sort your s[] array by name you can do:
/* qsort compare function lexagraphically sorts words */
int compare (const void *a, const void *b)
{
/* a & b are pointers to adjacent list elements, (pointers to list) */
const list *sa = (const list *)a,
*sb = (const list *)b;
const char *na = sa->name, /* pointers to name in each element */
*nb = sb->name;
/* loop advancing a character in each word per-iteration */
for (;; na++, nb++) {
/* if characters differ or at end of either */
if (*na != *nb || !*na)
break;
}
return (*na > *nb) - (*na < *nb); /* return sort order */
}
Then to sort your array of list (your s[] array) with qsort(), all that is needed is:
qsort (s, count, sizeof *s, compare); /* sort array by name */
Putting it all together in a short program that reads from the filename given as the first argument to the program (or from "rec.text" by default if no argument is given), you can do:
#include <stdio.h>
#include <stdlib.h>
#define MXSTUD 30 /* if you need a constant, #define one (or more) */
#define MXNAME 40
typedef struct list { /* adding typedef for convenience */
char name[MXNAME];
unsigned studID;
} list;
/* qsort compare function lexagraphically sorts words */
int compare (const void *a, const void *b)
{
/* a & b are pointers to adjacent list elements, (pointers to list) */
const list *sa = (const list *)a,
*sb = (const list *)b;
const char *na = sa->name, /* pointers to name in each element */
*nb = sb->name;
/* loop advancing a character in each word per-iteration */
for (;; na++, nb++) {
/* if characters differ or at end of either */
if (*na != *nb || !*na)
break;
}
return (*na > *nb) - (*na < *nb); /* return sort order */
}
int main (int argc, char **argv) {
int count = 0; /* count of students */
char buf[MXNAME * 2]; /* temprorary storage for line */
list s[MXSTUD] = {{ .name = "" }}; /* list array initialized all 0 */
/* open filename given as 1st argument or "rec.text" if none given */
FILE *fptr = fopen (argc > 1 ? argv[1] : "rec.text", "r");
if (!fptr) { /* validate file open for reading */
fputs ("error: file open failed\n", stderr);
return 1;
}
while (fgets (buf, sizeof buf, fptr)) { /* read each line into buf */
/* separate studID and name using sscanf() */
if (sscanf (buf, "%u:%39[^\n]", &s[count].studID, s[count].name) == 2) {
count += 1; /* increment count on success */
}
}
qsort (s, count, sizeof *s, compare); /* sort array by name */
for (int i = 0; i < count; i++) { /* output results */
printf ("%2d %10u %s\n", i + 1, s[i].studID, s[i].name);
}
}
(note: you simply need to open the file in read mode "r")
Example Use/Output
With your data in a file named dat/studIDlist.txt, for the 27 students in your data you would get:
$ ./bin/studIDlist dat/studIDlist.txt
1 1191302417 ALMARHOON, ALI HUSSAIN A
2 1191103119 ANG QIZHENG
3 1201100603 AVINASH MURALI
4 1181101858 CHEAH KOK YEW
5 1171302286 DARWIN KUMAR A/L MUNIAN
6 1191102443 FAIZA OSAMA ABDALLA HASHIM
7 1191103071 GAN WEI TONG
8 1181101192 HAIZUN NAJWA BINTI MOHD RIFIN
9 1201100225 HEMAN RAO A/L SUBRAMANIAM
10 1201100584 HI CHIA LING
11 1171203258 HOSSAIN, MARUF
12 1201100301 KEVIN THAM ZHENG YIT
13 1181202660 KUHAN RAJ A/L TAMIL CHEL WAM
14 1201302289 LEE JIA WEI
15 1201100648 LIM CHER AIK
16 1181100823 LIM ZHEN BANG
17 1201100879 MOSTAFA ARABY MADBOULY AHMED
18 1201101509 NG CHEA YEAT
19 1201100926 NG XUE NIE
20 1191103201 PHUAH CHEE HAOU
21 1181203465 PONG KAI SUN
22 1201302368 SHEIKH, AHNAF AZMAIN
23 1201302222 SHIVAA RUTRAN A/L NAGATHEESAN
24 1161202587 SOHEIL PRAKASAN SUPPAN
25 1201100779 TAN WEI XIANG
26 1191103215 TONG JUN YANG
27 1191100919 WONG HONG WEI

You will have to get line by line your file and store it in an array.
FILE *fp = fopen("lorem.txt", "r");
if(fp == NULL) {
perror("Unable to open file!");
exit(1);
}
char chunk[128];
while(fgets(chunk, sizeof(chunk), fp) != NULL) {
fputs(chunk, stdout);
fputs("|*\n", stdout); // marker string used to show where the content the chunk array has ended
}
fclose(fp);
To split each line use strtok() function:
char *token = strtok(line, ":"); // To separate the first block from the second like seen on your image.
char *token[1] = strtok(token, ","); // To separate the other part

C check if archive file is truncated with struct ar_hdr (ar.h)

I'm using the ar.h's structure : struct ar_hdr to retrieve informations inside my archive file (lib.a) using read to iterate over it, and i'm running into a little problem if the file is truncated.
In using the C language and when the file was truncated it currently makes me get a segmentation fault.
Is there any way to check if the file is truncated beforehand ? like by using stat or stuff like that ?
Thanks in advance
PS: be free to tell me if my question wasn't really understandable and clear
#define SIZE atoi(ar->ar_size)
struct ar_hdr *get_header(int fd)
{
struct ar_hdr *ar = (struct ar_hdr *)malloc(sizeof(struct ar_hdr));
if (read(fd, ar, sizeof(struct ar_hdr)) != sizeof(struct ar_hdr)) {
free(ar);
return NULL;
}
return ar;
}
int handle_ar_files(int fd, char *names[2], int ret)
{
struct ar_hdr *ar = NULL;
void *buf = NULL;
int index = 0;
while ((ar = get_header(fd)) != NULL) {
index = 0;
buf = malloc(SIZE);
if (ar->ar_name[0] == '/') {
my_free(ar, buf, (int [2]){fd, SIZE}, 1);
continue;
}
for (; ar->ar_name[index] && ar->ar_name[index] != '/'; index++);
ar->ar_name[index] = 0;
if ((read(fd, buf, SIZE)) != SIZE)
return my_free(ar, buf, (int [2]){fd, SIZE}, 0);
if ((parse_ar(buf, SIZE, (char *[2]){names[1], ar->ar_name})) == 84)
return my_free(ar, buf, (int [2]){fd, SIZE}, 0);
else
my_free(ar, buf, (int [2]){fd, SIZE}, 0);
}
return ret;
}
void *buf32 = mmap(NULL, st->st_size, PROT_READ, MAP_PRIVATE, fd, 0);
if (strncmp((char *)buf32, ARMAG, SARMAG) == 0) {
read(fd, tmp, SARMAG);
return handle_ar_files(fd, names, 0);
}
#ifndef _AR_H
#define _AR_H 1
#include <sys/cdefs.h>
/* Archive files start with the ARMAG identifying string. Then follows a
`struct ar_hdr', and as many bytes of member file data as its `ar_size'
member indicates, for each member file. */
#define ARMAG "!<arch>\n" /* String that begins an archive file. */
#define SARMAG 8 /* Size of that string. */
#define ARFMAG "`\n" /* String in ar_fmag at end of each header. */
__BEGIN_DECLS
struct ar_hdr
{
char ar_name[16]; /* Member file name, sometimes / terminated. */
char ar_date[12]; /* File date, decimal seconds since Epoch. */
char ar_uid[6], ar_gid[6]; /* User and group IDs, in ASCII decimal. */
char ar_mode[8]; /* File mode, in ASCII octal. */
char ar_size[10]; /* File size, in ASCII decimal. */
char ar_fmag[2]; /* Always contains ARFMAG. */
};
__END_DECLS
#endif /* ar.h */

Duplicated declarations. error: conflicting types for ‘functionname’

I'm trying to include these files to my main C code:
who.c:
/* who3.c - who with buffered reads
* - surpresses empty records
* - formats time nicely
* - buffers input (using utmplib)
*/
#include <stdio.h>
#include <stdlib.h>
#include <sys/types.h>
#include <utmp.h>
#include <fcntl.h>
#include <time.h>
#include "utmplib.h"
#define SHOWHOST
void show_info(struct utmp *);
void showtime(time_t);
int Who()
{
struct utmp *utbufp, /* holds pointer to next rec */
*utmp_next(); /* returns pointer to next */
if ( utmp_open( UTMP_FILE ) == -1 ){
perror(UTMP_FILE);
exit(1);
}
while ( ( utbufp = utmp_next() ) != ((struct utmp *) NULL) )
show_info( utbufp );
utmp_close( );
return 0;
}
/*
* show info()
* displays the contents of the utmp struct
* in human readable form
* * displays nothing if record has no user name
*/
void show_info( struct utmp *utbufp )
{
printf("%-8.8s", utbufp->ut_name); /* the logname */
printf(" "); /* a space */
printf("%-8.8s", utbufp->ut_line); /* the tty */
printf(" "); /* a space */
showtime( utbufp->ut_time ); /* display time */
#ifdef SHOWHOST
if ( utbufp->ut_host[0] != '\0' )
printf(" (%s)", utbufp->ut_host); /* the host */
#endif
printf("\n"); /* newline */
}
void showtime( time_t timeval )
/*
* displays time in a format fit for human consumption
* uses ctime to build a string then picks parts out of it
* Note: %12.12s prints a string 12 chars wide and LIMITS
* it to 12chars.
*/
{
char *ctime(); /* convert long to ascii */
char *cp; /* to hold address of time */
cp = ctime( &timeval ); /* convert time to string */
/* string looks like */
/* Mon Feb 4 00:46:40 EST 1991 */
/* 0123456789012345. */
printf("%12.12s", cp+4 ); /* pick 12 chars from pos 4 */
}
who.h:
#ifndef WHO_H
#define WHO_H
/* This file was automatically generated. Do not edit! */
int Who();
void showtime(time_t);
void showtime(time_t timeval);
void show_info(struct utmp *);
void show_info(struct utmp *utbufp);
#endif
utmplib.c:
/* utmplib.c - functions to buffer reads from utmp file
*
* functions are
* utmp_open( filename ) - open file
* returns -1 on error
* utmp_next( ) - return pointer to next struct
* returns NULL on eof
* utmp_close() - close file
*
* reads NRECS per read and then doles them out from the buffer
*/
#include <stdio.h>
#include <fcntl.h>
#include <sys/types.h>
#include <utmp.h>
#define NRECS 1
#define NULLUT ((struct utmp *)NULL)
#define UTSIZE (sizeof(struct utmp))
static char utmpbuf[NRECS * UTSIZE]; /* storage */
static int num_recs; /* num stored */
static int cur_rec; /* next to go */
static int fd_utmp = -1; /* read from */
utmp_open( char *filename )
{
fd_utmp = open( filename, O_RDONLY ); /* open it */
cur_rec = num_recs = 0; /* no recs yet */
return fd_utmp; /* report */
}
struct utmp *utmp_next()
{
struct utmp *recp;
//struct utmp *nextRecp;
int match = 0;
if(recp->ut_type == USER_PROCESS)
{
recp = ( struct utmp *) &utmpbuf[cur_rec * UTSIZE];
}
while(recp->ut_type!= USER_PROCESS)
{
if ( fd_utmp == -1 ) /* error ? */
return NULLUT;
if ( cur_rec==num_recs && utmp_reload()==0 ) /* any more ? */
return NULLUT;
/* get address of next record */
recp = ( struct utmp *) &utmpbuf[cur_rec * UTSIZE];
cur_rec++;
}
return recp;
}
int utmp_reload()
/*
* read next bunch of records into buffer
*/
{
int amt_read;
/* read them in */
amt_read = read( fd_utmp , utmpbuf, NRECS * UTSIZE );
/* how many did we get? */
num_recs = amt_read/UTSIZE;
/* reset pointer */
cur_rec = 0;
return num_recs;
}
utmp_close()
{
if ( fd_utmp != -1 ) /* don't close if not */
close( fd_utmp ); /* open */
}
utmplib.h:
#ifndef UTMPLIB_H
#define UTMPLIB_H
/* This file was automatically generated. Do not edit! */
utmp_close();
int utmp_reload();
struct utmp *utmp_next();
utmp_open(char *filename);
#endif
But I can not compile these files, I'm getting this error:
who.h:9:6: error: conflicting types for ‘show_info’
void show_info(struct utmp *utbufp);
I didn't write these two file. First of all why there are duplicated declarations? And how can I fix it?
I generated header files using makeheaders.
Actually these files are working example for who command in Linux or basic version of who.c in GNU core utils. Taken from here
who.c compiling without any error and working seamlessly(if you change Who function name with main):
cc who.c utmplib.c -o who
who

You can override flags when using automake, e.g., 'make CFLAGS=-O0 file.o' to disable optimization for file.c

searching an lzw encoded file

I am looking to alter the LZW compressor to enable it to search for a word in an LZW encoded file and finds the number of matches for that search term.
For example if my file is used as
Prompt:>lzw "searchterm" encoded_file.lzw
32
Any suggestions on how to achive this?
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define BITS 12 /* Setting the number of bits to 12, 13*/
#define HASHING_SHIFT (BITS-8) /* or 14 affects several constants. */
#define MAX_VALUE (1 << BITS) - 1 /* Note that MS-DOS machines need to */
#define MAX_CODE MAX_VALUE - 1 /* compile their code in large model if*/
/* 14 bits are selected. */
#if BITS == 16
#define TABLE_SIZE 99991
#endif
#if BITS == 14
#define TABLE_SIZE 18041 /* The string table size needs to be a */
#endif /* prime number that is somewhat larger*/
#if BITS == 13 /* than 2**BITS. */
#define TABLE_SIZE 9029
#endif
#if BITS <= 12
#define TABLE_SIZE 5021
#endif
void *malloc();
int *code_value; /* This is the code value array */
unsigned int *prefix_code; /* This array holds the prefix codes */
unsigned char *append_character; /* This array holds the appended chars */
unsigned char decode_stack[4000]; /* This array holds the decoded string */
/*
* Forward declarations
*/
void compress(FILE *input,FILE *output);
void expand(FILE *input,FILE *output);
int find_match(int hash_prefix,unsigned int hash_character);
void output_code(FILE *output,unsigned int code);
unsigned int input_code(FILE *input);
unsigned char *decode_string(unsigned char *buffer,unsigned int code);
/********************************************************************
**
** This program gets a file name from the command line. It compresses the
** file, placing its output in a file named test.lzw. It then expands
** test.lzw into test.out. Test.out should then be an exact duplicate of
** the input file.
**
*************************************************************************/
main(int argc, char *argv[])
{
FILE *input_file;
FILE *output_file;
FILE *lzw_file;
char input_file_name[81];
char command;
command=(argv==3);
/*
** The three buffers are needed for the compression phase.
*/
code_value=(int*)malloc(TABLE_SIZE*sizeof(int));
prefix_code=(unsigned int *)malloc(TABLE_SIZE*sizeof(unsigned int));
append_character=(unsigned char *)malloc(TABLE_SIZE*sizeof(unsigned char));
if (code_value==NULL || prefix_code==NULL || append_character==NULL)
{
printf("Fatal error allocating table space!\n");
exit(-1);
}
/*
** Get the file name, open it up, and open up the lzw output file.
*/
if (argc>1)
strcpy(input_file_name,argv[1]);
else
{
printf("Input file name? ");
scanf("%s",input_file_name);
}
input_file=fopen(input_file_name,"rb");
lzw_file=fopen("test.lzw","wb");
if (input_file==NULL || lzw_file==NULL)
{
printf("Fatal error opening files.\n");
exit(-1);
};
/*
** Compress the file.
*/
if(command=='r')
{
compress(input_file,lzw_file);
}
fclose(input_file);
fclose(lzw_file);
free(c-ode_value);
/*
** Now open the files for the expansion.
*/
lzw_file=fopen("test.lzw","rb");
output_file=fopen("test.out","wb");
if (lzw_file==NULL || output_file==NULL)
{
printf("Fatal error opening files.\n");
exit(-2);
};
/*
** Expand the file.
*/
expand(lzw_file,output_file);
fclose(lzw_file);
fclose(output_file);
free(prefix_code);
free(append_character);
}
/*
** This is the compression routine. The code should be a fairly close
** match to the algorithm accompanying the article.
**
*/
void compress(FILE *input,FILE *output)
{
unsigned int next_code;
unsigned int character;
unsigned int string_code;
unsigned int index;
int i;
next_code=256; /* Next code is the next available string code*/
for (i=0;i<TABLE_SIZE;i++) /* Clear out the string table before starting */
code_value[i]=-1;
i=0;
printf("Compressing...\n");
string_code=getc(input); /* Get the first code */
/*
** This is the main loop where it all happens. This loop runs util all of
** the input has been exhausted. Note that it stops adding codes to the
** table after all of the possible codes have been defined.
*/
while ((character=getc(input)) != (unsigned)EOF)
{
if (++i==1000) /* Print a * every 1000 */
{ /* input characters. This */
i=0; /* is just a pacifier. */
printf("*");
}
index=find_match(string_code,character);/* See if the string is in */
if (code_value[index] != -1) /* the table. If it is, */
string_code=code_value[index]; /* get the code value. If */
else /* the string is not in the*/
{ /* table, try to add it. */
if (next_code <= MAX_CODE)
{
code_value[index]=next_code++;
prefix_code[index]=string_code;
append_character[index]=character;
}
output_code(output,string_code); /* When a string is found */
string_code=character; /* that is not in the table*/
} /* I output the last string*/
} /* after adding the new one*/
/*
** End of the main loop.
*/
output_code(output,string_code); /* Output the last code */
output_code(output,MAX_VALUE); /* Output the end of buffer code */
output_code(output,0); /* This code flushes the output buffer*/
printf("\n");
}
/*
** This is the hashing routine. It tries to find a match for the prefix+char
** string in the string table. If it finds it, the index is returned. If
** the string is not found, the first available index in the string table is
** returned instead.
*/
int find_match(int hash_prefix,unsigned int hash_character)
{
int index;
int offset;
index = (hash_character << HASHING_SHIFT) ^ hash_prefix;
if (index == 0)
offset = 1;
else
offset = TABLE_SIZE - index;
while (1)
{
if (code_value[index] == -1)
return(index);
if (prefix_code[index] == hash_prefix &&
append_character[index] == hash_character)
return(index);
index -= offset;
if (index < 0)
index += TABLE_SIZE;
}
}
/*
** This is the expansion routine. It takes an LZW format file, and expands
** it to an output file. The code here should be a fairly close match to
** the algorithm in the accompanying article.
*/
void expand(FILE *input,FILE *output)
{
unsigned int next_code;
unsigned int new_code;
unsigned int old_code;
int character;
int counter;
unsigned char *string;
next_code=256; /* This is the next available code to define */
counter=0; /* Counter is used as a pacifier. */
printf("Expanding...\n");
old_code=input_code(input); /* Read in the first code, initialize the */
character=old_code; /* character variable, and send the first */
putc(old_code,output); /* code to the output file */
/*
** This is the main expansion loop. It reads in characters from the LZW file
** until it sees the special code used to inidicate the end of the data.
*/
while ((new_code=input_code(input)) != (MAX_VALUE))
{
if (++counter==1000) /* This section of code prints out */
{ /* an asterisk every 1000 characters */
counter=0; /* It is just a pacifier. */
printf("*");
}
/*
** This code checks for the special STRING+CHARACTER+STRING+CHARACTER+STRING
** case which generates an undefined code. It handles it by decoding
** the last code, and adding a single character to the end of the decode string.
*/
if (new_code>=next_code)
{
*decode_stack=character;
string=decode_string(decode_stack+1,old_code);
}
/*
** Otherwise we do a straight decode of the new code.
*/
else
string=decode_string(decode_stack,new_code);
/*
** Now we output the decoded string in reverse order.
*/
character=*string;
while (string >= decode_stack)
putc(*string--,output);
/*
** Finally, if possible, add a new code to the string table.
*/
if (next_code <= MAX_CODE)
{
prefix_code[next_code]=old_code;
append_character[next_code]=character;
next_code++;
}
old_code=new_code;
}
printf("\n");
}
/*
** This routine simply decodes a string from the string table, storing
** it in a buffer. The buffer can then be output in reverse order by
** the expansion program.
*/
unsigned char *decode_string(unsigned char *buffer,unsigned int code)
{
int i;
i=0;
while (code > 255)
{
*buffer++ = append_character[code];
code=prefix_code[code];
if (i++>=MAX_CODE)
{
printf("Fatal error during code expansion.\n");
exit(-3);
}
}
*buffer=code;
return(buffer);
}
/*
** The following two routines are used to output variable length
** codes. They are written strictly for clarity, and are not
** particularyl efficient.
*/
unsigned int input_code(FILE *input)
{
unsigned int return_value;
static int input_bit_count=0;
static unsigned long input_bit_buffer=0L;
while (input_bit_count <= 24)
{
input_bit_buffer |=
(unsigned long) getc(input) << (24-input_bit_count);
input_bit_count += 8;
}
return_value=input_bit_buffer >> (32-BITS);
input_bit_buffer <<= BITS;
input_bit_count -= BITS;
return(return_value);
}
void output_code(FILE *output,unsigned int code)
{
static int output_bit_count=0;
static unsigned long output_bit_buffer=0L;
output_bit_buffer |= (unsigned long) code << (32-BITS-output_bit_count);
output_bit_count += BITS;
while (output_bit_count >= 8)
{
putc(output_bit_buffer >> 24,output);
output_bit_buffer <<= 8;
output_bit_count -= 8;
}
}

Here's a document on an algorithm to do regex searching directly in LZW compressed bytes :
http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.9.1434&rep=rep1&type=pdf
It contains references to efficient algorithms to search for exact strings as well.

libjpeg decompress to RAW not working

I am on a RHEL 6.0 x86_64 box with the following version of libjpeg.
[mehoggan#hogganz400 jpeg_to_raw.c]$ rpm -qa libjpeg
libjpeg-6b-46.el6.x86_64
I have the following code which takes as its input a .jpeg file, and writes out a .raw file. When I run the program the size of the file expands, which leads me to believe the program is working:
[mehoggan#hogganz400 jpeg_to_raw.c]$ ls -l
total 600
-rwxrwxr-x 1 mehoggan mehoggan 10113 Dec 1 10:32 jpeg_to_raw
-rw-rw-r-- 1 mehoggan mehoggan 3311 Dec 1 10:32 jpeg_to_raw.c
-rw-rw-r-- 1 mehoggan mehoggan 75 Dec 1 10:27 Makefile
-rw-rw-r-- 1 mehoggan mehoggan 215205 Dec 1 09:19 test.jpg
-rw-rw-r-- 1 mehoggan mehoggan 374850 Dec 1 10:32 test_out.raw
However when I open up the file using Irfanview (and associated plugins) only a small portion of my image opens up. The code can be found below:
#include <stdio.h>
#include <jpeglib.h>
#include <stdlib.h>
#include <unistd.h>
/* we will be using this uninitialized pointer later to store raw, uncompressd image */
unsigned char *raw_image = NULL;
unsigned int size;
/**
* print the information for what was stored in the JPEG File
**/
void print_jpeg_info(struct jpeg_decompress_struct cinfo)
{
printf("JPEG File Information: \n");
printf("Image width and height: %d pixels and %d pixels.\n", cinfo.image_width, cinfo.image_height);
printf("Color components per pixel: %d.\n", cinfo.num_components);
printf("Color space: %d.\n", cinfo.jpeg_color_space);
printf("Raw flag is: %d.\n", cinfo.raw_data_out);
}
/**
* read_jpeg_file Reads from a jpeg file on disk specified by filename and saves into the
* raw_image buffer in an uncompressed format.
*
* \returns positive integer if successful, -1 otherwise
* \param *filename char string specifying the file name to read from
**/
int read_jpeg_file(char *filename)
{
/* these are standard libjpeg structures for reading(decompression) */
struct jpeg_decompress_struct cinfo;
struct jpeg_error_mgr jerr;
/* libjpeg data structure for storing one row, that is, scanline of an image */
JSAMPROW row_pointer[1];
FILE *infile = fopen(filename, "rb");
unsigned long location = 0;
int i = 0;
if (!infile) {
printf("Error opening jpeg file %s\n!", filename);
return -1;
}
/* here we set up the standard libjpeg error handler */
cinfo.err = jpeg_std_error(&jerr);
/* setup decompression process and source, then read JPEG header */
jpeg_create_decompress(&cinfo);
/* this makes the library read from infile */
jpeg_stdio_src(&cinfo, infile);
/* reading the image header which contains image information */
jpeg_read_header(&cinfo, TRUE);
print_jpeg_info(cinfo);
jpeg_start_decompress(&cinfo);
/* allocate memory to hold the uncompressed image */
size = cinfo.output_width*cinfo.output_height*cinfo.num_components;
raw_image = (unsigned char*)malloc(size);
/* now actually read the jpeg into the raw buffer */
row_pointer[0] = (unsigned char *)malloc(cinfo.output_width*cinfo.num_components);
/* read one scan line at a time */
while (cinfo.output_scanline < cinfo.image_height) {
jpeg_read_scanlines( &cinfo, row_pointer, 1 );
for (i=0; i<cinfo.image_width*cinfo.num_components;i++) {
raw_image[location++] = row_pointer[0][i];
}
}
/* wrap up decompression, destroy objects, free pointers and close open files */
jpeg_finish_decompress(&cinfo);
jpeg_destroy_decompress(&cinfo);
free(row_pointer[0]);
fclose(infile);
/* yup, we succeeded! */
return 1;
}
int main(int argc, char *argv[])
{
char *infilename = "test.jpg";
if (read_jpeg_file(infilename) > 0) {
size_t count = size / sizeof(unsigned char*);
fprintf(stdout, "The number of unsigned chars in raw_image = %d\n", (int)count);
FILE *ofile = fopen("test_out.raw", "w+");
ssize_t data_out = fwrite(raw_image, count, sizeof(unsigned char), ofile);
fprintf(stdout, "%d", (int)data_out);
fclose(ofile);
}
else
return -1;
return 0;
}
What is your take on why the program is not writing out all the data? Or why is it possibly corrupting the data?
The makefile used to build this simple app is:
jpeg_to_raw : jpeg_to_raw.c
gcc jpeg_to_raw.c -Wall -o jpeg_to_raw -ljpeg

size_t count = size / sizeof(unsigned char*);
gets you only 1/4 of the raw-data written (actually, in 64 bit mode only 1/8th).
count should be same as size (counting chars, not pointers)