How to get LZO to work with a file stream? - c

I am trying to compress a file stream with LZO and not getting very far. Specifically, I get a segmentation fault when extracting the archive file created by my compressFileWithLzo1x function.
My main function and prototype declarations are:
#include <stdio.h>
#include <stdlib.h>
#include "lzo/include/lzo/lzo1x.h"
#define LZO_IN_CHUNK (128*1024L)
#define LZO_OUT_CHUNK (LZO_IN_CHUNK + LZO_IN_CHUNK/16 + 64 + 3)
int compressFileWithLzo1x(const char *inFn, const char *outFn);
int extractFileWithLzo1x(const char *inFn);
int main(int argc, char **argv) {
const char *inFilename = "test.txt";
const char *outFilename = "test.txt.lzo1x";
if ( compressFileWithLzo1x(inFilename, outFilename) != 0 )
exit(EXIT_FAILURE);
if ( extractFileWithLzo1x(outFilename) != 0 )
exit(EXIT_FAILURE);
return 0;
}
Here is the implementation of my compression function:
int compressFileWithLzo1x(const char *inFn, const char *outFn) {
FILE *inFnPtr = fopen(outFn, "r");
FILE *outFnPtr = fopen(outFn, "wb");
int compressionResult;
lzo_bytep in;
lzo_bytep out;
lzo_voidp wrkmem;
lzo_uint out_len;
size_t inResult;
if (lzo_init() != LZO_E_OK)
return -1;
in = (lzo_bytep)malloc(LZO_IN_CHUNK);
out = (lzo_bytep)malloc(LZO_OUT_CHUNK);
wrkmem = (lzo_voidp)malloc(LZO1X_1_MEM_COMPRESS);
do {
inResult = fread(in, sizeof(lzo_byte), LZO_IN_CHUNK, inFnPtr);
if (inResult == 0)
break;
compressionResult = lzo1x_1_compress(in, LZO_IN_CHUNK, out, &out_len, wrkmem);
if ((out_len >= LZO_IN_CHUNK) || (compressionResult != LZO_E_OK))
return -1;
if (fwrite(out, sizeof(lzo_byte), (size_t)out_len, outFnPtr) != (size_t)out_len || ferror(outFnPtr))
return -1;
fflush(outFnPtr);
} while (!feof(inFnPtr) && !ferror(inFnPtr));
free(wrkmem);
free(out);
free(in);
fclose(inFnPtr);
fclose(outFnPtr);
return 0;
}
Here is the implementation of my decompression function:
int extractFileWithLzo1x(const char *inFn) {
FILE *inFnPtr = fopen(inFn, "rb");
lzo_bytep in = (lzo_bytep)malloc(LZO_IN_CHUNK);
lzo_bytep out = (lzo_bytep)malloc(LZO_OUT_CHUNK);
int extractionResult;
size_t inResult;
lzo_uint new_length;
if (lzo_init() != LZO_E_OK)
return -1;
do {
new_length = LZO_IN_CHUNK;
inResult = fread(in, sizeof(lzo_byte), LZO_IN_CHUNK, inFnPtr);
extractionResult = lzo1x_decompress(out, LZO_OUT_CHUNK, in, &new_length, NULL);
if ((extractionResult != LZO_E_OK) || (new_length != LZO_IN_CHUNK))
return -1;
fprintf(stderr, "out: [%s]\n", (unsigned char *)out);
} while (!feof(inFnPtr) && (!ferror(inFnPtr));
free(in);
free(out);
fclose(inFnPtr);
return 0;
}
The segmentation fault occurs here:
extractionResult = lzo1x_decompress(out, LZO_OUT_CHUNK, in, &new_length, NULL);
What is wrong with this approach that is causing the segmentation fault?
I hope I haven't left any code out this time. Feel free to let me know if I need to add more information. Thanks in advance for your advice.

You're compressing independent blocks. The LZO decompressor needs the byte length of the compressed data because when it decodes EOF it checks whether it has consumed all the input bytes (and returns an error if it hasn't) so you need to store the length of each compressed chunk as well. Thus you need a more complex file format. For example:
# compressing, in python-like pseudocode
ifile = open("data", "rb")
ofile = open("data.mylzo", "wb")
input, input_len = ifile.read(65536)
while input_len > 0:
compressed, compressed_len = lzo1x(input, input_len)
compressed_len -= 1 # store len-1 of next block
if compressed_len < 65536 - 1:
ofile.write(compressed_len & 255) # be sure of endianess in file formats!
ofile.write(compressed_len >> 8)
ofile.write(compressed)
else:
ofile.write(255) # incompressible block stored it as-is (saves space & time).
ofile.write(255)
ofile.write(input)
input, input_len = ifile.read(65536)
ofile.close()
ifile.close()
# decompressing, in python-like pseudocode
ifile = open("data.mylzo", "rb")
ofile = open("data", "wb")
compressed_len_s = ifile.read(2)
while len(compressed_len_s) == 2:
compressed_len = (compressed_len_s[0] | (compressed_len_s[1] << 8)) + 1
if compressed_len == 65536:
ofile.write(ifile.read(65536)) # this can be done without copying
else:
compressed = ifile.read(compressed_len)
decompressed = lzo1x_decompress(compressed, compressed_len)
ofile.write(decompressed)
compressed_len_s = ifile.read(2)
ofile.close()
ifile.close()
If you want to be able to decompress the chunks without skipping (either for decompression in parallel or random access) you should place the lengths of compressed chunks at the beginning, before the first chunk. Precede them with the number of chunks.
The last chunk can be shorter than 64k, and it can be incompressible but we'll still store the compressed form, even though it's longer than the non-compressed form, because only full 64k blocks are stored as-is. If entire file is shorter than 64k, it will grow.

The code you've given won't compile (spurious = in the #defines; inFilePtr instead of inFnPtr in various places, etc.). But:
When compressing, you are not taking account of the actual amount of data returned by the fread(), which might well be less than LZO_IN_CHUNK.
compressionResult = lzo1x_1_compress(in, LZO_IN_CHUNK, out, &out_len, wrkmem);
should probably be
compressionResult = lzo1x_1_compress(in, inResult, out, &out_len, wrkmem);
(This is unlikely to be the problem, but will add bogus junk at the end of the file.)
When decompressing, you have a similar problem, and the in / out arguments are the wrong way round, which is likely to be the cause of your segfault.
extractionResult = lzo1x_decompress(out, LZO_OUT_CHUNK, in, &new_length, NULL);
should probably be
extractionResult = lzo1x_decompress(in, inResult, out, &new_length, NULL);

I think you are opening the wrong file in int compressFileWithLzo1x:
FILE *inFnPtr = fopen(outFn, "r");
it should be
FILE *inFnPtr = fopen(inFn, "r");

Related

LZW Compression

The LZW compression algorithm is increasing the size in bits after compression:
Here is the code for Compression function:
// compression
void compress(FILE *inputFile, FILE *outputFile) {
int prefix;
int character;
int nextCode;
int index;
// LZW starts out with a dictionary of 256 characters (in the case of 8 codeLength) and uses those as the "standard"
// character set.
nextCode = 256; // next code is the next available string code
dictionaryInit();
// while (there is still data to be read)
while ((character = getc(inputFile)) != (unsigned)EOF) { // ch = read a character;
// if (dictionary contains prefix+character)
if ((index = dictionaryLookup(prefix, character)) != -1) prefix = index; // prefix = prefix+character
else { // ...no, try to add it
// encode s to output file
writeBinary(outputFile, prefix);
// add prefix+character to dictionary
if (nextCode < dictionarySize) dictionaryAdd(prefix, character, nextCode++);
// prefix = character
prefix = character; //... output the last string after adding the new one
}
}
// encode s to output file
writeBinary(outputFile, prefix); // output the last code
if (leftover > 0) fputc(leftoverBits << 4, outputFile);
// free the dictionary here
dictionaryDestroy();
}
Where the writeBinary (It acts like a buffer in the program) function is as follows:
void writeBinary(FILE * output, int code);
int leftover = 0;
int leftoverBits;
void writeBinary(FILE * output, int code) {
if (leftover > 0) {
int previousCode = (leftoverBits << 4) + (code >> 8);
fputc(previousCode, output);
fputc(code, output);
leftover = 0; // no leftover now
} else {
leftoverBits = code & 0xF; // save leftover, the last 00001111
leftover = 1;
fputc(code >> 4, output);
}
}
Can you spot the error, please? I'll be grateful!
chux already pointed you to the solution: You need to start out with 9-bit codes, and increase the code size up to 12 whenever the available codes for the current bit size are exhausted. If you're writing 12-bit codes from the beginning, there's no compression effect, of course.

Get secondary storage details using C on Linux

I need simple a way to get the secondary storage details (like total size, used and free space) in a (daemon) C code for Linux;
This are the things I tried
statvfs - don't know how to get disk details instead of files
Using system (df -h --total | grep total > disk.stat) in the C code and then read the file.
But the above involves file write and read which is not efficient cause this C code is a daemon which will be polling the system details continuously as input to a graph generation.
If there no is other way, tell me a simple and fast ipc mechanism with example for communication between this bash and C code.
/*
* #breif returns total percentage of secondary storage used
*
* - uses bash command to get storage data and store in a file
* - and use c code retrive the percent of usage from file and return it
*/
int calculate_storage_size( )
{
if ( system("df -h --total | grep total > disk.stat") >= 0 )
{
char *temp_char_ptr = (char *)NULL;
int storage_size_percent = -1;
FILE *fp ;
fp = fopen ("disk.stat" , "r");
if (fp != (FILE *)NULL)
{
temp_char_ptr = (char*) calloc ( 6 , 1 );
fscanf( fp,"%s %s %s %s %d", temp_char_ptr, temp_char_ptr, temp_char_ptr, temp_char_ptr, &storage_size_percent);
}
free (temp_char_ptr);
fclose(fp);
return storage_size_percent;
}
return -1;
}
I would suggest it would be better to let the user specify which mounts should be considered in the total, or use a heuristic to omit system and temporary mounts.
Consider the following example, info.c:
#define _POSIX_C_SOURCE 200809L
#define _GNU_SOURCE
#include <stdlib.h>
#include <stdint.h>
#include <inttypes.h>
#include <sys/statvfs.h>
#include <mntent.h>
#include <string.h>
#include <errno.h>
#include <stdio.h>
static void free_array(char **array)
{
if (array) {
size_t i;
for (i = 0; array[i] != NULL; i++) {
free(array[i]);
array[i] = NULL;
}
free(array);
}
}
static char **normal_mounts(void)
{
char **list = NULL, **temp;
size_t size = 0;
size_t used = 0;
char buffer[4096];
struct mntent entry;
FILE *mounts;
mounts = fopen("/proc/mounts", "r");
if (!mounts)
return NULL;
while (getmntent_r(mounts, &entry, buffer, sizeof buffer) == &entry)
if (strcmp(entry.mnt_fsname, "tmpfs") &&
strcmp(entry.mnt_fsname, "swap") &&
strcmp(entry.mnt_dir, "/proc") && strncmp(entry.mnt_dir, "/proc/", 6) &&
strcmp(entry.mnt_dir, "/boot") && strncmp(entry.mnt_dir, "/boot/", 6) &&
strcmp(entry.mnt_dir, "/sys") && strncmp(entry.mnt_dir, "/sys/", 5) &&
strcmp(entry.mnt_dir, "/run") && strncmp(entry.mnt_dir, "/run/", 5) &&
strcmp(entry.mnt_dir, "/dev") && strncmp(entry.mnt_dir, "/dev/", 5) &&
strcmp(entry.mnt_dir, "/mnt") && strncmp(entry.mnt_dir, "/mnt/", 5) &&
strcmp(entry.mnt_dir, "/media") && strncmp(entry.mnt_dir, "/media/", 7) &&
strcmp(entry.mnt_dir, "/var/run") && strncmp(entry.mnt_dir, "/var/run/", 9)) {
if (used >= size) {
size = (used | 15) + 17;
temp = realloc(list, size * sizeof list[0]);
if (!temp) {
endmntent(mounts);
free_array(list);
errno = ENOMEM;
return NULL;
}
list = temp;
}
if (!(list[used++] = strdup(entry.mnt_dir))) {
endmntent(mounts);
free_array(list);
errno = ENOMEM;
return NULL;
}
}
if (ferror(mounts) || !feof(mounts)) {
endmntent(mounts);
free_array(list);
errno = EIO;
return NULL;
} else
endmntent(mounts);
if (!used) {
free_array(list);
errno = 0;
return NULL;
}
if (size != used + 1) {
size = used + 1;
temp = realloc(list, size * sizeof list[0]);
if (!temp) {
free_array(list);
errno = ENOMEM;
return NULL;
}
list = temp;
}
list[used] = NULL;
errno = 0;
return list;
}
static int statistics(const char **mountpoint, uint64_t *bytes_total, uint64_t *bytes_free)
{
struct statvfs info;
uint64_t btotal = 0;
uint64_t bfree = 0;
size_t i;
if (!mountpoint)
return errno = EINVAL;
for (i = 0; mountpoint[i] != NULL; i++)
if (statvfs(mountpoint[i], &info) != -1) {
btotal += (uint64_t)info.f_frsize * (uint64_t)info.f_blocks;
bfree += (uint64_t)info.f_bsize * (uint64_t)info.f_bavail;
} else
return errno;
if (bytes_total)
*bytes_total = btotal;
if (bytes_free)
*bytes_free = bfree;
return 0;
}
int main(int argc, char *argv[])
{
uint64_t total = 0;
uint64_t nfree = 0;
if (argc > 1) {
if (statistics((const char **)argv + 1, &total, &nfree)) {
fprintf(stderr, "%s.\n", strerror(errno));
return EXIT_FAILURE;
}
} else {
char **mounts = normal_mounts();
size_t i;
if (!mounts) {
if (errno)
fprintf(stderr, "Error determining file systems: %s.\n", strerror(errno));
else
fprintf(stderr, "No normal file systems found.\n");
return EXIT_FAILURE;
}
fprintf(stderr, "Considering mount points");
for (i = 0; mounts[i] != NULL; i++)
fprintf(stderr, " %s", mounts[i]);
fprintf(stderr, "\n");
if (statistics((const char **)mounts, &total, &nfree)) {
fprintf(stderr, "%s.\n", strerror(errno));
return EXIT_FAILURE;
}
free_array(mounts);
}
printf("%20" PRIu64 " bytes total\n", total);
printf("%20" PRIu64 " bytes free\n", nfree);
return EXIT_SUCCESS;
}
The statistics() function takes a NULL-terminated array of mount points, and two pointers to unsigned 64-bit integers. The function returns 0 if successful, and a nonzero errno code otherwise. If successful, the function will set the total number of bytes in the filesystems to the first integer, and the number of free bytes in the second.
If you supply one or more mounts points as command line arguments, only those are considered. (POSIX says argv[argc] == NULL, so this usage is safe.)
Otherwise, the normal_mounts() function is used to parse /proc/mounts to obtain a list of "normal" mount points. The function uses getmntent() to read each entry (line) from the kernel-provided pseudo-file. All tmpfs (ramdisks) and swap filesystems are excluded, as are those mounted at or under /proc, /boot, /sys, /run, /dev, /mnt, /media, and /var/run. This is just a crude heuristic, not a known good approach.
In a daemon, or even in a graphical application, you call only (your equivalent of) the statistics() function, with the same array of mount points. You could even consider tracking each mount point separately, and let the user filter and combine the information they are interested in. In fact, I would recommend that: I personally might be interested in seeing the fluctuations in my temporary file usage (on machines where /tmp and /var/tmp are tmpfs mounts), as well as track my long-term usage of /home.
In a daemon, you can use HUP or USR1 or USR2 signals to indicate when the user wants you to reload the configuration -- the mount point list, here. I do not believe it would be that interesting to integrate it to DBUS for detecting removable media mounts/unmounts, but of course you can if you think it useful.
If you compile the above program using e.g.
gcc -Wall -O2 info.c -o info
and run
./info
it will output something like
Considering mount points / /home
119989497856 bytes total
26786156544 bytes free
where the first line is output to standard error, and the bytes lines to standard output. You can also specifically name the mount points -- make sure they are different, as the code does not check for duplicate mounts --:
./info /home /tmp
If you are wondering how you could determine whether two directories are on the same mount or not: call stat(path1, &info1) on one, and stat(path2, &info2) on the other. If and only if (info1.st_dev == info2.st_dev), the two paths are on the same mount. (One device may be mounted multiple times at different points, using e.g. bind mounts, but usually the above check suffices.)
If you find all the above code annoying, you can always rely on the df utility. To ensure the output is in the C/POSIX locale (and not, say, in French or Finnish), use
handle = popen("LANG=C LC_ALL=C df -Pl", "r");
or similar, and read the output using len = getline(&line, &size, handle).
You can use popen() instead of system()/fopen(): The system will give you a readable file without using hard-drive.
There's no portable ANSI C mechanism except the system and file kludge, and even that is a bit of an illusion as it depends on df being present. However the Posix function popen() does essentially the same thing, but gives you the output as a FILE *.
#include <stdio.h>
#include <stdlib.h>
#include <sys/statvfs.h>
int main( )
{
struct statvfs stat;
statvfs("/media/hp",&stat);
printf("\n\navail size --%ld GB\n\n", stat.f_bsize * stat.f_bavail / 1000000000 );
printf("\n\nblocks size --%ld GB\n\n", stat.f_frsize * stat.f_blocks / 1000000000 );
}
I Finally did it statvfs itself it works fine .

zlib inflateReset causes memory leak (not)

I am currently working on the below requirement.
Here is the requirement: On the server side a large file is divided into 4000-byte blocks (frames). Each block is in turn compressed (using zlib) and sent to client process. For instance, if a file is 12000 bytes in size then it is divided into 3 blocks.
Above file will have 3 blocks => Block-0, Block-1, Block-2
On receipt, client decompresses each block (or frame) and writes to buffer allocated on the heap.When all the blocks corresponding to the entire file is received by the client, then the uncompressed version of the resultant file is written to the disk.
I have written a routine inflateData that does the following based on the block # received:
When the first block is received,
- inflateInit
- inflate
- inflateReset
When the intermediate blocks are received,
- inflate
- inflateReset
When the last block is received,
- inflate
- inflateEnd
With the above routine, Decompression of blocks happens as expected. But the issue that I face is it consumes lots of memory and at some point entire system slows down. When checked with valgrind, memory leak is reported with inflateInit2_. This causes the system resources to be exhausted.
==30359== 57,312 bytes in 6 blocks are possibly lost in loss record 64 of 67
==30359== at 0x4A069EE: malloc (vg_replace_malloc.c:270)
==30359== by 0x3E57808F1E: inflateInit2_ (in /lib64/libz.so.1.2.3)
==30359== by 0x40C220: inflateData (productMaker.c:1668)
Below is the routine inflateData.
int inflateData(
char* const inBuf,
unsigned long inLen,
unsigned int isFirstBlk,
unsigned int isLastBlk,
const char* outBuf,
unsigned long* outLen)
{
int have;
int readsz;
int bsize;
static z_stream zstrm;
int zerr;
int flush;
char out[CHUNK_SZ];
char in[CHUNK_SZ];
int ret,nwrite,idx = -1;
int savedByteCntr=0;
unsigned char *dstBuf;
int firstCall = 1;
int totalBytesIn=0;
int inflatedBytes=0;
int decompByteCounter = 0;
int num=0;
ret = Z_OK;
readsz = 0;
bsize = CHUNK_SZ;
dstBuf = (unsigned char *) outBuf;
if(isFirstBlk){
memset(&zstrm, '\0', sizeof(z_stream));
zstrm.zalloc = Z_NULL;
zstrm.zfree = Z_NULL;
zstrm.opaque = Z_NULL;
if ((zerr = inflateInit(&zstrm)) != Z_OK) {
uerror("ERROR %d inflateInit (%s)",
zerr, decode_zlib_err(zerr));
return -1;
}
}
while(totalBytesIn < inLen ) {
int compChunkSize = ((inLen - totalBytesIn) > 5120) ? 5120 :
(inLen - totalBytesIn);
memcpy(in, inBuf + totalBytesIn, compChunkSize);
zstrm.avail_in = inLen - totalBytesIn;
zstrm.next_in = in ;
zstrm.avail_out = CHUNK_SZ;
zstrm.next_out = out;
inflatedBytes = 0;
while(ret != Z_STREAM_END) {
ret = inflate(&zstrm, Z_NO_FLUSH);
if(ret < 0) {
uerror(" Error %d inflate (%s)", ret, decode_zlib_err(ret));
(void)inflateEnd(&zstrm);
return ret;
}
inflatedBytes = CHUNK_SZ - zstrm.avail_out;
if(inflatedBytes == 0) {
unotice("\n Unable to decompress data - truncated");
break;
}
totalBytesIn += zstrm.total_in;
decompByteCounter += inflatedBytes;
memcpy(dstBuf + savedByteCntr, out, inflatedBytes);
savedByteCntr = decompByteCounter;
}
// Reset inflater for additional input
ret = inflateReset(&zstrm);
if(ret == Z_STREAM_ERROR){
uerror(" Error %d inflateReset (%s)", ret, decode_zlib_err(ret));
(void)inflateEnd(&zstrm);
return ret;
}
}
if(isLastBlk){
ret = inflateEnd(&zstrm);
if(ret < 0) {
uerror("Fail inflateEnd %d [%s] ", ret, decode_zlib_err(ret));
return (ret);
}
}
*outLen = decompByteCounter;
return 0;
}
Thanks in advance for the support.
Thanks,
Sathya.
You are making an error in your use of your inflateData() routine.
First off, using a static variable in this way is a horrible idea. If you call your inflateData() twice with isFirstBlk true without an intermediate call with isLastBlk true, then you will wipe out the reference to the first set of allocations, resulting in a memory leak.
To avoid this sort of error, you should keep track of whether zstrm is initialized or not, and reject any attempt to initialize an already initialized stream. Better still would be to not even have an isFirstBlk, and simply initialize zstrm on the first call and on any call that immediately follows a call with isLastBlk true.
So you are either doing the above, calling twice with isFirstBlk true, or failing to call with isLastBlk true.

Initializing an infinite number of char **

I'm making a raytracing engine in C using the minilibX library.
I want to be able to read in a .conf file the configuration for the scene to display:
For example:
(Az#Az 117)cat universe.conf
#randomcomment
obj:eye:x:y:z
light:sun:100
light:moon:test
The number of objects can vary between 1 and the infinite.
From now on, I'm reading the file, copying each line 1 by 1 in a char **tab, and mallocing by the number of objects found, like this:
void open_file(int fd, struct s_img *m)
{
int i;
char *s;
int curs_obj;
int curs_light;
i = 0;
curs_light = 0;
curs_obj = 0;
while (s = get_next_line(fd))
{
i = i + 1;
if (s[0] == 'l')
{
m->lights[curs_light] = s;
curs_light = curs_light + 1;
}
else if (s[0] == 'o')
{
m->objs[curs_obj] = s;
curs_obj = curs_obj + 1;
}
else if (s[0] != '#')
{
show_error(i, s);
stop_parsing(m);
}
}
Now, I want to be able to store each information of each tab[i] in a new char **tab, 1 for each object, using the ':' as a separation.
So I need to initialize and malloc an undetermined number of char **tab. How can I do that?
(Ps: I hope my code and my english are good enough for you to understand. And I'm using only the very basic function, like read, write, open, malloc... and I'm re-building everything else, like printf, get_line, and so on)
You can't allocate an indeterminate amount of memory; malloc doesn't support it. What you can do is to allocate enough memory for now and revise that later:
size_t buffer = 10;
char **tab = malloc(buffer);
//...
if (indexOfObjectToCreate > buffer) {
buffer *= 2;
tab = realloc(tab, buffer);
}
I'd use an alternative approach (as this is c, not c++) and allocate simply large buffers as we go by:
char *my_malloc(size_t n) {
static size_t space_left = 0;
static char *base = NULL;
if (base==NULL || space_left < n) base=malloc(space_left=BIG_N);
base +=n; return base-n;
}
Disclaimer: I've omitted the garbage collection stuff and testing return values and all safety measures to keep the routine short.
Another way to think this is to read the file in to a large enough mallocated array (you can check it with ftell), scan the buffer, replace delimiters, line feeds etc. with ascii zero characters and remember the starting locations of keywords.

How can I check if file is text (ASCII) or binary in C

I need to write C code that checks to see if a file is text(ASCII) or Binary
Could someone help?
Thanks
Typical method is to read the first several hundred bytes and look for ASCII NUL.
If the file contains NUL, it is definitely a binary file. Most binary files do contain NUL bytes, but text files should never contain NUL bytes.
#include <string.h>
bool is_binary(const void *data, size_t len)
{
return memchr(data, '\0', len) != NULL;
}
Be warned that this is a heuristic. In other words, it will be wrong sometimes.
Read all characters and see if all of them are ASCII, that is, with codes from 0 to 127 inclusive.
Some tools determine whether a file is a text file or a binary file by just checking whether or not it has any byte with code 0.
Clearly, if you apply both of these methods, you will get different results for some files, so, you have to define what it is exactly that you're looking for.
You can use libmagic. The code below will show you roughly the way the "file" command does it. (The code below is quick and dirty -- it probably needs to be cleaned up.)
#include <string.h>
#include <magic.h>
#include <stdio.h>
//------------------------------------------------------------------------------
struct magic_set * prep_magic(int flags)
{
struct magic_set *magic = magic_open(flags);
const char *errstring;
int action = 0;
const char *magicfile = NULL;
if (magicfile == NULL)
magicfile = magic_getpath(magicfile, action);
if (magic == NULL)
{
printf("Can't create magic");
return NULL;
}
if (magic_load(magic, magicfile) == -1)
{
printf("%s", magic_error(magic));
magic_close(magic);
return NULL;
}
if ((errstring = magic_error(magic)) != NULL)
printf("%s\n", errstring);
return magic;
/* END FUNCTION prep_magic */ }
//------------------------------------------------------------------------------
int main(int argc, char **argv)
{
int flags = 0;
struct magic_set *msetptr = NULL;
const char *testfile = (char *)"/etc/motd";
msetptr = prep_magic(flags);
if( msetptr == NULL )
printf("no mset ptr\n");
const char *typer;
typer = magic_file( msetptr, testfile );
printf("typer = %s\n", typer );
return 0;
/* END PROGRAM */ }

Resources