realloc reports incorrect checksum - c

I have this C program attempting a text line reading function that should be able to deal with lines of arbitrary length. It works by maintaining a buffer, whose size is doubled whenever there is need for more room.
The actual method is here:
/*******************************************************************************
* Attempts to expand the line buffer. If succeeded, TRUE is returned. *
*******************************************************************************/
static char* try_expand(char* buffer, int* p_buffer_length)
{
*p_buffer_length *= 2;
puts("Before realloc");
char* s = realloc(buffer, *p_buffer_length);
puts("After realloc");
if (s)
{
return s;
}
// Once here, realloc failed.
char* s2 = malloc(*p_buffer_length);
if (!s2)
{
return NULL;
}
strncpy(s2, buffer, *p_buffer_length / 2);
free(buffer);
return s2;
}
I am working on Mac OS X, and whenever the buffer expansion takes place, the program crashes and the system reports:
malloc: *** error for object 0x100105568: incorrect checksum for freed object - object was probably modified after being freed.
Everything else is here:
#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define HELP_FLAG "-h"
#define VERSION_FLAG "-v"
#define FLAG_DESC "%-5s"
#define INITIAL_BUFFER_SIZE 8
#define FALSE 0
#define TRUE (~FALSE)
/*******************************************************************************
* This routine removes all leading and trailing whitespace from a string, *
* doing that in-place. (Total of two passes.) *
*******************************************************************************/
static char* trim_inplace(char* start)
{
return start;
/*
for (char* end = &start[strlen(start) - 1];
isspace(*end) && end >= start; --end)
{
*end = '\0';
}
while (isspace(*start))
{
++start;
}
return start;*/
}
/*******************************************************************************
* Processes a single line and handles everything needed for dealing with lines *
* of arbitrary length. *
*******************************************************************************/
static int process_line(char** p_buffer, int* p_buffer_length, FILE* file)
{
size_t current_index = 0;
for (;;)
{
char* ret = fgets(*p_buffer + current_index, *p_buffer_length, file);
if (!ret)
{
//puts("!ret is true.");
return FALSE;
}
// Find out whether we have a newline character, which would imply that
// we have an entire line read.
for (size_t i = 0; i < *p_buffer_length; ++i)
{
if ((*p_buffer)[i] == '\n')
{
//(*p_buffer)[i + 1] = '\0';
puts(trim_inplace(*p_buffer));
return TRUE;
}
}
// -1 for skipping the NULL-terminator.
current_index += *p_buffer_length - 1;
char* new_buffer;
// Once here, the current line does not fit in 'p_buffer'. Expand the
// array by doubling its capacity.
if (!(new_buffer = try_expand(*p_buffer, p_buffer_length)))
{
perror("Could not expand the line buffer");
free(*p_buffer);
exit(EXIT_FAILURE);
}
else
{
*p_buffer = new_buffer;
}
}
}
/*******************************************************************************
* Processes a file. *
*******************************************************************************/
static void process_file(char** p_buffer, int* p_buffer_length, FILE* file)
{
while (!feof(file))
{
process_line(p_buffer, p_buffer_length, file);
}
}
/*******************************************************************************
* Prints the help message and exits. *
*******************************************************************************/
static void print_help()
{
printf("Usage: trim [" HELP_FLAG "] [" VERSION_FLAG "] " \
"[FILE1, [FILE2, [...]]]\n" \
" " FLAG_DESC " Print the help message and exit.\n" \
" " FLAG_DESC " Print the version message and exit.\n" \
" If no files specified, reads from standard input.\n",
HELP_FLAG,
VERSION_FLAG);
}
/*******************************************************************************
* Prints the version string. *
*******************************************************************************/
static void print_version()
{
printf("trim 1.618\n" \
"By Rodion \"rodde\" Efremov 08.04.2015 Helsinki\n");
}
/*******************************************************************************
* Prints the erroneous flag. *
*******************************************************************************/
static void print_bad_flag(const char* flag)
{
printf("Unknown flag \"%s\"\n", flag);
}
/*******************************************************************************
* Checks the flags. *
*******************************************************************************/
static void check_flags(int argc, char** argv)
{
for (size_t i = 1; i < argc; ++i)
{
if (strcmp(argv[i], HELP_FLAG) == 0)
{
print_help();
exit(EXIT_SUCCESS);
}
else if (strcmp(argv[i], VERSION_FLAG) == 0)
{
print_version();
exit(EXIT_SUCCESS);
}
else if (argv[i][0] == '-')
{
print_bad_flag(argv[i]);
exit(EXIT_FAILURE);
}
}
}
/*******************************************************************************
* The entry point for a trivial line trimmer. *
*******************************************************************************/
int main(int argc, char** argv)
{
check_flags(argc, argv);
int buffer_length = INITIAL_BUFFER_SIZE;
char* buffer = malloc(buffer_length);
if (argc < 2)
{
// If realloc changes the location of memory, we need to know this.
process_file(&buffer, &buffer_length, stdin);
fclose(stdin);
return EXIT_SUCCESS;
}
for (size_t i = 1; i < argc; ++i)
{
FILE* file = fopen(argv[i], "r");
if (!file)
{
perror("Error opening a file");
return (EXIT_FAILURE);
}
process_file(&buffer, &buffer_length, file);
fclose(file);
}
}
The only observation I have made, is that if the input line requires only one expansion of the line buffer, everything is O.K. However, if the input line is large enough to require at least two expansions, the program crashes. What am I doing wrong here?

When you read further chunks in process_line(), you pass the wrong size:
fgets(*p_buffer + current_index, *p_buffer_length, file);
Should be
fgets(*p_buffer + current_index, *p_buffer_length - current_index, file);

Related

Cant find memory leak in C

I've been sitting for hours checking this code after I found out there's a memory leak/error somewhere
Where is that leak? How can it be fixed?
here is Dr.Memory report:
Dr. Memory version 2.3.0
Running "C:\Users\Beni\source\repos\Magshimim_EX8\Debug\Magshimim_EX8.exe"
Using system call file C:\Users\Beni\AppData\Roaming\Dr. Memory\symcache\syscalls_wow64.txt
Error #1: UNADDRESSABLE ACCESS: reading 1 byte(s)
replace_strlen
d:\drmemory_package\drmemory\replace.c(412):
Magshimim_EX8.exe!?
??:0
Magshimim_EX8.exe!?
??:0
Magshimim_EX8.exe!?
??:0
Magshimim_EX8.exe!?
??:0
Magshimim_EX8.exe!?
??:0
KERNEL32.dll!BaseThreadInitThunk
??:0
ERRORS FOUND:
1 unique, 1 total unaddressable access(es)
0 unique, 0 total uninitialized access(es)
0 unique, 0 total invalid heap argument(s)
0 unique, 0 total GDI usage error(s)
0 unique, 0 total handle leak(s)
0 unique, 0 total warning(s)
0 unique, 0 total, 0 byte(s) of leak(s)
0 unique, 0 total, 0 byte(s) of possible leak(s)
Details: C:\Users\Beni\AppData\Roaming\Dr. Memory\DrMemory-Magshimim_EX8.exe.5208.000\results.txt
WARNING: application exited with abnormal code 0xc0000005
#include <stdio.h>
#include <string.h>
#include <dirent.h>
#include <stdlib.h>
#define FALSE 0
#define TRUE !FALSE
#define FIRST_TWO_FILES 2
#define FIRST_TWENTY_PRECENTS 1
#define MIDDLE_SIXTY_PRECENTS 2
#define LAST_TWENTY_PRECENTS 3
long findLenOfFile(FILE * file);
char* readFile(FILE* f, char* dest, long len);
char menu(char* scanFolder, char* virusSignature);
char** writeFilesFromFolder(char* scanFolder, char ** filesList, int* len);
char* writePart(char* src, char* dest, int length, int* newLen, int part);
int findSignature(char* virusSignature, char* buffer, int sigLen, int bufferLen);
void scanFiles(char* scanFolder, char** filesList, int amountOfFiles, char* virusSignature, long virusLength, char option);
int main(int argc, char* argv[])
{
char* log = malloc(sizeof(char)*strlen(argv[1]) + sizeof(char)*strlen("\\Log.txt") + 4);
FILE* virusSignatureFile = fopen(argv[2], "rb");
long virusLength = 0;
char** filesList = (char**)malloc(sizeof(char) * 0);
char* virusSignature = 0;
int amountOfFiles = 0;
char option = 0;
int i = 0;
virusLength = findLenOfFile(virusSignatureFile);
// get the virusSignature as a string and write the files to check into the filesList
virusSignature = readFile(virusSignatureFile, virusSignature, virusLength);
filesList = writeFilesFromFolder(argv[1], filesList, &amountOfFiles);
// create log file
strcpy(log, "");
strcat(log, argv[1]);
strcat(log, "\\Log.txt");
FILE * logFile = fopen(log, "w");
fprintf(logFile, "Anti-virus began! Welcome!\n\nFolder to scan:\n%s\nVirus signature:\n%s\n\nScanning option:\n", argv[1], argv[2]);
// get scanning option (normal or quick) and continue accordingly
option = menu(argv[1], argv[2]);
if (option == '0') {
fprintf(logFile, "Normal Scan\n\n");
}
else {
fprintf(logFile, "Quick Scan\n\n");
}
fprintf(logFile, "Results:\n");
fclose(logFile);
// initiate scan
scanFiles(argv[1], filesList, amountOfFiles, virusSignature, virusLength, option);
fclose(virusSignatureFile);
free(log);
free(filesList);
free(virusSignature);
getchar();
return 0;
}
/*
This function will print the scanning folder path and signature path, also will print the option menu to the user of quick or normal scan,
after that function will return user option(0, or other key)
input: scanFolder path (string), virus signature path (also string)
output: user option (char: '0', or other key)
*/
char menu(char * scanFolder, char * virusSignature)
{
char userOption = '\0';
printf("Welcome to my Virus Scan!\n\nFolder to scan: %s\nVirus signature: %s\n\nPress 0 for a norman scan or any other key for a quick scan: ", scanFolder, virusSignature);
userOption = getchar();
printf("Scanning began...\nThis process may take several minutes...\n\n");
return userOption;
}
/*
This function writes all files name from folder to the filesList
input: the scanning folder path
output: amount of files
*/
char** writeFilesFromFolder(char * scanFolder, char ** filesList, int* len)
{
DIR *d = 0;
struct dirent *dir;
d = opendir(scanFolder);
int i = 0;
if (d)
{
while ((dir = readdir(d)) != NULL)
{
if (i > 1)
{
filesList = (char**)realloc(filesList, sizeof(filesList) + sizeof(char*) + 4);
*(filesList + (i - FIRST_TWO_FILES)) = (char*)malloc(sizeof(char) * strlen(dir->d_name) + 1);
strcpy(*(filesList + (i - FIRST_TWO_FILES)), (dir->d_name));
}
i++;
}
closedir(d);
}
*len = i - FIRST_TWO_FILES; //first two names is "." and ".."
return filesList;
}
/*
This function will read the contents of a file into a string
input: a file (FILE *) to read from
output: char* with the contents of the file
*/
char* readFile(FILE* f, char * dest, long len)
{
dest = (char*)malloc(sizeof(char) * len);
fread(dest, 1, len, f);
return dest;
}
void scanFiles(char * scanFolder, char ** filesList, int amountOfFiles, char * virusSignature, long virusLength, char option)
{
char* log = malloc(sizeof(char)*strlen(scanFolder) + sizeof(char)*strlen("\\Log.txt") + 1);
char * buffer = (char*)malloc(sizeof(char) * 0);
char* subBuffer = 0;
char* slash = "\\";
long length = 0;
char* name = 0;
int subLen = 0;
int i = 0;
FILE * f;
// reopen log file and append to it
strcpy(log, "");
strcat(log, scanFolder);
strcat(log, "\\Log.txt");
FILE * logFile = fopen(log, "a");
// iterate over each file
for (i = 0; i < amountOfFiles; i++)
{
name = (char*)malloc(sizeof(char) * strlen(scanFolder) + 1 + sizeof(char) * strlen(slash) + sizeof(char) * strlen(*(filesList + i)) + 20);
// open current file
strcpy(name, "");
strcat(name, scanFolder);
strcat(name, slash);
strcat(name, *(filesList + i));
f = fopen(name, "rb");
length = findLenOfFile(f);
if (f != NULL) // if file can be accessed
{
buffer = readFile(f, buffer, length);
if (option == '0') { // Normal Mode
if (findSignature(virusSignature, buffer, virusLength, length))
{
printf("%s - Infected!\n", name);
fprintf(logFile, "%s - Infected!\n", name);
}
else
{
printf("%s - Clean\n", name);
fprintf(logFile, "%s - Clean\n", name);
}
}
else { // Quick Mode
subBuffer = writePart(buffer, subBuffer, length, &subLen, FIRST_TWENTY_PRECENTS); // get first 20%
if (findSignature(virusSignature, subBuffer, virusLength, subLen))
{
printf("%s - infected! (first 20%%)\n", name);
fprintf(logFile, "%s - infected! (first 20%%)\n", name);
}
else {
free(subBuffer);
subBuffer = writePart(buffer, subBuffer, length, &subLen, LAST_TWENTY_PRECENTS); // get last 20%
if (findSignature(virusSignature, subBuffer, virusLength, subLen))
{
printf("%s - Infected! (last 20%%)\n", name);
fprintf(logFile, "%s - Infected! (last 20%%)\n", name);
}
else {
subBuffer = writePart(buffer, subBuffer, length, &subLen, MIDDLE_SIXTY_PRECENTS); // get the 60% left in the middle
if (findSignature(virusSignature, subBuffer, virusLength, subLen))
{
printf("%s - Infected!\n", name);
fprintf(logFile, "%s - Infected!\n", name);
}
else {
printf("%s - clean\n", name);
fprintf(logFile, "%s - Clean\n", name);
}
}
}
free(subBuffer);
}
fclose(f);
}
else
{
printf("No file found\n");
}
free(*(filesList + i));
free(name);
}
fclose(logFile);
free(log);
free(buffer);
getchar();
}
/*
This function will write part of the file (beginning, middle or end) to a string
input: source (string) to take the information from, destination (string) to write a part of the source to it,
length (int) of the source string, a pointer (int*) to store the new length of the destination string and
part of the file to write from (int) 1,2 or 3: first 20%, 60% in the middle and last 20% accordingly
output: string containing the desired part of the source string
*/
char* writePart(char *src, char *dest, int length, int *newLen, int part) {
int i = 0;
int percentedLength = 0;
int count = 0;
percentedLength = (int)(length / 5); // this len is 20% of the entire file's length
if (part == FIRST_TWENTY_PRECENTS) // return beginning
{
dest = (char*)malloc(sizeof(char) * percentedLength);
*newLen = percentedLength;
for (i = 0; i < percentedLength; i++)
{
*(dest + i) = *(src + i);
}
}
else if (part == MIDDLE_SIXTY_PRECENTS) // return middle
{
// allocate space for the middle: The entire file size minus 20% from the start and 20% from the end
dest = (char*)malloc(sizeof(char) * (length - 2 * percentedLength));
*newLen = length - 2 * percentedLength;
for (i = percentedLength; i < length - percentedLength; i++) {
*(dest + count) = *(src + i);
count++;
}
}
else if (part == LAST_TWENTY_PRECENTS) // return end
{
dest = (char*)malloc(sizeof(char) * percentedLength);
*newLen = percentedLength;
for (i = length - percentedLength; i < length; i++)
{
*(dest + count) = *(src + i);
count++;
}
}
return dest;
}
/*
function that finds the length of a file
input: file (FILE *)
output: the file's length (long)
*/
long findLenOfFile(FILE * file)
{
long length = 0;
fseek(file, 0, SEEK_END);
length = ftell(file);
fseek(file, 0, SEEK_SET);
return length;
}
/*
function checks whether a file contains the virusSignature. It iterates over each letter of the file and checks
if it is the same as the first letter in the virusSignature. If it is, it checks the rest of the characters and
returns True if a match is found. if not it continues the same process until the end of the file is reached.
input: The virusSignature (string), a buffer with the content of a file (string), the signature's length (int)
and the buffer's length (int)
output: True if signature is in file, False otherwise
*/
int findSignature(char* virusSignature, char* buffer, int sigLen, int bufferLen)
{
int found = 0;
int i = 0;
int j = 0;
for (i = 0; i < bufferLen - (sigLen - 1); i++) {
if (*(buffer + i) == *virusSignature) // check if a letter is the same as first letter in virusSignature
{
found = TRUE;
// check if the rest of the letters match the signature and stop if one doesn't
for (j = 1; (j < sigLen) && found; j++) {
if (*(buffer + (i + j)) != *(virusSignature + j)) {
found = FALSE;
}
}
if (found) {
return TRUE; // if we got a match, return true!
}
}
}
return FALSE;
}
The realloc call in writeFilesFromFolder has a bug.
It is:
filesList = (char **) realloc(filesList, sizeof(filesList) + sizeof(char *) + 4);
Notice that the space allocated for filesList is constant. It does not grow as new elements are added, so you have undefined behavior.
This is not a memory leak as the tool detected. A memory leak means that you fail to free a pointer that goes out of scope.
Rather, you're storing data beyond the end of the area you've allocated, trashing whatever is there, which is probably the [hidden] chain pointer area that malloc et. al. use to keep track of allocations.
I'm not sure how either of the sizeof factor in, but, filesList is a pointer, so sizeof(filesList) is constant [either 4 on a 32 bit machine or 8 for 64 bit].
The allocated space has to increase in proportion to i.
Here is a refactored version of that function that fixes the bug along with some simplification and cleanup:
BTW, don't cast malloc: Do I cast the result of malloc?
Also, note that sizeof(char) is [by definition] always 1 regardless of how many bits a char actually has for a given architecture. So, remove any sizeof(char) *
Instead of (e.g.):
*(filesList + i)
It's usually simpler/cleaner to do:
filesList[i]
Anyway, here's the code:
/*
This function writes all files name from folder to the filesList
input: the scanning folder path
output: amount of files
*/
char **
writeFilesFromFolder(char *scanFolder, char **filesList, int *len)
{
DIR *d = 0;
struct dirent *dir;
d = opendir(scanFolder);
int i = -FIRST_TWO_FILES;
if (d) {
while ((dir = readdir(d)) != NULL) {
if (i >= 0) {
filesList = realloc(filesList,sizeof(*filesList) * (i + 1));
filesList[i] = strdup(dir->d_name);
}
i++;
}
closedir(d);
}
*len = i; // first two names is "." and ".."
return filesList;
}
UPDATE:
Okay, you do have memory leaks. And, I've coded up detection and a fix.
Primarily, what you are doing is passing down a buffer pointer to a function (e.g. readFile or writePart) as an argument (e.g. dest).
Then, you are doing:
dest = malloc(percentedLength);
This leaks the previous value of dest.
Normally, functions that allocate a buffer and return it do not take it as an argument. But, after analyzing your code, replacing the malloc with a realloc prevents the leak.
I had to do a complete code review to find this. And, I did several simplifications and cleanups along the way to try to understand your code and isolate possible further issues.
I replaced your allocation/concatenation of filenames with a new function filejoin.
I replaced the other malloc calls with a macro: ALLOCME that detects the memory leaks before they happen. This works in conjunction with the [new] FREEME macro that replaces the free calls.
The default mode is to detect the leak and abort. If you give the program a -f option, it will fix the problem. After you analyze and understand what happened, you can change the default to the "fix" mode.
Where possible, I left your original code under #if 0
A few more style tips:
Keep lines to <= 80 chars.
Don't use "sidebar" comments, particularly on if clauses (e.g.):
if (...) { // process the file
Replace with:
// process the file
if (...) {
Don't replicate code. When you are replicating similar code [as in where I replaced the code with filejoin], this indicates a good place to write a modular function
Anyway, here's the refactored and fixed code:
#include <stdio.h>
#include <string.h>
#include <dirent.h>
#include <stdlib.h>
#include <errno.h>
#include <stdarg.h>
#include <assert.h>
#define FALSE 0
#define TRUE (! FALSE)
#define FIRST_TWO_FILES 2
#define FIRST_TWENTY_PRECENTS 1
#define MIDDLE_SIXTY_PRECENTS 2
#define LAST_TWENTY_PRECENTS 3
#define ALLOCME(_ptr,_len) \
do { \
_ptr = allocme(_ptr,_len,__FUNCTION__,__LINE__); \
} while (0)
#define FREEME(_ptr) \
do { \
if (_ptr != NULL) \
free(_ptr); \
_ptr = NULL; \
} while (0)
void
sysfault(const char *fmt,...)
{
va_list ap;
va_start(ap,fmt);
vfprintf(stderr,fmt,ap);
va_end(ap);
exit(1);
}
long findLenOfFile(FILE * file);
char *readFile(FILE * f, char *dest, long len);
char menu(const char *scanFolder, const char *virusSignature);
char **writeFilesFromFolder(const char *scanFolder, char **filesList, int *len);
char *writePart(char *src, char *dest, int length, int *newLen, int part);
int findSignature(char *virusSignature, char *buffer, int sigLen,
int bufferLen);
void scanFiles(const char *scanFolder, char **filesList, int amountOfFiles,
char *virusSignature, long virusLength, char option);
int opt_fixme = 0;
// allocme -- guarded allocation
void *
allocme(void *ptr,size_t len,const char *fnc,int lno)
{
if (! opt_fixme) {
if (ptr != NULL)
sysfault("allocme: leaking ptr=%p len=%zu (from %s at line %d)\n",
ptr,len,fnc,lno);
}
ptr = realloc(ptr,len);
if (ptr == NULL)
sysfault("allocme: realloc failure\n");
return ptr;
}
#ifdef __linux__
const char *slash = "/";
#else
const char *slash = "\\";
#endif
// filejoin -- create filename from directory and file tail
char *
filejoin(const char *dir,const char *tail)
{
size_t len;
char *file;
len = 0;
len += strlen(dir);
len += strlen(slash);
len += strlen(tail);
len += 1;
file = malloc(len);
if (file == NULL)
sysfault("filejoin: unable to alloc -- %s\n",strerror(errno));
*file = 0;
strcat(file,dir);
strcat(file,slash);
strcat(file,tail);
return file;
}
int
main(int argc, char **argv)
{
#if 0
char *log = malloc(strlen(argv[1]) + strlen("\\Log.txt") + 4);
#else
char *log;
#endif
--argc;
++argv;
for (; argc > 0; --argc, ++argv) {
char *cp = *argv;
if (*cp != '-')
break;
switch (cp[1]) {
case 'f':
opt_fixme = ! opt_fixme;
break;
}
}
if (argc != 2)
sysfault("usage: <folder_to_scan> <virus_signature_file>\n");
const char *topdir = argv[0];
const char *sigfile = argv[1];
FILE *virusSignatureFile = fopen(sigfile, "rb");
if (virusSignatureFile == NULL)
sysfault("main: unable to open '%s' -- %s\n",sigfile,strerror(errno));
long virusLength = 0;
#if 0
char **filesList = malloc(0);
#else
char **filesList = NULL;
#endif
char *virusSignature = NULL;
int amountOfFiles = 0;
char option = 0;
virusLength = findLenOfFile(virusSignatureFile);
// get the virusSignature as a string and write the files to check into the
// filesList
virusSignature = readFile(virusSignatureFile, virusSignature, virusLength);
#if 1
fclose(virusSignatureFile);
#endif
filesList = writeFilesFromFolder(topdir, filesList, &amountOfFiles);
// create log file
#if 0
strcpy(log, "");
strcat(log, argv[1]);
strcat(log, "\\Log.txt");
#else
log = filejoin(topdir,"Log.txt");
#endif
FILE *logFile = fopen(log, "w");
fprintf(logFile, "Anti-virus began! Welcome!\n\n"
"Folder to scan:\n%s\n"
"Virus signature:\n%s\n\n"
"Scanning option:\n", topdir, sigfile);
// get scanning option (normal or quick) and continue accordingly
option = menu(topdir, sigfile);
if (option == '0') {
fprintf(logFile, "Normal Scan\n\n");
}
else {
fprintf(logFile, "Quick Scan\n\n");
}
fprintf(logFile, "Results:\n");
fclose(logFile);
// initiate scan
scanFiles(topdir, filesList, amountOfFiles, virusSignature, virusLength,
option);
#if 0
fclose(virusSignatureFile);
#endif
FREEME(log);
FREEME(filesList);
FREEME(virusSignature);
#ifndef __linux__
getchar();
#endif
return 0;
}
/*
This function will print the scanning folder path and signature path, also
will print the option menu to the user of quick or normal scan,
after that function will return user option(0, or other key)
input: scanFolder path (string), virus signature path (also string)
output: user option (char: '0', or other key)
*/
char
menu(const char *scanFolder, const char *virusSignature)
{
char userOption = '\0';
printf("Welcome to my Virus Scan!\n\n"
"Folder to scan: %s\n"
"Virus signature: %s\n\n"
"Press 0 for a norman scan or any other key for a quick scan: ",
scanFolder, virusSignature);
userOption = getchar();
printf("Scanning began...\nThis process may take several minutes...\n\n");
return userOption;
}
/*
This function writes all files name from folder to the filesList
input: the scanning folder path
output: amount of files
*/
char **
writeFilesFromFolder(const char *scanFolder, char **filesList, int *len)
{
DIR *d = 0;
struct dirent *dir;
d = opendir(scanFolder);
int i = -FIRST_TWO_FILES;
if (d) {
while ((dir = readdir(d)) != NULL) {
if (i >= 0) {
filesList = realloc(filesList,sizeof(*filesList) * (i + 1));
filesList[i] = strdup(dir->d_name);
}
i++;
}
closedir(d);
}
// first two names is "." and ".."
*len = i;
return filesList;
}
/*
This function will read the contents of a file into a string
input: a file (FILE *) to read from
output: char* with the contents of the file
*/
char *
readFile(FILE * f, char *dest, long len)
{
// NOTE/BUG: this does _not_ free the prior value -- memory leak!
ALLOCME(dest,len);
fread(dest, 1, len, f);
return dest;
}
void
scanFiles(const char *scanFolder, char **filesList, int amountOfFiles,
char *virusSignature, long virusLength, char option)
{
#if 0
char *log = malloc(strlen(scanFolder) + strlen("\\Log.txt") + 1);
#else
char *log;
#endif
#if 0
char *buffer = malloc(0);
#else
char *buffer = NULL;
#endif
char *subBuffer = NULL;
long length = 0;
char *name = NULL;
int subLen = 0;
int i = 0;
FILE *f;
// reopen log file and append to it
#if 0
strcpy(log, "");
strcat(log, scanFolder);
strcat(log, "\\Log.txt");
#else
log = filejoin(scanFolder,"Log.txt");
#endif
FILE *logFile = fopen(log, "a");
// iterate over each file
for (i = 0; i < amountOfFiles; i++) {
#if 0
name = malloc(strlen(scanFolder) + 1 + strlen(slash) + strlen(*(filesList + i)) + 20);
#endif
// open current file
#if 0
strcpy(name, "");
strcat(name, scanFolder);
strcat(name, slash);
strcat(name, *(filesList + i));
#else
name = filejoin(scanFolder,filesList[i]);
#endif
f = fopen(name, "rb");
length = findLenOfFile(f);
// if file can be accessed
if (f != NULL) {
buffer = readFile(f, buffer, length);
// Normal Mode
if (option == '0') {
if (findSignature(virusSignature, buffer, virusLength, length)) {
printf("%s - Infected!\n", name);
fprintf(logFile, "%s - Infected!\n", name);
}
else {
printf("%s - Clean\n", name);
fprintf(logFile, "%s - Clean\n", name);
}
}
// Quick Mode
else {
// get first 20%
subBuffer = writePart(buffer, subBuffer, length, &subLen,
FIRST_TWENTY_PRECENTS);
if (findSignature(virusSignature, subBuffer, virusLength,
subLen)) {
printf("%s - infected! (first 20%%)\n", name);
fprintf(logFile, "%s - infected! (first 20%%)\n", name);
}
else {
FREEME(subBuffer);
// get last 20%
subBuffer = writePart(buffer, subBuffer, length, &subLen,
LAST_TWENTY_PRECENTS);
if (findSignature(virusSignature, subBuffer, virusLength,
subLen)) {
printf("%s - Infected! (last 20%%)\n", name);
fprintf(logFile, "%s - Infected! (last 20%%)\n", name);
}
else {
// get the 60% left in the middle
subBuffer = writePart(buffer, subBuffer, length,
&subLen, MIDDLE_SIXTY_PRECENTS);
if (findSignature(virusSignature, subBuffer,
virusLength, subLen)) {
printf("%s - Infected!\n", name);
fprintf(logFile, "%s - Infected!\n", name);
}
else {
printf("%s - clean\n", name);
fprintf(logFile, "%s - Clean\n", name);
}
}
}
FREEME(subBuffer);
}
fclose(f);
}
else {
printf("No file found\n");
}
FREEME(filesList[i]);
FREEME(name);
}
fclose(logFile);
FREEME(log);
FREEME(buffer);
getchar();
}
/*
This function will write part of the file (beginning, middle or end) to a string
input: source (string) to take the information from, destination (string) to
write a part of the source to it, length (int) of the source string, a pointer
(int*) to store the new length of the destination string and part of the file
to write from (int) 1,2 or 3: first 20%, 60% in the middle and last 20%
accordingly
output: string containing the desired part of the source string
*/
char *
writePart(char *src, char *dest, int length, int *newLen, int part)
{
int i = 0;
int percentedLength = 0;
int count = 0;
// this len is 20% of the entire file's length
percentedLength = (int) (length / 5);
// NOTE/BUG: this does _not_ free the prior value -- memory leak!
switch (part) {
case FIRST_TWENTY_PRECENTS: // return beginning
ALLOCME(dest,percentedLength);
*newLen = percentedLength;
for (i = 0; i < percentedLength; i++) {
*(dest + i) = *(src + i);
}
break;
case MIDDLE_SIXTY_PRECENTS: // return middle
// allocate space for the middle: The entire file size minus 20% from
// the start and 20% from the end
ALLOCME(dest,length - 2 * percentedLength);
*newLen = length - 2 * percentedLength;
for (i = percentedLength; i < length - percentedLength; i++) {
*(dest + count) = *(src + i);
count++;
}
break;
case LAST_TWENTY_PRECENTS: // return end
ALLOCME(dest,percentedLength);
*newLen = percentedLength;
for (i = length - percentedLength; i < length; i++) {
*(dest + count) = *(src + i);
count++;
}
break;
}
return dest;
}
/*
function that finds the length of a file
input: file (FILE *)
output: the file's length (long)
*/
long
findLenOfFile(FILE * file)
{
long length = 0;
fseek(file, 0, SEEK_END);
length = ftell(file);
fseek(file, 0, SEEK_SET);
return length;
}
/*
function checks whether a file contains the virusSignature.
It iterates over each letter of the file and checks if it is the same as the
first letter in the virusSignature.
If it is, it checks the rest of the characters and returns True if a match is
found.
if not it continues the same process until the end of the file is reached.
input: The virusSignature (string), a buffer with the content of a file
(string), the signature's length (int)
and the buffer's length (int)
output: True if signature is in file, False otherwise
*/
int
findSignature(char *virusSignature, char *buffer, int sigLen, int bufferLen)
{
int found = 0;
int i = 0;
int j = 0;
for (i = 0; i < bufferLen - (sigLen - 1); i++) {
// check if a letter is the same as first letter in virusSignature
if (*(buffer + i) == *virusSignature)
{
found = TRUE;
// check if the rest of the letters match the signature and stop
// if one doesn't
for (j = 1; (j < sigLen) && found; j++) {
if (*(buffer + (i + j)) != *(virusSignature + j)) {
found = FALSE;
}
}
// if we got a match, return true!
if (found) {
return TRUE;
}
}
}
return FALSE;
}

Strange behavior when manipulating char* array in C

I'm trying to create a command line shell for an Operating Systems class. One of our assignments is to create a builtin "history" command that prints out the last 10 commands executed in the shell. Here is the code I have written for the "history" command:
char* cmd_hsitory[10]; // This is a global variable
int add_history(char **args) {
cmd_history[9] = NULL;
for(int i = 8; i >= 0; i--) {
cmd_history[i+1] = cmd_history[i];
}
cmd_history[0] = *args;
return 1;
}
Where the char **args argument is the last command exectued. Here is the function that prints the history:
int lsh_history(char **args) {
printf("Last 10 commands: \n");
for(int i = 0; i < 10; i++) {
printf("%s\n", cmd_history[i]);
}
return 1;
}
Some strange behavior is happening with this code. For instance, when I run the commands [cd, cd, ls, history] in succession, this is the printed output:
Last 10 commands:
ls
ls
cd
(null)
(null)
(null)
(null)
(null)
(null)
(null)
The first problem here is that I ran the cd command twice, and the ls command only once. If I run the "history" command again I get:
Last 10 commands:
history
ls
ls
cd
(null)
(null)
(null)
(null)
(null)
(null)
This seems correct with the exception of the 2 ls commands vs the 1 cd command.
This isn't very consistent, though, as sometimes I'll get mixed-up commands and the "history" command will show up several times.
If someone told me what's glaringly wrong with my code, that would be of great help. Thanks!
EDIT: Here is the full source code:
P.s. Most of this code is pulled from the internet (Stephen Brennan) and I'm building on top of it to learn. I will not submit this code as my assignment.
#include <sys/wait.h>
#include <unistd.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
/*
Function Declaration for history queue
*/
int add_history(char **args);
/*
Function Declarations for builtin shell commands:
*/
int lsh_cd(char **args);
int lsh_help(char **args);
int lsh_exit(char **args);
int lsh_history(char **args);
/*
List of builtin commands, followed by their corresponding functions.
*/
char *builtin_str[] = {
"cd",
"help",
"exit",
"history"
};
int (*builtin_func[]) (char **) = {
&lsh_cd,
&lsh_help,
&lsh_exit,
&lsh_history
};
char *cmd_history[10];
int lsh_num_builtins() {
return sizeof(builtin_str) / sizeof(char *);
}
int add_history(char **args) {
cmd_history[9] = NULL;
for(int i = 8; i >= 0; --i) {
cmd_history[i+1] = cmd_history[i];
}
cmd_history[0] = NULL;
cmd_history[0] = *args;
return 1;
}
/*
Builtin function implementations.
*/
/**
#brief Builtin command: command history.
#param args List of args. args[0] is "history".
#return Always returns 1 to continue executing.
*/
int lsh_history(char **args) {
printf("Last 10 commands: \n");
for(int i = 0; i < 10; i++) {
printf("%s\n", cmd_history[i]);
}
return 1;
}
/**
#brief Bultin command: change directory.
#param args List of args. args[0] is "cd". args[1] is the directory.
#return Always returns 1, to continue executing.
*/
int lsh_cd(char **args)
{
if (args[1] == NULL) {
chdir("/Users/Landon/");
} else {
if (chdir(args[1]) != 0) {
perror("lsh");
}
}
return 1;
}
/**
#brief Builtin command: print help.
#param args List of args. Not examined.
#return Always returns 1, to continue executing.
*/
int lsh_help(char **args)
{
int i;
printf("Stephen Brennan's LSH\n");
printf("Type program names and arguments, and hit enter.\n");
printf("The following are built in:\n");
for (i = 0; i < lsh_num_builtins(); i++) {
printf(" %s\n", builtin_str[i]);
}
printf("Use the man command for information on other programs.\n");
return 1;
}
/**
#brief Builtin command: exit.
#param args List of args. Not examined.
#return Always returns 0, to terminate execution.
*/
int lsh_exit(char **args)
{
return 0;
}
/**
#brief Launch a program and wait for it to terminate.
#param args Null terminated list of arguments (including program).
#return Always returns 1, to continue execution.
*/
int lsh_launch(char **args)
{
pid_t pid;
int status;
pid = fork();
if (pid == 0) {
// Child process
if (execvp(args[0], args) == -1) {
perror("lsh");
}
exit(EXIT_FAILURE);
} else if (pid < 0) {
// Error forking
perror("lsh");
} else {
// Parent process
do {
waitpid(pid, &status, WUNTRACED);
} while (!WIFEXITED(status) && !WIFSIGNALED(status));
}
return 1;
}
/**
#brief Execute shell built-in or launch program.
#param args Null terminated list of arguments.
#return 1 if the shell should continue running, 0 if it should terminate
*/
int lsh_execute(char **args)
{
int i;
if (args[0] == NULL) {
// An empty command was entered.
return 1;
}
for (i = 0; i < lsh_num_builtins(); i++) {
if (strcmp(args[0], builtin_str[i]) == 0) {
return (*builtin_func[i])(args);
}
}
return lsh_launch(args);
}
#define LSH_RL_BUFSIZE 1024
/**
#brief Read a line of input from stdin.
#return The line from stdin.
*/
char *lsh_read_line(void)
{
int bufsize = LSH_RL_BUFSIZE;
int position = 0;
char *buffer = malloc(sizeof(char) * bufsize);
int c;
if (!buffer) {
fprintf(stderr, "lsh: allocation error\n");
exit(EXIT_FAILURE);
}
while (1) {
// Read a character
c = getchar();
if (c == EOF) {
exit(EXIT_SUCCESS);
} else if (c == '\n') {
buffer[position] = '\0';
return buffer;
} else {
buffer[position] = c;
}
position++;
// If we have exceeded the buffer, reallocate.
if (position >= bufsize) {
bufsize += LSH_RL_BUFSIZE;
buffer = realloc(buffer, bufsize);
if (!buffer) {
fprintf(stderr, "lsh: allocation error\n");
exit(EXIT_FAILURE);
}
}
}
}
#define LSH_TOK_BUFSIZE 64
#define LSH_TOK_DELIM " \t\r\n\a"
/**
#brief Split a line into tokens (very naively).
#param line The line.
#return Null-terminated array of tokens.
*/
char **lsh_split_line(char *line)
{
int bufsize = LSH_TOK_BUFSIZE, position = 0;
char **tokens = malloc(bufsize * sizeof(char*));
char *token, **tokens_backup;
if (!tokens) {
fprintf(stderr, "lsh: allocation error\n");
exit(EXIT_FAILURE);
}
token = strtok(line, LSH_TOK_DELIM);
while (token != NULL) {
tokens[position] = token;
position++;
if (position >= bufsize) {
bufsize += LSH_TOK_BUFSIZE;
tokens_backup = tokens;
tokens = realloc(tokens, bufsize * sizeof(char*));
if (!tokens) {
free(tokens_backup);
fprintf(stderr, "lsh: allocation error\n");
exit(EXIT_FAILURE);
}
}
token = strtok(NULL, LSH_TOK_DELIM);
}
tokens[position] = NULL;
return tokens;
}
/**
#brief Loop getting input and executing it.
*/
void lsh_loop(void)
{
char *line;
char **args;
int status;
do {
printf("> ");
line = lsh_read_line();
args = lsh_split_line(line);
status = lsh_execute(args);
add_history(args);
free(line);
free(args);
} while (status);
}
/**
#brief Main entry point.
#param argc Argument count.
#param argv Argument vector.
#return status code
*/
int main(int argc, char **argv)
{
// Load config files, if any.
// Run command loop.
lsh_loop();
// Perform any shutdown/cleanup.
return EXIT_SUCCESS;
}
You need to strdup() the arg strings over to your cmd_history pointers, not just have your cmd_history pointers reference the original command string. When you free(line), you're putting the memory that your cmd_history pointers are referencing back into the free pool. In the next iteration of the loop you may or may not overwrite that data.
You wrote ((Comments Added)):
void lsh_loop(void)
{
...
line = lsh_read_line(); //mallocs line
args = lsh_split_line(line); //mallocs args array; consisting of references into line[]
status = lsh_execute(args); //no promblem here
add_history(args); // ***PERMANETNLY*** stores args[0] into cmd_history[]!!!
free(line); //frees line[]
free(args); //frees the pointer array into freed line[], which was copied to cmd_history[] --> cmd_history[] points to unallocated memory!!!
}
The main problem is related to strtok(): It returns a pointer into the spitted string, within the original string: If you later free the token pointer array or the original string and use it later --> UB.
Additionally lsh_history() kills also last entry which would leave a memory leak behind, if you would allocated that entry firmly.
Solution:
Establish for cmd_history[] your own storage of the strings, duplicate and free them properly in lsh_history().

qsort fails to sort large array of strings

I'm using qsort to sort an array of i strings of size 256, such as char *arr = malloc(i * 256) -- was actully done with reallocs inside a loop. Each string contains, among text, a number, which I use as the comparison element:
int
cmp(const void *a, const void *b)
{
double atime = get_time((char*)a);
double btime = get_time((char*)b);
return (atime > btime) - (atime < btime);
}
When i is small, it works. With a large i, it fails to sort the array correctly. get_time is working. I was using it with a custom heapsort implementation before, which worked flawlessly.
I added the following to cmp to check what was happening:
fprintf(stderr, "Comparing %f to %f, result: %d.\n", atime, btime, (atime > btime) - (atime < btime));
It seems that all comparisons are correct, but not all comparisons are being made. arr has several strings containing 1.something, however I couldn't find any comparison between numbers greater than 1 in the output. The call to qsort is as follows:
qsort((void*)arr, i-1, MAX_ROW_LEN, cmp);
It's the same parameters I used to pass to my heapsort function, but it doesn't work.
Complete code, and example file (fails to sort).
#include <errno.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#define MAX_ROW_LEN 256
#define MAX_FILENAME_LEN 256
/* Return the start time of the event or -1 if no time. */
static double
get_time(const char *event)
{
if (!event || event[0] == '%')
return -1;
size_t tok = strcspn(event, " ") + 2;
double ans = strtod(event + tok, NULL);
if (!ans)
return -1;
return ans;
}
/*static inline*/ int
cmp(const void *a, const void *b)
{
double atime = get_time((char*)a);
double btime = get_time((char*)b);
return (atime > btime) - (atime < btime);
}
int
main(int argc, char **argv)
{
/* process parameters */
if (argc < 2) {
fprintf(stderr, "Supply a file to sort.\n");
exit(EXIT_FAILURE);
}
if (strlen(argv[1]) > MAX_FILENAME_LEN) {
fprintf(stderr, "Filename too long.\n");
exit(EXIT_FAILURE);
}
/* read the file */
printf("Now processing %s.\n", argv[1]);
FILE *f = fopen(argv[1], "r");
if (!f) {
fprintf(stderr, "Failed to open out. Errno %d.\n", errno);
exit(EXIT_FAILURE);
}
char *trace = malloc(MAX_ROW_LEN);
char *header = malloc(MAX_ROW_LEN);
size_t i = 1, j = 1;
while (fgets(trace + (i-1)*MAX_ROW_LEN, MAX_ROW_LEN, f)) {
/* (if we can't get the time, it's part of the header) */
if (get_time(trace + (i-1)*MAX_ROW_LEN) != -1) {
trace = realloc((void*)trace, (++i)*MAX_ROW_LEN);
} else {
strncpy(header + (j-1)*MAX_ROW_LEN, trace + (i-1)*MAX_ROW_LEN,
MAX_ROW_LEN);
header = realloc((void*)header, (++j)*MAX_ROW_LEN);
}
}
if (!feof(f)) {
fprintf(stderr, "Error reading file. Errno %d.\n", ferror(f));
exit(EXIT_FAILURE);
}
printf("Read %zu lines.\n", i);
fclose(f);
/* write the header */
f = fopen("out_fixed", "w");
if (!f) {
fprintf(stderr, "Failed to open out_fixed. Errno %d.\n", errno);
exit(EXIT_FAILURE);
}
for (size_t k = 0; k < j-1; ++k) {
/* (there is '%' in comments, can't print formatted) */
fputs((void*)(header + k*MAX_ROW_LEN), f);
}
/* sort */
printf("Started sorting.\n");
time_t start = time(NULL);
qsort((void*)trace, i-1, MAX_ROW_LEN, cmp);
printf("Ended sorting, took %fs.\n", difftime(time(NULL), start));
/* write the sorted trace */
printf("Started writting to disk.\n");
start = time(NULL);
for (size_t k = 0; k < i-1; ++k) {
fprintf(f, "%s", trace + k*MAX_ROW_LEN);
}
printf("Took %fs.\n", difftime(time(NULL), start));
/* flush */
printf("Closing file (fflush)\n");
start = time(NULL);
if (fclose(f)) {
fprintf(stderr, "Failed to close out_fixed. Errno %d.\n", errno);
exit(EXIT_FAILURE);
}
printf("Took %fs.\n", difftime(time(NULL), start));
exit(EXIT_SUCCESS);
}
I've tested your code and your example input file and it seems to work fine. In your question you say:
... has several strings containing 1.something, however I couldn't find
any comparison between numbers greater than 1 in the output.
But there are no such lines in your example input file.
Given this example line of your input:
12 0.475183170 rank3 STATE fill_row
This line in get_time is going to skip over any leading digits in your double:
size_t tok = strcspn(event, " ") + 2;
strcspn returns the number of characters that it had to read before finding the "needle" so in this case it will return 2. You then add 2 to that and then use that as a pointer offset into your event string, meaning that you are passing a pointer to .475183170 instead of 0.475183170.
You'd be better off just using strchr here anyway:
char *tok = strchr(event, ' ');
if (!tok) {
return -1;
}
double ans = strtod(tok, NULL);
The subsequent strtod will skip leading whitespace for you, so you don't need to get super fancy.

3 memory leaks in a program reading lines, even with exit() used

After running the program in valgrind -v it shows me that there are 3 unfreed blocks of memory (50 allocs, 47 frees). I'm probably failing to free infile outfile and temp - I guess. But when I put:
else {
free(line);
fclose(infile); /* added lines */
fclose(outfile); /* added lines */
free(temp); /* added lines */
exit(EXIT_FAILURE);
}
it doesn't compile, showing me errors about undefined use of temp and outfile.
EDIT:
I changed it to (in lineRead):
else {
free(line);
fclose(infile);
return NULL;
}
and added following error catcher after while in main:
if ((check = readline(infile)) == NULL) {
fclose(outfile);
}
However, this gives me even more errors. Why is that?
/EDIT
How to fix that? I though that exit() does all the cleaning needed...
The code is changed in [1] because I wanted to simulate that particular error.
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <string.h>
char* lineRead(FILE* infile)
{
char* line = NULL;
char* newbuf = NULL;
int c;
size_t bufsize = 0;
size_t size = 0;
while((c=fgetc(infile)) != EOF) {
if (size >= bufsize) {
if (bufsize == 0)
bufsize = 2;
else if (bufsize <= ((size_t)-1)/2)
bufsize = size+1;
else {
free(line);
exit(EXIT_FAILURE);
}
newbuf = realloc(line,bufsize);
if (!newbuf) {
free(line);
exit(EXIT_FAILURE);
} else {
line = newbuf;
}
}
if (c != '\n') {
line[size++]=c;
}
}
if(size >= bufsize) {
if (size > (size_t)-1) /* [1] I know that there should be*/
/* '<', but it is '>' just for testing errors */
bufsize = size + 1;
else {
free(line);
exit(EXIT_FAILURE);
}
newbuf = realloc(line,bufsize);
if (!newbuf) {
free(line);
exit(EXIT_FAILURE);
}
line = newbuf;
}
line[size++]='\0';
return line;
}
int main(int argc, char* argv[])
{
char *line=NULL;
char **lines=NULL;
int linenumber=0;
int c;
void *temp=NULL;
while((line=lineRead(infile))!=NULL) {
linenumber++;
temp=realloc(lines, (linenumber)*sizeof(char*));
if(temp==NULL) {
printf("Bad alloc error\n");
free(lines);
return 0;
} else {
lines=temp;
}
}
/* processing lines */
free(lines);
return 0;
}
You didn't free
newbuf // in read line
line // in main
temp // Which gets free'd only on after bad malloc
You don't say where you added free()s went, but If it is in lineread, then those other variables aren't visible there (they were declared local to main; you've defined a DIFFERENT line here). So you'll need to either pass them as parameters or make them global.

How do I handle a stream of data internal to a C-based app?

I am pulling data from a bzip2 stream within a C application. As chunks of data come out of the decompressor, they can be written to stdout:
fwrite(buffer, 1, length, stdout);
This works great. I get all the data when it is sent to stdout.
Instead of writing to stdout, I would like to process the output from this statement internally in one-line-chunks: a string that is terminated with a newline character \n.
Do I write the output of the decompressor stream to another buffer, one character at a time, until I hit a newline, and then call the per-line processing function? Is this slow and is there a smarter approach? Thanks for your advice.
EDIT
Thanks for your suggestions. I ended up creating a pair of buffers that store the remainder (the "stub" at the end of an output buffer) at the beginning of a short line buffer, each time I pass through the output buffer's worth of data.
I loop through the output buffer character by character and process a newline-line's worth of data at a time. The newline-less remainder gets allocated and assigned, and copied to the next stream's line buffer. It seems like realloc is less expensive than repeated malloc-free statements.
Here's the code I came up with:
char bzBuf[BZBUFMAXLEN];
BZFILE *bzFp;
int bzError, bzNBuf;
char bzLineBuf[BZLINEBUFMAXLEN];
char *bzBufRemainder = NULL;
int bzBufPosition, bzLineBufPosition;
bzFp = BZ2_bzReadOpen(&bzError, *fp, 0, 0, NULL, 0); /* http://www.bzip.org/1.0.5/bzip2-manual-1.0.5.html#bzcompress-init */
if (bzError != BZ_OK) {
BZ2_bzReadClose(&bzError, bzFp);
fprintf(stderr, "\n\t[gchr2] - Error: Bzip2 data could not be retrieved\n\n");
return -1;
}
bzError = BZ_OK;
bzLineBufPosition = 0;
while (bzError == BZ_OK) {
bzNBuf = BZ2_bzRead(&bzError, bzFp, bzBuf, sizeof(bzBuf));
if (bzError == BZ_OK || bzError == BZ_STREAM_END) {
if (bzBufRemainder != NULL) {
/* fprintf(stderr, "copying bzBufRemainder to bzLineBuf...\n"); */
strncpy(bzLineBuf, bzBufRemainder, strlen(bzBufRemainder)); /* leave out \0 */
bzLineBufPosition = strlen(bzBufRemainder);
}
for (bzBufPosition = 0; bzBufPosition < bzNBuf; bzBufPosition++) {
bzLineBuf[bzLineBufPosition++] = bzBuf[bzBufPosition];
if (bzBuf[bzBufPosition] == '\n') {
bzLineBuf[bzLineBufPosition] = '\0'; /* terminate bzLineBuf */
/* process the line buffer, e.g. print it out or transform it, etc. */
fprintf(stdout, "%s", bzLineBuf);
bzLineBufPosition = 0; /* reset line buffer position */
}
else if (bzBufPosition == (bzNBuf - 1)) {
bzLineBuf[bzLineBufPosition] = '\0';
if (bzBufRemainder != NULL)
bzBufRemainder = (char *)realloc(bzBufRemainder, bzLineBufPosition);
else
bzBufRemainder = (char *)malloc(bzLineBufPosition);
strncpy(bzBufRemainder, bzLineBuf, bzLineBufPosition);
}
}
}
}
if (bzError != BZ_STREAM_END) {
BZ2_bzReadClose(&bzError, bzFp);
fprintf(stderr, "\n\t[gchr2] - Error: Bzip2 data could not be uncompressed\n\n");
return -1;
} else {
BZ2_bzReadGetUnused(&bzError, bzFp, 0, 0);
BZ2_bzReadClose(&bzError, bzFp);
}
free(bzBufRemainder);
bzBufRemainder = NULL;
I really appreciate everyone's help. This is working nicely.
I don't think there's a smarter approach (except finding an automata library that already does this for you). Be careful with allocating proper size for the "last line" buffer: if it cannot handle arbitrary length and the input comes from something accessible to third parties, it becomes a security risk.
I've also been working with processing bzip2 data per line, and I found that reading one byte at a time was too slow. This worked better for me:
#include <stdio.h>
#include <stdlib.h>
#include <bzlib.h>
/* gcc -o bz bz.c -lbz2 */
#define CHUNK 128
struct bzdata {
FILE *fp;
BZFILE *bzf;
int bzeof, bzlen, bzpos;
char bzbuf[4096];
};
static int bz2_open(struct bzdata *bz, char *file);
static void bz2_close(struct bzdata *bz);
static int bz2_read_line(struct bzdata *bz, char **line, int *li);
static int bz2_buf(struct bzdata *bz, char **line, int *li, int *ll);
static int
bz2_buf(struct bzdata *bz, char **line, int *li, int *ll)
{
int done = 0;
for (; bz->bzpos < bz->bzlen && done == 0; bz->bzpos++) {
if (*ll + 1 >= *li) {
*li += CHUNK;
*line = realloc(*line, (*li + 1) * sizeof(*(*line)));
}
if ( ((*line)[(*ll)++] = bz->bzbuf[bz->bzpos]) == '\n') {
done = 1;
}
}
if (bz->bzpos == bz->bzlen) {
bz->bzpos = bz->bzlen = 0;
}
(*line)[*ll] = '\0';
return done;
}
static int
bz2_read_line(struct bzdata *bz, char **line, int *li)
{
int bzerr = BZ_OK, done = 0, ll = 0;
if (bz->bzpos) {
done = bz2_buf(bz, line, li, &ll);
}
while (done == 0 && bz->bzeof == 0) {
bz->bzlen = BZ2_bzRead(&bzerr, bz->bzf, bz->bzbuf, sizeof(bz->bzbuf));
if (bzerr == BZ_OK || bzerr == BZ_STREAM_END) {
bz->bzpos = 0;
if (bzerr == BZ_STREAM_END) {
bz->bzeof = 1;
}
done = bz2_buf(bz, line, li, &ll);
} else {
done = -1;
}
}
/* Handle last lines that don't have a line feed */
if (done == 0 && ll > 0 && bz->bzeof) {
done = 1;
}
return done;
}
static int
bz2_open(struct bzdata *bz, char *file)
{
int bzerr = BZ_OK;
if ( (bz->fp = fopen(file, "rb")) &&
(bz->bzf = BZ2_bzReadOpen(&bzerr, bz->fp, 0, 0, NULL, 0)) &&
bzerr == BZ_OK) {
return 1;
}
return 0;
}
static void
bz2_close(struct bzdata *bz)
{
int bzerr;
if (bz->bzf) {
BZ2_bzReadClose(&bzerr, bz->bzf);
bz->bzf = NULL;
}
if (bz->fp) {
fclose(bz->fp);
bz->fp = NULL;
}
bz->bzpos = bz->bzlen = bz->bzeof = 0;
}
int main(int argc, char *argv[]) {
struct bzdata *bz = NULL;
int i, lc, li = 0;
char *line = NULL;
if (argc < 2) {
return fprintf(stderr, "usage: %s file [file ...]\n", argv[0]);
}
if ( (bz = calloc(1, sizeof(*bz))) ) {
for (i = 1; i < argc; i++) {
if (bz2_open(bz, argv[i])) {
for (lc = 0; bz2_read_line(bz, &line, &li) > 0; lc++) {
/* Process line here */
}
printf("%s: lines=%d\n", argv[i], lc);
}
bz2_close(bz);
}
free(bz);
}
if (line) {
free(line);
}
return 0;
}
This would be easy to do using C++'s std::string, but in C it takes some code if you want to do it efficiently (unless you use a dynamic string library).
char *bz_read_line(BZFILE *input)
{
size_t offset = 0;
size_t len = CHUNK; // arbitrary
char *output = (char *)xmalloc(len);
int bzerror;
while (BZ2_bzRead(&bzerror, input, output + offset, 1) == 1) {
if (offset+1 == len) {
len += CHUNK;
output = xrealloc(output, len);
}
if (output[offset] == '\n')
break;
offset++;
}
if (output[offset] == '\n')
output[offset] = '\0'; // strip trailing newline
else if (bzerror != BZ_STREAM_END) {
free(output);
return NULL;
}
return output;
}
(Where xmalloc and xrealloc handle errors internally. Don't forget to free the returned string.)
This is almost an order of magnitude slower than bzcat:
lars#zygmunt:/tmp$ wc foo
1193 5841 42868 foo
lars#zygmunt:/tmp$ bzip2 foo
lars#zygmunt:/tmp$ time bzcat foo.bz2 > /dev/null
real 0m0.010s
user 0m0.008s
sys 0m0.000s
lars#zygmunt:/tmp$ time ./a.out < foo.bz2 > /dev/null
real 0m0.093s
user 0m0.044s
sys 0m0.020s
Decide for yourself whether that's acceptable.
I think you should copy chunks of characters to another buffer until the latest chunk you write contains a new line character. Then you can work on the whole line.
You can save the rest of the buffer (after the '\n') into a temporary and then create a new line from it.

Resources