Related
So, I looked around the internet and a couple questions here and I couldn't find anything that could fix my problem here. I have an assignment for C programming, to write a program that allows user to enter words into a string, add more words, put all words in the string to a text file, delete all words in string, and when they exit it saves the words in a binary, which is loaded upon starting up the program again. I've gotten everything to work except where the binary is concerned.
I made two functions, one that loads the bin file when the program starts, one that saves the bin file when it ends. I don't know in which, or if in both, the problem starts. But basically I know it's not working right because I get garbage in my text file if I save it in a text file after the program loads the bin file into the string. I know for sure that the text file saver is working properly.
Thank you to anyone who takes the time to help me out, it's been an all-day process! lol
Here are the two snippets of my functions, everything else in my code seems to work so I don't want to blot up this post with the entire program, but if need be I'll put it up to solve this.
SIZE is a constant of 10000 to meet program specs of a 1000 words. But I couldn't get this to run even asking for only 10 elements or 1, just to clear that up
void loadBin(FILE *myBin, char *stringAll) {
myBin = fopen("myBin.bin", "rb");
if (myBin == NULL) {
saveBin(&myBin, stringAll);
}//if no bin file exists yet
fread(stringAll, sizeof(char), SIZE + 1, myBin);
fclose(myBin); }
/
void saveBin(FILE *myBin, char *stringAll) {
int stringLength = 0;
myBin = fopen("myBin.bin", "wb");
if (myBin == NULL) {
printf("Problem writing file!\n");
exit(-1);
stringLength = strlen(stringAll);
fwrite(&stringAll, sizeof(char), (stringLength + 1), myBin);
fclose(myBin); }
You are leaving bad values in your myBin FILE*, and passing the & (address) of a pointer.
Pass the filename, and you can (re) use the functions for other purposes, other files, et al.
char* filename = "myBin.bin";
Pass the filename, buffer pointer, and max size to read. You should consider using stat/fstat to discover file size
size_t loadBin(char *fn, char *stringAll, size_t size)
{
//since you never use myBin, keep this FILE* local
FILE* myBin=NULL;
if( NULL == (myBin = fopen(fn, "rb")) ) {
//create missing file
saveBin(fn, stringAll, 0);
}//if no bin file exists yet
size_t howmany = fread(stringAll, sizeof(char), size, myBin);
if( howmany < size ) printf("read fewer\n");
if(myBin) fclose(myBin);
return howmany;
}
Pass the file name, buffer pointer, and size to save
size_t saveBin(char *fn, char *stringAll, size_t size)
{
int stringLength = 0;
//again, why carry around FILE* pointer only used locally?
FILE* myBin=NULL;
if( NULL == (myBin = fopen("myBin.bin", "wb")) ) {
printf("Problem writing file!\n");
exit(-1);
}
//binary data may have embedded '\0' bytes, cannot use strlen,
//stringLength = strlen(stringAll);
size_t howmany = fwrite(stringAll, sizeof(char), size, myBin);
if( howmany < size ) printf("short write\n");
if(myBin) fclose(myBin);
return howmany;
}
Call these; you are not guaranteed to write & read the same sizes...
size_t buffer_size = SIZE;
char buffer[SIZE]; //fill this with interesting bytes
saveBin(filename, buffer, buffer_size);
size_t readcount = loadBin(filename, buffer, buffer_size);
I have created a framework to parse text files of reasonable size that can fit in memory RAM, and for now, things are going well. I have no complaints, however what if I encountered a situation where I have to deal with large files, say, greater than 8GB(which is the size of mine)?
What would be an efficient approach to deal with such large files?
My framework:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
int Parse(const char *filename,
const char *outputfile);
int main(void)
{
clock_t t1 = clock();
/* ............................................................................................................................. */
Parse("file.txt", NULL);
/* ............................................................................................................................. */
clock_t t2 = clock();
fprintf(stderr, "time elapsed: %.4f\n", (double)(t2 - t1) / CLOCKS_PER_SEC);
fprintf(stderr, "Press any key to continue . . . ");
getchar();
return 0;
}
long GetFileSize(FILE * fp)
{
long f_size;
fseek(fp, 0L, SEEK_END);
f_size = ftell(fp);
fseek(fp, 0L, SEEK_SET);
return f_size;
}
char *dump_file_to_array(FILE *fp,
size_t f_size)
{
char *buf = (char *)calloc(f_size + 1, 1);
if (buf) {
size_t n = 0;
while (fgets(buf + n, INT_MAX, fp)) {
n += strlen(buf + n);
}
}
return buf;
}
int Parse(const char *filename,
const char *outputfile)
{
/* open file for reading in text mode */
FILE *fp = fopen(filename, "r");
if (!fp) {
perror(filename);
return 1;
}
/* store file in dynamic memory and close file */
size_t f_size = GetFileSize(fp);
char *buf = dump_file_to_array(fp, f_size);
fclose(fp);
if (!buf) {
fputs("error: memory allocation failed.\n", stderr);
return 2;
}
/* state machine variables */
// ........
/* array index variables */
size_t x = 0;
size_t y = 0;
/* main loop */
while (buf[x]) {
switch (buf[x]) {
/* ... */
}
x++;
}
/* NUL-terminate array at y */
buf[y] = '\0';
/* write buffer to file and clean up */
outputfile ? fp = fopen(outputfile, "w") :
fp = fopen(filename, "w");
if (!fp) {
outputfile ? perror(outputfile) :
perror(filename);
}
else {
fputs(buf, fp);
fclose(fp);
}
free(buf);
return 0;
}
Pattern deletion function based on the framework:
int delete_pattern_in_file(const char *filename,
const char *pattern, const char *outputfile)
{
/* open file for reading in text mode */
FILE *fp = fopen(filename, "r");
if (!fp) {
perror(filename);
return 1;
}
/* copy file contents to buffer and close file */
size_t f_size = GetFileSize(fp);
char *buf = dump_file_to_array(fp, f_size);
fclose(fp);
if (!buf) {
fputs("error - memory allocation failed", stderr);
return 2;
}
/* delete first match */
size_t n = 0, pattern_len = strlen(pattern);
char *tmp, *ptr = strstr(buf, pattern);
if (!ptr) {
fputs("No match found.\n", stderr);
free(buf);
return -1;
}
else {
n = ptr - buf;
ptr += pattern_len;
tmp = ptr;
}
/* delete the rest */
while (ptr = strstr(ptr, pattern)) {
while (tmp < ptr) {
buf[n++] = *tmp++;
}
ptr += pattern_len;
tmp = ptr;
}
/* copy the rest of the buffer */
strcpy(buf + n, tmp);
/* open file for writing and print the processed buffer to it */
outputfile ? fp = fopen(outputfile, "w") :
fp = fopen(filename, "w");
if (!fp) {
outputfile ? perror(outputfile) :
perror(filename);
}
else {
fputs(buf, fp);
fclose(fp);
}
free(buf);
return 0;
}
If you wish to stick with your current design, an option might be to mmap() the file instead of reading it into a memory buffer.
You could change the function dump_file_to_array to the following (linux-specific):
char *dump_file_to_array(FILE *fp, size_t f_size) {
buf = mmap(NULL, f_size, PROT_READ, MAP_SHARED, fileno(fp), 0);
if (buf == MAP_FAILED)
return NULL;
return buf;
}
Now you can read over the file, the memory manager will take automatically care to only hold the relevant potions of the file in memory.
For Windows, similar mechanisms exist.
Chances you are parsing the file line-by line. So read in a large block (4k or 16k) and parse all the lines in that. Copy the small remainder to the beginning of the 4k or 16k buffer and read in the rest of the buffer. Rinse and repeat.
For JSON or XML you will need an event based parser that can accept multiple blocks or input.
There are multiple issues with your approach.
The concept of maximum and available memory are not so evident: technically, you are not limited by the RAM size, but by the quantity of memory your environment will let you allocate and use for your program. This depends on various factors:
What ABI you compile for: the maximum memory size accessible to your program is limited to less than 4 GB if you compile for 32-bit code, even if your system has more RAM than that.
What quota the system is configured to let your program use. This may be less than available memory.
What strategy the system uses when more memory is requested than is physically available: most modern systems use virtual memory and share physical memory between processes and system tasks (such as the disk cache) using very advanced algorithms that cannot be describe in a few lines. It is possible on some systems for your program to allocate and use more memory than is physically installed on the motherboard, swapping memory pages to disk as more memory is accessed, at a huge cost in lag time.
There are further issues in your code:
The type long might be too small to hold the size of the file: on Windows systems, long is 32-bit even on 64-bit versions where memory can be allocated in chunks larger than 2GB. You must use different API to request the file size from the system.
You read the file with an series of calls to fgets(). This is inefficient, a single call to fread() would suffice. Furthermore, if the file contains embedded null bytes ('\0' characters), chunks from the file will be missing in memory. However you could not deal with embedded null bytes if you use string functions such as strstr() and strcpy() to handle your string deletion task.
the condition in while (ptr = strstr(ptr, pattern)) is an assignment. While not strictly incorrect, it is poor style as it confuses readers of your code and prevents life saving warnings by the compiler where such assignment-conditions are coding errors. You might think that could never happen, but anyone can make a typo and a missing = in a test is difficult to spot and has dire consequences.
you short-hand use of the ternary operator in place of if statements is quite confusing too: outputfile ? fp = fopen(outputfile, "w") : fp = fopen(filename, "w");
rewriting the input file in place is risky too: if anything goes wrong, the input file will be lost.
Note that you can implement the filtering on the fly, without a buffer, albeit inefficiently:
#include <stdio.h>
#include <string.h>
int main(int argc, char *argv[]) {
if (argc < 2) {
fprintf(stderr, "usage: delpat PATTERN < inputfile > outputfile\n");
return 1;
}
unsigned char *pattern = (unsigned char*)argv[1];
size_t i, j, n = strlen(argv[1]);
size_t skip[n + 1];
int c;
skip[0] = 0;
for (i = j = 1; i < n; i++) {
while (memcmp(pattern, pattern + j, i - j)) {
j++;
}
skip[i] = j;
}
i = 0;
while ((c = getchar()) != EOF) {
for (;;) {
if (i < n && c == pattern[i]) {
if (++i == n) {
i = 0; /* match found, consumed */
}
break;
}
if (i == 0) {
putchar(c);
break;
}
for (j = 0; j < skip[i]; j++) {
putchar(pattern[j]);
}
i -= skip[i];
}
}
for (j = 0; j < i; j++) {
putchar(pattern[j]);
}
return 0;
}
First of all I wouldn't suggest holding such big files in RAM but instead using streams. This because buffering is usually done by the library as well as by the kernel.
If you are accessing the file sequentially, which seems to be the case, then you probably know that all modern systems implement read-ahead algorithms so just reading the whole file ahead of time IN RAM may in most cases just waste time.
You didn't specify the use-cases you have to cover so I'm going to have to assume that using streams like
std::ifstream
and doing the parsing on the fly will suit your needs. As a side note, also make sure your operations on files that are expected to be large are done in separate threads.
An alternative solution: If you're on linux systems, and you have a decent amount of swap space, just open the whole bad boy up. It will consume your ram and also consume harddrive space (swap). Thus you can have the entire thing open at once, just not all of it will be on the ram.
Pros
If an unexpected shut down occurred, the memory on the swap space is recoverable.
RAM is expensive, HDDs are cheap, so the application would put less strain on your expensive equipment
Virus could not harm your computer because there would be no room in RAM for them to run
You'll be taking full advantage of the Linux operating system by using the swap space. Normally the swap space module is not used and all it does is clog up precious ram.
The additional energy that is needed to utilize the entirety of the ram can warm the immediate area. Useful during winter time
You can add "Complex and Special Memory Allocation Engineering" to your resume.
Cons
None
Consider treating the file as an external array of lines.
Code can use an array of line indexes. This index array can be kept in memory at a fraction of the size of the large file. Access to any line is accomplished quickly via this lookup, a seek with fsetpos() and an fread()/fgets(). As the lines are edited, the new lines can be saved, in any order, in temporary text file. Saving of the file reads both the original file and temp one in sequence to form and write the new file.
typedef struct {
int attributes; // not_yet_read, line_offset/length_determined,
// line_changed/in_other_file, deleted, etc.
fpos_t line_offset; // use with fgetpos() fsetpos()
unsigned line_length; // optional field as code could re-compute as needed.
} line_index;
size_t line_count;
// read some lines
line_index *index = malloc(sizeof *index * line_count);
// read more lines
index = realloc(index, sizeof *index * line_count);
// edit lines, save changes to appended temporary file.
// ...
// Save file -weave the contents of the source file and temp file to the new output file.
Additionally, with enormous files, the array line_index[] itself can be realized in disk memory too. Access to is easily computed. In an extreme sense, only 1 line of the file needs to in memory at any time.
You mentioned state-machine. Every finite-state-automata can be optimized to have minimal (or no) lookahead.
Is it possible to do this in Lex? It will generate output c file which you can compile.
If you don't want to use Lex, you can always do following:
Read n chars into (ring?) buffer where n is size of pattern.
Try to match buffer with pattern
If match goto 1
Print buffer[0], read char, goto 2
Also for very long patterns and degenerate inputs strstr can be slow. In that case you might want to look into more advanced sting matching aglorithms.
mmap() is a pretty good way of working on files with large sizes.
It provides you with lot of flexibility but you need to be cautious with page size. Here is a good article which talks about more specifics.
I have a few binary files that I want to write into an output file.
So I wrote this function using a char as a buffer naively thinking it would work.
//Opened hOutput for writing, hInput for reading
void faddf(FILE* hOutput, FILE* hInput) {
char c;
int scan;
do{
scan = fscanf(hInput, "%c", &c);
if (scan > 0)
fprintf(hOutput, "%c", c);
} while (scan > 0 && !feof(hInput));
}
Executing this function gives me an output of the few readable char's in the beginning binary file. So I tried it this way:
void faddf(FILE* hOutput, FILE* hInput) {
void * buffer;
int scan;
buffer = malloc(sizeof(short) * 209000000);
fread(buffer, sizeof(short), 209000000, hInput);
fwrite(buffer, sizeof(short), 209000000, hOutput);
free(buffer);
}
This "works" but is only works when the file is smaller then my "magic number" Is there a better way?
Although your new code (in the answer) is much better than the old code, it can still be improved and simplified.
Specifically, you can avoid any memory problems by copying the file in chunks.
void faddf( FILE *fpout, FILE *fpin )
{
char buffer[4096];
size_t count;
while ( (count = fread(buffer, 1, sizeof buffer, fpin)) > 0 )
fwrite(buffer, 1, count, fpout);
}
You should avoid reading bytes per byte. Use the fgets() function instead of fscanf().
Please refer to : Man fgets() (for Windows)
When you open both files next to each other (input one / output one), you're saying that the output file only contains readable characters... But can your text editor display unreadable characters on the input one ?
I should not have asked the question in the first place but here is how I ended up doing it:
void faddf(FILE* hOutput, FILE* hInput) {
void * buffer;
int scan,size;
size_t read;
//get the input file size
fseek(hInput, 0L, SEEK_END);
size = ftell(hInput);
fseek(hInput, 0L, SEEK_SET);
//place the get space
buffer = malloc(size);
if (buffer == NULL)exit(1);//should fail silently instead
//try to read everything to buffer
read = fread(buffer, 1, size, hInput);
//write what was read
fwrite(buffer, 1, read, hOutput);
//clean up
free(buffer);
}
What is the best way to create a empty text file of given length in C? Writing space or any special char is not an option. I mean it should directly create the file without any iteration up to file length or something.
It's pretty trivial to do. All you have to do is to seek the intended position and then write something:
#include <stdio.h>
const unsigned int wanted_size = 1048576;
int main(int argc, char **argv) {
FILE *fp = fopen("test.dat", "w+");
if (fp) {
// Now go to the intended end of the file
// (subtract 1 since we're writing a single character).
fseek(fp, wanted_size - 1, SEEK_SET);
// Write at least one byte to extend the file (if necessary).
fwrite("", 1, sizeof(char), fp);
fclose(fp);
}
return 0;
}
The example above will create a file that is 1 MB in length. Just keep in mind that the actual space will be allocated immediately, not just reserved.
This will also allow you to allocate files larger than your system memory. With the code above I'm able to instantly (< 1 ms) reserve a 1 GB large file on a Raspberry Pi (which only has 512 MB RAM) without having to use any kind of iteration.
You're also able to use any other way to write data to the position (like fputs()), it's just important that you actually write something. Calling fputs("", fp); won't necessarily extend the file as intended.
On Windows use SetFilePointer and SetEndOfFile, on Linux use truncate (which also increases).
This is what I came up with.
// hello.c
#include <stdio.h>
int CreateFileSetSize(const char *file, int size)
{
FILE *pFile;
pFile = fopen(file, "w");
if (NULL == pFile)
{
return 1;
}
fseek(pFile, size, SEEK_SET);
fputc('\n', pFile);
fclose(pFile);
return 0;
}
int main(int argc, const char *argv[])
{
const char *fileName = "MyFile.txt";
int size = 1024;
int ret = 0;
if (3 == argc)
{
fileName = argv[1];
size = atoi(argv[2]);
}
ret = CreateFileSetSize(fileName, size);
return ret;
}
I apparently am not the only one to come up with this solution. I happened to find the following question right here on Stack Overflow.
How to create file of “x” size?
How to create file of "x" size?
I am writing in C using OpenSSL library.
How can I calculate hash of a large file using md5?
As I know, I need to load a whole file to RAM as char array and then call the hash function. But what if the file is about 4Gb long? Sounds like a bad idea.
SOLVED: Thanks to askovpen, I found my bug. I've used
while ((bytes = fread (data, 1, 1024, inFile)) != 0)
MD5_Update (&mdContext, data, 1024);
not
while ((bytes = fread (data, 1, 1024, inFile)) != 0)
MD5_Update (&mdContext, data, bytes);
example
gcc -g -Wall -o file file.c -lssl -lcrypto
#include <stdio.h>
#include <openssl/md5.h>
int main()
{
unsigned char c[MD5_DIGEST_LENGTH];
char *filename="file.c";
int i;
FILE *inFile = fopen (filename, "rb");
MD5_CTX mdContext;
int bytes;
unsigned char data[1024];
if (inFile == NULL) {
printf ("%s can't be opened.\n", filename);
return 0;
}
MD5_Init (&mdContext);
while ((bytes = fread (data, 1, 1024, inFile)) != 0)
MD5_Update (&mdContext, data, bytes);
MD5_Final (c,&mdContext);
for(i = 0; i < MD5_DIGEST_LENGTH; i++) printf("%02x", c[i]);
printf (" %s\n", filename);
fclose (inFile);
return 0;
}
result:
$ md5sum file.c
25a904b0e512ee546b3f47574703d9fc file.c
$ ./file
25a904b0e512ee546b3f47574703d9fc file.c
First, MD5 is a hashing algorithm. It doesn't encrypt anything.
Anyway, you can read the file in chunks of whatever size you like. Call MD5_Init once, then call MD5_Update with each chunk of data you read from the file. When you're done, call MD5_Final to get the result.
You don't have to load the entire file in memory at once. You can use the functions MD5_Init(), MD5_Update() and MD5_Final() to process it in chunks to produce the hash. If you are worried about making it an "atomic" operation, it may be necessary to lock the file to prevent someone else changing it during the operation.
The top answer is correct, but didn't mention something: The value of the hash will be different for each buffer size used. The value will be consistent across hashes, so the same buffer size will produce the same hash everytime, however if this hash will be compared against a hash of the same data at a later time, the same buffer size must be used for each call.
In addition, if you want to make sure your digest code functions correctly, and go online to compare your hash with the online hashing websites, it appears they use a buffer length of 1. This also brings an interesting thought: It is perfectly acceptable to use a buffer length of 1 to hash a large file, it will just take longer (duh).
So my rule of thumb is if it's only for internal use, then I can set the buffer length accordingly for a large file, but if it has to play nice with other systems, then set the buffer length to 1 and deal with the time consequence.
int hashTargetFile(FILE* fp, unsigned char** md_value, int *md_len) {
#define FILE_BUFFER_LENGTH 1
EVP_MD_CTX *mdctx;
const EVP_MD *md;
int diglen; //digest length
int arrlen = sizeof(char)*EVP_MAX_MD_SIZE + 1;
int arrlen2 = sizeof(char)*FILE_BUFFER_LENGTH + 1;
unsigned char *digest_value = (char*)malloc(arrlen);
char *data = (char*)malloc(arrlen2);
size_t bytes; //# of bytes read from file
mdctx = EVP_MD_CTX_new();
md = EVP_sha512();
if (!mdctx) {
fprintf(stderr, "Error while creating digest context.\n");
return 0;
}
if (!EVP_DigestInit_ex(mdctx, md, NULL)) {
fprintf(stderr, "Error while initializing digest context.\n");
return 0;
}
while (bytes = fread(data, 1, FILE_BUFFER_LENGTH, fp) != 0) {
if (!EVP_DigestUpdate(mdctx, data, bytes)) {
fprintf(stderr, "Error while digesting file.\n");
return 0;
}
}
if (!EVP_DigestFinal_ex(mdctx, digest_value, &diglen)) {
fprintf(stderr, "Error while finalizing digest.\n");
return 0;
}
*md_value = digest_value;
*md_len = diglen;
EVP_MD_CTX_free(mdctx);
return 1;
}