Hashing program not returning identical values for the same file - c

This hashing function I've created (that scans for new files, and computes their hashes) seemingly functions, however once removing a file, for example test.c, and then replacing it with the exact same file it returns 2 different hash values. By this I mean that whilst the program is running the first calculation might return a hash of 1234, for example, and once deleting and placing the same file within the folder the it then returns 2345.
There seems to be no order, as 1234 could be the result 5 times in a row. I wondered whether there's any strikingly obvious reason in this code?
#include <stdio.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <string.h>
#include <errno.h>
#include <sys/inotify.h>
#include <openssl/sha.h>
int main (int argc, char *argv[])
{
int fd;
unsigned char c[SHA512_DIGEST_LENGTH];
int i;
SHA512_CTX mdContext;
int bytes;
unsigned char data[1024];
const int event_size = sizeof(struct inotify_event);
const int buf_len = 1024 * (event_size + FILENAME_MAX);
char *directory = "/home/joe/Documents/";
char *hashDirectory = "/home/joe/Documents/_Hash/";
char hashInBuf[100];
char hashOutBuf[100];
fd = inotify_init();
if (fd < 0) {
perror("inotify_init");
}
while (1) {
char buff[buf_len];
int no_of_events, count = 0;
//SEARCH FOR NEW FILES WITHIN DIRECTORY
no_of_events = read (fd, buff, buf_len);
while (count < no_of_events) {
struct inotify_event *event = (struct inotify_event *)&buff[count];
if (event->len) {
if ((event->mask & IN_CREATE))
if(!(event->mask & IN_ISDIR)) {
printf("\n%s has been created\n", event->name);
//CONJOIN DIRECTORY AND FILENAME / EXTENSION
snprintf(hashInBuf, sizeof(hashInBuf), "%s/%s", directory, event->name);
snprintf(hashOutBuf, sizeof(hashOutBuf), "%s/%s.txt", hashDirectory, event->name);
FILE *ftest=fopen(hashInBuf, "rb");
FILE *ftest2=fopen(hashOutBuf, "wt");
//HASH FUNCTION
SHA512_Init (&mdContext);
while ((bytes = fread (data, 1, 1024, ftest)) != 0)
SHA512_Update (&mdContext, data, bytes);
SHA512_Final (c,&mdContext);
for(i = 0; i < SHA512_DIGEST_LENGTH; i++){
fprintf(ftest2, "%02x", c[i]);
printf("%02x", c[i]);
}
fclose (ftest);
fclose (ftest2);
fflush (stdout);
}
}
count += event_size + event->len;
}
}
return 0;
}
Thank you in advance!

In this line
if ((event->mask & IN_CREATE))
you wait for the event that a file is created. Then, your hashing function immediately starts running!
This may lead to the situation that the file is not fully written yet, so you only hashed a part of the file.
You should use the event IN_CLOSE_WRITE to make sure, that the file has already been completely written.
Another option is to not create the files in this directory, but creating them in a temporary directory and subsequently moving them into the target directory. The corresponding event is IN_MOVED_TO then.

Related

how to open file within while loop that is dependant on information not yet declared?

I am attempting to open a file that will only be known once it has been created within a directory, however the FILE *infile (etc, etc..) function does not work in this scenario as "infile" has not previously been delcared. I can't work out how to declare this before the loop so that it gets the current file that is being iterated at that time.
#include <stdio.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <string.h>
#include <errno.h>
#include <sys/inotify.h>
#include <openssl/sha.h>
int main (int argc, char *argv[])
{
int result;
int fd;
int wd;
unsigned char c[SHA512_DIGEST_LENGTH];
int i;
//FILE *inFile = fopen (filename, "rb"); //I'm aware this would usually
//be declared here
SHA512_CTX mdContext;
int bytes;
unsigned char data[1024];
const int event_size = sizeof(struct inotify_event);
const int buf_len = 1024 * (event_size + FILENAME_MAX);
fd = inotify_init();
if (fd < 0) {
perror("inotify_init");
}
wd = inotify_add_watch(fd, "/home/joe/Documents", IN_CREATE);
while (1) {
char buff[buf_len];
char target[FILENAME_MAX];
int no_of_events, count = 0;
no_of_events = read (fd, buff, buf_len);
while (count < no_of_events) {
struct inotify_event *event = (struct inotify_event *)&buff[count];
if (event->len) {
if (event->mask & IN_CREATE)
if(!(event->mask & IN_ISDIR)) {
printf("The file %s has been created\n", event->name);
//FILE *infile = fopen (filename, "rb"); //issue arises here
//when not commented
SHA512_Init (&mdContext);
while ((bytes = fread (data, 1, 1024, filename)) != 0)
SHA512_Update (&mdContext, data, bytes);
SHA512_Final (c,&mdContext);
for(i = 0; i < SHA512_DIGEST_LENGTH; i++) printf("%02x", c[i]);
printf (" %s\n", event->name);
fclose (filename);
return 0;
fflush(stdout);
}
}
count += event_size + event->len;
}
}
return 0;
}
I'm trying to work out the issue hence the comments and also undeclared "filename".
The name of the file that you want to open is stored in event->name. That's what you want to pass to fopen. Also, you want to pass infile to both fread and fclose.
FILE *infile = fopen (event->name, "rb"); // event->name is the filename
SHA512_Init (&mdContext);
while ((bytes = fread (data, 1, 1024, infile )) != 0) // read from infile
SHA512_Update (&mdContext, data, bytes);
SHA512_Final (c,&mdContext);
for(i = 0; i < SHA512_DIGEST_LENGTH; i++) printf("%02x", c[i]);
printf (" %s\n", event->name);
fclose (infile); // close infile

how to print name of newly created file(s) within a directory in C?

This code scans for newly created files within a directory, however where "%s" should contain the name of the new file(s) this does not occur.
I can imagine there are unnecessary pieces of code written here, however being quite unfamiliar with C I'm simply happy it compiles at this point (and actually recognizes new files) !
#include <stdio.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <string.h>
#include <errno.h>
#include <sys/inotify.h>
int main (int argc, char *argv[])
{
char target[FILENAME_MAX];
int result;
int fd;
int wd; /* watch descriptor */
const int event_size = sizeof(struct inotify_event);
const int buf_len = 1024 * (event_size + FILENAME_MAX);
fd = inotify_init();
if (fd < 0) {
perror("inotify_init");
}
wd = inotify_add_watch(fd, "/home/joe/Documents", IN_CREATE);
while (1) {
char buff[buf_len];
int no_of_events, count = 0;
no_of_events = read (fd, buff, buf_len);
while (count < no_of_events) {
struct inotify_event *event = (struct inotify_event *)&buff[count];
if (event->len) {
if (event->mask & IN_CREATE)
if(!(event->mask & IN_ISDIR)) {
printf("The file %s has been created\n", target);
fflush(stdout);
}
}
count += event_size + event->len;
}
}
return 0;
}
You're printing out target when you get an event, however target is never modified.
The name of the created file is stored in event->name. That's what you want to print.
printf("The file %s has been created\n", event->name);

Is there a way to read a text file byte by byte asynchronously with POSIX threads?

I am trying to read and copy its contents to another file asynchronously with POSIX threads in C. Assuming a file contains "aabbcc" and i have 4 threads, how can i copy "aabbcc" to another file with threads asynchronously in C. This part has been stuck in my head for the entire day. What i have done so far is shown below.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <fcntl.h>
#include <pthread.h>
#include <aio.h>
#include <math.h> //for ceil() and floor()
#include <sys/types.h>
#include <unistd.h>
#define FILE_SIZE 1024 //in bytes
//>cc code.c -o code.out -lrt -lpthread
//>./code.out
char alphabets[52] = {'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o',
'p','q','r','s','t','u','v','w','x','y','z',
'A','B','C','D','E','F','G','H','I','J','K','L','M','N','O',
'P','Q','R','S','T','U','V','W','X','Y','Z'};
long prepareInputFile(char* filename)
{
FILE *fp;
fp = fopen(filename, "w");
if(fp == NULL)
{
printf("Cannot write to input file\n");
return;
}
int index;
char str[FILE_SIZE];
int rand_size = (rand() % 1024)+1;
for(index = 0;index < rand_size;index++) /*Generate the file with random sizes in bytes*/
{
int num2 = (rand() % 52); /*Get a random char in char array*/
putc(alphabets[num2],fp); /*Write that char to the file pointed to by fp*/
}
putc('\n',fp);
fseek(fp, 0L, SEEK_END);
long size = ftell(fp);
fseek(fp, 0L, SEEK_SET);
return size;
}
//Perform main operation inside this function
void *writeToFileAsync(void *src_file, void *dest_file,
void *thread, void *t_count, void *filesize)
{
int readfd, writefd;
struct aiocb aio_write, aio_read;
memset(&aio_read, 0, sizeof(aio_read));
aio_read.aio_fildes = readfd;
aio_read.aio_nbytes = (int)filesize/(int)t_count;
readfd = open((char *)src_file, O_RDONLY);
if(readfd < 0)
{
printf("Cannot open the file for reading\n");
}
memset(&aio_write, 0, sizeof(aio_write));
aio_read.aio_fildes = writefd;
aio_read.aio_nbytes = (int)filesize/(int)t_count;
writefd = open((void *)dest_file, O_CREAT | O_WRONLY);
if(writefd < 0)
{
printf("Cannot open the file for writing\n");
}
return;
}
int main(int argc, char *argv[])
{
int i,threadCount;
char sourcePath[100], destPath[100];
strcpy(sourcePath,argv[1]);
if(strcmp(sourcePath, "-") == 0)
{
getcwd(sourcePath, sizeof(sourcePath));
strcpy(sourcePath, strcat(sourcePath, "/source.txt"));
}
else
{
strcpy(sourcePath, strcat(sourcePath, "source.txt"));
}
printf("Source path is: %s\n", sourcePath);
strcpy(destPath,argv[2]);
if(strcmp(destPath, "-") == 0)
{
getcwd(destPath, sizeof(destPath));
strcpy(destPath, strcat(destPath, "/destination.txt"));
}
else
{
strcpy(destPath, strcat(destPath, "destination.txt"));
}
printf("Dest path is: %s\n", destPath);
threadCount = strtol(argv[3],NULL,10);
long file_size = prepareInputFile(sourcePath);
pthread_t threads[threadCount];
for(i=0;i<threadCount;i++)
{
pthread_create(&threads[i],NULL,(void *)writeToFileAsync, NULL);
}
return 0;
}
Any help would be appreciated.
It is unlikely that parallelizing this operation would help, as it is probably bound by I/O rather than CPU time, and copying this way will certainly not be faster than simply copying via system call.
However, if you wanted to do this, one method would be: map the input file into memory (with mmap() or the equivalent), create the destination buffer or memory-mapped file, divide the source and destination files into equal slices, and have each thread copy its slice of the file. You might use memcpy(), but a modern compiler can see what your loop is doing and optimize it.
Even this is not going to be as fast as reading or mapping the source file into a buffer, then writing it back out from the same buffer with write(). If all you need to do is copy the file to disk, you don’t need to copy the bytes at all. In fact, you might even be able to make a second link to the file on disk.
This would probably work best if the slices are aligned to page boundaries. Be very careful about having two threads write to the same cache line, as this creates a race condition.

Applying fork() and pipe() (or fifo()) on counting words code

I've completed writing of counting words code finally. It counts total number of words in files. (i.e. txt). Now, I want to use multiple fork() to access and read every file. I studied in the last week. Besides, I use global variable to hold number of counted words. As far as I know, If I apply fork(), used global variables are assigned as 0. To avoid it, I tried to use mmap() and similar functions this is okey. But, I also want to use pipe() also (fifo() if it is possible) to communicate (hold values of numbers).
I use nftw() function to go in folders and files. My logic is on the below picture. How can use fork() and pipe() (fifo()) on this code ? fork() is really complicated for me because of my inexperience. I'm new using of pipe() and fork(). According to my idea logic of the code is that if I can use fork() and pipe(), there will be fork() every file(i.e. txt) and access them by using fork. If there is another folder and there are files, again creates fork() from one of created forks , then access file. I try to explain also drawing below. Thank you. I want to learn using of them.
int countInEveryFolder(const char *dir)
is used because I don't know how to count files until the next folder in nftw() function. Number of files is necessary because it is number of fork.
Every folder should be parent of files. The files are included by the folder.
Code:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/wait.h>
#include <dirent.h>
#include <errno.h>
#include <ftw.h>
#include <ctype.h>
#include <sys/mman.h>
#include <locale.h>
#include <errno.h>
#define MAX_PATH_LEN 2048
unsigned long total_words = 0UL;
unsigned long total_dirs = 0UL;
unsigned long total_files = 0UL;
// Just proves counting number of file in a folder
int countInEveryFolder(const char *dir) {
struct stat stDirInfo;
struct dirent * stFiles;
DIR * stDirIn;
char szFullName[MAX_PATH_LEN];
char szDirectory[MAX_PATH_LEN];
struct stat stFileInfo;
int numOfFile = 0;
strncpy( szDirectory, dir, MAX_PATH_LEN - 1 );
if (lstat( szDirectory, &stDirInfo) < 0)
{
perror (szDirectory);
return 0;
}
if (!S_ISDIR(stDirInfo.st_mode))
return 0;
if ((stDirIn = opendir( szDirectory)) == NULL)
{
perror( szDirectory );
return 0;
}
while (( stFiles = readdir(stDirIn)) != NULL)
{
if (!strcmp(stFiles->d_name, ".") || !strcmp(stFiles->d_name, ".."))
continue;
sprintf(szFullName, "%s/%s", szDirectory, stFiles -> d_name );
if (lstat(szFullName, &stFileInfo) < 0)
perror ( szFullName );
/* is the file a directory? */
if (S_ISREG(stFileInfo.st_mode))
{
printf( "Filename: %s\n", szFullName );
numOfFile++;
}
} // end while
closedir(stDirIn);
return numOfFile;
}
// Count words in files.
unsigned long count_words_in_file(const char *const filename)
{
unsigned long count = 0UL;
int errnum = 0;
int c;
FILE *in;
in = fopen(filename, "rt");
if (in == NULL) {
errnum = errno;
fprintf(stderr, "%s: %s.\n", filename, strerror(errnum));
errno = errnum;
return 0UL;
}
/* Skip leading whitespace. */
do {
c = getc(in);
} while (isspace(c));
/* Token loop. */
while (c != EOF) {
/* This token is a word, if it starts with a letter. */
if (isalpha(c))
count++;
/* Skip the rest of this token. */
while (!isspace(c) && c != EOF)
c = getc(in);
/* Skip the trailing whitespace. */
while (isspace(c))
c = getc(in);
}
/* Paranoid checking for I/O errors. */
if (!feof(in) || ferror(in)) {
fclose(in);
fprintf(stderr, "Warning: %s: %s.\n", filename, strerror(EIO));
errnum = EIO;
} else
if (fclose(in)) {
fprintf(stderr, "Warning: %s: %s.\n", filename, strerror(EIO));
errnum = EIO;
}
errno = errnum;
return count;
}
// Recursively go in folders
int nftw_callback(const char *filepath, const struct stat *sb, int typeflag, struct FTW *ftwbuf)
{
// Directory
if (typeflag == FTW_DP || typeflag == FTW_D)
{
total_dirs++;
printf("%*s%s\n", ftwbuf->level * 4, "", filepath);
//countInEveryFolder(filepath);
}
// Folder
else if (typeflag == FTW_F)
{
total_files++;
total_words += count_words_in_file(filepath);
printf("%*s%s\n", ftwbuf->level * 4, "", filepath);
}
return 0;
}
/* Error message */
void err_sys(const char *msg)
{
perror(msg);
fflush(stdout);
exit(EXIT_FAILURE);
}
int main(int argc, char *argv[])
{
total_files = total_dirs = total_words = 0UL;
if (nftw(argv[1], nftw_callback, 15, FTW_PHYS) == 0) {
/* Success! */
printf("%s: %lu files, %lu directories, %lu words total.\n",
argv[1], total_files, total_dirs, total_words);
} else {
/* Failed... */
err_sys("ntfw");
}
putchar('\n');
//printf( "\nTotal words = %d\n\n", *wordCount);
//printf( "\nTotal folders = %d\n\n", *folderCount);
//printf( "\nTotal childs = %d\n\n", *childCount); //fork()
return 0;
}
To start I would write the program with two phases. A single-process phase in which all the file-paths are queued up (into a linked-list or dequeue), and a multi-process phase in which the worker processes receive work via their pipe() and send counts back to the main process via their pipe(). The main process would use select() to multiplex the input from its children.
Once you understand how to use select() with pipe()s, then work on having the filepath discovery be concurrent.
This design would be much easier to implement in Go, node.js, or greenlet with Python, but learning how to do it in C gives you a level of understanding for the underlying operations that you don't get with newer languages.

mmap is wiping my file instead of copying it

So I'm using mmap to then write to another file. But the weird thing is, when my code hits mmap, what it does is clears the file. So I have a file that's populated with random characters (AB, HAA, JAK, etc...). What it's supposed to do is use mmap as read basically and then write that file to the new file. So that first if (argc == 3) is the normal read and write, the second if (argc ==4) is supposed to use mmap. Does anyone have any idea why on Earth this is happening?
#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/io.h>
#include <sys/mman.h>
#include <sys/time.h>
#include <sys/resource.h>
int main(int argc, char const *argv[])
{
int nbyte = 512;
char buffer[nbyte];
unsigned char *f;
int bytesRead = 0;
int size;
int totalBuffer;
struct stat s;
const char * file_name = argv[1];
int fd = open (argv[1], O_RDONLY);
int i = 0;
char c;
int fileInput = open(argv[1], O_RDONLY);
int fileOutPut = open(argv[2], O_WRONLY | O_TRUNC | O_CREAT, S_IRUSR | S_IWUSR);
fstat(fileInput, &s);
size = s.st_size;
printf("%d\n", size);
if (argc == 3)
{
printf("size: %d\n", size);
printf("nbyte: %d\n", nbyte);
while (size - bytesRead >= nbyte)
{
read(fileInput, buffer, nbyte);
bytesRead += nbyte;
write(fileOutPut, buffer, nbyte);
}
read(fileInput, buffer, size - bytesRead);
write(fileOutPut, buffer, size - bytesRead);
}
else if (argc == 4)
{
int i = 0;
printf("4 arg\n");
f = (char *) mmap (0, size, PROT_READ, MAP_PRIVATE, fileInput, 0);
/* This is where it is being wipped */
}
close(fileInput);
close(fileOutPut);
int who = RUSAGE_SELF;
struct rusage usage;
int ret;
/* Get the status of the file and print some. Easy to do what "ls" does with fstat system call... */
int status = fstat (fd, & s);
printf("File Size: %d bytes\n",s.st_size);
printf("Number of Links: %d\n",s.st_nlink);
return 0;
}
EDIT: I wanted to mention that the first read and write works perfectly, it is only when you try to do it through the mmap.
If you mean it's clearing your destination file, then yes, that's exactly what your code will do.
It opens the destination with truncation and then, in your argc==4 section, you map the input file but do absolutely nothing to transfer the data to the output file.
You'll need a while loop of some description, similar to the one in the argc==3 case, but which writes the bytes in mapped memory to the fileOutput descriptor.

Resources