Suggestions for duplicate file finder algorithm (using C)

Suggestions for duplicate file finder algorithm (using C) - c

I wanted to write a program that test if two files are duplicates (have exactly the same content). First I test if the files have the same sizes, and if they have i start to compare their contents.
My first idea, was to "split" the files into fixed size blocks, then start a thread for every block, fseek to startup character of every block and continue the comparisons in parallel. When a comparison from a thread fails, the other working threads are canceled, and the program exits out of the thread spawning loop.
The code looks like this:
dupf.h
#ifndef __NM__DUPF__H__
#define __NM__DUPF__H__
#define NUM_THREADS 15
#define BLOCK_SIZE 8192
/* Thread argument structure */
struct thread_arg_s {
const char *name_f1; /* First file name */
const char *name_f2; /* Second file name */
int cursor; /* Where to seek in the file */
};
typedef struct thread_arg_s thread_arg;
/**
* 'arg' is of type thread_arg.
* Checks if the specified file blocks are
* duplicates.
*/
void *check_block_dup(void *arg);
/**
* Checks if two files are duplicates
*/
int check_dup(const char *name_f1, const char *name_f2);
/**
* Returns a valid pointer to a file.
* If the file (given by the path/name 'fname') cannot be opened
* in 'mode', the program is interrupted an error message is shown.
**/
FILE *safe_fopen(const char *name, const char *mode);
#endif
dupf.c
#include <errno.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include "dupf.h"
FILE *safe_fopen(const char *fname, const char *mode)
{
FILE *f = NULL;
f = fopen(fname, mode);
if (f == NULL) {
char emsg[255];
sprintf(emsg, "FOPEN() %s\t", fname);
perror(emsg);
exit(-1);
}
return (f);
}
void *check_block_dup(void *arg)
{
const char *name_f1 = NULL, *name_f2 = NULL; /* File names */
FILE *f1 = NULL, *f2 = NULL; /* Streams */
int cursor = 0; /* Reading cursor */
char buff_f1[BLOCK_SIZE], buff_f2[BLOCK_SIZE]; /* Character buffers */
int rchars_1, rchars_2; /* Readed characters */
/* Initializing variables from 'arg' */
name_f1 = ((thread_arg*)arg)->name_f1;
name_f2 = ((thread_arg*)arg)->name_f2;
cursor = ((thread_arg*)arg)->cursor;
/* Opening files */
f1 = safe_fopen(name_f1, "r");
f2 = safe_fopen(name_f2, "r");
/* Setup cursor in files */
fseek(f1, cursor, SEEK_SET);
fseek(f2, cursor, SEEK_SET);
/* Initialize buffers */
rchars_1 = fread(buff_f1, 1, BLOCK_SIZE, f1);
rchars_2 = fread(buff_f2, 1, BLOCK_SIZE, f2);
if (rchars_1 != rchars_2) {
/* fread failed to read the same portion.
* program cannot continue */
perror("ERROR WHEN READING BLOCK");
exit(-1);
}
while (rchars_1-->0) {
if (buff_f1[rchars_1] != buff_f2[rchars_1]) {
/* Different characters */
fclose(f1);
fclose(f2);
pthread_exit("notdup");
}
}
/* Close streams */
fclose(f1);
fclose(f2);
pthread_exit("dup");
}
int check_dup(const char *name_f1, const char *name_f2)
{
int num_blocks = 0; /* Number of 'blocks' to check */
int num_tsp = 0; /* Number of threads spawns */
int tsp_iter = 0; /* Iterator for threads spawns */
pthread_t *tsp_threads = NULL;
thread_arg *tsp_threads_args = NULL;
int tsp_threads_iter = 0;
int thread_c_res = 0; /* Thread creation result */
int thread_j_res = 0; /* Thread join res */
int loop_res = 0; /* Function result */
int cursor;
struct stat buf_f1;
struct stat buf_f2;
if (name_f1 == NULL || name_f2 == NULL) {
/* Invalid input parameters */
perror("INVALID FNAMES\t");
return (-1);
}
if (stat(name_f1, &buf_f1) != 0 || stat(name_f2, &buf_f2) != 0) {
/* Stat fails */
char emsg[255];
sprintf(emsg, "STAT() ERROR: %s %s\t", name_f1, name_f2);
perror(emsg);
return (-1);
}
if (buf_f1.st_size != buf_f2.st_size) {
/* File have different sizes */
return (1);
}
/* Files have the same size, function exec. is continued */
num_blocks = (buf_f1.st_size / BLOCK_SIZE) + 1;
num_tsp = (num_blocks / NUM_THREADS) + 1;
cursor = 0;
for (tsp_iter = 0; tsp_iter < num_tsp; tsp_iter++) {
loop_res = 0;
/* Create threads array for this spawn */
tsp_threads = malloc(NUM_THREADS * sizeof(*tsp_threads));
if (tsp_threads == NULL) {
perror("TSP_THREADS ALLOC FAILURE\t");
return (-1);
}
/* Create arguments for every thread in the current spawn */
tsp_threads_args = malloc(NUM_THREADS * sizeof(*tsp_threads_args));
if (tsp_threads_args == NULL) {
perror("TSP THREADS ARGS ALLOCA FAILURE\t");
return (-1);
}
/* Initialize arguments and create threads */
for (tsp_threads_iter = 0; tsp_threads_iter < NUM_THREADS;
tsp_threads_iter++) {
if (cursor >= buf_f1.st_size) {
break;
}
tsp_threads_args[tsp_threads_iter].name_f1 = name_f1;
tsp_threads_args[tsp_threads_iter].name_f2 = name_f2;
tsp_threads_args[tsp_threads_iter].cursor = cursor;
thread_c_res = pthread_create(
&tsp_threads[tsp_threads_iter],
NULL,
check_block_dup,
(void*)&tsp_threads_args[tsp_threads_iter]);
if (thread_c_res != 0) {
perror("THREAD CREATION FAILURE");
return (-1);
}
cursor+=BLOCK_SIZE;
}
/* Join last threads and get their status */
while (tsp_threads_iter-->0) {
void *thread_res = NULL;
thread_j_res = pthread_join(tsp_threads[tsp_threads_iter],
&thread_res);
if (thread_j_res != 0) {
perror("THREAD JOIN FAILURE");
return (-1);
}
if (strcmp((char*)thread_res, "notdup")==0) {
loop_res++;
/* Closing other threads and exiting by condition
* from loop. */
while (tsp_threads_iter-->0) {
pthread_cancel(tsp_threads[tsp_threads_iter]);
}
}
}
free(tsp_threads);
free(tsp_threads_args);
if (loop_res > 0) {
break;
}
}
return (loop_res > 0) ? 1 : 0;
}
The function works fine (at least for what I've tested). Still, some guys from #C (freenode) suggested that the solution is overly complicated, and it may perform poorly because of parallel reading on hddisk.
What I want to know:
Is the threaded approach flawed by default ?
Is fseek() so slow ?
Is there a way to somehow map the files to memory and then compare them ?
LATED EDIT:
Today I had some time, and I've followed your advices. You were right, this threaded version actually performs worse than a single threaded version, and all because of the parallel readings on hard disk.
Another thing is that I've written a function that uses mmap(), and until now is the optimal one. Still the biggest drawback of that function is that it fails, when the files are getting really big.
Here is the new implementation (a very brute and direct code):
#include <errno.h>
#include <fcntl.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include "dupf.h"
/**
* Safely assures that a file is opened.
* If cannot open file, the flow of the program is interrupted.
* The error code returned is -1.
**/
FILE *safe_fopen(const char *fname, const char *mode)
{
FILE *f = NULL;
f = fopen(fname, mode);
if (f == NULL) {
char emsg[1024];
sprintf(emsg, "Cannot open file: %s\t", fname);
perror(emsg);
exit(-1);
}
return (f);
}
/**
* Check if two files have the same size.
* Returns:
* -1 Error.
* 0 If they have the same size.
* 1 If the don't have the same size.
**/
int check_same_size(const char *f1_name, const char *f2_name, off_t *f1_size, off_t *f2_size)
{
struct stat f1_stat, f2_stat;
if((f1_name == NULL) || (f2_name == NULL)){
fprintf(stderr, "Invalid filename passed to function [check_same_size].\n");
return (-1);
}
if((stat(f1_name, &f1_stat) != 0) || (stat(f2_name, &f2_stat) !=0)){
fprintf(stderr, "Cannot apply stat. [check_same_size].\n");
return (-1);
}
if(f1_size != NULL){
*f1_size = f1_stat.st_size;
}
if(f2_size != NULL){
*f2_size = f2_stat.st_size;
}
return (f1_stat.st_size == f2_stat.st_size) ? 0 : 1;
}
/**
* Test if two files are duplicates.
* Returns:
* -1 Error.
* 0 If they are duplicates.
* 1 If they are not duplicates.
**/
int check_dup_plain(char *f1_name, char *f2_name, int block_size)
{
if ((f1_name == NULL) || (f2_name == NULL)){
fprintf(stderr, "Invalid filename passed to function [check_dup_plain].\n");
return (-1);
}
FILE *f1 = NULL, *f2 = NULL;
char f1_buff[block_size], f2_buff[block_size];
size_t rch1, rch2;
if(check_same_size(f1_name, f2_name, NULL, NULL) == 1){
return (1);
}
f1 = safe_fopen(f1_name, "r");
f2 = safe_fopen(f2_name, "r");
while(!feof(f1) && !feof(f2)){
rch1 = fread(f1_buff, 1, block_size, f1);
rch2 = fread(f2_buff, 1, block_size, f2);
if(rch1 != rch2){
fprintf(stderr, "Invalid reading from file. Cannot continue. [check_dup_plain].\n");
return (-1);
}
while(rch1-->0){
if(f1_buff[rch1] != f2_buff[rch1]){
return (1);
}
}
}
fclose(f1);
fclose(f2);
return (0);
}
/**
* Test if two files are duplicates.
* Returns:
* -1 Error.
* 0 If they are duplicates.
* 1 If they are not duplicates.
**/
int check_dup_memmap(char *f1_name, char *f2_name)
{
struct stat f1_stat, f2_stat;
char *f1_array = NULL, *f2_array = NULL;
off_t f1_size, f2_size;
int f1_des, f2_des, cont, res;
if((f1_name == NULL) || (f2_name == NULL)){
fprintf(stderr, "Invalid filename passed to function [check_dup_memmap].\n");
return (-1);
}
if(check_same_size(f1_name, f2_name, &f1_size, &f2_size) == 1){
return (1);
}
f1_des = open(f1_name, O_RDONLY);
f2_des = open(f2_name, O_RDONLY);
if((f1_des == -1) || (f2_des == -1)){
perror("Cannot open file");
exit(-1);
}
f1_array = mmap(0, f1_size * sizeof(*f1_array), PROT_READ, MAP_SHARED, f1_des, 0);
if(f1_array == NULL){
fprintf(stderr, "Cannot map file to memory [check_dup_memmap].\n");
return (-1);
}
f2_array = mmap(0, f2_size * sizeof(*f2_array), PROT_READ, MAP_SHARED, f2_des, 0);
if(f2_array == NULL){
fprintf(stderr, "Cannot map file to memory [check_dup_memmap].\n");
return (-1);
}
cont = f1_size;
res = 0;
while(cont-->0){
if(f1_array[cont]!=f2_array[cont]){
res = 1;
break;
}
}
munmap((void*) f1_array, f1_size * sizeof(*f1_array));
munmap((void*) f2_array, f2_size * sizeof(*f2_array));
return res;
}
int main(int argc, char *argv[])
{
printf("result: %d\n",check_dup_memmap("f2","f1"));
return (0);
}
I am planning now to extend this code, by re-adding the threaded functionality, but this time the reading will be on memory.
Thanks for your answers.

The limiting factor will be disk reads, which (assuming that both files are on the same disk) will be serialized anyway, so I don't think threading will help much at all.

You could probably simplify your code greatly by using hashes, instead of doing a byte-by-byte comparison. Assuming you're not doing anything important, like deleting, an md5 or similar hash function should be plenty. Boost provides quite a few, and they're usually pretty fast.
if fileA.size == fileB.size
if fileA.hash() == fileB.hash()
flag(fileA, fileB, same);
I wouldn't delete files after that comparison, but it's plenty safe to move them to a temporary directory for further review or just build a list of possible duplicates.

It's hard to guess about performance without a real system to test against (for example if you're using a solid state drive, there's no head seek time and the cost of reading different sectors from different threads is almost zero).
If this is running against a reasonably standard computer with regular (spinning platter) hard drives, having multiple threads contend for the part of the disk they want to read from will possibly slow things down (depending, again, on the hardware and also the size of the chunks).
If the time it takes to compute the "sameness" of a chunk is fast compared to the time it takes to read that chunk from disk, having a separate thread will not help much since the second (or third...) thread would spend most of it's time waiting for IO to complete anyway.
Another factor is the cache size of the CPU. If all of the memory you're processing at one time fits in the CPU cache, things will be much faster than if different threads cause different chunks of memory to be loaded into cache as they execute instructions.
If you have more threads than you have CPU cores, you will just slow things down by making unnecessary context switches (since a thread needs a core to run on).
After reading all of that, if you still think multithreading is going to help for your target system, consider one thread that does IO only, places the data in a queue, and has two or more worker threads taking data off of the queue to process. That way, you optimize disk IO and can take advantage of multiple cores to crunch the numbers.
Steve suggested you can memory map you files on Unix. That will speed up access to the underlying data a bit by leveraging low level OS functionality (the same kind used to manage swap files). That will give you some performance improvement as the OS will handle loading the parts of the file you are working on into memory efficiently, as long as the file fits into available address space. FYI you can do the same thing on Windows.

Before even considering the performance effects of parallel disk reads and thread overhead and such...
Is there any reason to believe that scanning the files in chunks will find the differences any quicker than straight through? Is the data contained in the files predominantly in a certain format, and if so, is the splitting scheme tailored to it? If not, I don't see how scanning the files by skipping over every n bytes (which is all the multithreaded splitting is effectively doing) could offer any improvement over reading the bytes in the order they are on disk.
Think of the two limiting cases -- "splitting" the file into one block, and splitting the file into as many one-byte "blocks" as there are bytes in the file. Will either of those cases be more efficient than the other, or some in-between value? If there is no in-between value that you know you should optimize to, then you know nothing about how the data is stored in the files, so it should make no difference how you scan them.
Even if you set the split to optimize to the disk's performance like block size, you're still going to have to go back to read the next byte, which will likely be at an extremely non-optimal position. And in the end you're going to have to read every single byte in the file, no matter how you split it.

Because you're using pthreads, I assume you're working in a Unix environment -- in which case you could mmap(2) both files into memory and compare the memory arrays directly.

Well, there is the standard memory mapping mmap() function that maps a file to memory. You should be able to do something like
int fd1;
int fd2;
int size1;
int size2;
fd1 = open(name1, O_RDONLY);
size1 = lseek(fd1, 0, SEEK_END);
fd2 = open(name2, O_RDONLY);
size2 = lseek(fd2, 0, SEEK_END);
if ( size1 == size2 )
{
char * data1 = mmap(0, size1, PROT_READ, MAP_SHARED, fd1, 0);
char * data2 = mmap(0, size1, PROT_READ, MAP_SHARED, fd2, 0);
int i;
/* ...and this is, obviously, where you'd do something more clever */
for ( i = 0; i < size1 && *data1 == *data2; i++, data1++, data2++ );
if ( i == size1 )
printf("Equal\n");
}
close(fd1);
close(fd2);
Other than that, yes, your solution looks overly complicated ;-) The threaded approach is not necessarily flawed, but you might not see that parallel access improves performance. For SAN drives or ramdisks it might improve performance, for normal spinning platter drives it might impede it. But simpler is usually better, unless you really have a performance issue.
Regarding fseek() vs other methods, it depends on the operating system you use. Google is you friend here, you can easily find articles at least for Solaris and Linux.

Even if disk access was not the limiting factor (it will be), unless you have a multi-core processor that could hand off different threads to different cores, you would not see a speed-up from going multi-threaded. Basically, you have to compare all N bytes of the file one way or another, and even if you use threads, if they execute in the same core, it will take the same amount of time as without using threads.
There are some environments that could spread the workload across cores, but even so, the CPU will be able to process so much faster than the data can be pulled in from disk that the disk I/O system will be the limiting factor.

Related

Why can I not mmap /proc/self/maps?

To be specific: why can I do this:
FILE *fp = fopen("/proc/self/maps", "r");
char buf[513]; buf[512] = NULL;
while(fgets(buf, 512, fp) > NULL) printf("%s", buf);
but not this:
int fd = open("/proc/self/maps", O_RDONLY);
struct stat s;
fstat(fd, &s); // st_size = 0 -> why?
char *file = mmap(0, s.st_size /*or any fixed size*/, PROT_READ, MAP_PRIVATE, fd, 0); // gives EINVAL for st_size (because 0) and ENODEV for any fixed block
write(1, file, st_size);
I know that /proc files are not really files, but it seems to have some defined size and content for the FILE* version. Is it secretly generating it on-the-fly for read or something? What am I missing here?
EDIT:
as I can clearly read() from them, is there any way to get the possible available bytes? or am I stuck to read until EOF?

They are created on the fly as you read them. Maybe this would help, it is a tutorial showing how a proc file can be implemented:
https://devarea.com/linux-kernel-development-creating-a-proc-file-and-interfacing-with-user-space/
tl;dr: you give it a name and read and write handlers, that's it. Proc files are meant to be very simple to implement from the kernel dev's point of view. They do not behave like full-featured files though.
As for the bonus question, there doesn't seem to be a way to indicate the size of the file, only EOF on reading.

proc "files" are not really files, they are just streams that can be read/written from, but they contain no pyhsical data in memory you can map to.
https://tldp.org/LDP/Linux-Filesystem-Hierarchy/html/proc.html

As already explained by others, /proc and /sys are pseudo-filesystems, consisting of data provided by the kernel, that does not really exist until it is read – the kernel generates the data then and there. Since the size varies, and really is unknown until the file is opened for reading, it is not provided to userspace at all.
It is not "unfortunate", however. The same situation occurs very often, for example with character devices (under /dev), pipes, FIFOs (named pipes), and sockets.
We can trivially write a helper function to read pseudofiles completely, using dynamic memory management. For example:
// SPDX-License-Identifier: CC0-1.0
//
#define _POSIX_C_SOURCE 200809L
#define _ATFILE_SOURCE
#define _GNU_SOURCE
#include <stdlib.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <string.h>
#include <errno.h>
/* For example main() */
#include <stdio.h>
/* Return a directory handle for a specific relative directory.
For absolute paths and paths relative to current directory, use dirfd==AT_FDCWD.
*/
int at_dir(const int dirfd, const char *dirpath)
{
if (dirfd == -1 || !dirpath || !*dirpath) {
errno = EINVAL;
return -1;
}
return openat(dirfd, dirpath, O_DIRECTORY | O_PATH | O_CLOEXEC);
}
/* Read the (pseudofile) contents to a dynamically allocated buffer.
For absolute paths and paths relative to current durectory, use dirfd==AT_FDCWD.
You can safely initialize *dataptr=NULL,*sizeptr=0 for dynamic allocation,
or reuse the buffer from a previous call or e.g. getline().
Returns 0 with errno set if an error occurs. If the file is empty, errno==0.
In all cases, remember to free (*dataptr) after it is no longer needed.
*/
size_t read_pseudofile_at(const int dirfd, const char *path, char **dataptr, size_t *sizeptr)
{
char *data;
size_t size, have = 0;
ssize_t n;
int desc;
if (!path || !*path || !dataptr || !sizeptr) {
errno = EINVAL;
return 0;
}
/* Existing dynamic buffer, or a new buffer? */
size = *sizeptr;
if (!size)
*dataptr = NULL;
data = *dataptr;
/* Open pseudofile. */
desc = openat(dirfd, path, O_RDONLY | O_CLOEXEC | O_NOCTTY);
if (desc == -1) {
/* errno set by openat(). */
return 0;
}
while (1) {
/* Need to resize buffer? */
if (have >= size) {
/* For pseudofiles, linear size growth makes most sense. */
size = (have | 4095) + 4097 - 32;
data = realloc(data, size);
if (!data) {
close(desc);
errno = ENOMEM;
return 0;
}
*dataptr = data;
*sizeptr = size;
}
n = read(desc, data + have, size - have);
if (n > 0) {
have += n;
} else
if (n == 0) {
break;
} else
if (n == -1) {
const int saved_errno = errno;
close(desc);
errno = saved_errno;
return 0;
} else {
close(desc);
errno = EIO;
return 0;
}
}
if (close(desc) == -1) {
/* errno set by close(). */
return 0;
}
/* Append zeroes - we know size > have at this point. */
if (have + 32 > size)
memset(data + have, 0, 32);
else
memset(data + have, 0, size - have);
errno = 0;
return have;
}
int main(void)
{
char *data = NULL;
size_t size = 0;
size_t len;
int selfdir;
selfdir = at_dir(AT_FDCWD, "/proc/self/");
if (selfdir == -1) {
fprintf(stderr, "/proc/self/ is not available: %s.\n", strerror(errno));
exit(EXIT_FAILURE);
}
len = read_pseudofile_at(selfdir, "status", &data, &size);
if (errno) {
fprintf(stderr, "/proc/self/status: %s.\n", strerror(errno));
exit(EXIT_FAILURE);
}
printf("/proc/self/status: %zu bytes\n%s\n", len, data);
len = read_pseudofile_at(selfdir, "maps", &data, &size);
if (errno) {
fprintf(stderr, "/proc/self/maps: %s.\n", strerror(errno));
exit(EXIT_FAILURE);
}
printf("/proc/self/maps: %zu bytes\n%s\n", len, data);
close(selfdir);
free(data); data = NULL; size = 0;
return EXIT_SUCCESS;
}
The above example program opens a directory descriptor ("atfile handle") to /proc/self. (This way you do not need to concatenate strings to construct paths.)
It then reads the contents of /proc/self/status. If successful, it displays its size (in bytes) and its contents.
Next, it reads the contents of /proc/self/maps, reusing the previous buffer. If successful, it displays its size and contents as well.
Finally, the directory descriptor is closed as it is no longer needed, and the dynamically allocated buffer released.
Note that it is perfectly safe to do free(NULL), and also to discard the dynamic buffer (free(data); data=NULL; size=0;) between the read_pseudofile_at() calls.
Because pseudofiles are typically small, the read_pseudofile_at() uses a linear dynamic buffer growth policy. If there is no previous buffer, it starts with 8160 bytes, and grows it by 4096 bytes afterwards until sufficiently large. Feel free to replace it with whatever growth policy you prefer, this one is just an example, but works quite well in practice without wasting much memory.

Is there a way to read a text file byte by byte asynchronously with POSIX threads?

I am trying to read and copy its contents to another file asynchronously with POSIX threads in C. Assuming a file contains "aabbcc" and i have 4 threads, how can i copy "aabbcc" to another file with threads asynchronously in C. This part has been stuck in my head for the entire day. What i have done so far is shown below.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <fcntl.h>
#include <pthread.h>
#include <aio.h>
#include <math.h> //for ceil() and floor()
#include <sys/types.h>
#include <unistd.h>
#define FILE_SIZE 1024 //in bytes
//>cc code.c -o code.out -lrt -lpthread
//>./code.out
char alphabets[52] = {'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o',
'p','q','r','s','t','u','v','w','x','y','z',
'A','B','C','D','E','F','G','H','I','J','K','L','M','N','O',
'P','Q','R','S','T','U','V','W','X','Y','Z'};
long prepareInputFile(char* filename)
{
FILE *fp;
fp = fopen(filename, "w");
if(fp == NULL)
{
printf("Cannot write to input file\n");
return;
}
int index;
char str[FILE_SIZE];
int rand_size = (rand() % 1024)+1;
for(index = 0;index < rand_size;index++) /*Generate the file with random sizes in bytes*/
{
int num2 = (rand() % 52); /*Get a random char in char array*/
putc(alphabets[num2],fp); /*Write that char to the file pointed to by fp*/
}
putc('\n',fp);
fseek(fp, 0L, SEEK_END);
long size = ftell(fp);
fseek(fp, 0L, SEEK_SET);
return size;
}
//Perform main operation inside this function
void *writeToFileAsync(void *src_file, void *dest_file,
void *thread, void *t_count, void *filesize)
{
int readfd, writefd;
struct aiocb aio_write, aio_read;
memset(&aio_read, 0, sizeof(aio_read));
aio_read.aio_fildes = readfd;
aio_read.aio_nbytes = (int)filesize/(int)t_count;
readfd = open((char *)src_file, O_RDONLY);
if(readfd < 0)
{
printf("Cannot open the file for reading\n");
}
memset(&aio_write, 0, sizeof(aio_write));
aio_read.aio_fildes = writefd;
aio_read.aio_nbytes = (int)filesize/(int)t_count;
writefd = open((void *)dest_file, O_CREAT | O_WRONLY);
if(writefd < 0)
{
printf("Cannot open the file for writing\n");
}
return;
}
int main(int argc, char *argv[])
{
int i,threadCount;
char sourcePath[100], destPath[100];
strcpy(sourcePath,argv[1]);
if(strcmp(sourcePath, "-") == 0)
{
getcwd(sourcePath, sizeof(sourcePath));
strcpy(sourcePath, strcat(sourcePath, "/source.txt"));
}
else
{
strcpy(sourcePath, strcat(sourcePath, "source.txt"));
}
printf("Source path is: %s\n", sourcePath);
strcpy(destPath,argv[2]);
if(strcmp(destPath, "-") == 0)
{
getcwd(destPath, sizeof(destPath));
strcpy(destPath, strcat(destPath, "/destination.txt"));
}
else
{
strcpy(destPath, strcat(destPath, "destination.txt"));
}
printf("Dest path is: %s\n", destPath);
threadCount = strtol(argv[3],NULL,10);
long file_size = prepareInputFile(sourcePath);
pthread_t threads[threadCount];
for(i=0;i<threadCount;i++)
{
pthread_create(&threads[i],NULL,(void *)writeToFileAsync, NULL);
}
return 0;
}
Any help would be appreciated.

It is unlikely that parallelizing this operation would help, as it is probably bound by I/O rather than CPU time, and copying this way will certainly not be faster than simply copying via system call.
However, if you wanted to do this, one method would be: map the input file into memory (with mmap() or the equivalent), create the destination buffer or memory-mapped file, divide the source and destination files into equal slices, and have each thread copy its slice of the file. You might use memcpy(), but a modern compiler can see what your loop is doing and optimize it.
Even this is not going to be as fast as reading or mapping the source file into a buffer, then writing it back out from the same buffer with write(). If all you need to do is copy the file to disk, you don’t need to copy the bytes at all. In fact, you might even be able to make a second link to the file on disk.
This would probably work best if the slices are aligned to page boundaries. Be very careful about having two threads write to the same cache line, as this creates a race condition.

Filesystem VS Raw disk benchmarking in C

I am doing some benchmarking (on OS X) to see how the use of file system influences the bandwidth etc. I am using concurrency with the hope to create fragmentation in the FS.
However, it looks like using the FS is more efficient than raw disk accesses. Why?
Here is my code:
#include <pthread.h>
#include <stdio.h>
#include <fcntl.h>
#include <unistd.h>
#include <stdlib.h>
#define NO_THREADS (2)
#define PACKET_SIZE (1024 * 4)
#define SIZE_TO_WRITE (1024 * 1024 * 1024)
void write_buffer(void *arg) {
int *p_start = arg;
int start = *p_start;
char buffer[PACKET_SIZE];
char path[50];
sprintf(path, "file%d", start);
int fd = open(path, O_CREAT | O_WRONLY | O_APPEND);
//int fd = open("/dev/rdisk0s4", O_WRONLY);
if (fd < 0) {
fprintf(stderr, "Cound not open.\n", stderr);
goto end;
}
//lseek(fd, start * SIZE_TO_WRITE, SEEK_SET);
int current;
for (current = start; current < start + SIZE_TO_WRITE; current += PACKET_SIZE) {
int i;
for (i = 0; i < PACKET_SIZE; ++i) {
buffer[i] = i + current;
}
if (PACKET_SIZE != write(fd, buffer, PACKET_SIZE)) {
fprintf(stderr, "Could not write packet %d properly.", current);
goto close;
}
}
fsync(fd);
close:
close(fd);
end:
pthread_exit(0);
}
void flush(void) {
fflush(stdout);
fflush(stderr);
}
int main(void) {
pthread_t threads[NO_THREADS];
int starts[NO_THREADS];
int i;
atexit(flush);
for (i = 0; i < NO_THREADS; ++i) {
starts[i] = i;
if(pthread_create(threads + i, NULL, (void *) &write_buffer, (void *)(starts + i))) {
fprintf(stderr, "Error creating thread no %d\n", i);
return EXIT_FAILURE;
}
}
for (i = 0; i < NO_THREADS; ++i) {
if(pthread_join(threads[i], NULL)) {
fprintf(stderr, "Error joining thread\n");
return EXIT_FAILURE;
}
}
puts("Done");
return EXIT_SUCCESS;
}
With the help of the FS, the 2 threads write the file in 31.33 seconds. Without, it is achieved after minutes...

When you use /dev/rdisk0s4 instead of /path/to/normal/file%d, for every write you perform the OS will issue a disk I/O. Even if that disk is an SSD, that means that the round-trip time is probably at least a few hundred microseconds on average. When you write to the file instead, the filesystem isn't actually issuing your writes to disk until later. The Linux man page describes this well:
A successful return from write() does not make any guarantee that data has been committed to disk. In fact, on some buggy implementations, it does not even guarantee that space has successfully been reserved for the data. The only way to be sure is to call fsync(2) after you are done writing all your data.
So, the data you wrote is being buffered by the filesystem, which only requires that a copy be made in memory -- this probably takes on the order of a few microseconds at most. If you want to do an apples-to-apples comparison, you should make sure you're doing synchronous I/O for both test cases. Even running fsync after the whole test is done will probably allow the filesystem to be much faster, since it will batch up the I/O into one continuous streaming write, which could be faster than what your test directly on the disk can achieve.
In general, writing good systems benchmarks is incredibly difficult, especially when you don't know a lot about the system you're trying to test. I'd recommend using an off-the-shelf Unix filesystem benchmarking toolkit if you want high quality results -- otherwise, you could spend literally a lifetime learning about performance pathologies of the OS and FS you're testing... not that that's a bad thing, if you're interested in it like I am :-)

How to keep track of how many read/write operations are performed...?

For class I was given this, "Develop a C program that copies an input file to an output file and counts the number of read/write operations." I know how to do the action copying the input file to the output file, but I am not entirely sure how to keep track of how many read/write operation were performed. This program is supposed to repeat the copying using different buffer sizes and output a listing of the number of read/write operations performed with each buffer size. I am just not sure how to do the part of counting the r/w operations. How could one go about doing this? Thank you in advance.
Here is my current code (updated):
#include <stdio.h>
#include "apue.h"
#include <fcntl.h>
#define BUFFSIZE 1
int main(void)
{
int n;
char buf[BUFFSIZE];
int input_file;
int output_file;
int readCount = 0;
int writeCount = 0;
input_file = open("test.txt", O_RDONLY);
if(input_file < 0)
{
printf("could not open file.\n");
}
output_file = creat("output.txt", FILE_MODE);
if(output_file < 0)
{
printf("error with output file.\n");
}
while((n = read(input_file, buf, BUFFSIZE)) > 0)
{
readCount++;
if(write(output_file, buf, n) == n){
writeCount++;
}else{
printf("Error writing");
}
}
if(n < 0)
{
printf("reading error");
}
printf("read/write count: %d\n", writeCount + readCount);
printf("read = %d\n", readCount);
printf("write = %d\n", writeCount);
}
And for the text file: test one two
The result is:
read/write count: 26
read = 13
write = 13
Process returned 0 (0x0) execution time : 0.003 s
Press ENTER to continue.
I was thinking that the write would be 12...but I am not sure...

You will need to increment a variable every time you call a function that does reading or writing. You may do that by making a function that wraps the standard i/o function.
For example, replace fread with something like this:
size_t fread_count(void *p, size_t size, size_t num, FILE *f){
iocount++;
return fread(p, size, num, f);
}
iocount would have to be in scope (e.g. global)
If you need to count reads and writes separately, use separate variables.
One that you increment for reads and one that you increment for writes.
-edit-
since you are using write() and read(), you could easily make an equivalent
function like above but using write and read instead of fwrite and fread

to help with trying different buffer sizes:
1) put the open/read/write/close and char buffer[], read/write counters, final printf statement for counters, etc into a separate function
2) in main(), add a table that contains the buffer sizes to be tried.
3) call the new function from main(),
including a parameter that indicates the buffer size to use

Segmentation fault (core dumped)

I'm writing a program in c that basically copies files, but I'm getting this error: Segmentation fault (core dumped). From what I'm reading I think it's because I'm trying to access memory that hasn't been allocated yet. I'm a newbie when it comes to c and I suck at pointers, so I was wondering if you guys could tell me which pointer is causing this and how to fix it if possible. Btw, this program is supposed to be a daemon, but I haven't put anything inside the infinite while loop at the bottom.
Here is my code:
#include <sys/types.h>
#include <sys/stat.h>
#include <stdio.h>
#include <stdlib.h>
#include <fcntl.h>
#include <errno.h>
#include <unistd.h>
#include <syslog.h>
#include <string.h>
#include <dirent.h>
int main(int c, char *argv[]) {
char *source, *destination;
char *list1[30], *list2[30], *listDif[30];
unsigned char buffer[4096];
int i=0, x=0, sizeSource=0, sizeDest=0, sizeDif=0;
int outft, inft,fileread;
int sleeper;
struct dirent *ent, *ent1;
//Check number of arguments
if(c<3)
{
printf("Daemon wrongly called\n");
printf("How to use: <daemon name> <orginDirectory> <destinationDirectory> \n");
printf("or : <daemon name> <orginDirectory> <destinationDirectory> <sleeperTime(seconds)>");
return 0;
}
//Checks if sleeper time is given or will be the default 5minutes
/*if(c=4)
{
char *p;
errno = 0;
long conv = strtol(argv[3], &p, 10);
if(errno != 0 || *p != '\0')
{
printf("Number given for sleeper incorrect, it has to be an integer value.\n");
return(0);
} else
{
sleeper = conv;
}
} else
{
sleeper = 300;
}*/
//Get path of directories from arguments
source = argv[1];
destination = argv[2];
//Check if directories exist
DIR* dirSource = opendir(source);
if (!dirSource)
{
printf("Source directory incorrect\n");
return 0;
}
DIR* dirDest = opendir(destination);
if (!dirDest)
{
printf("Destination directory incorrect\n");
return 0;
}
/* save all the files and directories within directory */
while ((ent = readdir (dirSource)) != NULL) {
list1[sizeSource] = strdup(ent->d_name);
sizeSource++;
if(sizeSource>=30){break;}
}
closedir(dirSource);
while((ent1 = readdir (dirDest)) != NULL) {
list2[sizeDest] = strdup(ent1->d_name);
sizeDest++;
if(sizeDest>=30){break;}
}
closedir(dirDest);
/* Verify the diferences between the directories and save them */
int z;
int dif = 0; //0 - False | 1 - True
printf("Diferenças:\n");
for(i=0;i<sizeSource;i++){
dif = 0;
for(z=0;z<sizeDest;z++){
if(strcmp(list1[i],list2[z])==0){ //If there is no match, it saves the name of the file to listDif[]
dif = 1;
break;
}
}
if(dif==0) {
printf("%s\n",list1[i]);
listDif[sizeDif] = list1[i];
sizeDif++;
}
}
/* This code will copy the files */
z=0;
while(z!=sizeDif){
// output file opened or created
char *pathSource, *pathDest;
strcpy(pathSource, source);
strcat(pathSource, "/");
strcat(pathSource, listDif[z]);
strcpy(pathDest, destination);
strcat(pathDest, "/");
strcat(pathDest, listDif[z]);
// output file opened or created
if((outft = open(pathDest, O_CREAT | O_APPEND | O_RDWR))==-1){
perror("open");
}
// lets open the input file
inft = open(pathSource, O_RDONLY);
if(inft >0){ // there are things to read from the input
fileread = read(inft, buffer, sizeof(buffer));
printf("%s\n", buffer);
write(outft, buffer, fileread);
close(inft);
}
close(outft);
}
/* Our process ID and Session ID */
pid_t pid, sid;
/* Fork off the parent process */
pid = fork();
if (pid < 0) {
exit(EXIT_FAILURE);
}
/* If we got a good PID, then
we can exit the parent process. */
if (pid > 0) {
exit(EXIT_SUCCESS);
}
/* Change the file mode mask */
umask(0);
/* Open any logs here */
/* Create a new SID for the child process */
sid = setsid();
if (sid < 0) {
/* Log the failure */
exit(EXIT_FAILURE);
}
/* Change the current working directory */
if ((chdir("/")) < 0) {
/* Log the failure */
exit(EXIT_FAILURE);
}
/* Close out the standard file descriptors */
close(STDIN_FILENO);
close(STDOUT_FILENO);
close(STDERR_FILENO);
/* Daemon-specific initialization goes here */
/* The Big Loop */
while (1) {
//sleep(5); /* wait 5 seconds */
}
exit(EXIT_SUCCESS);
}
The result of ls is:
ubuntu#ubuntu:~/Desktop$ ls
Concatenar_Strings.c core D2 daemon.c examples.desktop
Concatenar_Strings.c~ D1 daemon daemon.c~ ubiquity.desktop
D1 and D2 are folders, and in D1 are three text documents that I want to copy into D2.
One other question, is this a delayed error or an immediate one? Because I doubt this message would appear on a code line that with two integers.
Thanks in advance guys.

This loop is wrong:
while ((ent = readdir (dirSource)) != NULL) {
list1[sizeSource] = ent->d_name;
Probably, ent points to the same memory block every time, and the readdir function updates it. So when you save that pointer, you end up with your list containing invalid pointers (probably end up all pointing to the same string). Further, the string may be deallocated once you got to the end of the directory.
If you want to use the result of readdir after closing the directory or after calling readdir again you will need to take a copy of the data. In this case you can use strdup and it is usually good style to free the string at the end of the operation.
This may or may not have been the cause of your segfault. Another thing to check is that you should break out of your loops if sizeSource or sizeDest hits 30.
In the strcmp loop, you should really set dif = 0 at the start of the i loop, instead of in an else block.
Update: (more code shown by OP)
char *pathSource, *pathDest;
strcpy(pathSource, source);
You are copying to a wild pointer, which is a likely cause of segfaults. strcpy does not allocate any memory, it expects that you have already allocated enough.
One possible fix would be:
char pathSource[strlen(source) + 1 + strlen(listDif[z]) + 1];
sprintf(pathSource, "%s/%s", source, listDif[z]);
Alternatively (without using VLA):
char pathSource[MAX_PATH]; // where MAX_PATH is some large number
snprintf(pathSource, MAX_PATH, "%s/%s", source, listDif[z]);
Do the same thing for pathDest.
NB. Consider moving the closedir lines up to after the readdir loops; generally speaking you should open and close a resource as close as possible to the times you start and finish using them respectively; this makes your code easier to maintain.

Develop Reference

c reactjs sql-server angularjs arrays wpf database batch-file google-app-engine silverlight