Related
I'm new to C and I made this file sorter that basically traverses through a directory and prints out a sorted list of all the files in it based on size.
#define MAXFILE 10000
struct fileList
{
char fName[256];
unsigned int fileSize;
};
void sorter...
//Sorts files
void rec_tav(char *path, struct fileList f_arr[MAXFILE])
{
DIR* dir;
struct dirent *ent;
struct stat sb;
char newPath[256];
dir = opendir(path);
if(!dir){
return;
}
while((ent = readdir(dir)) != NULL){
if(strcmp(ent->d_name,".") == 0 || strcmp(ent->d_name,"..") == 0)
continue;
strcpy(newPath,path);
strcat(newPath,"/");
strcat(newPath,ent->d_name);
stat(newPath, &sb);
if((S_ISREG(sb.st_mode)) != 0 && (stat(newPath,&sb))==0){
strcpy(f_arr[fileCount].fName,newPath);
f_arr[fileCount].fileSize = sb.st_size;
fileCount++;
}
rec_tav(newPath,f_arr);
}
closedir(dir);
}
int main(int argc,char **argv)
{
struct fileList file_array[MAXFILE];
rec_tav(argv[1],file_array);
sorter(file_array);
for(int i = 0; i < fileCount;i++)
printf("%d\t%s\n", file_array[i].fileSize,file_array[i].fName);
}
It works expect that I have to define a maximum file count. How can I alter it so that it works dynamically based on the file count? Im assuming that an array of struct wont work anymore but I dont know what I should use instead.
So the idiomatic way would be to change the signature of you function to something like int rec_tav(char *path, fileList **tab); so the return value would be the number of entries, and *tab would be the array of entries.
Inside the function, you might have code like this where you add one entry:
fileList *temp;
temp = realloc(*tab, (fileCount+1) * sizeof *temp);
if (temp == NULL) {
free(*tab);
return -1;
}
*tab = temp;
strcpy((*tab)[fileCount].fName,newPath);
(*tab)[fileCount].fileSize = sb.st_size;
fileCount++;
Although you might want to grow it by chunks, then shrink it back at the end.
Your recursive step only adds a small problem, and since you weren't dealing with it anyways, we can just add a small bit here:
int n;
fileList *newtab = NULL;
n = rec_tav(path, &newtab);
if (n > 0) {
fileList *temp;
temp = realloc(*tab, (n+fileCount) * sizeof *temp);
if (temp == NULL) {
free(*tab);
free(newtab);
return -1;
}
memcpy(temp + fileCount, newtab, n*sizeof *temp);
*tab = temp;
free(newtab);
fileCount += n;
}
I need to recursively list all directories and files in C programming. I have looked into FTW but that is not included with the 2 operating systems that I am using (Fedora and Minix). I am starting to get a big headache from all the different things that I have read over the past few hours.
If somebody knows of a code snippet I could look at that would be amazing, or if anyone can give me good direction on this I would be very grateful.
Why does everyone insist on reinventing the wheel again and again?
POSIX.1-2008 standardized the nftw() function, also defined in the Single Unix Specification v4 (SuSv4), and available in Linux (glibc, man 3 nftw), OS X, and most current BSD variants. It is not new at all.
Naïve opendir()/readdir()/closedir() -based implementations almost never handle the cases where directories or files are moved, renamed, or deleted during the tree traversal, whereas nftw() should handle them gracefully.
As an example, consider the following C program that lists the directory tree starting at the current working directory, or at each of the directories named on the command line, or just the files named at the command line:
/* We want POSIX.1-2008 + XSI, i.e. SuSv4, features */
#define _XOPEN_SOURCE 700
/* Added on 2017-06-25:
If the C library can support 64-bit file sizes
and offsets, using the standard names,
these defines tell the C library to do so. */
#define _LARGEFILE64_SOURCE
#define _FILE_OFFSET_BITS 64
#include <stdlib.h>
#include <unistd.h>
#include <ftw.h>
#include <time.h>
#include <stdio.h>
#include <string.h>
#include <errno.h>
/* POSIX.1 says each process has at least 20 file descriptors.
* Three of those belong to the standard streams.
* Here, we use a conservative estimate of 15 available;
* assuming we use at most two for other uses in this program,
* we should never run into any problems.
* Most trees are shallower than that, so it is efficient.
* Deeper trees are traversed fine, just a bit slower.
* (Linux allows typically hundreds to thousands of open files,
* so you'll probably never see any issues even if you used
* a much higher value, say a couple of hundred, but
* 15 is a safe, reasonable value.)
*/
#ifndef USE_FDS
#define USE_FDS 15
#endif
int print_entry(const char *filepath, const struct stat *info,
const int typeflag, struct FTW *pathinfo)
{
/* const char *const filename = filepath + pathinfo->base; */
const double bytes = (double)info->st_size; /* Not exact if large! */
struct tm mtime;
localtime_r(&(info->st_mtime), &mtime);
printf("%04d-%02d-%02d %02d:%02d:%02d",
mtime.tm_year+1900, mtime.tm_mon+1, mtime.tm_mday,
mtime.tm_hour, mtime.tm_min, mtime.tm_sec);
if (bytes >= 1099511627776.0)
printf(" %9.3f TiB", bytes / 1099511627776.0);
else
if (bytes >= 1073741824.0)
printf(" %9.3f GiB", bytes / 1073741824.0);
else
if (bytes >= 1048576.0)
printf(" %9.3f MiB", bytes / 1048576.0);
else
if (bytes >= 1024.0)
printf(" %9.3f KiB", bytes / 1024.0);
else
printf(" %9.0f B ", bytes);
if (typeflag == FTW_SL) {
char *target;
size_t maxlen = 1023;
ssize_t len;
while (1) {
target = malloc(maxlen + 1);
if (target == NULL)
return ENOMEM;
len = readlink(filepath, target, maxlen);
if (len == (ssize_t)-1) {
const int saved_errno = errno;
free(target);
return saved_errno;
}
if (len >= (ssize_t)maxlen) {
free(target);
maxlen += 1024;
continue;
}
target[len] = '\0';
break;
}
printf(" %s -> %s\n", filepath, target);
free(target);
} else
if (typeflag == FTW_SLN)
printf(" %s (dangling symlink)\n", filepath);
else
if (typeflag == FTW_F)
printf(" %s\n", filepath);
else
if (typeflag == FTW_D || typeflag == FTW_DP)
printf(" %s/\n", filepath);
else
if (typeflag == FTW_DNR)
printf(" %s/ (unreadable)\n", filepath);
else
printf(" %s (unknown)\n", filepath);
return 0;
}
int print_directory_tree(const char *const dirpath)
{
int result;
/* Invalid directory path? */
if (dirpath == NULL || *dirpath == '\0')
return errno = EINVAL;
result = nftw(dirpath, print_entry, USE_FDS, FTW_PHYS);
if (result >= 0)
errno = result;
return errno;
}
int main(int argc, char *argv[])
{
int arg;
if (argc < 2) {
if (print_directory_tree(".")) {
fprintf(stderr, "%s.\n", strerror(errno));
return EXIT_FAILURE;
}
} else {
for (arg = 1; arg < argc; arg++) {
if (print_directory_tree(argv[arg])) {
fprintf(stderr, "%s.\n", strerror(errno));
return EXIT_FAILURE;
}
}
}
return EXIT_SUCCESS;
}
Most of the code above is in print_entry(). Its task is to print out each directory entry. In print_directory_tree(), we tell nftw() to call it for each directory entry it sees.
The only hand-wavy detail above is the decision on how many file descriptors one should let nftw() use. If your program uses at most two extra file descriptors (in addition to the standard streams) during the file tree walk, 15 is known to be safe (on all systems having nftw() and being mostly POSIX-compliant).
In Linux, you could use sysconf(_SC_OPEN_MAX) to find the maximum number of open files, and subtract the number you use concurrently with the nftw() call, but I wouldn't bother (unless I knew the utility would be used mostly with pathologically deep directory structures). Fifteen descriptors does not limit the tree depth; nftw() just gets slower (and might not detect changes in a directory if walking a directory deeper than 13 directories from that one, although the tradeoffs and general ability to detect changes vary between systems and C library implementations). Just using a compile-time constant like that keeps the code portable -- it should work not just on Linux, but on Mac OS X and all current BSD variants, and most other not-too-old Unix variants, too.
In a comment, Ruslan mentioned that they had to switch to nftw64() because they had filesystem entries that required 64-bit sizes/offsets, and the "normal" version of nftw() failed with errno == EOVERFLOW. The correct solution is to not switch to GLIBC-specific 64-bit functions, but to define _LARGEFILE64_SOURCE and _FILE_OFFSET_BITS 64. These tell the C library to switch to 64-bit file sizes and offsets if possible, while using the standard functions (nftw(), fstat(), et cetera) and type names (off_t etc.).
Here is a recursive version:
#include <unistd.h>
#include <sys/types.h>
#include <dirent.h>
#include <stdio.h>
#include <string.h>
void listdir(const char *name, int indent)
{
DIR *dir;
struct dirent *entry;
if (!(dir = opendir(name)))
return;
while ((entry = readdir(dir)) != NULL) {
if (entry->d_type == DT_DIR) {
char path[1024];
if (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0)
continue;
snprintf(path, sizeof(path), "%s/%s", name, entry->d_name);
printf("%*s[%s]\n", indent, "", entry->d_name);
listdir(path, indent + 2);
} else {
printf("%*s- %s\n", indent, "", entry->d_name);
}
}
closedir(dir);
}
int main(void) {
listdir(".", 0);
return 0;
}
int is_directory_we_want_to_list(const char *parent, char *name) {
struct stat st_buf;
if (!strcmp(".", name) || !strcmp("..", name))
return 0;
char *path = alloca(strlen(name) + strlen(parent) + 2);
sprintf(path, "%s/%s", parent, name);
stat(path, &st_buf);
return S_ISDIR(st_buf.st_mode);
}
int list(const char *name) {
DIR *dir = opendir(name);
struct dirent *ent;
while (ent = readdir(dir)) {
char *entry_name = ent->d_name;
printf("%s\n", entry_name);
if (is_directory_we_want_to_list(name, entry_name)) {
// You can consider using alloca instead.
char *next = malloc(strlen(name) + strlen(entry_name) + 2);
sprintf(next, "%s/%s", name, entry_name);
list(next);
free(next);
}
}
closedir(dir);
}
Header files worth being skimmed in this context: stat.h, dirent.h. Bear in mind that the code above isn't checking for any errors which might occur.
A completely different approach is offered by ftw defined in ftw.h.
As I mentioned in my comment, I believe a recursive approach to have two inherent flaws to this task.
The first flaw is the limit on open files. This limit imposes a limit on deep traversal. If there are enough sub-folders, the recursive approach will break. (See edit regarding stack overflow)
The second flaw is a bit more subtle. The recursive approach makes it very hard to test for hard links. If a folder tree is cyclic (due to hard links), the recursive approach will break (hopefully without a stack overflow). (See edit regarding hard links)
However, it is quite simple to avoid these issues by replacing recursion with a single file descriptor and linked lists.
I assume this isn't a school project and that recursion is optional.
Here's an example application.
Use a.out ./ to view folder tree.
I apologize for the macros and stuff... I usually use inline functions, but I thought it would be easier to follow the code if it was all in a single function.
#include <dirent.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
int main(int argc, char const *argv[]) {
/* print use instruction unless a folder name was given */
if (argc < 2)
fprintf(stderr,
"\nuse:\n"
" %s <directory>\n"
"for example:\n"
" %s ./\n\n",
argv[0], argv[0]),
exit(0);
/*************** a small linked list macro implementation ***************/
typedef struct list_s {
struct list_s *next;
struct list_s *prev;
} list_s;
#define LIST_INIT(name) \
{ .next = &name, .prev = &name }
#define LIST_PUSH(dest, node) \
do { \
(node)->next = (dest)->next; \
(node)->prev = (dest); \
(node)->next->prev = (node); \
(dest)->next = (node); \
} while (0);
#define LIST_POP(list, var) \
if ((list)->next == (list)) { \
var = NULL; \
} else { \
var = (list)->next; \
(list)->next = var->next; \
var->next->prev = var->prev; \
}
/*************** a record (file / folder) item type ***************/
typedef struct record_s {
/* this is a flat processing queue. */
list_s queue;
/* this will list all queued and processed folders (cyclic protection) */
list_s folders;
/* this will list all the completed items (siblings and such) */
list_s list;
/* unique ID */
ino_t ino;
/* name length */
size_t len;
/* name string */
char name[];
} record_s;
/* take a list_s pointer and convert it to the record_s pointer */
#define NODE2RECORD(node, list_name) \
((record_s *)(((uintptr_t)(node)) - \
((uintptr_t) & ((record_s *)0)->list_name)))
/* initializes a new record */
#define RECORD_INIT(name) \
(record_s){.queue = LIST_INIT((name).queue), \
.folders = LIST_INIT((name).folders), \
.list = LIST_INIT((name).list)}
/*************** the actual code ***************/
record_s records = RECORD_INIT(records);
record_s *pos, *item;
list_s *tmp;
DIR *dir;
struct dirent *entry;
/* initialize the root folder record and add it to the queue */
pos = malloc(sizeof(*pos) + strlen(argv[1]) + 2);
*pos = RECORD_INIT(*pos);
pos->len = strlen(argv[1]);
memcpy(pos->name, argv[1], pos->len);
if (pos->name[pos->len - 1] != '/')
pos->name[pos->len++] = '/';
pos->name[pos->len] = 0;
/* push to queue, but also push to list (first item processed) */
LIST_PUSH(&records.queue, &pos->queue);
LIST_PUSH(&records.list, &pos->list);
/* as long as the queue has items to be processed, do so */
while (records.queue.next != &records.queue) {
/* pop queued item */
LIST_POP(&records.queue, tmp);
/* collect record to process */
pos = NODE2RECORD(tmp, queue);
/* add record to the processed folder list */
LIST_PUSH(&records.folders, &pos->folders);
/* process the folder and add all folder data to current list */
dir = opendir(pos->name);
if (!dir)
continue;
while ((entry = readdir(dir)) != NULL) {
/* create new item, copying it's path data and unique ID */
item = malloc(sizeof(*item) + pos->len + entry->d_namlen + 2);
*item = RECORD_INIT(*item);
item->len = pos->len + entry->d_namlen;
memcpy(item->name, pos->name, pos->len);
memcpy(item->name + pos->len, entry->d_name, entry->d_namlen);
item->name[item->len] = 0;
item->ino = entry->d_ino;
/* add item to the list, right after the `pos` item */
LIST_PUSH(&pos->list, &item->list);
/* unless it's a folder, we're done. */
if (entry->d_type != DT_DIR)
continue;
/* test for '.' and '..' */
if (entry->d_name[0] == '.' &&
(entry->d_name[1] == 0 ||
(entry->d_name[1] == '.' && entry->d_name[2] == 0)))
continue;
/* add folder marker */
item->name[item->len++] = '/';
item->name[item->len] = 0;
/* test for cyclic processing */
list_s *t = records.folders.next;
while (t != &records.folders) {
if (NODE2RECORD(t, folders)->ino == item->ino) {
/* we already processed this folder! */
break; /* this breaks from the small loop... */
}
t = t->next;
}
if (t != &records.folders)
continue; /* if we broke from the small loop, entry is done */
/* item is a new folder, add to queue */
LIST_PUSH(&records.queue, &item->queue);
}
closedir(dir);
}
/*************** Printing the results and cleaning up ***************/
while (records.list.next != &records.list) {
/* pop list item */
LIST_POP(&records.list, tmp);
/* collect and process record */
pos = NODE2RECORD(tmp, list);
fwrite(pos->name, pos->len, 1, stderr);
fwrite("\n", 1, 1, stderr);
/* free node */
free(pos);
}
return 0;
}
EDIT
#Stargateur mentioned in the comments that the recursive code will probably overflow the stack before reaching the open file limit.
Although I don't see how a stack-overflow is any better, this assessment is probably correct as long as the process isn't close to the file limit when invoked.
Another point mentioned by #Stargateur in the comments was that the depth of the recursive code is limited by the maximum amount of sub-directories (64000 on the ext4 filesystem) and that hard links are extremely unlikely (since hard links to folders aren't allowed on Linux/Unix).
This is good news if the code is running on Linux (which it is, according to the question), so this issue isn't a real concern (unless running the code on macOS or, maybe, Windows)... although 64K subfolders in recursion might blow the stack wide open.
Having said that, the none recursive option still has advantages, such as being able to easily add a limit to the amount of items processed as well as being able to cache the result.
P.S.
According to the comments, here's a non-recursive version of the code that doesn't check for cyclic hierarchies. It's faster and should be safe enough to use on a Linux machine where hard links to folders aren't allowed.
#include <dirent.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
int main(int argc, char const *argv[]) {
/* print use instruction unless a folder name was given */
if (argc < 2)
fprintf(stderr,
"\nuse:\n"
" %s <directory>\n"
"for example:\n"
" %s ./\n\n",
argv[0], argv[0]),
exit(0);
/*************** a small linked list macro implementation ***************/
typedef struct list_s {
struct list_s *next;
struct list_s *prev;
} list_s;
#define LIST_INIT(name) \
{ .next = &name, .prev = &name }
#define LIST_PUSH(dest, node) \
do { \
(node)->next = (dest)->next; \
(node)->prev = (dest); \
(node)->next->prev = (node); \
(dest)->next = (node); \
} while (0);
#define LIST_POP(list, var) \
if ((list)->next == (list)) { \
var = NULL; \
} else { \
var = (list)->next; \
(list)->next = var->next; \
var->next->prev = var->prev; \
}
/*************** a record (file / folder) item type ***************/
typedef struct record_s {
/* this is a flat processing queue. */
list_s queue;
/* this will list all the completed items (siblings and such) */
list_s list;
/* unique ID */
ino_t ino;
/* name length */
size_t len;
/* name string */
char name[];
} record_s;
/* take a list_s pointer and convert it to the record_s pointer */
#define NODE2RECORD(node, list_name) \
((record_s *)(((uintptr_t)(node)) - \
((uintptr_t) & ((record_s *)0)->list_name)))
/* initializes a new record */
#define RECORD_INIT(name) \
(record_s){.queue = LIST_INIT((name).queue), .list = LIST_INIT((name).list)}
/*************** the actual code ***************/
record_s records = RECORD_INIT(records);
record_s *pos, *item;
list_s *tmp;
DIR *dir;
struct dirent *entry;
/* initialize the root folder record and add it to the queue */
pos = malloc(sizeof(*pos) + strlen(argv[1]) + 2);
*pos = RECORD_INIT(*pos);
pos->len = strlen(argv[1]);
memcpy(pos->name, argv[1], pos->len);
if (pos->name[pos->len - 1] != '/')
pos->name[pos->len++] = '/';
pos->name[pos->len] = 0;
/* push to queue, but also push to list (first item processed) */
LIST_PUSH(&records.queue, &pos->queue);
LIST_PUSH(&records.list, &pos->list);
/* as long as the queue has items to be processed, do so */
while (records.queue.next != &records.queue) {
/* pop queued item */
LIST_POP(&records.queue, tmp);
/* collect record to process */
pos = NODE2RECORD(tmp, queue);
/* process the folder and add all folder data to current list */
dir = opendir(pos->name);
if (!dir)
continue;
while ((entry = readdir(dir)) != NULL) {
/* create new item, copying it's path data and unique ID */
item = malloc(sizeof(*item) + pos->len + entry->d_namlen + 2);
*item = RECORD_INIT(*item);
item->len = pos->len + entry->d_namlen;
memcpy(item->name, pos->name, pos->len);
memcpy(item->name + pos->len, entry->d_name, entry->d_namlen);
item->name[item->len] = 0;
item->ino = entry->d_ino;
/* add item to the list, right after the `pos` item */
LIST_PUSH(&pos->list, &item->list);
/* unless it's a folder, we're done. */
if (entry->d_type != DT_DIR)
continue;
/* test for '.' and '..' */
if (entry->d_name[0] == '.' &&
(entry->d_name[1] == 0 ||
(entry->d_name[1] == '.' && entry->d_name[2] == 0)))
continue;
/* add folder marker */
item->name[item->len++] = '/';
item->name[item->len] = 0;
/* item is a new folder, add to queue */
LIST_PUSH(&records.queue, &item->queue);
}
closedir(dir);
}
/*************** Printing the results and cleaning up ***************/
while (records.list.next != &records.list) {
/* pop list item */
LIST_POP(&records.list, tmp);
/* collect and process record */
pos = NODE2RECORD(tmp, list);
fwrite(pos->name, pos->len, 1, stderr);
fwrite("\n", 1, 1, stderr);
/* free node */
free(pos);
}
return 0;
}
Here is a simplified version that is recursive but uses much less stack space:
#include <errno.h>
#include <stdio.h>
#include <string.h>
#include <sys/types.h>
#include <unistd.h>
#include <dirent.h>
void listdir(char *path, size_t size) {
DIR *dir;
struct dirent *entry;
size_t len = strlen(path);
if (!(dir = opendir(path))) {
fprintf(stderr, "path not found: %s: %s\n",
path, strerror(errno));
return;
}
puts(path);
while ((entry = readdir(dir)) != NULL) {
char *name = entry->d_name;
if (entry->d_type == DT_DIR) {
if (!strcmp(name, ".") || !strcmp(name, ".."))
continue;
if (len + strlen(name) + 2 > size) {
fprintf(stderr, "path too long: %s/%s\n", path, name);
} else {
path[len] = '/';
strcpy(path + len + 1, name);
listdir(path, size);
path[len] = '\0';
}
} else {
printf("%s/%s\n", path, name);
}
}
closedir(dir);
}
int main(void) {
char path[1024] = ".";
listdir(path, sizeof path);
return 0;
}
On my system, its output is exactly identical to that of find .
Walking a Directory Tree Without Constructing Path Names
This is a version that uses file descriptors to refer to directories, with fdopendir(), fstatat(), and openat() to walk a directory tree without having to construct any path names.
This is simpler to implement, and can be useful on systems with deeply-nested directory trees, where a full path name might exceed PATH_MAX - and note that PATH_MAX may not even exist.
The posted code is compressed, broken up, and all error checking removed to remove vertical scroll bars and improve readability. A complete example is at the end of the question.
Headers
#define _POSIX_C_SOURCE 200809L
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <dirent.h>
#include <stdio.h>
#include <unistd.h>
#include <string.h>
Actual directory tree walk implementation:
// the actual walking is done by descriptor, not name
static int myftwImp( int dirfd )
{
DIR *dirp = fdopendir( dirfd );
for ( ;; )
{
struct dirent *dent = readdir( dirp );
if ( NULL == dent ) break;
if ( ( 0 == strcmp( ".", dent->d_name ) ) ||
( 0 == strcmp( "..", dent->d_name ) ) )
{
continue;
}
struct stat sb = { 0 };
fstatat( dirfd, dent->d_name, &sb, 0 );
if ( S_ISDIR( sb.st_mode ) )
{
printf( "dir: %s\n", dent->d_name );
int newdirfd = openat( dirfd, dent->d_name,
O_RDONLY | O_DIRECTORY );
myftwImp( newdirfd );
}
printf( " file: %s\n", dent->d_name );
}
// this will close the descriptor, too
closedir( dirp );
return( 0 );
}
Public call that uses directory name:
int myftw( const char *dirname )
{
int dirfd = open( dirname, O_RDONLY | O_DIRECTORY );
myftwImp( dirfd );
return( 0 );
}
Example use:
int main( int argc, char **argv )
{
int rc = myftw( argv[ 1 ] );
return( rc );
}
No error checking is done here for brevity. Real code should check all calls for errors and handle them appropriately.
Full code with error checking:
#define _POSIX_C_SOURCE 200809L
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <dirent.h>
#include <stdio.h>
#include <unistd.h>
#include <string.h>
static int myftwImp( int dirfd )
{
DIR *dirp = fdopendir( dirfd );
if ( NULL == dirp )
{
return( -1 );
}
int rc = 0;
for ( ;; )
{
struct dirent *dent = readdir( dirp );
if ( NULL == dent )
{
break;
}
if ( 0 == strcmp( ".", dent->d_name ) )
{
continue;
}
if ( 0 == strcmp( "..", dent->d_name ) )
{
continue;
}
struct stat sb = { 0 };
rc = fstatat( dirfd, dent->d_name, &sb, 0 );
if ( 0 != rc )
{
break;
}
if ( S_ISDIR( sb.st_mode ) )
{
int newdirfd = openat( dirfd, dent->d_name, O_RDONLY | O_DIRECTORY );
if ( -1 == newdirfd )
{
rc = -1;
break;
}
printf( "dir: %s\n", dent->d_name );
rc = myftwImp( newdirfd );
if ( 0 != rc )
{
break;
}
}
printf( " file: %s\n", dent->d_name );
}
closedir( dirp );
return( rc );
}
int myftw( const char *dirname )
{
int dirfd = open( dirname, O_RDONLY | O_DIRECTORY );
if ( -1 == dirfd )
{
return( -1 );
}
int rc = myftwImp( dirfd );
return( rc );
}
int main( int argc, char **argv )
{
int rc = myftw( argv[ 1 ] );
return( rc );
}
I'm implementing parts of the Linux ls command in C. I want to sort the contents of directories lexicographically, which I've been doing using scandir(). This is easy enough for listing single directories, but I'm having trouble doing it for listing subdirectories recursively. My current code: (results in a segmentation faults once a directory type is reached)
void recursive(char* arg){
int i;
struct dirent **file_list;
int num;
char* next_dir;
num = scandir(arg, &file_list, NULL, alphasort);
for(i = 0; i < num; i++) {
if(file_list[i]->d_type == DT_DIR) {
if(strcmp(".", file_list[i]->d_name) != 0 && strcmp("..", file_list[i]->d_name) != 0) {
// Directories are printed with a colon to distinguish them from files
printf("%s: \n", file_list[i]->d_name);
strcpy(next_dir, arg);
strcat(next_dir, "/");
strcat(next_dir, file_list[i]->d_name);
printf("\n");
recursive(next_dir);
}
} else {
if(strcmp(".", file_list[i]->d_name) != 0 && strcmp("..", file_list[i]->d_name) != 0) {
printf("%s \n", file_list[i]->d_name);
}
}
}
}
int main(void) {
recursive(".");
return 0;
}
There are two recommended methods for traversing entire filesystem trees in Linux and other POSIXy systems:
nftw(): man 3 nftw
Given an initial path, a callback function, the maximum number of descriptors to use, and a set of flags, nftw() will call the callback function once for every filesystem object in the subtree. The order in which entries in the same directory is called is not specified, however.
This is the POSIX.1 (IEEE 1003) function.
fts_open()/fts_read()/fts_children()/fts_close(): man 3 fts
The fts interface provides a way to traverse filesystem hierarchies. The fts_children() provides a linked list of filesystem entries sorted by the comparison function specified in the fts_open() call. It is rather similar to how scandir() returns an array of filesystem entries, except that the two use very different structures to describe each filesystem entry.
Prior to glibc 2.23 (released in 2016), the Linux (glibc) fts implementation had bugs when using 64-bit file sizes (so on x86-64, or when compiling with -D_FILE_OFFSET_BITS=64).
These are BSD functions (FreeBSD/OpenBSD/macOS), but are available in Linux also.
Finally, there is also the atfile version of scandir(), scandirat(), that returns the filtered and sorted filesystem entries from a specific directory, but in addition to the pathname, it takes a file descriptor to the relative root directory to be used as a parameter. (If AT_FDCWD is used instead of a file descriptor, then scandirat() behaves like scandir().)
The simplest option here is to use nftw(), store all walked paths, and finally sort the paths. For example, walk.c:
// SPDX-License-Identifier: CC0-1.0
#define _POSIX_C_SOURCE 200809L
#define _GNU_SOURCE
#include <stdlib.h>
#include <locale.h>
#include <ftw.h>
#include <string.h>
#include <stdio.h>
#include <errno.h>
struct entry {
/* Insert additional properties like 'off_t size' here. */
char *name; /* Always points to name part of pathname */
char pathname[]; /* Full path and name */
};
struct listing {
size_t max; /* Number of entries allocated for */
size_t num; /* Number of entries in the array */
struct entry **ent; /* Array of pointers, one per entry */
};
#define STRUCT_LISTING_INITIALIZER { 0, 0, NULL }
/* Locale-aware sort for arrays of struct entry pointers.
*/
static int entrysort(const void *ptr1, const void *ptr2)
{
const struct entry *ent1 = *(const struct entry **)ptr1;
const struct entry *ent2 = *(const struct entry **)ptr2;
return strcoll(ent1->pathname, ent2->pathname);
}
/* Global variable used by nftw_add() to add to the listing */
static struct listing *nftw_listing = NULL;
static int nftw_add(const char *pathname, const struct stat *info, int typeflag, struct FTW *ftwbuf)
{
const char *name = pathname + ftwbuf->base;
/* These generate no code, just silences the warnings about unused parameters. */
(void)info;
(void)typeflag;
/* Ignore "." and "..". */
if (name[0] == '.' && !name[1])
return 0;
if (name[0] == '.' && name[1] == '.' && !name[2])
return 0;
/* Make sure there is room for at least one more entry in the listing. */
if (nftw_listing->num >= nftw_listing->max) {
const size_t new_max = nftw_listing->num + 1000;
struct entry **new_ent;
new_ent = realloc(nftw_listing->ent, new_max * sizeof (struct entry *));
if (!new_ent)
return -ENOMEM;
nftw_listing->max = new_max;
nftw_listing->ent = new_ent;
}
const size_t pathnamelen = strlen(pathname);
struct entry *ent;
/* Allocate memory for this entry.
Remember to account for the name, and the end-of-string terminator, '\0', at end of name. */
ent = malloc(sizeof (struct entry) + pathnamelen + 1);
if (!ent)
return -ENOMEM;
/* Copy other filesystem entry properties to ent here; say 'ent->size = info->st_size;'. */
/* Copy pathname, including the end-of-string terminator, '\0'. */
memcpy(ent->pathname, pathname, pathnamelen + 1);
/* The name pointer is always to within the pathname. */
ent->name = ent->pathname + ftwbuf->base;
/* Append. */
nftw_listing->ent[nftw_listing->num++] = ent;
return 0;
}
/* Scan directory tree starting at path, adding the entries to struct listing.
Note: the listing must already have been properly initialized!
Returns 0 if success, nonzero if error; -1 if errno is set to indicate error.
*/
int scan_tree_sorted(struct listing *list, const char *path)
{
if (!list) {
errno = EINVAL;
return -1;
}
if (!path || !*path) {
errno = ENOENT;
return -1;
}
nftw_listing = list;
int result = nftw(path, nftw_add, 64, FTW_DEPTH);
nftw_listing = NULL;
if (result < 0) {
errno = -result;
return -1;
} else
if (result > 0) {
errno = 0;
return result;
}
if (list->num > 2)
qsort(list->ent, list->num, sizeof list->ent[0], entrysort);
return 0;
}
int main(int argc, char *argv[])
{
struct listing list = STRUCT_LISTING_INITIALIZER;
setlocale(LC_ALL, "");
if (argc < 2 || !strcmp(argv[1], "-h") || !strcmp(argv[1], "--help")) {
const char *arg0 = (argc > 0 && argv && argv[0] && argv[0][0]) ? argv[0] : "(this)";
fprintf(stderr, "\n");
fprintf(stderr, "Usage: %s [ -h | --help ]\n", arg0);
fprintf(stderr, " %s .\n", arg0);
fprintf(stderr, " %s TREE [ TREE ... ]\n", arg0);
fprintf(stderr, "\n");
fprintf(stderr, "This program lists all files and directories starting at TREE,\n");
fprintf(stderr, "in sorted order.\n");
fprintf(stderr, "\n");
return EXIT_SUCCESS;
}
for (int arg = 1; arg < argc; arg++) {
if (scan_tree_sorted(&list, argv[arg])) {
fprintf(stderr, "%s: Error scanning directory tree: %s.\n", argv[arg], strerror(errno));
return EXIT_FAILURE;
}
}
printf("Found %zu entries:\n", list.num);
for (size_t i = 0; i < list.num; i++)
printf("\t%s\t(%s)\n", list.ent[i]->pathname, list.ent[i]->name);
return EXIT_SUCCESS;
}
Compile using gcc -Wall -Wextra -O2 walk.c -o walk, and run using e.g. ./walk ...
The scan_tree_sorted() function calls nftw() for the directory specified, updating the global variable nftw_listing so that the nftw_add() callback function can add each new directory entry to it. If the listing contains more that one entry afterwards, it is sorted using qsort() and a locale-aware comparison function (based on strcoll()).
nftw_add() skips . and .., and adds every other pathname to the listing structure nftw_listing. It automatically grows the array as needed in linear fashion; the new_max = nftw_listing->num + 1000; means we allocate in units of a thousand (pointers).
The scan_tree_sorted() can be called multiple times with the same listing as the target, if one wants to list disjoint subtrees in one listing. Note, however, that it does not check for duplicates, although those could easily be filtered out after the qsort.
I am trying to implement a linked list data structure that represents a folder tree.
The structures below:
typedef struct SRC_ERROR SRC_ERROR;
struct SRC_ERROR {
int error_code;
char *error;
};
typedef struct SRC_FILE SRC_FILE;
struct SRC_FILE {
char *entry;
char md5[MD5_DIGEST_LENGTH];
};
typedef struct SRC SRC; //Source file tree with md5 entry char for source verification.
struct SRC {
SRC_ERROR error;
char *name;
char *full_path;
SRC_FILE **entries;
SRC *next_dir;
};
The idea was that each directory will be stored in SRC the SRC_FILE is to be used as an array to store the filename and MD5 hash for each file.
The scan_source() below populates the structures.
SRC *scan_source(char *source_path) {
SRC *source = malloc(sizeof(SRC));
source->error.error_code = OK;
int count = 0;
DIR *dir;
struct dirent *entry;
if (!(dir = opendir(source_path))) {
source->error.error_code = ERROR;
source->error.error = "Unable to open source directory.\n";
return source;
}
source->entries = (SRC_FILE **)malloc(sizeof(SRC_FILE *) * count);
if (source->entries == NULL) {
source->error.error_code = ERROR;
source->error.error = "Unable to allocate memory to file entry tree\n";
}
while ((entry = readdir(dir)) != NULL) {
if (entry->d_type == DT_DIR) {
char path[PATH_MAX];
if (strcmp(entry->d_name, ".") == 0 || strcmp(entry->d_name, "..") == 0)
continue;
snprintf(path, sizeof(path), "%s/%s", source_path, entry->d_name);
printf("[%s] - %s\n", entry->d_name, path);
//add new node
source = add_dir(source, insert_dir_node(entry->d_name, path));
scan_source(path);
} else
if (entry->d_type == DT_REG) {
printf("[FILE] - %s\n", entry->d_name);
source->entries[count]->entry = entry->d_name; //SEGFAULT HERE
count++;
source->entries = realloc(source->entries, sizeof(SRC_FILE *) * (count));
}
}
closedir(dir);
return source;
}
I am having issues with memory management. I am getting intermittent seg faults when the directory is structured in certain ways.
I have marked the line that the debugger has flagged
source->entries[count]->entry = entry->d_name; //SEGFAULT HERE
I thought that I allocated memory for each structure but maybe I have not done this correctly or there is an underlying problem with the data structure entirely?
For Example:
test> tree
.
└── Text
0 directories, 1 file
This causes a seg fault. Whereas, this does not:
/test> tree
.
├── another sample
│ └── Text
└── sample folder
2 directories, 1 file
Additional functions that are used:
SRC *add_dir(SRC *file_tree, SRC *new_dir) {
new_dir->next_dir = file_tree;
return new_dir;
}
SRC *insert_dir_node(char *name, char *full_path) {
SRC *next_dir;
next_dir = (SRC *)emalloc(sizeof(SRC));
next_dir->name = name;
next_dir->full_path = full_path;
next_dir->next_dir = NULL;
return next_dir;
}
I started looking at the code, and the first issue I see is that you're storing pointers returned by a readdir() call - you should copy the data contained therein instead.
Change
source = add_dir(source, insert_dir_node(entry->d_name, path));
to
source = add_dir(source, insert_dir_node(strdup(entry->d_name), path));
The reason you're seeing segmentation faults is that you always write after the end of the source->entries array.
You initially create a 0-size array:
int count = 0;
/* ... */
source->entries = (SRC_FILE **) malloc(sizeof(SRC_FILE*) * count);
Then set its 1st (indexed by 0) element:
source->entries[count]->entry = entry->d_name; //SEGFAULT HERE
count++;
source->entries = realloc(source->entries, sizeof(SRC_FILE*)*(count));
Then you expand the array to 1 element, then write to the second index, and so on.
You can either fix the logic (allocate space for count+1 elements always, because you want to have room not only for the existing ones but also for the next one), or, which in this case may be more efficient, switch to a linked list structure here as well.
The next problem is that you're only allocating pointers to SRC_FILE, not SRC_FILE structures - you should change the definition to:
struct SRC {
SRC_ERROR error;
char *name;
char *full_path;
SRC_FILE *entries;
SRC *next_dir;
};
And the initialization to
source->entries = (SRC_FILE *) malloc(sizeof(SRC_FILE) * (count + 1));
Then the critical part to
source->entries[count].entry = strdup(entry->d_name);
count++;
source->entries = realloc(source->entries, sizeof(SRC_FILE) * (count + 1));
There's one more thing to attend to: insert_dir_node creates a new SRC struct, which will need to have a freshly initialized entries member:
next_dir->count = 0;
next_dir->entries = (SRC_FILE *)malloc(sizeof(SRC_FILE) * (1));
and, since we have now separate entries we need to have a count for each of them, so move this variable into the struct as well.
Fixing all of these provided me with an error-free program.
The subject is Memory management in linked lists. Indeed this is a major issue in C program because there is no automatic memory management. You must decide and specify how each object pointed to by a pointer in your structures is handled from a memory management standpoint. Is the pointer the reference for the object life time or is the lifetime handled somewhere else and the pointer just an access point.
Let's analyse your object definitions:
typedef struct SRC_ERROR SRC_ERROR;
struct SRC_ERROR {
int error_code;
char *error;
};
SRC_ERROR is just a way to package an error description. If the error member always stores a pointer to a string literal, it should be defined as const char *. Conversely, if in some cases you allocate a string with information specific to the actual error, such as "error allocating 1023 objects\n", then you either need an indicator specifying the error points to allocated memory that should be freed after use or you should always allocate memory for the error message and always free this memory when discarding the SRC_ERROR object.
typedef struct SRC_FILE SRC_FILE;
struct SRC_FILE {
char *entry;
char md5[MD5_DIGEST_LENGTH];
};
entry should point to allocated memory and this memory should be freed when discarding the SRC_FILE object.
typedef struct SRC SRC; //Source file tree with md5 entry char for source verification.
struct SRC {
SRC_ERROR error;
char *name;
char *full_path;
SRC_FILE **entries;
SRC *next_dir;
};
name and full_path should point to allocated memory and should be freed when discarding the SRC object.
next_dir points to another SRC object, which should be allocated and freed consistently.
entries points to an allocated array, each element of which points to an allocated object. You need a way to tell the number of elements in this array. You could maintain a NULL pointer at the end of the array, but it is simpler to add a count member in SRC for this information. It would also be much simpler to make this a pointer to an allocated array of SRC objects.
The function does not construct a tree, but attempts to construct a list of directories. Whenever to recurse into a directory, you should append the new list from the SRC_ERROR object returned by scan_source to the list already constructed in the SRC_ERROR object allocated by the caller and free the object returned by the recursive call.
Here is a modified version in a test program:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <dirent.h>
#ifndef PATH_MAX
#define PATH_MAX 1024
#endif
#define MD5_DIGEST_LENGTH 16
#define TRACE(x) //x
enum { OK = 0, ERROR, OUT_OF_MEMORY };
typedef struct ERROR_STATE ERROR_STATE;
struct ERROR_STATE {
int code;
const char *message; // always a string literal
};
typedef struct SRC_FILE SRC_FILE;
struct SRC_FILE {
char *name; // points to allocated memory
char md5[MD5_DIGEST_LENGTH];
};
typedef struct SRC SRC; //Source file tree with md5 entry char for source verification.
struct SRC {
char *name; // points to allocated memory
char *full_path; // points to allocated memory
size_t count; // number of elements in entries
SRC_FILE *entries; // allocated array of count elements
SRC *next_dir; // the next SRC
};
static char *basename_dup(const char *full_path) {
char *p = strrchr(full_path, '/');
return strdup(p ? p + 1 : full_path);
}
/* construct a SRC describing the directory contents.
* if there is an error, either return a partially constructed SRC or return NULL
*/
SRC *scan_source(const char *source_path, ERROR_STATE *error) {
char *full_path = strdup(source_path);
char *name = basename_dup(source_path);
SRC *source = calloc(1, sizeof(SRC)); // all members initialized to 0
if (source == NULL) {
error->code = ERROR;
error->message = "Unable to allocate memory.\n";
free(full_path);
free(name);
free(source);
return NULL;
}
error->code = OK;
source->full_path = full_path;
source->name = name;
DIR *dir;
struct dirent *entry;
if (!(dir = opendir(source_path))) {
error->code = ERROR;
error->message = "Unable to open source directory.\n";
return source;
}
while ((entry = readdir(dir)) != NULL) {
char path[PATH_MAX];
int len;
if (!strcmp(entry->d_name, ".") || !strcmp(entry->d_name, ".."))
continue;
len = snprintf(path, sizeof(path), "%s/%s", source_path, entry->d_name);
if (len >= (int)sizeof(path)) {
// the path was truncated.
// you can report this or ignore it...
TRACE(printf("[%s] - %s - path too long, ignored\n", entry->d_name, path));
continue;
}
if (entry->d_type == DT_DIR) {
TRACE(printf("[%s] - %s\n", entry->d_name, path));
SRC *source1 = scan_source(path, error);
if (error->code != OK) {
// either ignore the error or abort?
}
if (source1) {
// append the new directory (and its list of sub-directories)
SRC **tailp = &source->next_dir;
while (*tailp) tailp = &(*tailp)->next_dir;
*tailp = source1;
}
} else
if (entry->d_type == DT_REG) {
TRACE(printf("[FILE] - %s\n", entry->d_name));
// add the file to the entries list
SRC_FILE *entries = realloc(source->entries, sizeof(source->entries[0]) * (source->count + 1));
if (entries == NULL) {
// you should return to the caller with a proper error code
error->code = OUT_OF_MEMORY;
error->message = "cannot reallocate entries array";
break;
}
source->entries = entries;
// source->entries[count] must point to an allocated object
name = strdup(entry->d_name);
if (name == NULL) {
error->code = OUT_OF_MEMORY;
error->message = "cannot allocate entry name";
break;
}
source->entries[source->count].name = name;
memset(source->entries[source->count].md5, 0, sizeof(source->entries[source->count].md5));
source->count++;
//if (md5_sum(full_path, source->entries[source->count].md5)) {
// // error computing the MD5 sum...
//}
}
}
closedir(dir);
return source;
}
void free_source(SRC *source) {
if (source) {
free(source->name);
free(source->full_path);
for (size_t i = 0; i < source->count; i++) {
free(source->entries[i].name);
}
free(source);
}
}
int main(int argc, char *argv[1]) {
ERROR_STATE error = { 0, NULL };
if (argc < 2) {
printf("usage: scansource directory [...]\n");
return 1;
}
for (int i = 1; i < argc; i++) {
SRC *source = scan_source(argv[i], &error);
if (error.code) {
printf("Error %d: %s\n", error.code, error.message);
}
while (source) {
SRC *cur = source;
source = source->next_dir;
printf("{\n"
" name: '%s',\n"
" full_path: '%s',\n"
" count: %zu,\n"
" entries: [\n",
cur->name, cur->full_path, cur->count);
for (size_t j = 0; j < cur->count; j++) {
printf(" { md5: '");
for (size_t k = 0; k < MD5_DIGEST_LENGTH; k++)
printf("%02x", cur->entries[j].md5[k]);
printf("', name: '%s' },\n", cur->entries[j].name);
}
printf(" ]\n},\n");
free_source(cur);
}
}
return 0;
}
Problem 1: what's the best data structure to save the directory structure?
Problem 2: I have tried to use a general tree to solve it, but there are a lot of problems:
The number of files under a directory is not certain. So the number of child nodes under a tree node is also not certain. and I try to add a keyword nchild to each node, showing nchild child nodes. so there are nchild pointers (saved with **child) to the child nodes. And once that, **child and *child should be dynamically allocated space with no certain child nodes. So you know, this is really difficult to release these spaces(and the program below is not called free()). Is there a better way to solve it?
And sometimes the program below would get the garbage characters when I output the directory tree, which make me really confused. while debugging it, found that is the function ent=readdir(pDir); has read garbage characters. But when I write another simple program to read the same directory, that goes well. I think the problem is the recursive function, but I didn't get any idea. I will be appreciated if some one can give me a idea. Thanks!
```
#include <dirent.h>
#include <unistd.h>
#include <stdlib.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <string.h>
#include <stdio.h>
typedef struct tree_file_s
{
char path[512];
time_t date;
char type;
long size;
int nchild;
struct tree_file_s **child;
} tree_file_t;
int dir_child_len(const char *dir)
{
int nchild = 0;
DIR *pDir;
struct dirent *ent;
pDir = opendir(dir);
while((ent=readdir(pDir)) != NULL)
{
if (strcmp(ent->d_name, ".")==0 || strcmp(ent->d_name, "..")==0)
{
continue;
}
nchild++;
}
return nchild;
}
void tree_create(tree_file_t *tft, const char *dir)
{
int nchild; // the tft has n child
DIR *pDir;
struct dirent *ent; // the directory dir dirent info
struct stat file_stat; // the new file's stat info
stat(dir, &file_stat);
nchild = dir_child_len(dir);
pDir = opendir(dir);
// Initialize the parent
//tft->path = calloc(1, strlen(dir)+1);
strcpy(tft->path, dir);
tft->date = file_stat.st_mtime;
tft->type = 'D';
tft->size = file_stat.st_size;
tft->nchild = nchild;
tft->child = calloc(1, nchild);
nchild = 0;
while ((ent=readdir(pDir)) != NULL)
{
if (ent->d_type & DT_DIR)
{
if (strcmp(ent->d_name, ".")==0 || strcmp(ent->d_name, "..")==0)
{
continue;
}
tree_file_t *new_dir = calloc(1, sizeof(tree_file_t));
tft->child[nchild] = new_dir;
char *new_path = calloc(1, strlen(dir)+strlen(ent->d_name)+1);
sprintf(new_path, "%s/%s", dir, ent->d_name);
tree_create(new_dir, new_path);
free(new_path);
} else {
tree_file_t *new_file = calloc(1, sizeof(tree_file_t));
char *new_path = calloc(1, strlen(dir)+strlen(ent->d_name)+1);
// new_file->path = calloc(1, strlen(dir)+strlen(ent->d_name)+1);
sprintf(new_path, "%s/%s", dir, ent->d_name);
stat(new_path, &file_stat);
strcpy(new_file->path, new_path);
free(new_path);
new_file->date = file_stat.st_mtime;
new_file->type = 'F';
new_file->size = file_stat.st_size;
new_file->nchild = 0;
new_file->child = 0;
tft->child[nchild] = new_file;
}
//free(new_path);
//new_path = 0;
nchild++;
}
}
void display_tree(tree_file_t *tft)
{
int nchild, i;
nchild = tft->nchild;
printf("%c: %s\n", tft->type, tft->path);
for(i = 0; i < nchild; i++)
{
if(tft->child[i]->type == 'F')
{
printf("%c: %s\n", tft->child[i]->type, tft->child[i]->path);
} else {
display_tree(tft->child[i]);
}
}
}
int main(int argc, const char *argv[])
{
if(argc != 2)
{
printf("Usage: a.out dir\n");
exit(0);
}
char dir[512];
strcpy(dir, argv[1]);
tree_file_t *tft = calloc(1, sizeof(tree_file_t));
tree_create(tft, dir);
display_tree(tft);
return 0;
}
```
When you allocate space for new_path you need to add 2 (one for the slash, one for the null terminator). And you never close the directories you open (use closedir()).
An even more serious error is this line:
tft->child = calloc(1, nchild);
which only allocates nchild bytes, not enough to hold nchild pointers! Try:
tft->child = calloc(nchild, sizeof(*tft->child));