So, I asked here just a while ago, but half of that question was just me being dumb. And I still have issues. I hope that this will be clearer than the question before.
I'm writing POSIX cat, I nearly got it working, but I have couple of issues:
My cat can not read from a pipe and I really do not know why (redirecting (<) works fine)
I can not figure out how to make it continuously read stdin, without some issues. I had a version that worked "fine", but would create a stack-overflow. The other version wouldn't stop reading from stdin if there was only stdin i.e.: my-cat < file would read from stdin until it got terminated which it shouldn't, but it has to read from stdin and wait for termination if no files are suplied.
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <sys/stat.h>
#include <fcntl.h>
int main(int argc, char *argv[])
{
char opt;
while ((opt = getopt(argc, argv, "u")) != EOF) {
switch(opt) {
case 'u':
/* Make the output un-buffered */
setbuf(stdout, NULL);
break;
default:
break;
}
}
argc -= optind;
argv += optind;
int i = 0, fildes, fs = 0;
do {
/* Check for operands, if none or operand = "-". Read from stdin */
if (argc == 0 || !strcmp(argv[i], "-")) {
fildes = STDIN_FILENO;
} else {
fildes = open(argv[i], O_RDONLY);
}
/* Check for directories */
struct stat fb;
if (!fstat(fildes, &fb) && S_ISDIR(fb.st_mode)) {
fprintf(stderr, "pcat: %s: Is a directory\n", argv[i]);
i++;
continue;
}
/* Get file size */
fs = fb.st_size;
/* If bytes are read, write them to stdout */
char *buf = malloc(fs * sizeof(char));
while ((read(fildes, buf, fs)) > 0)
write(STDOUT_FILENO, buf, fs);
free(buf);
/* Close file if it's not stdin */
if (fildes != STDIN_FILENO)
close(fildes);
i++;
} while (i < argc);
return 0;
}
Pipes don't have a size, and nor do terminals. The contents of the st_size field is undefined for such files. (On my system it seems to always contain 0, but I don't think there is any cross-platform guarantee of that.)
So your plan of reading the entire file at one go and writing it all out again is not workable for non-regular files, and is risky even for them (the read is not guaranteed to return the full number of bytes requested). It's also an unnecessary memory hog if the file is large.
A better strategy is to read into a fixed-size buffer, and write out only the number of bytes you successfully read. You repeat this until end-of-file is reached, which is indicated by read() returning 0. This is how you solve your second problem.
On a similar note, write() is not guaranteed to write out the full number of bytes you asked it to, so you need to check its return value, and if it was short, try again to write out the remaining bytes.
Here's an example:
#define BUFSIZE 65536 // arbitrary choice, can be tuned for performance
ssize_t nread;
char buf[BUFSIZE]; // or char *buf = malloc(BUFSIZE);
while ((nread = read(filedes, buf, BUFSIZE)) > 0) {
ssize_t written = 0;
while (written < nread) {
ssize_t ret = write(STDOUT_FILENO, buf + written, nread - written);
if (ret <= 0)
// handle error
written += ret;
}
}
if (nread < 0)
// handle error
As a final comment, your program lacks error checking in general; e.g. if the file cannot be opened, it will proceed anyway with filedes == -1. It is important to check the return value of every system call you issue, and handle errors accordingly. This would be essential for a program to be used in real life, and even for toy programs created just as an exercise, it will be very helpful in debugging them. (Error checking would probably have given you some clues in figuring out what was wrong with this program, for instance.)
Your cat (You can call it my-cat, but I preferred to call it felix, just permit me the pun) should be used with stdio all the time to get the benefit of the buffering done by the stdio package. Below is a simplified version of cat using exclusively stdio package (almost exactly equal as it appears in K&R) and you'll see that is completely efficient as shown (you will see that the structure is almost exactly as yours, but I simplify the processing of the data copy /like K&R book/ and the processing of arguments /yours is a bit meshy/):
felix.c
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <getopt.h>
#define ERR(_code, _fmt, ...) do { \
fprintf(stderr,"%s: " _fmt, progname, \
##__VA_ARGS__); \
if (_code) exit(_code); \
} while (0)
char *progname = "cat";
void process(FILE *f);
int main(int argc, char **argv)
{
int opt;
while ((opt = getopt(argc, argv, "u")) != EOF) {
switch (opt) {
case 'u': setbuf(stdout, NULL); break;
}
}
/* for the case it has been renamed, calculate the basename
* of argv[0] (progname is used in the macro ERR above) */
progname = strrchr(argv[0], '/');
progname = progname
? progname + 1
: argv[0];
/* shift options */
argc -= optind;
argv += optind;
if (argc) {
int i;
for (i = 0; i < argc; i++) {
FILE *f = fopen(argv[i], "r");
if (!f) {
ERR(EXIT_FAILURE,
"%s: %s (errno = %d)\n",
argv[i], strerror(errno), errno);
}
process(f);
fclose(f);
}
} else {
process(stdin);
}
exit(EXIT_SUCCESS);
}
/* you don't need to complicate here, fgetc and putchar use buffering as you stated in main
* (no output buffering if you do the setbuf(NULL) and input buffering all the time). The buffer
* size is best to leave stdio to calculate it, as it queries the filesystem to get the best
* input/output size and create buffers this size. and the processing is simple with a loop like
* the one below. You'll get no appreciable difference between this and any other input/output.
* you can believe me, I've tested it. */
void process(FILE *f)
{
int c;
while ((c = fgetc(f)) != EOF) {
putchar(c);
}
}
As you see, nothing has been specially done to support redirection, as redirection is not done inside a program, but done by the program that calls it (in this case by the shell) When you start a program, you receive three already open file descriptors. These are the ones that the shell is using, or the ones that the shell just puts in the places of 0, 1, and 2 before starting your program. So your program has nothing to do to cope with redirection. Everything is done (in this case) in the shell... and this is why your program redirection works, even if you have not done anything for it to work. You have only to do redirection if you are going to call a program with its input, output or standard error redirected somewhere (and this somewhere is not the standard input, output or error you have received from your parent process)... but this is not the case of my-cat.
I have a code which runs bc thru popen(). I can intercept the calculator's output and prepend it with "Output=" text. But how can I intercept what user's is writing to bc?
#include <stdio.h>
#include <stdlib.h>
int main(void) {
FILE *in;
char buff[512];
if(!(in = popen("bc", "r"))){
exit(1);
}
while(fgets(buff, sizeof(buff), in)!=NULL){
printf("Output = %s", buff);
}
pclose(in);
return 0;
}
You can combine bc and echo with a pipe: echo '12*4' | bc
Example typing 12*4:
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
int main(void) {
FILE *in;
char buff[512];
char cmd[512];
while (fgets(buff, sizeof(buff), stdin)!=NULL){
strcpy(cmd, "echo '");
strcat(cmd, buff);
strcat(cmd, "' | bc");
if(!(in = popen(cmd, "r"))){
exit(1);
}
fgets(buff, sizeof(buff), in);
printf("output:%s", buff);
}
pclose(in);
return 0;
}
Output:
david#debian:~$ ./demo
12*4
output:48
You need to use pipe() and fork/exec(). However, manual piping is quite complex:
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
int main(void) {
int write_pipe[2], read_pipe[2];
pipe(read_pipe); pipe(write_pipe);
#define PARENT_READ read_pipe[0]
#define CHILD_WRITE read_pipe[1]
#define CHILD_READ write_pipe[0]
#define PARENT_WRITE write_pipe[1]
int child = fork();
if (child == 0) { /* in child */
close(PARENT_WRITE);
close(PARENT_READ);
dup2(CHILD_READ, 0); close(CHILD_READ);
dup2(CHILD_WRITE, 1); close(CHILD_WRITE);
execl("/usr/bin/bc", "/usr/bin/bc");
} else { /* in parent */
close(CHILD_READ);
close(CHILD_WRITE);
write(PARENT_WRITE, "2+3\n", 4);
char buff[512];
int output_len=read(PARENT_READ, buff, sizeof(buff));
write(1, buff, output_len);
close(PARENT_READ);
}
return 0;
}
What you're looking to do is to start a subprocess, then simultaneously:
When activity occurs on standard input, execute some function on that input before passing it to the subprocess.
When activity occurs on the subprocess output, execute some function on that output before passing it to standard output.
The system call that allows you to wait for activity on two handles is called poll, but before we do that, we need to create the handles and start the subprocess:
int a[2], b[2];
if(pipe(a)==-1)abort(); // for communicating with subprocess input
if(pipe(b)==-1)abort(); // for communicating with subprocess output
switch(fork()) {
case -1: abort();
case 0: dup2(a[0],0), dup2(b[1],1), execlp("/usr/bin/bc", "bc", 0); exit(1);
};
Note how pipe works: Data written to fildes[1] appears on (i.e., can be read from) fildes[0]. This means we want to read from the standard output of our subprocess, b[0] and write to the standard input of our subprocess a[1].
Before we do that, we can use the poll instruction to wait for activity on either standard input (fd #0), or the subprocess output (b[0]):
for(;;) {
struct pollfd p[2]={0};
p[0].fd = 0; p[1].fd = b[0];
p[0].events = p[1].events = POLLIN;
while (poll(p,2,-1) <= 0);
At this point, there is activity on at least one of these file descriptors. You can see which one by examining the .revents member.
if(p[0].revents & POLLIN) {
r = read(0, buffer, sizeof(buffer));
write(a[1], buffer, r); // check for errors, or perhaps modify buffer
}
if(p[1].revents & POLLIN) {
r = read(b[0], buffer, sizeof(buffer));
write(1, buffer, r); // check for errors, or perhaps modify buffer
}
Note especially we use the opposite member a[1] and b[0] from the member we dup2'd onto the subprocesses standard input (0) and standard output (1).
At this point you can loop back up to poll again:
}
Disconnects (like EOF, program crash, etc) will be presented as read() returning 0, so watch carefully for this case, and break; out of the loop if so desired.
I have a generic problem I am looking to solve, where chunks of binary data sent from a standard input or regular file stream to an application, which in turn converts that binary data into text. Using threads, I want to process the text before piping it over to the next application, which modifies that text even further, and so on.
As a simple test case, I want to extract compressed data via gunzip. Specifically, I am looking at using gunzip -c - to extract chunks of binary data sent to it via its (reassigned) stdin file descriptor, and then pulling out chunks of text from its (reassigned) stdout file descriptor. I can then print these chunks of text to the real stdout or stderr (or do other stuff, later on).
(I realize that I can do gzip-based compression and extraction on the command line. My goal here is to use this test case to learn how to correctly pass around generic chunks of binary and text data between threads that either run that data through binaries, or process it further.)
In the case of my test program, I have set up three pthread_t threads:
produce_gzip_chunk_thread
consume_gzip_chunk_thread
consume_gunzip_chunk_thread
I pass each of these threads a shared data instance called thread_data, which contains a thread lock, two conditions, and some buffers and counter variables. I also include a set of file descriptors for a gunzip process opened with popen3():
typedef struct pthread_data pthread_data_t;
typedef struct popen3_desc popen3_desc_t;
struct pthread_data {
pthread_mutex_t in_lock;
pthread_cond_t in_cond;
pthread_cond_t out_cond;
unsigned char in_buf[BUF_LENGTH_VALUE];
size_t n_in_bytes;
size_t n_in_bytes_written_to_gunzip;
size_t n_out_bytes_read_from_gunzip;
FILE *in_file_ptr;
boolean in_eof;
char in_line[LINE_LENGTH_VALUE];
popen3_desc_t *gunzip_ptr;
};
struct popen3_desc {
int in;
int out;
int err;
};
The produce_gzip_chunk_thread reads in a 1024-byte chunk of gzip-compressed bytes from a regular file called foo.gz.
These bytes are written to an unsigned char buffer called in_buf, which is part of the shared data struct I am passing to each thread:
void * produce_gzip_chunk(void *t_data)
{
#ifdef DEBUG
fprintf(stderr, "Debug: Entering --> produce_gzip_chunk()\n");
#endif
pthread_data_t *d = (pthread_data_t *)t_data;
unsigned char in_buf[BUF_LENGTH_VALUE];
size_t n_in_bytes = 0;
d->in_eof = kFalse;
pthread_mutex_lock(&d->in_lock);
while(kTrue) {
n_in_bytes = fread(in_buf, sizeof(in_buf[0]), sizeof(in_buf), d->in_file_ptr);
if (n_in_bytes > 0) {
while (d->n_in_bytes != 0 || d->n_out_bytes_read_from_gunzip != 0)
pthread_cond_wait(&d->in_cond, &d->in_lock);
memcpy(d->in_buf, in_buf, n_in_bytes);
d->n_in_bytes = n_in_bytes;
#ifdef DEBUG
fprintf(stderr, "Debug: ######## [%07zu] produced chunk\n", d->n_in_bytes);
#endif
pthread_cond_signal(&d->in_cond);
}
else if (feof(d->in_file_ptr) || ferror(d->in_file_ptr))
break;
}
d->in_eof = kTrue;
pthread_mutex_unlock(&d->in_lock);
pthread_cond_signal(&d->in_cond);
#ifdef DEBUG
fprintf(stderr, "Debug: Leaving --> produce_gzip_chunk()\n");
#endif
return NULL;
}
Once there is a positive number of bytes stored in n_bytes — that is, we have pulled data from our input gzip archive that needs to be processed with gunzip — this triggers a condition that permits the second thread consume_gzip_chunk_thread to operate:
void * consume_gzip_chunk(void *t_data)
{
#ifdef DEBUG
fprintf(stderr, "Debug: Entering --> consume_gzip_chunk()\n");
#endif
pthread_data_t *d = (pthread_data_t *)t_data;
long n_in_bytes_written_to_gunzip;
pthread_mutex_lock(&d->in_lock);
while(kTrue) {
while (d->n_in_bytes == 0 && !d->in_eof)
pthread_cond_wait(&d->in_cond, &d->in_lock);
if (d->n_in_bytes) {
#ifdef DEBUG
fprintf(stderr, "Debug: ........ [%07zu] processing chunk\n", d->n_in_bytes);
#endif
if (!d->gunzip_ptr) {
#ifdef DEBUG
fprintf(stderr, "Debug: * setting up gunzip ptr\n");
#endif
d->gunzip_ptr = malloc(sizeof(popen3_desc_t));
if (!d->gunzip_ptr) {
fprintf(stderr, "Error: Could not create gunzip file handle struct\n");
exit(EXIT_FAILURE);
}
popen3("gunzip -c -",
&(d->gunzip_ptr->in),
&(d->gunzip_ptr->out),
&(d->gunzip_ptr->err),
kTrue,
kTrue);
memset(d->in_line, 0, LINE_LENGTH_VALUE);
}
n_in_bytes_written_to_gunzip = (long) write(d->gunzip_ptr->in, d->in_buf, d->n_in_bytes);
#ifdef DEBUG
fprintf(stderr, "Debug: ................ wrote [%07ld] bytes into the gunzip process\n", n_in_bytes_written_to_gunzip);
#endif
if (n_in_bytes_written_to_gunzip > 0)
d->n_in_bytes_written_to_gunzip = n_in_bytes_written_to_gunzip;
d->n_in_bytes = 0;
pthread_cond_signal(&d->out_cond);
}
if (d->in_eof)
break;
}
pthread_mutex_unlock(&d->in_lock);
#ifdef DEBUG
fprintf(stderr, "Debug: Leaving --> consume_gzip_chunk()\n");
#endif
return NULL;
}
When consuming the gzip data chunk, we use the write function to send n_bytes of in_buf to the gunzip process's input file descriptor. At the end, we send another thread signal, but this time to out_cond, so as to help reawaken consume_gunzip_chunk_thread, which reads from gunzip's output to do more work:
void * consume_gunzip_chunk(void *t_data)
{
#ifdef DEBUG
fprintf(stderr, "Debug: Entering --> consume_gunzip_chunk()\n");
#endif
pthread_data_t *d = (pthread_data_t *)t_data;
long n_out_bytes_read_from_gunzip;
pthread_mutex_lock(&d->in_lock);
while(kTrue) {
while (d->n_in_bytes_written_to_gunzip == 0) {
pthread_cond_wait(&d->out_cond, &d->in_lock);
}
if (d->n_in_bytes_written_to_gunzip) {
sleep(1);
n_out_bytes_read_from_gunzip = read(d->gunzip_ptr->out, d->in_line, LINE_LENGTH_VALUE);
#ifdef DEBUG
fprintf(stderr, "Debug: ------------------------ read [%07ld] bytes out from the gunzip process\n", n_out_bytes_read_from_gunzip);
fprintf(stderr, "Debug: ------------------------ gunzip output chunk:\n[%s]\n", d->in_line);
#endif
memset(d->in_line, 0, strlen(d->in_line));
if (n_out_bytes_read_from_gunzip > 0)
d->n_out_bytes_read_from_gunzip = n_out_bytes_read_from_gunzip;
d->n_in_bytes_written_to_gunzip = 0;
pthread_cond_signal(&d->in_cond);
}
if (d->in_eof && (d->n_in_bytes_written_to_gunzip == 0))
break;
}
pthread_mutex_unlock(&d->in_lock);
#ifdef DEBUG
fprintf(stderr, "Debug: Leaving --> consume_gunzip_chunk()\n");
#endif
return NULL;
}
This attempts to read any available bytes from the gunzip process's output file descriptor. For debugging purposes, I just want to print them to stderr for now.
The problem I am facing is that I need to add a sleep(1) statement in consume_gunzip_chunk, before doing the read, in order to get things working properly.
Without this sleep(1) statement, my test program will usually output nothing — except once every 8-10 attempts, when the compressed data are extracted correctly.
Question - What am I doing wrong about my arrangement of conditions, such that the sleep(1) call is required to make the gzip-extraction work properly? In a production scenario, working with much larger input files, forcibly waiting a second every 1kB seems like a bad idea.
For reproducibility with the full source code, here are the two relevant files. Here is the header:
/*
* convert.h
*/
#ifndef CONVERT_H
#define CONVERT_H
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <pthread.h>
#include <getopt.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <errno.h>
#define CB_VERSION "1.0"
#define LINE_LENGTH_VALUE 65536
#define BUF_LENGTH_VALUE 1024
#define POPEN3_READ 0
#define POPEN3_WRITE 1
typedef int boolean;
extern const boolean kTrue;
extern const boolean kFalse;
const boolean kTrue = 1;
const boolean kFalse = 0;
typedef enum {
kGzip,
kUnknown
} format_t;
typedef struct pthread_data pthread_data_t;
typedef struct popen3_desc popen3_desc_t;
struct pthread_data {
pthread_mutex_t in_lock;
pthread_cond_t in_cond;
pthread_cond_t out_cond;
unsigned char in_buf[BUF_LENGTH_VALUE];
size_t n_in_bytes;
size_t n_in_bytes_written_to_gunzip;
size_t n_out_bytes_read_from_gunzip;
boolean in_eof;
FILE *in_file_ptr;
popen3_desc_t *gunzip_ptr;
char in_line[LINE_LENGTH_VALUE];
};
struct popen3_desc {
int in;
int out;
int err;
};
static const char *name = "convert";
static const char *version = CB_VERSION;
static const char *authors = "Alex Reynolds";
static const char *usage = "\n" \
"Usage: convert --input-format=str <input-file>\n" \
" Process Flags:\n\n" \
" --input-format=str | -f str Input format (str = [ gzip ]; required)\n" \
" --help | -h Show this usage message\n";
static struct convert_globals_t {
char *input_format_str;
format_t input_format;
char **filenames;
int num_filenames;
} convert_globals;
static struct option convert_client_long_options[] = {
{ "input-format", required_argument, NULL, 'f' },
{ "help", no_argument, NULL, 'h' },
{ NULL, no_argument, NULL, 0 }
};
static const char *convert_client_opt_string = "f:h?";
void * consume_gunzip_chunk (void *t_data);
void * consume_gzip_chunk (void *t_data);
void * produce_gzip_chunk (void *t_data);
FILE * new_file_ptr (const char *in_fn);
void delete_file_ptr (FILE **file_ptr);
pid_t popen3 (const char *command,
int *in_desc,
int *out_desc,
int *err_desc,
boolean nonblock_in,
boolean nonblock_outerr);
off_t fsize (const char *fn);
void initialize_globals ();
void parse_command_line_options (int argc,
char **argv);
void print_usage (FILE *stream);
#endif
Here is the implementation:
/*
* convert.c
*/
#include "convert.h"
int main(int argc, char **argv)
{
#ifdef DEBUG
fprintf(stderr, "Debug: Entering --> main()\n");
#endif
pthread_t produce_gzip_chunk_thread = NULL;
pthread_t consume_gzip_chunk_thread = NULL;
pthread_t consume_gunzip_chunk_thread = NULL;
pthread_data_t *thread_data = NULL;
parse_command_line_options(argc, argv);
/* initialize thread data */
thread_data = malloc(sizeof(pthread_data_t));
thread_data->n_in_bytes = 0;
thread_data->n_in_bytes_written_to_gunzip = 0;
thread_data->n_out_bytes_read_from_gunzip = 0;
thread_data->in_eof = kFalse;
thread_data->in_file_ptr = new_file_ptr(convert_globals.filenames[0]);
pthread_mutex_init(&(thread_data->in_lock), NULL);
pthread_cond_init(&(thread_data->in_cond), NULL);
pthread_cond_init(&(thread_data->out_cond), NULL);
/* parse input */
if (convert_globals.input_format == kGzip)
{
if (pthread_create(&produce_gzip_chunk_thread, NULL, produce_gzip_chunk, (void *) thread_data) != 0) {
fprintf(stderr, "Error: Could not create gzip chunk production thread\n");
return EXIT_FAILURE;
}
if (pthread_create(&consume_gzip_chunk_thread, NULL, consume_gzip_chunk, (void *) thread_data) != 0) {
fprintf(stderr, "Error: Could not create gzip chunk consumption thread\n");
return EXIT_FAILURE;
}
if (pthread_create(&consume_gunzip_chunk_thread, NULL, consume_gunzip_chunk, (void *) thread_data) != 0) {
fprintf(stderr, "Error: Could not create gunzip chunk consumption thread\n");
return EXIT_FAILURE;
}
if (pthread_join(produce_gzip_chunk_thread, NULL) != 0) {
fprintf(stderr, "Error: Could not join gzip chunk production thread\n");
return EXIT_FAILURE;
}
if (pthread_join(consume_gzip_chunk_thread, NULL) != 0) {
fprintf(stderr, "Error: Could not join gzip chunk consumption thread\n");
return EXIT_FAILURE;
}
if (pthread_join(consume_gunzip_chunk_thread, NULL) != 0) {
fprintf(stderr, "Error: Could not join gunzip chunk consumption thread\n");
return EXIT_FAILURE;
}
}
else
{
/*
handle text formats
*/
}
/* cleanup */
delete_file_ptr(&thread_data->in_file_ptr);
pthread_mutex_destroy(&(thread_data->in_lock));
pthread_cond_destroy(&(thread_data->in_cond));
pthread_cond_destroy(&(thread_data->out_cond));
free(thread_data);
#ifdef DEBUG
fprintf(stderr, "Debug: Leaving --> main()\n");
#endif
return EXIT_SUCCESS;
}
void * consume_gunzip_chunk(void *t_data)
{
#ifdef DEBUG
fprintf(stderr, "Debug: Entering --> consume_gunzip_chunk()\n");
#endif
pthread_data_t *d = (pthread_data_t *)t_data;
long n_out_bytes_read_from_gunzip;
pthread_mutex_lock(&d->in_lock);
while(kTrue) {
while (d->n_in_bytes_written_to_gunzip == 0) {
pthread_cond_wait(&d->out_cond, &d->in_lock);
}
if (d->n_in_bytes_written_to_gunzip) {
sleep(1);
n_out_bytes_read_from_gunzip = read(d->gunzip_ptr->out, d->in_line, LINE_LENGTH_VALUE);
#ifdef DEBUG
fprintf(stderr, "Debug: ------------------------ read [%07ld] bytes out from the gunzip process\n", n_out_bytes_read_from_gunzip);
fprintf(stderr, "Debug: ------------------------ gunzip output chunk:\n[%s]\n", d->in_line);
#endif
memset(d->in_line, 0, strlen(d->in_line));
if (n_out_bytes_read_from_gunzip > 0)
d->n_out_bytes_read_from_gunzip = n_out_bytes_read_from_gunzip;
d->n_in_bytes_written_to_gunzip = 0;
pthread_cond_signal(&d->in_cond);
}
if (d->in_eof && (d->n_in_bytes_written_to_gunzip == 0))
break;
}
pthread_mutex_unlock(&d->in_lock);
#ifdef DEBUG
fprintf(stderr, "Debug: Leaving --> consume_gunzip_chunk()\n");
#endif
return NULL;
}
void * consume_gzip_chunk(void *t_data)
{
#ifdef DEBUG
fprintf(stderr, "Debug: Entering --> consume_gzip_chunk()\n");
#endif
pthread_data_t *d = (pthread_data_t *)t_data;
long n_in_bytes_written_to_gunzip;
pthread_mutex_lock(&d->in_lock);
while(kTrue) {
while (d->n_in_bytes == 0 && !d->in_eof)
pthread_cond_wait(&d->in_cond, &d->in_lock);
if (d->n_in_bytes) {
#ifdef DEBUG
fprintf(stderr, "Debug: ........ [%07zu] processing chunk\n", d->n_in_bytes);
#endif
if (!d->gunzip_ptr) {
#ifdef DEBUG
fprintf(stderr, "Debug: * setting up gunzip ptr\n");
#endif
d->gunzip_ptr = malloc(sizeof(popen3_desc_t));
if (!d->gunzip_ptr) {
fprintf(stderr, "Error: Could not create gunzip file handle struct\n");
exit(EXIT_FAILURE);
}
popen3("gunzip -c -",
&(d->gunzip_ptr->in),
&(d->gunzip_ptr->out),
&(d->gunzip_ptr->err),
kTrue,
kTrue);
memset(d->in_line, 0, LINE_LENGTH_VALUE);
}
n_in_bytes_written_to_gunzip = (long) write(d->gunzip_ptr->in, d->in_buf, d->n_in_bytes);
#ifdef DEBUG
fprintf(stderr, "Debug: ................ wrote [%07ld] bytes into the gunzip process\n", n_in_bytes_written_to_gunzip);
#endif
if (n_in_bytes_written_to_gunzip > 0)
d->n_in_bytes_written_to_gunzip = n_in_bytes_written_to_gunzip;
d->n_in_bytes = 0;
/* pthread_cond_signal(&d->in_cond); */
pthread_cond_signal(&d->out_cond);
}
if (d->in_eof)
break;
}
pthread_mutex_unlock(&d->in_lock);
#ifdef DEBUG
fprintf(stderr, "Debug: Leaving --> consume_gzip_chunk()\n");
#endif
return NULL;
}
void * produce_gzip_chunk(void *t_data)
{
#ifdef DEBUG
fprintf(stderr, "Debug: Entering --> produce_gzip_chunk()\n");
#endif
pthread_data_t *d = (pthread_data_t *)t_data;
unsigned char in_buf[BUF_LENGTH_VALUE];
size_t n_in_bytes = 0;
d->in_eof = kFalse;
pthread_mutex_lock(&d->in_lock);
while(kTrue) {
n_in_bytes = fread(in_buf, sizeof(in_buf[0]), sizeof(in_buf), d->in_file_ptr);
if (n_in_bytes > 0) {
while (d->n_in_bytes != 0 || d->n_out_bytes_read_from_gunzip != 0)
pthread_cond_wait(&d->in_cond, &d->in_lock);
memcpy(d->in_buf, in_buf, n_in_bytes);
d->n_in_bytes = n_in_bytes;
#ifdef DEBUG
fprintf(stderr, "Debug: ######## [%07zu] produced chunk\n", d->n_in_bytes);
#endif
pthread_cond_signal(&d->in_cond);
}
else if (feof(d->in_file_ptr) || ferror(d->in_file_ptr))
break;
}
d->in_eof = kTrue;
pthread_mutex_unlock(&d->in_lock);
pthread_cond_signal(&d->in_cond);
#ifdef DEBUG
fprintf(stderr, "Debug: Leaving --> produce_gzip_chunk()\n");
#endif
return NULL;
}
FILE * new_file_ptr(const char *in_fn)
{
#ifdef DEBUG
fprintf(stderr, "Debug: Entering --> new_file_ptr()\n");
#endif
FILE *file_ptr = NULL;
boolean not_stdin = kTrue;
not_stdin = strcmp(in_fn, "-");
file_ptr = (not_stdin) ? fopen(in_fn, "r") : stdin;
if (!file_ptr) {
fprintf(stderr, "Error: Could not open input stream\n");
exit(EXIT_FAILURE);
}
#ifdef DEBUG
fprintf(stderr, "Debug: Leaving --> new_file_ptr()\n");
#endif
return file_ptr;
}
void delete_file_ptr(FILE **file_ptr)
{
#ifdef DEBUG
fprintf(stderr, "Debug: Entering --> delete_file_ptr()\n");
#endif
fclose(*file_ptr);
*file_ptr = NULL;
#ifdef DEBUG
fprintf(stderr, "Debug: Leaving --> delete_file_ptr()\n");
#endif
}
pid_t popen3(const char *command, int *in_desc, int *out_desc, int *err_desc, boolean nonblock_in, boolean nonblock_outerr)
{
#ifdef DEBUG
fprintf(stderr, "Debug: Entering --> popen3()\n");
#endif
int p_stdin[2], p_stdout[2], p_stderr[2];
pid_t pid;
if (pipe(p_stdin) != 0 || pipe(p_stdout) != 0 || pipe(p_stderr) != 0)
return -1;
if (nonblock_in) {
fcntl(p_stdin[POPEN3_WRITE], F_SETFL, fcntl(p_stdin[POPEN3_WRITE], F_GETFL) | O_NONBLOCK);
}
if (nonblock_outerr) {
fcntl(p_stdout[POPEN3_READ], F_SETFL, fcntl(p_stdout[POPEN3_READ], F_GETFL) | O_NONBLOCK);
fcntl(p_stderr[POPEN3_READ], F_SETFL, fcntl(p_stderr[POPEN3_READ], F_GETFL) | O_NONBLOCK);
}
pid = fork();
if (pid < 0)
return pid; /* error */
if (pid == 0) {
close(p_stdin[POPEN3_WRITE]);
close(p_stdout[POPEN3_READ]);
close(p_stderr[POPEN3_READ]);
dup2(p_stdin[POPEN3_READ], fileno(stdin));
dup2(p_stdout[POPEN3_WRITE], fileno(stderr));
dup2(p_stdout[POPEN3_WRITE], fileno(stdout));
execl("/bin/sh", "sh", "-c", command, NULL);
fprintf(stderr, "Error: Could not execl [%s]\n", command);
exit(EXIT_FAILURE);
}
if (in_desc == NULL)
close(p_stdin[POPEN3_WRITE]);
else
*in_desc = p_stdin[POPEN3_WRITE];
if (out_desc == NULL)
close(p_stdout[POPEN3_READ]);
else
*out_desc = p_stdout[POPEN3_READ];
if (err_desc == NULL)
close(p_stderr[POPEN3_READ]);
else
*err_desc = p_stderr[POPEN3_READ];
#ifdef DEBUG
fprintf(stderr, "Debug: New *in_desc = %d\n", *in_desc);
fprintf(stderr, "Debug: New *out_desc = %d\n", *out_desc);
fprintf(stderr, "Debug: New *err_desc = %d\n", *err_desc);
#endif
#ifdef DEBUG
fprintf(stderr, "Debug: Leaving --> popen3()\n");
#endif
return pid;
}
off_t fsize(const char *fn)
{
#ifdef DEBUG
fprintf(stderr, "Debug: Entering --> fsize()\n");
#endif
struct stat st;
if (stat(fn, &st) == 0)
return st.st_size;
#ifdef DEBUG
fprintf(stderr, "Debug: Leaving --> fsize()\n");
#endif
return EXIT_FAILURE;
}
void initialize_globals()
{
#ifdef DEBUG
fprintf(stderr, "Debug: Entering --> initialize_globals()\n");
#endif
convert_globals.input_format = kUnknown;
convert_globals.filenames = NULL;
convert_globals.num_filenames = 0;
#ifdef DEBUG
fprintf(stderr, "Debug: Leaving --> initialize_globals()\n");
#endif
}
void parse_command_line_options(int argc, char **argv)
{
#ifdef DEBUG
fprintf(stderr, "Debug: Entering --> parse_command_line_options()\n");
#endif
int client_long_index;
int client_opt = getopt_long(argc,
argv,
convert_client_opt_string,
convert_client_long_options,
&client_long_index);
char *in_format_str = NULL;
opterr = 0; /* disable error reporting by GNU getopt */
initialize_globals();
while (client_opt != -1)
{
switch (client_opt)
{
case 'f':
in_format_str = optarg;
break;
case 'h':
print_usage(stdout);
exit(EXIT_SUCCESS);
case '?':
print_usage(stdout);
exit(EXIT_SUCCESS);
default:
break;
}
client_opt = getopt_long(argc,
argv,
convert_client_opt_string,
convert_client_long_options,
&client_long_index);
}
convert_globals.filenames = argv + optind;
convert_globals.num_filenames = argc - optind;
if (!in_format_str) {
fprintf(stderr, "Error: Specified input format was omitted; please specify one of required input formats\n");
print_usage(stderr);
exit(EXIT_FAILURE);
}
else if (convert_globals.num_filenames != 1) {
fprintf(stderr, "Error: Please specify an input file (either a regular file or '-' for stdin\n");
print_usage(stderr);
exit(EXIT_FAILURE);
}
/* map format string to setting */
if (strcmp(in_format_str, "gzip") == 0)
convert_globals.input_format = kGzip;
else {
fprintf(stderr, "Error: Specified input format is unknown; please specify one of required input formats\n");
print_usage(stderr);
exit(EXIT_FAILURE);
}
#ifdef DEBUG
fprintf(stderr, "Debug: Leaving --> parse_command_line_options()\n");
#endif
}
void print_usage(FILE *stream)
{
#ifdef DEBUG
fprintf(stderr, "Debug: Entering --> print_usage()\n");
#endif
fprintf(stream,
"%s\n" \
" version: %s\n" \
" author: %s\n" \
"%s\n",
name,
version,
authors,
usage);
#ifdef DEBUG
fprintf(stderr, "Debug: Leaving --> print_usage()\n");
#endif
}
Here is the build process:
$ mkdir -p objects
$ cc -Wall -Wextra -pedantic -std=c99 -D__STDC_CONSTANT_MACROS -D_FILE_OFFSET_BITS=64 -D_LARGEFILE64_SOURCE=1 -DDEBUG=1 -g -O0 -fno-inline -c convert.c -o objects/convert.o -iquote${PWD}
$ cc -Wall -Wextra -pedantic -std=c99 -D__STDC_CONSTANT_MACROS -D_FILE_OFFSET_BITS=64 -D_LARGEFILE64_SOURCE=1 -DDEBUG=1 -g -O0 -fno-inline objects/convert.o -o convert -lpthread
I have been able to build this test code on OS X and Linux hosts with reasonably modern compile environments.
Thanks in advance for any useful advice!
I will start by saying that I feel pthreads conditions and mutexes were not really necessary here, nor was non-blocking I/O the best reaction to the problems you describe.
In my opinion, the problems you describe with your condition- and mutex-less version are symptoms of forgetting to close() assiduously the ends of your pipes, with the result that a copy of the writing-end file descriptor of the pipe feeding the child process's stdin leaked (into that child or others) alive.
Then, given that a writing-end corresponding to stdin's reading-end still existed, the system did not give EOF but instead blocked indefinitely.
In your case, you did prevent the pipe-end file descriptors from leaking to the spawned child (with the correct close() calls on the child-side of the fork() within your popen3(), although you forgot to close() the wrong-end pipe ends on the parent-side). However, you did not prevent this leakage to all other children! If you call popen3() twice, the leakage of the set of three descriptors into the child is prevented, but as the parent still owns them, when the next call to popen3() happens, after the fork() there are now 6 file descriptors to close (The old set of three and and the new set of three you just created).
In your case, therefore, you should set the close-on-exec flag on those pipe ends, thusly:
fcntl(fdIn [PIPEWR], F_SETFD, fcntl(fdIn [PIPEWR], F_GETFD) | FD_CLOEXEC);
fcntl(fdOut[PIPERD], F_SETFD, fcntl(fdOut[PIPERD], F_GETFD) | FD_CLOEXEC);
fcntl(fdErr[PIPERD], F_SETFD, fcntl(fdErr[PIPERD], F_GETFD) | FD_CLOEXEC);
Here is code that spawns 6 threads and 3 processes, and passes its input unmodified to the output, after internally compressing then decompressing it. It effectively implements gzip -c - | XOR 0x55 | XOR 0x55 | gunzip -c - | cat, where:
Standard input is fed to to gzip by thread srcThrd.
gzip's output is read by thread a2xor0Thrd and fed to thread xor0Thrd.
Thread xor0Thrd XORs its input with 0x55 before passing it on to thread xor1Thrd.
Thread xor1Thrd XORs its input with 0x55 before passing it on to thread xor22BThrd.
Thread xor22BThrd feeds its input to process gunzip.
Process gunzip feeds its output directly (without going through a thread) to cat
Process cat's output is read by thread dstThrd and printed to standard output.
Compression is done by inter-process pipe communication, while XORing is done by intra-process pipe communication. No mutexes or condition variables are used. main() is extremely easy to understand. This code should be easy to extend to your situation.
/* Includes */
#include <stdlib.h>
#include <pthread.h>
#include <unistd.h>
#include <stdio.h>
#include <fcntl.h>
/* Defines */
#define PIPERD 0
#define PIPEWR 1
/* Data structures */
typedef struct PIPESET{
int Ain[2];
int Aout[2];
int Aerr[2];
int xor0[2];
int xor1[2];
int xor2[2];
int Bin[2];
int BoutCin[2];
int Berr[2];
int Cout[2];
int Cerr[2];
} PIPESET;
/* Function Implementations */
/**
* Source thread main method.
*
* Slurps from standard input and feeds process A.
*/
void* srcThrdMain(void* arg){
PIPESET* pipeset = (PIPESET*)arg;
char c;
while(read(0, &c, 1) > 0){
write(pipeset->Ain[PIPEWR], &c, 1);
}
close(pipeset->Ain[PIPEWR]);
pthread_exit(NULL);
}
/**
* A to XOR0 thread main method.
*
* Manually pipes from standard output of process A to input of thread XOR0.
*/
void* a2xor0ThrdMain(void* arg){
PIPESET* pipeset = (PIPESET*)arg;
char buf[65536];
ssize_t bytesRead;
while((bytesRead = read(pipeset->Aout[PIPERD], buf, 65536)) > 0){
write(pipeset->xor0[PIPEWR], buf, bytesRead);
}
close(pipeset->xor0[PIPEWR]);
pthread_exit(NULL);
}
/**
* XOR0 thread main method.
*
* XORs input with 0x55 and outputs to input of XOR1.
*/
void* xor0ThrdMain(void* arg){
PIPESET* pipeset = (PIPESET*)arg;
char c;
while(read(pipeset->xor0[PIPERD], &c, 1) > 0){
c ^= 0x55;
write(pipeset->xor1[PIPEWR], &c, 1);
}
close(pipeset->xor1[PIPEWR]);
pthread_exit(NULL);
}
/**
* XOR1 thread main method.
*
* XORs input with 0x55 and outputs to input of process B.
*/
void* xor1ThrdMain(void* arg){
PIPESET* pipeset = (PIPESET*)arg;
char c;
while(read(pipeset->xor1[PIPERD], &c, 1) > 0){
c ^= 0x55;
write(pipeset->xor2[PIPEWR], &c, 1);
}
close(pipeset->xor2[PIPEWR]);
pthread_exit(NULL);
}
/**
* XOR2 to B thread main method.
*
* Manually pipes from input (output of XOR1) to input of process B.
*/
void* xor22BThrdMain(void* arg){
PIPESET* pipeset = (PIPESET*)arg;
char buf[65536];
ssize_t bytesRead;
while((bytesRead = read(pipeset->xor2[PIPERD], buf, 65536)) > 0){
write(pipeset->Bin[PIPEWR], buf, bytesRead);
}
close(pipeset->Bin[PIPEWR]);
pthread_exit(NULL);
}
/**
* Destination thread main method.
*
* Manually copies the standard output of process C to the standard output.
*/
void* dstThrdMain(void* arg){
PIPESET* pipeset = (PIPESET*)arg;
char c;
while(read(pipeset->Cout[PIPERD], &c, 1) > 0){
write(1, &c, 1);
}
pthread_exit(NULL);
}
/**
* Set close on exec flag on given descriptor.
*/
void setCloExecFlag(int fd){
fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC);
}
/**
* Set close on exec flag on given descriptor.
*/
void unsetCloExecFlag(int fd){
fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) & ~FD_CLOEXEC);
}
/**
* Pipe4.
*
* Create a pipe with some ends possibly marked close-on-exec.
*/
#define PIPE4_FLAG_NONE (0U)
#define PIPE4_FLAG_RD_CLOEXEC (1U << 0)
#define PIPE4_FLAG_WR_CLOEXEC (1U << 1)
int pipe4(int fd[2], int flags){
int ret = pipe(fd);
if(flags&PIPE4_FLAG_RD_CLOEXEC){setCloExecFlag(fd[PIPERD]);}
if(flags&PIPE4_FLAG_WR_CLOEXEC){setCloExecFlag(fd[PIPEWR]);}
return ret;
}
/**
* Pipe4 explicit derivatives.
*/
#define pipe4_cloexec(fd) pipe4((fd), PIPE4_FLAG_RD_CLOEXEC|PIPE4_FLAG_WR_CLOEXEC)
/**
* Popen4.
*
* General-case for spawning a process and tethering it with cloexec pipes on stdin,
* stdout and stderr.
*
* #param [in] cmd The command to execute.
* #param [in/out] pin The pointer to the cloexec pipe for stdin.
* #param [in/out] pout The pointer to the cloexec pipe for stdout.
* #param [in/out] perr The pointer to the cloexec pipe for stderr.
* #param [in] flags A bitwise OR of flags to this function. Available
* flags are:
*
* POPEN4_FLAG_NONE:
* Explicitly specify no flags.
* POPEN4_FLAG_NOCLOSE_PARENT_STDIN,
* POPEN4_FLAG_NOCLOSE_PARENT_STDOUT,
* POPEN4_FLAG_NOCLOSE_PARENT_STDERR:
* Don't close pin[PIPERD], pout[PIPEWR] and perr[PIPEWR] in the parent,
* respectively.
* POPEN4_FLAG_CLOSE_CHILD_STDIN,
* POPEN4_FLAG_CLOSE_CHILD_STDOUT,
* POPEN4_FLAG_CLOSE_CHILD_STDERR:
* Close the respective streams in the child. Ignores pin, pout and perr
* entirely. Overrides a NOCLOSE_PARENT flag for the same stream.
*/
#define POPEN4_FLAG_NONE (0U)
#define POPEN4_FLAG_NOCLOSE_PARENT_STDIN (1U << 0)
#define POPEN4_FLAG_NOCLOSE_PARENT_STDOUT (1U << 1)
#define POPEN4_FLAG_NOCLOSE_PARENT_STDERR (1U << 2)
#define POPEN4_FLAG_CLOSE_CHILD_STDIN (1U << 3)
#define POPEN4_FLAG_CLOSE_CHILD_STDOUT (1U << 4)
#define POPEN4_FLAG_CLOSE_CHILD_STDERR (1U << 5)
pid_t popen4(const char* cmd, int pin[2], int pout[2], int perr[2], int flags){
/********************
** FORK PROCESS **
********************/
pid_t ret = fork();
if(ret < 0){
/**
* Error in fork(), still in parent.
*/
fprintf(stderr, "fork() failed!\n");
return ret;
}else if(ret == 0){
/**
* Child-side of fork
*/
if(flags & POPEN4_FLAG_CLOSE_CHILD_STDIN){
close(0);
}else{
unsetCloExecFlag(pin [PIPERD]);
dup2(pin [PIPERD], 0);
}
if(flags & POPEN4_FLAG_CLOSE_CHILD_STDOUT){
close(1);
}else{
unsetCloExecFlag(pout[PIPEWR]);
dup2(pout[PIPEWR], 1);
}
if(flags & POPEN4_FLAG_CLOSE_CHILD_STDERR){
close(2);
}else{
unsetCloExecFlag(perr[PIPEWR]);
dup2(perr[PIPEWR], 2);
}
execl("/bin/sh", "sh", "-c", cmd, NULL);
fprintf(stderr, "exec() failed!\n");
exit(-1);
}else{
/**
* Parent-side of fork
*/
if(~flags & POPEN4_FLAG_NOCLOSE_PARENT_STDIN &&
~flags & POPEN4_FLAG_CLOSE_CHILD_STDIN){
close(pin [PIPERD]);
}
if(~flags & POPEN4_FLAG_NOCLOSE_PARENT_STDOUT &&
~flags & POPEN4_FLAG_CLOSE_CHILD_STDOUT){
close(pout[PIPEWR]);
}
if(~flags & POPEN4_FLAG_NOCLOSE_PARENT_STDERR &&
~flags & POPEN4_FLAG_CLOSE_CHILD_STDERR){
close(perr[PIPEWR]);
}
return ret;
}
/* Unreachable */
return ret;
}
/**
* Main Function.
*
* Sets up the whole piping scheme.
*/
int main(int argc, char* argv[]){
pthread_t srcThrd, a2xor0Thrd, xor0Thrd, xor1Thrd, xor22BThrd, dstThrd;
pid_t gzip, gunzip, cat;
PIPESET pipeset;
pipe4_cloexec(pipeset.Ain);
pipe4_cloexec(pipeset.Aout);
pipe4_cloexec(pipeset.Aerr);
pipe4_cloexec(pipeset.Bin);
pipe4_cloexec(pipeset.BoutCin);
pipe4_cloexec(pipeset.Berr);
pipe4_cloexec(pipeset.Cout);
pipe4_cloexec(pipeset.Cerr);
pipe4_cloexec(pipeset.xor0);
pipe4_cloexec(pipeset.xor1);
pipe4_cloexec(pipeset.xor2);
/* Spawn processes */
gzip = popen4("gzip -c -", pipeset.Ain, pipeset.Aout, pipeset.Aerr, POPEN4_FLAG_NONE);
gunzip = popen4("gunzip -c -", pipeset.Bin, pipeset.BoutCin, pipeset.Berr, POPEN4_FLAG_NONE);
cat = popen4("cat", pipeset.BoutCin, pipeset.Cout, pipeset.Cerr, POPEN4_FLAG_NONE);
/* Spawn threads */
pthread_create(&srcThrd, NULL, srcThrdMain, &pipeset);
pthread_create(&a2xor0Thrd, NULL, a2xor0ThrdMain, &pipeset);
pthread_create(&xor0Thrd, NULL, xor0ThrdMain, &pipeset);
pthread_create(&xor1Thrd, NULL, xor1ThrdMain, &pipeset);
pthread_create(&xor22BThrd, NULL, xor22BThrdMain, &pipeset);
pthread_create(&dstThrd, NULL, dstThrdMain, &pipeset);
pthread_join(srcThrd, (void**)NULL);
pthread_join(a2xor0Thrd, (void**)NULL);
pthread_join(xor0Thrd, (void**)NULL);
pthread_join(xor1Thrd, (void**)NULL);
pthread_join(xor22BThrd, (void**)NULL);
pthread_join(dstThrd, (void**)NULL);
return 0;
}
Commentary on your own code
There are many issues with your code, most of which have nothing to do with threading.
You don't close() the file descriptor d->gunzip_ptr->in. This means that gunzip can never know that no more input is forthcoming on its stdin, so it will never exit.
Since gunzip doesn't ever exit, it will never close() its stdout, and thus a blocking read() at the other end will never unblock. A non-blocking read will instead always give -1, with errno == EAGAIN.
Your popen3() doesn't close() p_stdin[POPEN3_READ], p_stdout[POPEN3_WRITE] or p_stderr[POPEN3_WRITE] on the parent side of the fork(). Only the child should have those descriptors. Failing to close these means that when the parent itself tries to read the stdout and stderr of the child, it will never see EOF, again for the same reasons as above: Because it itself still owns a write-end pipe in which it could write, making new data appear to the read-end.
Your code implicitly relies on gunzip writing out at least one byte for every 1024 you write in. There is no guarantee that this will be the case, since gunzip may, at its leisure, buffer internally.
This is because your code reads then copies chunks of at most BUF_LENGTH_VALUE bytes into d->in_buf. You then assign the number of bytes you read through fread() to d->n_in_bytes. This same d->n_in_bytes is used in your write() call to write to gunzip's stdin. You then signal for consume_gunzip_chunk() to wake up, then pthread_cond_wait()'s for the next gzip-compressed chunk. But this gzip-compressed chunk may never come, since there is no guarantee that gunzip will be able to unpack useful output from just the first 1024 bytes of input, nor even a guarantee that it will write() it out instead of buffering it until it has, say, 4096 bytes (a full page) of output. Therefore, the read() call in consume_gunzip_chunk() may never succeed (or even return, if read() was blocking). And if read() never returns, then consume_gunzip_chunk() doesn't signal d->in_cond, and so all three threads get stuck. And even if read() is non-blocking, the last block of output from gzip may never come, since gzip's input is never closed, so it doesn't flush out its buffers by write()'ing them out, so read() on the other end will never get useful data and no amount of pleading will elicit it without a close().
POSSIBLE (LIKELY?) CAUSE OF BUG: d->n_out_bytes_read_from_gunzip, once it becomes non-0, will never become 0 again. This means that the extremely baffling
while (d->n_in_bytes != 0 || d->n_out_bytes_read_from_gunzip != 0)
pthread_cond_wait(&d->in_cond, &d->in_lock);
within produce_gzip_chunk() will, once entered with d->n_out_bytes_read_from_gunzip != 0, forever remain stuck. By calling sleep(1) within consume_gunzip_chunk(), which sets d->n_out_bytes_read_from_gunzip, you may have defused the problem by reading all input before consume_gunzip_chunk() could lock up the system by setting d->n_out_bytes_read_from_gunzip to a non-zero value.
There are two threads that call pthread_cond_wait(&d->in_cond, &d->in_lock);, these being produce_gzip_chunk() and consume_gzip_chunk(). There is absolutely no guarantee that when consume_gunzip_chunk() calls pthread_cond_signal(&d->in_cond);, that the "correct" thread (whichever it is in your design) will receive the signal. To ensure that all of them will, use pthread_cond_broadcast(), but then you expose yourself to the thundering herd problem. Needing to use pthread_cond_broadcast() in this situation is, again, a symptom of a bad design in my opinion.
Related, you call pthread_cond_signal(&d->in_cond) within a thread (indeed, a function) in which you call pthread_cond_wait(&d->in_cond, &d->in_lock). What purpose does that serve?
You use d->in_lock for too many disparate purposes, exposing yourself to the possibility of deadlock, or low performance due to excessive protection. In particular you use it as the protection for both d->in_cond and d->out_cond. This is too strong a protection – the output of gunzip into d->in_line should be able to happen simultaneously with the input of gunzip being written into and out of d->in_buf.
Within consume_gunzip_chunk(), you have
while (d->n_in_bytes_written_to_gunzip == 0) {
pthread_cond_wait(&d->out_cond, &d->in_lock);
}
if (d->n_in_bytes_written_to_gunzip) {
...
This if can never fail! Is there a case you may have in mind?
Consider making the entire struct pthread_data volatile (or at least those integer elements which are used by multiple threads), since the compiler might decide to optimize out loads and stores that should, in fact, remain.
Compliments
So as to not sound too negative, I would like to say that in general your problems were not due to misuse of the pthreads API but due to erroneous consumer-producer logic and lack of close()s. Additionally, you appear to understand that pthread_cond_wait() may wake up spuriously, and so you have wrapped it up in a loop that checks the invariant.
In the future:
I would use pipes, even between threads. This absolves you from needing to implement your own consumer-producer scheme; The kernel has solved it for you already, and provides you with the pipe(), read() and write() primitives, which are all you need to take advantage of this ready-made solution. It also makes the code cleaner and void of mutexes and condition variables. One must simply be diligent in closing the ends, and one must be supremely careful around pipes in the presence of fork(). The rules are simple:
If a pipe's write-end exists, a read() on an open read-end will not give EOF but will block or EAGAIN.
If a pipe's write-ends have all been closed, a read() on an open read-end will give EOF.
If a pipe's read-ends have all been closed, a write() to any of its write-ends will cause SIGPIPE.
fork() duplicates the entire process, including all descriptors (modulo maybe crazy stuff in pthread_atfork())!
Ah. So I think I misunderstood the question.... sorry.
I had thought you wanted to run gunzip and then one other internal filter, and wanted to do that 'N' times.
It seems what you really want to do is run many stages of filters, one after the other... some using external commands and some (perhaps ?) internal to the program. Hence the desire to manage some inter-stage buffering.
So... I've had another go at this. The objective is to run any number of stages, starting with the input stage, then extrenal command or internal function "filter" stages, and finally the output stage. Each external command stage had three pthreads -- for stdin, stdout and stderr. Internal function stages use one pthread and the initial input and final output one pthread each. Between the stages is a small pipe structure (called a "straw") to "double buffer" and decouple the stages... I hope this is closer to what you had in mind.
The "straw" is the essence of the thing:
struct straw
{
pthread_mutex_t* mutex ;
struct lump* free ;
pthread_cond_t* free_cond ;
bool free_waiting ;
struct lump* ready ;
pthread_cond_t* ready_cond ;
bool ready_waiting ;
struct lump* lumps[2] ;
} ;
where a struct lump contains a buffer and what-not. The "straw" has two such "lumps", and at any moment one pthread may be filling one lump, while another is draining the other. Or both lumps may be free (on the free list) or both ready (full) waiting on the ready list.
Then to aquire an empty lump to fill it (eg when reading from a pipe):
static struct lump*
lump_acquire(struct straw* strw)
{
struct lump* lmp ;
pthread_mutex_lock(strw->mutex) ;
while (strw->free == NULL)
{
strw->free_waiting = true ;
pthread_cond_wait(strw->free_cond, strw->mutex) ;
strw->free_waiting = false ;
} ;
lmp = strw->free ;
strw->free = lmp->next ;
pthread_mutex_unlock(strw->mutex) ;
lmp->next = NULL ; /* tidy */
lmp->ptr = lmp->end = lmp->buff ; /* empty */
lmp->done = false ;
return lmp ;
} ;
Then to blow the completed lump into (one end of) the straw.
static void
lump_blow(struct lump* lmp)
{
struct straw* strw ;
strw = lmp->strw ;
qassert((lmp == strw->lumps[0]) || (lmp == strw->lumps[1])) ;
qassert( (lmp->buff <= lmp->ptr)
&& (lmp->ptr <= lmp->end)
&& (lmp->end <= lmp->limit) ) ;
lmp->ptr = lmp->buff ;
pthread_mutex_lock(strw->mutex) ;
if (strw->ready == NULL)
strw->ready = lmp ;
else
strw->ready->next = lmp ;
lmp->next = NULL ;
if (strw->ready_waiting)
pthread_cond_signal(strw->ready_cond) ;
pthread_mutex_unlock(strw->mutex) ;
} ;
To suck a lump out of (the other end of) the straw:
static struct lump*
lump_suck(struct straw* strw)
{
struct lump* lmp ;
pthread_mutex_lock(strw->mutex) ;
while (strw->ready == NULL)
{
strw->ready_waiting = true ;
pthread_cond_wait(strw->ready_cond, strw->mutex) ;
strw->ready_waiting = false ;
} ;
lmp = strw->ready ;
strw->ready = lmp->next ;
pthread_mutex_unlock(strw->mutex) ;
qassert( (lmp->buff <= lmp->ptr)
&& (lmp->ptr <= lmp->end)
&& (lmp->end <= lmp->limit) ) ;
lmp->ptr = lmp->buff ; /* lmp->ptr..lmp->end */
lmp->next = NULL ; /* tidy */
return lmp ;
} ;
And the final piece, freeing a lump once it has been drained:
static void
lump_free(struct lump* lmp)
{
struct straw* strw ;
strw = lmp->strw ;
qassert((lmp == strw->lumps[0]) || (lmp == strw->lumps[1])) ;
qassert( (lmp->buff <= lmp->ptr)
&& (lmp->ptr <= lmp->end)
&& (lmp->end <= lmp->limit) ) ;
pthread_mutex_lock(strw->mutex) ;
if (strw->free == NULL)
strw->free = lmp ;
else
strw->free->next = lmp ;
lmp->next = NULL ; /* end of list of free */
lmp->ptr = lmp->end = lmp->buff ; /* empty */
lmp->done = false ;
if (strw->free_waiting)
pthread_cond_signal(strw->free_cond) ;
pthread_mutex_unlock(strw->mutex) ;
} ;
The entire program is too big to fit in an answer -- see: pipework.c where that starts:
/*==============================================================================
* pipework.c
*
* Copyright (c) Chris Hall (GMCH) 2014, All rights reserved.
*
* Though you may do what you like with this, provided you recognise that
* it is offered "as is", gratis, and may or may not be fit for any purpose
* whatsoever -- you are on your own.
*
*------------------------------------------------------------------------------
*
* This will read from stdin, pass the data through an arbitrary number of
* "filter" stages and finally write the result to stdout.
*
* A filter stage may be an external command taking a piped stdin and
* outputting to a piped stdout. Anything it says to stderr is collected
* and output to the program's stderr.
*
* A filter stage may also be an internal function.
*
* The input, filter and output stages are implemented as a number of pthreads,
* with internal, miniature pipes (called "straws") between them. All I/O is
* blocking. This is an experiment in the use of pthreads to simplify things.
*
* ============================
* This is v0.08 of 4-Jul-2014
* ============================
*
* The 'main' below runs eight stages: input, 4 commands, 2 internal filters
* and the output. The output should be an exact copy of the input.
*
* In order to test the stderr handling, the following small perl script is
* used as two of the command filters:
*
* chatter.pl
* --------------------------------------------------------
use strict ;
use warnings ;
my $line = 0 ;
while (<STDIN>)
{
my $len = length($_) ;
my $r = rand ;
$line += 1 ;
print STDERR "|$line:$len:$r|" ;
if (int($r * 100) == 0)
{
print STDERR "\n" ;
} ;
print $_ ;
} ;
* --------------------------------------------------------
*
*------------------------------------------------------------------------------
* Limitations
*
* * this will crash if it gets some error its not expecting or not
* designed to overcome. Clearly, to be useful this needs to be more
* robust and more informative.
*
* * some (possible/theoretical) errors are simply ignored.
*
* * no attempt is made to collect up the child processes or to discover
* their return codes. If the child process reports errors or anything
* else on stderr, then that will be visible. But otherwise, if it just
* crashes then the pipeline will run to completion, but the result may
* be nonsense.
*
* * if one of the child processes stalls, the whole thing stalls.
*
* * an I/O error in a stage will send 'end' downstream, but the program
* will continue until everything upstream has completed.
*
* * generally... not intended for production use !!
*/
And the perl script is available as: chatter.pl
HTH
Regarding the part 'How to manage two or more consumers via pthreads?' of your post let me cite these points about 'Designing Threaded Programs':
In general though, in order for a program to take advantage of
Pthreads, it must be able to be organized into discrete, independent
tasks which can execute concurrently. For example, if routine1 and
routine2 can be interchanged, interleaved and/or overlapped in real
time, they are candidates for threading.
and
Several common models for threaded programs exist:
Manager/worker: a single thread, the manager assigns work to other threads, the workers. Typically, the manager handles all input and
parcels out work to the other tasks. At least two forms of the
manager/worker model are common: static worker pool and dynamic
worker pool.
Pipeline: a task is broken into a series of suboperations, each of which is handled in series, but concurrently, by a different thread.
An automobile assembly line best describes this model.
Peer: similar to the manager/worker model, but after the main thread creates other threads, it participates in the work.
Regarding your problem...
The problem I am facing is that I need to add a sleep(1) statement in
consume_gunzip_chunk, before doing the read, in order to get things
working properly.
Eric Lippert Best Practices with Multithreading in C# might not solve it but, they should help you finding the right solution to your multi-threaded program, particularly points 5, and 8:
5.At all costs avoid shared memory. Most threading bugs are caused by a failure to understand real-world shared memory semantics. If you
must make threads, treat them as though they were processes: give them
everything they need to do their work and let them work without
modifying the memory associated with any other thread. Just like a
process doesn't get to modify the memory of any other process.
8.If you use Thread.Sleep with an argument other than zero or one in any production code, you are possibly doing something wrong. Threads
are expensive; you don't pay a worker to sleep, so don't pay a thread
to sleep either. If you are using sleeps to solve a correctness issue
by avoiding a timing problem -- as you appear to be in your code --
then you definitely have done something deeply wrong. Multithreaded
code needs to be correct irrespective of accidents of timing.
Scenario: Say I have 8 files that I want to sort all the data of numbers it has in order from least to greatest. Only leaf proceses can sort all the numbers that a file contains. These leaf processes must send the sorted data to a parent process via pipes. This parent process will compare the data is receives and send which ever number is smaller to the next process up. It will do this until all number in the pipe is empty.
So think of it as a tree. We have one master process. With 8 files to sort, the Master process will spawn 2 processes off of it (a left and a right). Those two new processes will spawn their own processes. This will happen until there are 8 leaf processes at the bottom. Internal nodes can only hold onto one number. These will pass their number along a series of pipes until they reach the master process. The master process will output its piped contents to a file.
I've included the code here (as it is a bit lengthy but straightforward).
This works if I have 2 files to sort. So we have 1 master process and then two children. The two children sort their file's numbers and then pass them up. The master process then prints out the data in order from the pipes. However if I add some complexity (4 files), the leaf processes still send their data up, however when the master process begins to read the from the internal nodes pipes, it thinks it is empty and finishes the program without any data.
Any idea why the master process is thinking that its left and right pipes are empty?
Like I said, works great when there is one parent and 2 children. Anymore processes and it fails. (assuming that processing will happen in powers of 2).
NOTE: perror is being used for debugging purposes.
full program here [very messy as I have been doing a lot with it but it will compile.
The updated code in Pastebin is not a compilable function - let alone the complete program. That makes it hard to say what's really wrong.
However, one of the problems is in the code fragment:
if (pipe(upPipe) < 0 || pipe(leftPipe) < 0 || pipe(rightPipe) < 0)
...error exit...
if ((leftPID = fork()) < 0)
...error exit...
if(leftPID == 0){
fMax = ((fMax)/2);
dup2(leftPipe[WRITE], upPipe[WRITE]);
pipe(leftPipe);
pipe(rightPipe);
The call to dup2() is odd; you carefully map the write channel of the left pipe to the write channel of the up pipe.
The two pipe() calls after the dup2() fairly promptly screw up everything in the left child, opening 4 more file descriptors but losing the previous values stored in leftPipe and rightPipe.
You need to make your problem statement clearer. I cannot fathom from what you've got what you're supposed to have. There's a call to convertToInt() which takes no arguments and returns no value; what on earth is that doing? There's a call to freeMem(); it is not clear what that does.
z.c:42: error: ‘numberChar’ undeclared (first use in this function)
z.c:42: error: ‘sizeNumbers’ undeclared (first use in this function)
z.c:43: warning: implicit declaration of function ‘readFile’
z.c:43: error: ‘fileNames’ undeclared (first use in this function)
z.c:45: warning: implicit declaration of function ‘convertToInt’
z.c:46: error: ‘i’ undeclared (first use in this function)
z.c:46: error: ‘numbs’ undeclared (first use in this function)
z.c:47: error: ‘numbers’ undeclared (first use in this function)
z.c:48: warning: implicit declaration of function ‘freeMem’
Sorry, your question is unanswerable because you are not giving us:
The accurate requirements.
The code you've actually got compiling.
Your code does not have a good clean break up of the functions. Do you use a VCS (version control system - such as git)? If not, you should. I made the changed version below - which is essentially a complete rewrite - in 9 check-ins, and should probably have made more smaller check-ins than that. But using a VCS was crucial to me; it allowed me to make changes with confidence, knowing I would not lose anything valuable. And I didn't have to comment code out; I removed the stuff I didn't want. The solution below is 261 lines; the original was about 687 lines in total, including a lot of commented out code; when I'd finished stripping out the comments, etc, it came down to 469 lines.
When I got your code running (and reporting on which files were being opened by each child), I found that there were 2 processes opening each of files 2 and 3 (and since the data files didn't exist at the time, they failed at that point).
The revised code has an almost clean structure; the odd bit is the 'convertToString()' phase which reads binary integers off a pipe and converts them to ASCII output again. It works; I'm not convinced it is elegant. Instead of using an array of hard-coded file names, it takes an arbitrary list of file names from the command line (it does not have to be 8; it has been tested with 0 through 8, and I've no reason to think it won't handle 20 or more). I did a fair amount of testing with:
./piped-merge-sort [1-8]
There is copious diagnostic output. I've used two functions that I find vastly helpful in my work - I have them packaged up with some other related code in a more complex package, but the simple versions of err_error() and err_remark() functions really help me. Note that these versions report the PID of the reporting process for each call. They're also careful to pre-format the message into a string and then write the string in one print to standard error; otherwise, I was getting a lot of interleaved output which was confusing at best.
'Nuff said - here's the code:
#include <assert.h>
#include <errno.h>
#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/wait.h>
#include <unistd.h>
/* Error reporting */
static void err_vremark(char *fmt, va_list args)
{
char buffer[256];
int errnum = errno;
int buflen = snprintf(buffer, sizeof(buffer), "%d: ", (int)getpid());
buflen += vsnprintf(buffer + buflen, sizeof(buffer) - buflen, fmt, args);
if (errnum != 0)
buflen += snprintf(buffer + buflen, sizeof(buffer) - buflen,
": errno = %d (%s)", errnum, strerror(errnum));
fprintf(stderr, "%s\n", buffer);
}
static void err_error(char *fmt, ...)
{
va_list args;
va_start(args, fmt);
err_vremark(fmt, args);
va_end(args);
exit(1);
}
static void err_remark(char *fmt, ...)
{
va_list args;
va_start(args, fmt);
err_vremark(fmt, args);
va_end(args);
}
enum { READ = 0 };
enum { WRITE = 1 };
enum { BUFFER_SIZE = 256 };
static int *a_data = 0;
static int a_used = 0;
static int a_size = 0;
void readFile(char const *fileName);
void freeMem(void);
void sortArray(void);
int intcmp(void const *n1, void const *n2);
static void sortMergeFiles(int fd, int number, char **names);
static void sortOneFile(int fd, const char *file);
static void convertToString(int fd, FILE *fp);
int main(int argc, char **argv)
{
int m_pipe[2];
pid_t pid;
if (pipe(m_pipe) < 0)
err_error("Failed to create master pipe");
if ((pid = fork()) < 0)
err_error("Failed to fork master");
else if (pid == 0)
{
close(m_pipe[READ]);
sortMergeFiles(m_pipe[WRITE], argc - 1, &argv[1]);
close(m_pipe[WRITE]);
}
else
{
close(m_pipe[WRITE]);
convertToString(m_pipe[READ], stdout);
close(m_pipe[READ]);
}
return 0;
}
static void convertToString(int fd, FILE *fp)
{
int value;
while (read(fd, &value, sizeof(int)) == sizeof(int))
fprintf(fp, "%d\n", value);
}
static int readInteger(int fd, int *value)
{
if (read(fd, value, sizeof(int)) != sizeof(int))
return EOF;
return 0;
}
static void writeInteger(int fd, int value)
{
if (write(fd, &value, sizeof(int)) != sizeof(int))
err_error("Failed to write integer to fd %d", fd);
}
static void mergeFiles(int fd_in1, int fd_in2, int fd_out)
{
int value_1;
int value_2;
int eof_1 = readInteger(fd_in1, &value_1);
int eof_2 = readInteger(fd_in2, &value_2);
while (eof_1 != EOF && eof_2 != EOF)
{
err_remark("v1: %d; v2: %d", value_1, value_2);
if (value_1 <= value_2)
{
writeInteger(fd_out, value_1);
eof_1 = readInteger(fd_in1, &value_1);
}
else
{
writeInteger(fd_out, value_2);
eof_2 = readInteger(fd_in2, &value_2);
}
}
while (eof_1 != EOF)
{
err_remark("v1: %d", value_1);
writeInteger(fd_out, value_1);
eof_1 = readInteger(fd_in1, &value_1);
}
while (eof_2 != EOF)
{
err_remark("v2: %d", value_2);
writeInteger(fd_out, value_2);
eof_2 = readInteger(fd_in2, &value_2);
}
}
static void sortMergeFiles(int fd, int number, char **names)
{
assert(number >= 0);
if (number == 0)
return;
else if (number == 1)
sortOneFile(fd, names[0]);
else
{
err_remark("Non-Leaf: processing %d files (%s .. %s)", number, names[0], names[number-1]);
int mid = number / 2;
int l_pipe[2];
int r_pipe[2];
pid_t l_pid;
pid_t r_pid;
if (pipe(l_pipe) < 0 || pipe(r_pipe) < 0)
err_error("Failed to create pipes");
if ((l_pid = fork()) < 0)
err_error("Failed to fork left child");
else if (l_pid == 0)
{
close(l_pipe[READ]);
close(r_pipe[READ]);
close(r_pipe[WRITE]);
sortMergeFiles(l_pipe[WRITE], mid, names);
close(l_pipe[WRITE]);
exit(0);
}
else if ((r_pid = fork()) < 0)
err_error("Failed to fork right child");
else if (r_pid == 0)
{
close(r_pipe[READ]);
close(l_pipe[READ]);
close(l_pipe[WRITE]);
sortMergeFiles(r_pipe[WRITE], number - mid, names + mid);
close(r_pipe[WRITE]);
exit(0);
}
else
{
close(l_pipe[WRITE]);
close(r_pipe[WRITE]);
mergeFiles(l_pipe[READ], r_pipe[READ], fd);
close(l_pipe[READ]);
close(r_pipe[READ]);
err_remark("Non-Leaf: finished %d files (%s .. %s)", number, names[0], names[number-1]);
}
}
}
static void addNumberToArray(int number)
{
assert(a_used >= 0 && a_used <= a_size);
if (a_used == a_size)
{
int n_size = (a_size + 1) * 2;
int *n_data = realloc(a_data, sizeof(*n_data) * n_size);
if (n_data == 0)
err_error("Failed to allocate space for %d numbers", n_size);
a_data = n_data;
a_size = n_size;
}
a_data[a_used++] = number;
}
/* Could be compressed to write(fd, a_data, a_used * sizeof(int)); */
/* Arguably should check for write errors - but not SIGPIPE */
static void writeArray(int fd)
{
for (int i = 0; i < a_used; i++)
{
err_remark("Write: %d", a_data[i]);
write(fd, &a_data[i], sizeof(int));
}
}
void readFile(char const *fileName)
{
char buffer[BUFFER_SIZE];
FILE *fp;
fp = fopen(fileName, "r");
if (fp == NULL)
err_error("Failed to open file %s for reading", fileName);
while (fgets(buffer, sizeof(buffer), fp) != NULL)
{
char *nl = strchr(buffer, '\n');
if (nl != 0)
*nl = '\0';
err_remark("Line: %s", buffer);
addNumberToArray(atoi(buffer));
}
fclose(fp);
}
int intcmp(const void *n1, const void *n2)
{
const int num1 = *(const int *) n1;
const int num2 = *(const int *) n2;
return (num1 < num2) ? -1 : (num1 > num2);
}
void sortArray(void)
{
qsort(a_data, a_used, sizeof(int), intcmp);
}
void freeMem(void)
{
free(a_data);
}
static void sortOneFile(int fd, const char *file)
{
err_remark("Leaf: processing file %s", file);
readFile(file);
sortArray();
writeArray(fd);
freeMem();
err_remark("Leaf: finished file %s", file);
}
I have some gzipped files that I want to read in C via fopen and fscanf. Is there anyway to do this without having to gunzip the files to temporary files?
Thanks.
You can use libzlib to open the gzipped files directly.
It also offers a "gzopen" function that behaves similar to fopen but operates on gzipped files. However, fscanf would probably not work on such a handle, since it expects normal FILE pointers.
If popen is fair game, you can do it with fopen and fscanf:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
int main(int argc, char *argv[])
{
const char prefix[] = "zcat ";
const char *arg;
char *cmd;
FILE *in;
char buf[4096];
if (argc != 2) {
fprintf(stderr, "Usage: %s file\n", argv[0]);
return 1;
}
arg = argv[1];
cmd = malloc(sizeof(prefix) + strlen(arg) + 1);
if (!cmd) {
fprintf(stderr, "%s: malloc: %s\n", argv[0], strerror(errno));
return 1;
}
sprintf(cmd, "%s%s", prefix, arg);
in = popen(cmd, "r");
if (!in) {
fprintf(stderr, "%s: popen: %s\n", argv[0], strerror(errno));
return 1;
}
while (fscanf(in, "%s", buf) == 1)
printf("%s: got [%s]\n", argv[0], buf);
if (ferror(in)) {
fprintf(stderr, "%s: fread: %s\n", argv[0], strerror(errno));
return 1;
}
else if (!feof(in)) {
fprintf(stderr, "%s: %s: unconsumed input\n", argv[0], argv[1]);
return 1;
}
return 0;
}
For example:
$ zcat file.gz
Every good boy does fine.
$ ./gzread file.gz
./gzread: got [Every]
./gzread: got [good]
./gzread: got [boy]
./gzread: got [does]
./gzread: got [fine.]
Do not use
sprintf(cmd, "zcat %s", argv[1]);
popen(cmd,"r");
to open .gz files. Properly escape argv[1] instead. You may otherwise end up with a vulnerability, especially when some injects an argument argv[1] such as
123;rm -rf /
It already helps to change the above instruction into
sprintf(cmd, "zcat \'%s\'",argv[1]);
You may also want to escape characters such as '\0', '\'', '\;' etc.
Newbie attempt at gzscanf():
#include <stdio.h>
#include <stdarg.h>
#include <zlib.h>
#define MAXLEN 256
int gzscanf(gzFile *stream, const char *fmt, ...) {
/* read one line from stream (up to newline) and parse with sscanf */
va_list args;
va_start(args, fmt);
int n;
static char buf[MAXLEN];
if (NULL == gzgets(stream, buf, MAXLEN)) {
printf("gzscanf: Failed to read line from gz file.\n");
exit(EXIT_FAILURE);
}
n = vsscanf(buf, fmt, args);
va_end(args);
return n;
}
You can use zlib and wrap it to a regular file pointer, this way you can use fscanf,fread,etc. transparently.
FILE *myfopen(const char *path, const char *mode)
{
#ifdef WITH_ZLIB
gzFile *zfp;
/* try gzopen */
zfp = gzopen(path,mode);
if (zfp == NULL)
return fopen(path,mode);
/* open file pointer */
return funopen(zfp,
(int(*)(void*,char*,int))gzread,
(int(*)(void*,const char*,int))gzwrite,
(fpos_t(*)(void*,fpos_t,int))gzseek,
(int(*)(void*))gzclose);
#else
return fopen(path,mode);
#endif
}
You can use zlib, but it will require you to replace your I/O calls to be zlib-specific.
you have to open a pipe to do this. The basic flow in pseudo code is:
create pipe // man pipe
fork // man fork
if (parent) {
close the writing end of the pipe // man 2 close
read from the pipe // man 2 read
} else if (child) {
close the reading end of the pipe // man 2 close
overwrite the file descriptor for stdout with the writing end of the pipe // man dup2
call exec() with gzip and the relevant parameters // man 3 exec
}
You can use the man pages in the comments for more details on how to do this.
It's quite simple to use zlib to open .gz files. There's a reasonable manual over at zlib.net.
Here's a quick example to get you started:
#include <stdio.h>
#include <zlib.h>
int main( int argc, char **argv )
{
// we're reading 2 text lines, and a binary blob from the given file
char line1[1024];
char line2[1024];
int blob[64];
if (argc > 1)
{
const char *filename = argv[1];
gzFile gz_in = gzopen( filename, "rb" ); // same as fopen()
if (gz_in != NULL)
{
if ( gzgets( gz_in, line1, sizeof(line1) ) != NULL ) // same as fgets()
{
if ( gzgets( gz_in, line2, sizeof(line2) ) != NULL )
{
if ( gzfread( blob, sizeof(int), 64, gz_in ) == 64 ) // same as fread()
{
printf("Line1: %s", line1);
printf("Line2: %s", line2);
// ...etc
}
}
}
gzclose(gz_in); // same as fclose()
}
else
{
printf( "Failed to GZ-open [%s]\n", filename );
}
}
return 0;
}
Remember to link with zlib, under UNIX gcc ... -lz