I'm writing a program that will read from /etc/passwd and output the username and shell.
For example, here is the first line:
root:x:0:0:root:/root:/bin/bash
I need to only output the user and the shell. In this instance it would print:
root:/bin/bash
The values are seperated by : so I just need to print the string before the first : and the string after the 6th :. How can I do that?
Here is the code I have so far:
#include <unistd.h>
#include <fcntl.h>
//printf and scanf prototype
int printf(const char *text, ...);
int scanf(const char *format, ...);
int main(void)
{
int fd;
int buff_size = 1;
char buff[512];
int size;
fd = open("/etc/passwd",O_RDONLY);
if (fd < 0)
{
printf("Error opening file \n");
return -1;
}
while ((size = read(fd,buff,1))>0)
{
buff[1] = '\0';
printf("%s",buff);
}
}
So far the code reads from /etc/passwd and prints out the whole file line by line. I need to configure the printf statement to just print the user and shell for each instance.
I just need to print the string before the first : and the string after the 6th :.
Filter the string, put the needed data into an output buffer, and print the output buffer.
Since you already used char buff[512]; as the original data buffer, then just create another buffer for output. i.e. char outbuff[512];
// char buff[512]; assume already defined as that in PO
// char outbuff[512]; assume already defined as that in PO
void customized_print(char* buff, char* outbuff) {
int i = 0, j = 0, cnt = 0;
while (i < 512) { // copy until first ':'
if((outbuff[j++]=buff[i++]) == ':') break;
}
// skip 5 more ':'
while (i < 512 && cnt < 5) {
if(buff[i++] == ':') cnt++;
}
// copy the remains
while (i < 512) {
if( (outbuff[j++]=buff[i++]) == '\0' ) break;
}
printf("%s\n", outbuff);
outbuff[0] = 0; // clean the outbuff
}
Another solution with a demo of full code.
If you do not need to use buff. You can use the following solution.
#include <stdio.h>
void customized_print(char* buff, char* outbuff) {
int cnt = 0;
char* p = outbuff;
while (*buff) { // copy until first ':'
if((*p++=*buff++) == ':') break;
}
// skip 5 more ':'
while (*buff && cnt < 5) {
if(*buff++ == ':') cnt++;
}
// copy the remains
while (*buff) {
if( (*p++=*buff++) == '\0' ) break;
}
printf("%s\n", outbuff);
outbuff[0]=0; // clean the outbuff
}
int main(){
char outbuff[512];
customized_print("root:x:0:0:root:/root:/bin/bash", outbuff);
customized_print("foo:x:0:0:root:/root:/bin/bash", outbuff);
return 0;
}
Compile and run:
gcc -Wall demo.c -o demo.out
./demo.out
Output:
root:/bin/bash
foo:/bin/bashh
Related
I was writing a tool that replaces enough spaces with tab,
but it counts more spaces than expected.
Algorithm should count spaces until seeing character or
newline, if there is a character or newline, it passes to next
element of string; but unfortunately continues to count spaces instead
of passing newline or character.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/stat.h>
#define BUFFER_SIZE 100
int set_tab_init_arg(char *filename, int space_size)
{
struct stat filest;
FILE *fp = fopen(filename, "r+");
int fd = fileno(fp);
int line = 0, space = 0, var = 0, offset = 0;
if (fstat(fd, &filest) < 0)
return -1;
char *filecontent = (char*)malloc(filest.st_size);
char buff[BUFFER_SIZE];
if (fp == NULL) {
fprintf(stderr, "cyntax: invalid file pointer!\n");
return -1;
}
while (fgets(buff, BUFFER_SIZE, fp) != NULL)
strcat(filecontent, buff);
char *tmpcontent, *willbewrited = filecontent;
for (int x = 0; x < strlen(filecontent); x++) {
if (filecontent[x] == 32) {
printf("space\n");
space++;
}
else if (filecontent[x++] == 10) {
line++;
printf("newline: %d\n", line);
space = 0;
}
else {
printf("char\n");
x++;
var++;
}
if (space == space_size) {
replace_spaces_to_tab();
printf("space = space_size\n");
}
filecontent++, offset++;
}
}
here is file I tested on:
a
b
c
d
e f g
c h e
p ğ a
and debug result is here (stdout):
space
newline: 1
newline: 2
newline: 3
newline: 4
space
space
space = space_size
newline: 5
space
space
space = space_size
newline: 6
space
char
I executed with parameters:
./cyntax try -t 2
Note-1: try is file and -t 2 gets space size for converting enough spaces to tab.
Note-2: I used printf() function for debugging.
I think your problems were created by:
incrementing x additionally in the for loop x++; ( I believe this was already fixed in the comments)
incrementing the filecontent pointer at the end of your for loop filecontent++;
I code on linux. That's why I had to change your line break detection slightly. (Do line endings differ between Windows and Linux?). not relevant to your problem though...
My Solution
I markes my changes with CHANGE::
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/stat.h>
#define BUFFER_SIZE 100
int set_tab_init_arg(char *filename, int space_size)
{
struct stat filest;
FILE *fp = fopen(filename, "r+");
int fd = fileno(fp);
int line = 0, space = 0, var = 0, offset = 0;
if (fstat(fd, &filest) < 0)
return -1;
char *filecontent = (char*)malloc(filest.st_size);
char buff[BUFFER_SIZE];
if (fp == NULL) {
fprintf(stderr, "cyntax: invalid file pointer!\n");
return -1;
}
while (fgets(buff, BUFFER_SIZE, fp) != NULL)
strcat(filecontent, buff);
char *tmpcontent, *willbewrited = filecontent;
for (int x = 0; x < strlen(filecontent); x++) {
if (filecontent[x] == 32) {
printf("space\n");
space++;
}
else if (filecontent[x] == 10) { // CHANGE:: on linux. for windows filcontent[x++] is right :)
line++;
printf("newline: %d\n", line);
space = 0;
}
else {
printf("char\n");
//x++; //CHANGE:: removed else you would skip after chars
var++;
space = 0;
}
if (space == space_size) {
// replace_spaces_to_tab(); //CHANGE:: commented just to be able to compile
printf("space = space_size\n");
}
// filecontent++; //CHANGE:: removed else you would skip chars
offset++;
}
}
//CHANGE:: needed a main...
int main(){
set_tab_init_arg("try",2 );
}
compiled with gcc on linux fedora:
gcc -Wall -Wextra -g -o a.out main.c
testfile:
a
b
c
d
e f g
c h e
p ğ a
my output to your testfile:
% ./a.out
space
char
newline: 1
space
char
newline: 2
space
char
newline: 3
space
char
newline: 4
space
char
space
char
space
char
newline: 5
space
char
space
char
space
char
newline: 6
space
char
space
char
char
space
char
newline: 7
I am new to C, this is my first project and have been teaching myself. Within my program, one of my functions needs to read a line from a file, and store it in a char array. When I trace the program with gdb the array (line[]) is simply zeros. This leads to my program returning the error "Error: a line in the asset file lacks a ':' separator\n"
Here is my code:
//return the line number (0 based) that the cmd is on, -1 if absent
int locateCmd(char cmd[]) {
int lineIndex = -1; //-1, because lineIndex is incramented before the posible return
char cmdTemp[10] = "\0";
//create a compareable cmd with correct cmd that has its remaining values zeroed out
char cmdCmp[10] = "\0";
memset(cmdCmp, 0, sizeof(cmdCmp));
for (int i = 0; i < strlen(cmd); i++) {
cmdCmp[i] = cmd[i];
}
FILE *file = fopen(ASSET_FILE, "r");
//loop until target line is reached
while (strcmp(cmdTemp, cmdCmp) != 0) {
//check if last line is read
if (lineIndex == lineCounter(file)-1) {
return -1;
}
memset(cmdTemp, 0, sizeof(cmdTemp));
char line[61];
fgets(line, 61, file);
//set cmdTemp to the command on current line
lineIndex++;
for (int i = 0; line[i] != ':'; i++) {
cmdTemp[i] = line[i];
//return error if line doesn't contain a ':'
if (line[i] = '\n') {
printf("Error: a line in the asset file lacks a ':' separator\n");
exit(1);
}
}
}
return lineIndex;
}
Some context, this function is passed a command, and its job is to read a document that appears like this:
command:aBunchOfInfoOnTheComand
anotherCommand:aBunchOfInfoOnTheComand
and pick out the line that the passed command (cmd[]) is stored on.
The issue is with the fgets on line 24. I have separated the relevant portion of this code out into a smaller test program and it works fine.
The test program that works is:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int main (int argc, char *argv[]) {
FILE *file = fopen("tutorInfo.txt", "r");
char line[61];
fgets(line, 61, file);
printf("%s\n", line);
}
The proper exicution of my test program leads me to believe other code in my function is causing the issue, but i'm not sure what. It may be important to note, the problematic code has the same imports as my sample program. Any help would be much appreciated.
As OP didn't provide a Minimal, Complete, and Verifiable example, I have to base my answer on the functional description provided in the question.
I already covered some error and corner cases, but I'm sure I missed some. The approach is also inefficient, as the file is read over and over again, instead of parsing it once and returning a hash/map/directory for easy lookup. In real life code I would use something like GLib instead of wasting my time trying to re-invent the wheel(s)...
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define LINE_BUFFER_LENGTH 200
unsigned int locateCmd(FILE *fh, const char *key, const char **cmd_line) {
unsigned int found = 0;
size_t key_length = strlen(key);
*cmd_line = NULL;
/* make sure to start read from start of file */
rewind(fh);
unsigned int line_no = 0;
static char buffer[LINE_BUFFER_LENGTH];
while (!feof(fh) && (found == 0)) {
// NOTE: EOF condition will be checked on the next iteration
fgets(buffer, sizeof(buffer), fh);
size_t length = strlen(buffer);
line_no++;
if (buffer[length - 1] != '\n') {
printf("line %u is too long, aborting!\n", line_no);
return(0);
}
if ((strncmp(key, buffer, key_length) == 0) &&
(buffer[key_length] == ':')) {
found = line_no;
buffer[length - 1] = '\0'; // strip line ending
*cmd_line = &buffer[key_length + 1];
}
}
return(found);
}
int main(int argc, char *argv[]) {
FILE *fh = fopen("dummy.txt", "r");
if (!fh) {
perror("file open");
return(1);
}
int ret = 0;
while (--argc > 0) {
const char *cmd;
const char *key = *++argv;
unsigned line_no = locateCmd(fh, key, &cmd);
if (line_no != 0) {
printf("key '%s' found on line %u: %s\n", key, line_no, cmd);
ret = 0;
} else {
printf("key '%s' not found!\n", key);
};
}
if (fclose(fh) != 0) {
perror("file close");
return(1);
}
return(ret);
}
Test input dummy.txt:
command:aBunchOfInfoOnTheComand
anotherCommand:aBunchOfInfoOnTheComand
brokenline
foo:bar
toolong:sadflkjaLKFJASDJFLKASJDFLKSAJ DLFKJ SLDKJFLKASDFASDFKJASKLDJFLKASJDFLKJASDLKFJASLKDFJLKASDJFLKASJDLFKJASDKLFJKLASDJFLKSAJDFLKJASDLKFJKLASDJFLKASJDFKLJASDLKFJLKASDJFLKASJDFLKJSADLKFJASLKDJFLKC
Some test runs:
$ gcc -Wall -o dummy dummy.c
$ ./dummy command foo bar
key 'command' found on line 1: aBunchOfInfoOnTheComand
key 'foo' found on line 5: bar
line 6 is too long, aborting!
key 'bar' not found!
My instructor gave us a basic shell in C to expand upon, and I'm currently working on getting the shell to change directories whenever the user enters 'cd [directory]' into the command line. I've gotten it to stop seg faulting, but it won't change directories. Can anyone tell me why it isn't working?
Here is my code so far:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/wait.h>
/* Array holds arguments: args[0] is the command. */
static char *args[512];
pid_t pid;
int command_pipe[2];
#define READ 0
#define WRITE 1
int chdir(const char* path);
static int
command (int input, int first, int last)
{
int pipettes[2];
/* Invoke pipe */
pipe (pipettes);
pid = fork ();
if (pid == 0)
{
if (first == 1 && last == 0 && input == 0)
{
// 1st cmd
dup2 (pipettes[WRITE], STDOUT_FILENO);
}
else if (first == 0 && last == 0 && input != 0)
{
// Mid cmd
dup2 (input, STDIN_FILENO);
dup2 (pipettes[WRITE], STDOUT_FILENO);
}
else
{
// Last cmd
dup2 (input, STDIN_FILENO);
}
if (execvp (args[0], args) == -1)
_exit (EXIT_FAILURE); // If child fails
}
if (input != 0)
close (input);
close (pipettes[WRITE]);
// If last command, nothing more needs to be read
if (last == 1)
close (pipettes[READ]);
return pipettes[READ];
}
static void
cleanup (int n)
{
int i;
for (i = 0; i < n; ++i)
wait (NULL);
}
static int go (char *cmd, int input, int first, int last);
static char line[1024];
static int n = 0;
int
main (int argc, char* argv[])
{
while (1)
{
/* Initial Prompt */
printf ("?> ");
fflush (NULL);
/* Read in command */
if (!fgets (line, 1024, stdin))
return 0;
int input = 0;
int first = 1;
char *cmd = line;
char *next = strchr (cmd, '|'); /* Find initial '|' */
char *also = strchr (cmd, ';'); /* Find initial ';' */
char *directory = argv[1];
while (next != NULL)
{
/* 'next' points to '|' */
*next = '\0';
input = go (cmd, input, first, 0);
cmd = next + 1;
next = strchr (cmd, '|'); /* Find next '|' */
first = 0;
}
if(argv[0] == "cd"){
chdir(directory);
}
input = go (cmd, input, first, 1);
cleanup (n);
n = 0;
}
return 0;
}
static char *
skip_white_space (char *s)
{
while (isspace (*s))
++s;
return s;
}
static void
parse (char *cmd)
{
cmd = skip_white_space (cmd);
char *next = strchr (cmd, ' ');
int i = 0;
while (next != NULL)
{
next[0] = '\0';
args[i] = cmd;
++i;
cmd = skip_white_space (next + 1);
next = strchr (cmd, ' ');
}
if (cmd[0] != '\0')
{
args[i] = cmd;
next = strchr (cmd, '\n');
next[0] = '\0';
++i;
}
args[i] = NULL;
}
static int
go (char *cmd, int input, int first, int last)
{
parse (cmd);
if (args[0] != NULL)
{
if (strcmp (args[0], "exit") == 0)
exit (0);
n += 1;
return command (input, first, last);
}
return 0;
}
Your immediate problem seems to lie here:
if(argv[0] == "cd"){
chdir(directory);
I think you'll find that argv[0] is the implementation's representation of your program name, not the command you just entered, which is probably in args. Or cmd. Or somewhere.
Even once you fix that, you shouldn't be using == for string comparisons in C. One of the strcmp family is the correct way to do it.
I have to write a program that takes a file name from the command line.
It then read several bytes from the file, looking for strings of printable characters (ASCII values between 32 and 126 decimal).
Then print out the strings.
A string is a run of at least 4 consecutive printable characters and ends whenever a non-printable character is encountered.
Whenever such a string is found, print it out on a new line.
What I have so far is:
#include <stdio.h>
#include <stdlib.h>
int main(int argc, char *argv[]){
FILE *fp;
fp = fopen(argv[1], "rb");
char buffer;
while(fp != NULL)
{
fread(&buffer, 1, 1, fp);
}
fclose(fp);
}
What I think this does it take the program from the command line and read in all the bytes of the file 1 by 1 and store them into buffer.
Now I need to check each part of the array and see if each element is between 32 and 136.
If it is, I add those bytes to another array until there is a byte not in this range.
Do this for the entirety of the buffer array.
Is this a code approach and is this right so far?
Change the while loop a bit. What you're checking is whether the file exits or not in a loop which won't fetch the required result you want.
fp is comapared with NULL to find out if the file opening is succesful or not as fopen returns address of the file if it opens a file or NULL saying something went wrong.
if( fp == NULL )
{
perror("Error while opening the file\n");
exit(0);
}
What you want do is following lines:
while( ( ch = fgetc(fp) ) != EOF ) { // reads character by character from the file
if((ch <32) || (ch>136)) // check if it is in range to printed
break;
else
printf("%c",ch); // format whoever you want
}
If I understand you correctly, you want your program to read characters from a file (the file might contain non-printable characters), and check if the character falls in the range of 32 to 126 (printable character). If it is, then add that character to a buffer and read more characters until a non-printable character is found. It should also make sure that the string should have at least 4 characters; string should be printed on a newline.
Here is the code that might help you. It was compiled with gcc, and I hope it works for you too.
#include <stdio.h>
#include <stdlib.h>
int main(int argc, char *argv[]) {
FILE *fp;
char buf[100], ch; //the size of the array would vary according to your need
int i=0;
//check for enough arguments
if(argc<2)
{
printf("\nInsufficient Arguments.\n");
printf("\nUsage: PrintChar <file>\n\n");
return 1;
}
//open the file in binary mode and check for exisitence of the file
if((fp = fopen(argv[1], "rb"))== NULL)
{
printf("\nError: Unable to open the file.\n");
return 2;
}
i=0;
while( (ch = fgetc(fp))!=EOF )
{
//check for the range
if(ch>=32 && ch<=126)
{
buf[i] = ch; i++;
//This loop will run till it find a next unprintable character (not between the range of 32 and 126
//we also check for the EOF while reading the characters
while( ( (ch = fgetc(fp))>=32 && ch<=126 ) && ch!=EOF )
{
buf[i] = ch; i++;
}
buf[i] = '\0'; //adding the NULL character
//if the string is at least of 4 letters, print it
if(i>=4)
printf("\n%s", buf);
//reset the counter
i=0;
}
}
fclose(fp);
return 0;
}
File contents - test.txt, that I used:
---------------------------------------------------------------
This is a string
anotherline of text #$%#$%#$% #$% #$%345#$$%&$&##$!##~#######
!∞▬345345µ∞#452353453$%##$%#$%$%%^&%^*4234346443754754451}
and this is the output of the program:
C:\Users\shine\Documents\MYCPROGS\forStackoverflow>printchar test.txt
This is a string
anotherline of text #$%#$%#$% #$% #$%345#$$%&$&##$!##~#######
345#$%##$%##452353453$%##$%#$%$%%^&%^*4234346443754754451}
345345
-------------------------------------------------------------------
Hope this would be helpful. I made this is a hurry, so please let me know if you find something wrong in it.
Read one character each time, write when we find long enough string or have to:
#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>
int
main(int argc, char *argv[])
{
size_t min_str_len = 4;
size_t buf_len = 4; /* Must greater than or equal to min_str_len */
char buf[buf_len], ch;
size_t out_len, last_len;
last_len = out_len = 0;
while (fread(&ch, 1, 1, stdin) > 0) {
if (isprint(ch)) {
buf[out_len++] = ch;
if (out_len >= buf_len) {
fwrite(buf, 1, out_len, stdout);
last_len += out_len;
out_len = 0;
}
}
else {
if (out_len + last_len >= min_str_len) {
fwrite(buf, 1, out_len, stdout);
#ifdef NEWLINE
fwrite("\n", 1, 1, stdout);
#endif
last_len = out_len = 0;
}
else {
out_len = 0;
}
}
}
exit(EXIT_SUCCESS);
}
If you want to read more than one byte each time, this "at least 4 consecutive printable characters" will make it a little bit tricky:
#include <assert.h>
#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>
int
main(int argc, char *argv[])
{
size_t str_min_len = 4;
size_t buf_len = 1024; /* Must greater than or equal to str_min_len */
char in_buf[buf_len], out_buf[buf_len];
size_t out_len, in_len, last_len;
last_len = out_len = 0;
while ((in_len = fread(in_buf, 1, buf_len, stdin)) > 0) {
assert(out_len == 0);
for (size_t i = 0; i < in_len; i++) {
char ch = in_buf[i];
if (isprint(ch)) {
out_buf[out_len++] = ch;
}
else {
if (out_len + last_len >= str_min_len) {
fwrite(out_buf, 1, out_len, stdout);
#ifdef NEWLINE
/* Write a newline between strings. */
fwrite("\n", 1, 1, stdout);
#endif
last_len = 0;
}
out_len = 0;
}
}
if (0 < out_len && out_len < str_min_len) {
size_t pad_len = str_min_len - out_len;
for (size_t i = 0; i < pad_len; i++) {
char ch;
if (fread(&ch, 1, 1, stdin) < 1) {
exit(EXIT_SUCCESS);
}
else if (isprint(ch)) {
out_buf[out_len++] = ch;
}
else {
break;
}
}
}
if (out_len >= str_min_len) {
fwrite(out_buf, 1, out_len, stdout);
last_len = out_len;
out_len = 0;
}
else {
last_len = out_len = 0;
}
}
exit(EXIT_SUCCESS);
}
I am pulling data from a bzip2 stream within a C application. As chunks of data come out of the decompressor, they can be written to stdout:
fwrite(buffer, 1, length, stdout);
This works great. I get all the data when it is sent to stdout.
Instead of writing to stdout, I would like to process the output from this statement internally in one-line-chunks: a string that is terminated with a newline character \n.
Do I write the output of the decompressor stream to another buffer, one character at a time, until I hit a newline, and then call the per-line processing function? Is this slow and is there a smarter approach? Thanks for your advice.
EDIT
Thanks for your suggestions. I ended up creating a pair of buffers that store the remainder (the "stub" at the end of an output buffer) at the beginning of a short line buffer, each time I pass through the output buffer's worth of data.
I loop through the output buffer character by character and process a newline-line's worth of data at a time. The newline-less remainder gets allocated and assigned, and copied to the next stream's line buffer. It seems like realloc is less expensive than repeated malloc-free statements.
Here's the code I came up with:
char bzBuf[BZBUFMAXLEN];
BZFILE *bzFp;
int bzError, bzNBuf;
char bzLineBuf[BZLINEBUFMAXLEN];
char *bzBufRemainder = NULL;
int bzBufPosition, bzLineBufPosition;
bzFp = BZ2_bzReadOpen(&bzError, *fp, 0, 0, NULL, 0); /* http://www.bzip.org/1.0.5/bzip2-manual-1.0.5.html#bzcompress-init */
if (bzError != BZ_OK) {
BZ2_bzReadClose(&bzError, bzFp);
fprintf(stderr, "\n\t[gchr2] - Error: Bzip2 data could not be retrieved\n\n");
return -1;
}
bzError = BZ_OK;
bzLineBufPosition = 0;
while (bzError == BZ_OK) {
bzNBuf = BZ2_bzRead(&bzError, bzFp, bzBuf, sizeof(bzBuf));
if (bzError == BZ_OK || bzError == BZ_STREAM_END) {
if (bzBufRemainder != NULL) {
/* fprintf(stderr, "copying bzBufRemainder to bzLineBuf...\n"); */
strncpy(bzLineBuf, bzBufRemainder, strlen(bzBufRemainder)); /* leave out \0 */
bzLineBufPosition = strlen(bzBufRemainder);
}
for (bzBufPosition = 0; bzBufPosition < bzNBuf; bzBufPosition++) {
bzLineBuf[bzLineBufPosition++] = bzBuf[bzBufPosition];
if (bzBuf[bzBufPosition] == '\n') {
bzLineBuf[bzLineBufPosition] = '\0'; /* terminate bzLineBuf */
/* process the line buffer, e.g. print it out or transform it, etc. */
fprintf(stdout, "%s", bzLineBuf);
bzLineBufPosition = 0; /* reset line buffer position */
}
else if (bzBufPosition == (bzNBuf - 1)) {
bzLineBuf[bzLineBufPosition] = '\0';
if (bzBufRemainder != NULL)
bzBufRemainder = (char *)realloc(bzBufRemainder, bzLineBufPosition);
else
bzBufRemainder = (char *)malloc(bzLineBufPosition);
strncpy(bzBufRemainder, bzLineBuf, bzLineBufPosition);
}
}
}
}
if (bzError != BZ_STREAM_END) {
BZ2_bzReadClose(&bzError, bzFp);
fprintf(stderr, "\n\t[gchr2] - Error: Bzip2 data could not be uncompressed\n\n");
return -1;
} else {
BZ2_bzReadGetUnused(&bzError, bzFp, 0, 0);
BZ2_bzReadClose(&bzError, bzFp);
}
free(bzBufRemainder);
bzBufRemainder = NULL;
I really appreciate everyone's help. This is working nicely.
I don't think there's a smarter approach (except finding an automata library that already does this for you). Be careful with allocating proper size for the "last line" buffer: if it cannot handle arbitrary length and the input comes from something accessible to third parties, it becomes a security risk.
I've also been working with processing bzip2 data per line, and I found that reading one byte at a time was too slow. This worked better for me:
#include <stdio.h>
#include <stdlib.h>
#include <bzlib.h>
/* gcc -o bz bz.c -lbz2 */
#define CHUNK 128
struct bzdata {
FILE *fp;
BZFILE *bzf;
int bzeof, bzlen, bzpos;
char bzbuf[4096];
};
static int bz2_open(struct bzdata *bz, char *file);
static void bz2_close(struct bzdata *bz);
static int bz2_read_line(struct bzdata *bz, char **line, int *li);
static int bz2_buf(struct bzdata *bz, char **line, int *li, int *ll);
static int
bz2_buf(struct bzdata *bz, char **line, int *li, int *ll)
{
int done = 0;
for (; bz->bzpos < bz->bzlen && done == 0; bz->bzpos++) {
if (*ll + 1 >= *li) {
*li += CHUNK;
*line = realloc(*line, (*li + 1) * sizeof(*(*line)));
}
if ( ((*line)[(*ll)++] = bz->bzbuf[bz->bzpos]) == '\n') {
done = 1;
}
}
if (bz->bzpos == bz->bzlen) {
bz->bzpos = bz->bzlen = 0;
}
(*line)[*ll] = '\0';
return done;
}
static int
bz2_read_line(struct bzdata *bz, char **line, int *li)
{
int bzerr = BZ_OK, done = 0, ll = 0;
if (bz->bzpos) {
done = bz2_buf(bz, line, li, &ll);
}
while (done == 0 && bz->bzeof == 0) {
bz->bzlen = BZ2_bzRead(&bzerr, bz->bzf, bz->bzbuf, sizeof(bz->bzbuf));
if (bzerr == BZ_OK || bzerr == BZ_STREAM_END) {
bz->bzpos = 0;
if (bzerr == BZ_STREAM_END) {
bz->bzeof = 1;
}
done = bz2_buf(bz, line, li, &ll);
} else {
done = -1;
}
}
/* Handle last lines that don't have a line feed */
if (done == 0 && ll > 0 && bz->bzeof) {
done = 1;
}
return done;
}
static int
bz2_open(struct bzdata *bz, char *file)
{
int bzerr = BZ_OK;
if ( (bz->fp = fopen(file, "rb")) &&
(bz->bzf = BZ2_bzReadOpen(&bzerr, bz->fp, 0, 0, NULL, 0)) &&
bzerr == BZ_OK) {
return 1;
}
return 0;
}
static void
bz2_close(struct bzdata *bz)
{
int bzerr;
if (bz->bzf) {
BZ2_bzReadClose(&bzerr, bz->bzf);
bz->bzf = NULL;
}
if (bz->fp) {
fclose(bz->fp);
bz->fp = NULL;
}
bz->bzpos = bz->bzlen = bz->bzeof = 0;
}
int main(int argc, char *argv[]) {
struct bzdata *bz = NULL;
int i, lc, li = 0;
char *line = NULL;
if (argc < 2) {
return fprintf(stderr, "usage: %s file [file ...]\n", argv[0]);
}
if ( (bz = calloc(1, sizeof(*bz))) ) {
for (i = 1; i < argc; i++) {
if (bz2_open(bz, argv[i])) {
for (lc = 0; bz2_read_line(bz, &line, &li) > 0; lc++) {
/* Process line here */
}
printf("%s: lines=%d\n", argv[i], lc);
}
bz2_close(bz);
}
free(bz);
}
if (line) {
free(line);
}
return 0;
}
This would be easy to do using C++'s std::string, but in C it takes some code if you want to do it efficiently (unless you use a dynamic string library).
char *bz_read_line(BZFILE *input)
{
size_t offset = 0;
size_t len = CHUNK; // arbitrary
char *output = (char *)xmalloc(len);
int bzerror;
while (BZ2_bzRead(&bzerror, input, output + offset, 1) == 1) {
if (offset+1 == len) {
len += CHUNK;
output = xrealloc(output, len);
}
if (output[offset] == '\n')
break;
offset++;
}
if (output[offset] == '\n')
output[offset] = '\0'; // strip trailing newline
else if (bzerror != BZ_STREAM_END) {
free(output);
return NULL;
}
return output;
}
(Where xmalloc and xrealloc handle errors internally. Don't forget to free the returned string.)
This is almost an order of magnitude slower than bzcat:
lars#zygmunt:/tmp$ wc foo
1193 5841 42868 foo
lars#zygmunt:/tmp$ bzip2 foo
lars#zygmunt:/tmp$ time bzcat foo.bz2 > /dev/null
real 0m0.010s
user 0m0.008s
sys 0m0.000s
lars#zygmunt:/tmp$ time ./a.out < foo.bz2 > /dev/null
real 0m0.093s
user 0m0.044s
sys 0m0.020s
Decide for yourself whether that's acceptable.
I think you should copy chunks of characters to another buffer until the latest chunk you write contains a new line character. Then you can work on the whole line.
You can save the rest of the buffer (after the '\n') into a temporary and then create a new line from it.