Split string into two variables in C - c

I have been givin an assignment which uses C to read a given file and input data into a binary tree. My current problem is splitting the line read from the file into two different variables.
The file that has been given contains two bits of data, an ID and some information. 2409, blah, blah, blah
Currently, the program is reading the file correctly and storing each line and then displaying it. I have tried to use token's, memmove and trying to simply select the characters manually however this needs to be dynamic. The ID is not a fixed amount of numbers so manually selecting it will not work. As mentioned, I have tried to use strtok using ", " as a delimited however it just doesn't change anything.
This is currently what I am using to display the information, I intent to split the string within the while loop for each line:
int main() {
struct node* root = NULL;
FILE *file;
char filename[15];
char buff[255];
char line[128];
strcpy(filename, "file.txt");
file = fopen(filename, "r");
if (file == NULL) {
printf("File could not be openned.\n");
exit(0);
}
while (line != NULL)
{
strcpy(line, fgets(buff, 255, file));
printf("%s", line);
}
fclose(file);
}
Is there any way that I am able to simply select the first characters up to the first occurance of "," and convert them into an integer. Then select the rest of the data removing the first "ID, " and insert that into a char variable.
Your help is greatly appreciated.

Like #LPs suggested, and assuming each line is like "2019, blah, blah, blah", you can get the ID for each line by calling:
int id = atoi(strtok(line, ","));

If one wants to parse files like,
2409, blah, blah, blah
0x10,foo, bar, baz, qux
# This is more difficult.
010 , a\
a, b b\#\\\,still b,c
one is probably better off just using a parser generator like lex and yacc or my favourite, re2c.
#include <stdlib.h>
#include <stdio.h>
#include <errno.h>
#include <limits.h>
#include <assert.h>
/* Tokens. */
#define PARAM(A) A
#define STRINGISE(A) #A
#define TOKENS(X) X(ERROR), X(END), X(COMMA), X(NEWLINE), \
X(ESCAPE), X(WSP), X(NUMBER), X(WORD)
enum Token { TOKENS(PARAM) };
static const char *const tokens[] = { TOKENS(STRINGISE) };
struct Lexer { size_t line; char *marker, *from, *cursor; };
static enum Token lex(struct Lexer *lexer) {
assert(lexer);
/*!re2c
re2c:yyfill:enable = 0;
re2c:define:YYCTYPE = char;
re2c:define:YYCURSOR = lexer->cursor;
re2c:define:YYMARKER = lexer->marker; // Rules overlap.
newline = "\n" | ("\r" "\n"?);
oct = "0" [0-7]*;
dec = [1-9][0-9]*;
hex = '0x' [0-9a-fA-F]+;
num = oct | dec | hex;
word = [^\x00\\\n\r \t\v\f,0-9]+;
comment = "#" [^\x00\n\r]* newline;
*/
scan:
lexer->from = lexer->cursor;
/*!re2c
* { return ERROR; }
"\x00" { return END; }
[ \t\v\f]+ { return WSP; }
newline { lexer->line++; return NEWLINE; }
"\\\n" | comment { lexer->line++; goto scan; }
"\\\\" | "\\," | "\\ " | "\\n" | "\\#" { return ESCAPE; }
"," { return COMMA; }
word { return WORD; }
num { return NUMBER; }
*/
}
struct Buffer {
char *data;
size_t size, capacity;
};
static char *buffer_reserve(struct Buffer *const buf, const size_t reserve) {
const size_t min = buf->size + reserve;
size_t c = buf->capacity;
char *data;
assert(buf);
if(reserve > (size_t)-1 - buf->size || min > ((size_t)-1 >> 1) + 1)
{ errno = ERANGE; return 0; }
if(min > c) {
if(!c) c = 1;
while(min <= c) c <<= 1;
if(!(data = realloc(buf->data, c))) return 0;
buf->data = data;
buf->capacity = c;
}
return buf->data + buf->size;
}
struct Word { char *start, *end; };
struct Parser {
int id, id_set, first_comma;
size_t num_words;
struct Word words[64]; /* Lazy. */
char *start_words, *end_words;
};
static size_t parser_max_words = sizeof ((struct Parser *)0)->words
/ sizeof *((struct Parser *)0)->words;
static void clear_parser(struct Parser *const parser) {
assert(parser);
parser->id_set = 0;
parser->first_comma = 1;
parser->num_words = 0;
parser->start_words = parser->end_words = 0;
}
static void print_parser(const struct Parser *const parser) {
const struct Word *word = parser->words,
*word_end = parser->words + parser->num_words;
assert(parser && parser->id_set && parser->num_words <= parser_max_words);
printf("#%d: ", parser->id);
for( ; word < word_end; word++) {
if(word != parser->words) printf(", ");
if(!word->start) { printf("<null>"); continue; }
assert(word->start <= word->end);
if(word->start == word->end) { printf("<empty>"); continue; }
printf("<%.*s>", (int)(word->end - word->start), word->start);
}
fputc('\n', stdout);
}
static void expand_word(struct Parser *const parser,
const struct Lexer *const lexer) {
assert(parser && lexer && lexer->from < lexer->cursor);
if(!parser->start_words) {
assert(!parser->end_words);
parser->start_words = lexer->from;
}
parser->end_words = (lexer->from + INT_MAX >= lexer->cursor) ?
lexer->cursor : lexer->from + INT_MAX;
}
static int store_word(struct Parser *const parser) {
struct Word *word;
assert(parser);
if(parser->num_words >= parser_max_words) return errno = EILSEQ, 0;
word = parser->words + parser->num_words++;
word->start = parser->start_words;
word->end = parser->end_words;
parser->start_words = parser->end_words = 0;
return 1;
}
int main(int argc, char **argv) {
const size_t granularity = 1024;
struct Lexer lexer = { 1, 0, 0, 0 };
struct Parser parser;
size_t nread;
struct Buffer buf = { 0, 0, 0 };
char *b;
FILE *fp = 0;
int success = 0, end_of_buffer = 0;
/* Open. */
if(argc != 2) return fprintf(stderr, "Needs filename.\n"), EXIT_FAILURE;
if(!(fp = fopen(argv[1], "r"))) goto catch;
/* Read. */
do {
if(!(b = buffer_reserve(&buf, granularity))) goto catch;
nread = fread(b, 1, granularity, fp);
buf.size += nread;
} while(nread == granularity);
if(ferror(fp)) goto catch;
fclose(fp), fp = 0;
if(!(b = buffer_reserve(&buf, 1))) goto catch;
*b = '\0'; /* Make sure it's a string. */
/* Parse. */
lexer.cursor = buf.data;
clear_parser(&parser);
do {
enum Token tok;
switch((tok = lex(&lexer))) {
case ERROR: goto catch;
case END: end_of_buffer = 1; break;
case COMMA:
if(!parser.id_set) { errno = EILSEQ; goto catch; }
if(parser.first_comma) { parser.first_comma = 0; break; }
if(!store_word(&parser)) goto catch;
break;
case NEWLINE:
if(parser.id_set) {
/* We require at least key, data. */
if(!store_word(&parser)) goto catch;
print_parser(&parser);
clear_parser(&parser);
} else if(parser.start_words) {
errno = EILSEQ; goto catch;
}
break;
case ESCAPE:
if(!parser.id_set) { errno = EILSEQ; goto catch; }
expand_word(&parser, &lexer);
break;
case WSP: break;
case NUMBER:
if(parser.id_set) {
expand_word(&parser, &lexer);
} else {
char *end;
long i = strtol(lexer.from, &end, 0);
if(end != lexer.cursor || i < INT_MIN || i > INT_MAX)
{ errno = EDOM; goto catch; }
parser.id = (int)i;
parser.id_set = 1;
}
break;
case WORD:
expand_word(&parser, &lexer);
break;
}
} while(!end_of_buffer);
success = EXIT_SUCCESS;
goto finally;
catch:
fprintf(stderr, "While on line %lu.\n", (unsigned long)lexer.line);
perror("parsing");
assert(!lexer.from || (lexer.from < lexer.cursor
&& lexer.from + INT_MAX >= lexer.cursor));
if(lexer.from) fprintf(stderr, "While on %.*s.\n",
(int)(lexer.cursor - lexer.from), lexer.from);
finally:
free(buf.data);
if(fp) fclose(fp);
return success;
}
Prints,
#2409: <blah>, <blah>, <blah>
#16: <foo>, <bar>, <baz>, <qux>
#8: <a\
a>, <b b\#\\\,still b>, <c>
but that's probably overkill.

As #HAL9000 mentioned, I was able to complete this by using sscanf. Simply extracting the integer and string from the line using sscanf(line, "%d %[^\n]s", &ID, details);
I did try using strtok however, couldn't get my head around it as it wasn't working. sscanf was the easiest to do so this is what I am going to use, thanks.

Using sscanf
e.g
int main(int argc, char *argv[]) {
const char *str = "123, this, is, a test ;##";
char buff[128] = {0};
int num = 0;
if (2 == sscanf(str, "%d,%[^\r\n]s", &num, buff))
printf("== num: %d, string: '%s'\n", num, buff);
else
printf("== Wrong!\n");
return 0;
}
result: == num: 123, string: ' this, is, a test ;##'

Related

Counting the mose frequent char in a file

For my CS class I need to write a program that reads an entire file. I've researched a whole bunch of different ways to do this with a string (the two for loops inside the while loops) and I've combined it with the way I was taught to read through a whole file. The problem is you can't index the frequency list with a char variable type (line). Is there an easier way to read through the file and do this?
# define MAX 200
void replace_most_freq(const char *filename, char c, FILE *destination) {
// your code here
FILE *in_file = NULL;
in_file = fopen(filename, "r");
if (!in_file) {
fprintf(destination,
"Error(replace_most_freq): Could not open file %s\n", filename);
fclose(in_file);
return;
}
int i, max = -1, len;
int freq[256] = { 0 };
char line[MAX], result;
while (fgets(line, sizeof(line), in_file)) {
len = strlen(line);
for (i = 0; i < len; i++) {
freq[line[i]]++;
}
}
while (fgets(line, sizeof(line), in_file)) {
len = strlen(line);
for (i = 0; i < len; i++) {
if (max < freq[line[i]]) {
max = freq[line[i]];
result = line[i];
}
}
}
printf("Most frequent char = %c\n", result);
return;
}
Your initial loop is almost correct: you should convert the char to an unsigned char to avoid undefined behavior on negative char values on platforms where char is signed.
The second loop is incorrect: there is no need to read from the file, just iterate over the freq array to find the largest count.
Here is a modified version:
#include <limits.h>
#include <stdio.h>
void replace_most_freq(const char *filename, char newc, FILE *destination) {
FILE *in_file = fopen(filename, "r");
if (!in_file) {
fprintf(stderr,
"Error(replace_most_freq): Could not open file %s\n", filename);
return;
}
int c, max, maxc;
int freq[UCHAR_MAX] = { 0 };
while ((c = getc(in_file)) != EOF) {
freq[c]++;
}
max = freq[maxc = 0];
for (c = 1; c < UCHAR_MAX; c++) {
if (max < freq[c])
max = freq[maxc = c];
}
printf("Most frequent char = %c (%d)\n", max, max);
rewind(in_file);
while ((c = getc(in_file)) != EOF) {
if (c == maxc)
c = newc;
putc(c, destination);
}
}
You can read file in much larger chunks:
#define BUFFSIZE (4*1024*1024)
int findMax(const size_t *, size_t);
int replace_most_freq(const char *filename, char c, FILE *destination) {
int result = 1;
FILE *fi ;
size_t freq[256] = { 0 };
size_t dataChunkLength;
long fileLength;
unsigned char *databuff = malloc(BUFFSIZE);
if(!databuff)
{
result = -2;
goto function_exit;
}
fi = fopen(filename, "r");
if (!fi)
{
result = -1;
goto function_exit;
}
if (fseek(fi, 0, SEEK_END) == -1)
{
result = -3;
goto function_exit;
}
fileLength = ftell(fi);
if (fileLength == -1)
{
result = -4;
goto function_exit;
}
if (fseek(fi, 0, SEEK_SET) == -1)
{
result = -3;
goto function_exit;
}
while(fileLength)
{
if(fileLength <= BUFFSIZE) dataChunkLength = fileLength;
else dataChunkLength = BUFFSIZE;
size_t bytesRead = fread(databuff, 1, dataChunkLength, fi);
if(bytesRead != dataChunkLength)
{
if(feof(fi) || ferror(fi))
{
result = -4;
goto function_exit;
}
}
for(size_t index = 0; index < bytesRead; index++)
{
freq[databuff[index]]++;
}
fileLength -= bytesRead;
}
int mostFrequent;
printf("The most freq char is 0x%02x\n", mostFrequent = findMax(freq, 256));
function_exit:
free(databuff);
if (fi) fclose(fi);
return result;
}

How can I integrate a prefix checker to find complete words based on file read

ISSUE: I'm able to search through the file and print words based on numbers entered by the user (text-message number conversion), buy I should also be able to find full words based on only a portion of the numbers entered, so ... 72 would return pa, ra, sa, and sc... which would find words in my file such as party, radio, sandwich, and scanner.
I've tried utilizing prefix functions, but I'm unable to integrate them properly. The startsWith function is an example.
The words are based on a numbered keypad such as: Lettered Phone Keypad
CODE:
#include <stdio.h>
#include <string.h>
#include <stdbool.h>
const char numbered_letters[10][5] = {"", "", "abc", "def", "ghi", "jkl",
"mno", "pqrs", "tuv", "wxyz"};
bool startsWith(const char *pre, const char *str) {
size_t lenpre = strlen(pre),
lenstr = strlen(str);
return lenstr < lenpre ? false : strncmp(pre, str, lenpre) == 0;
}
void isEqual(char input[]) {
int i = 0;
//input[strlen(input)] = 0;
startsWith(input, input);
printf("check: %s\n", input);
//creating file
FILE *fp = fopen("words_alpha.txt", "r");
char str[32]; /* Handles largest possible string in the list which is: dichlorodiphenyltrichloroethane*/
//fseek(fp, 0, SEEK_SET);
if (fp == NULL) {
printf("Error! No file!\n");
} else {
//printf("Enter a number to be converted into a word-list");
//scanf("%s", str);
while (!feof(fp)) {
fscanf(fp, "%s", str);
i = strncmp(input, str, 32);
if (i == 0) {
printf("HIT: %s \n", input);
break;
} else {
printf("");
}
//if (strncmp(str, "hello", 32 ) == 0) { /*if strncmp finds the word and returns true, print */
// printf("%s\n", str);
}
//printf("%s\n", str);
//compareNums(num);
}
fclose(fp);
}
void printWordsUtil(int number[], int curr_digit, char output[], int n) {
// Base case, if current output word is prepared
int i;
if (curr_digit == n) {
//printf("%s ", output);
isEqual(output);
return;
}
// Try all possible characters for current digit in number[]
// and recur for remaining digits
for (i = 0; i < strlen(numbered_letters[number[curr_digit]]); i++) {
output[curr_digit] = numbered_letters[number[curr_digit]][i];
printWordsUtil(number, curr_digit + 1, output, n);/* recursive call */
if (number[curr_digit] == 0 || number[curr_digit] == 1)
return;
}
}
// A wrapper over printWordsUtil(). It creates an output array and
// calls printWordsUtil()
void printWords(int number[], int n) {
char result[n + 1];
result[n] = '\0';
printWordsUtil(number, 0, result, n);
}
//Driver program
int main(void) {
int number[] = {4, 3, 9};
int n = sizeof(number) / sizeof(number[0]);
printWords(number, n);
return 0;
}
FILE USED: Words Alpha (A file with over 420k words)
Thank you for any guidance you can provide!
There were a few bugs.
Your prefix function was okay. The strncmp you replaced it with was not.
You had a break on a hit that stopped on the first word match, so subsequent ones were not shown.
On a hit you were printing the prefix string instead of the word string.
I've annotated your code with the bugs and fixed them [please pardon the gratuitous style cleanup]:
#include <stdio.h>
#include <string.h>
#include <stdbool.h>
const char numbered_letters[10][5] = { "", "", "abc", "def", "ghi", "jkl",
"mno", "pqrs", "tuv", "wxyz"
};
bool
startsWith(const char *pre, const char *str)
{
size_t lenpre = strlen(pre),
lenstr = strlen(str);
return lenstr < lenpre ? false : strncmp(pre, str, lenpre) == 0;
}
void
isEqual(char input[])
{
int i = 0;
int ilen = strlen(input);
// input[strlen(input)] = 0;
// NOTE/BUG: this is extraneous
#if 0
startsWith(input, input);
#endif
printf("check: '%s'\n", input);
// creating file
FILE *fp = fopen("words_alpha.txt", "r");
// NOTE/BUG: this needs to be one more to contain the nul terminater
#if 0
/* Handles largest possible string in the list which is:
dichlorodiphenyltrichloroethane */
char str[32];
#else
char str[33];
#endif
// fseek(fp, 0, SEEK_SET);
if (fp == NULL) {
printf("Error! No file!\n");
}
else {
// printf("Enter a number to be converted into a word-list");
// scanf("%s", str);
// NOTE/BUG: although feof works here, it is considered _bad_ practice
#if 0
while (!feof(fp)) {
fscanf(fp, "%s", str);
#else
while (1) {
if (fscanf(fp, "%s", str) != 1)
break;
#endif
// NOTE/BUG: this is broken
#if 0
i = strncmp(input, str, 32) == 0;
#endif
// NOTE: this works and is simpler than startsWith (which also works)
#if 0
i = strncmp(input, str, ilen) == 0;
#endif
#if 1
i = startsWith(input, str);
#endif
if (i) {
// NOTE/BUG: we want the actual word to be printed and not just the prefix
#if 0
printf("HIT: %s\n", input);
#else
printf("HIT: %s\n", str);
#endif
// NOTE/BUG: this break stops on the _first_ word match in the list but we
// want all of them
#if 0
break;
#endif
}
else {
//printf("");
}
// if (strncmp(str, "hello", 32 ) == 0) { /*if strncmp finds the word and returns true, print */
// printf("%s\n", str);
}
// printf("%s\n", str);
// compareNums(num);
}
fclose(fp);
}
void
printWordsUtil(int numbers[], int curr_digit, char output[], int n)
{
// Base case, if current output word is prepared
int i;
if (curr_digit == n) {
// printf("%s ", output);
isEqual(output);
return;
}
// NOTE: did some cleanup to understand what was going on -- [probably] not a
// bug
int numcur = numbers[curr_digit];
const char *letters = numbered_letters[numcur];
int letlen = strlen(letters);
// Try all possible characters for current digit in number[]
// and recur for remaining digits
for (i = 0; i < letlen; ++i) {
output[curr_digit] = letters[i];
printWordsUtil(numbers, curr_digit + 1, output, n); /* recursive call */
if ((numcur == 0) || (numcur == 1))
break;
}
}
// A wrapper over printWordsUtil(). It creates an output array and
// calls printWordsUtil()
void
printWords(int number[], int n)
{
char result[n + 1];
// NOTE/BUG: this will have garbage in elements 0 to (n - 1)
// NOTE/BUG: result is not used otherwise
result[n] = '\0';
printWordsUtil(number, 0, result, n);
}
//Driver program
int
main(void)
{
int number[] = { 4, 3, 9 };
int n = sizeof(number) / sizeof(number[0]);
printWords(number, n);
return 0;
}

Find longest comment line in a file in C

So I have this function to find the longest line in a file:
int LongestLine(FILE *filename) {
char buf[MAX_LINE_LENGTH] = {0};
char line_val[MAX_LINE_LENGTH] = {0};
int line_len = -1;
int line_num = -1;
int cur_line = 1;
filename = fopen(filename, "r");
while(fgets(buf, MAX_LINE_LENGTH, filename) != NULL) {
int len_tmp = strlen(buf) - 1;
if(buf[len_tmp] == '\n')
buf[len_tmp] = '\0';
if(line_len < len_tmp) {
strncpy(line_val, buf, len_tmp + 1);
line_len = len_tmp;
line_num = cur_line;
}
cur_line++;
}
return line_num;
}
and I was thinking of combining it with this one:
bool startsWith(const char *pre, const char *str)
{
size_t lenpre = strlen(pre),
lenstr = strlen(str);
return lenstr < lenpre ? false : strncmp(pre, str, lenpre) == 0;
}
But.. however, the LongestLine() function returns an integer. So how can I use both functions so that I may find the longest line starting with let's say //?
Add a call to startsWith (to see if it is a comment) in your if statement to decide if a line is the new longest:
if( startsWith("//",buf) && (line_len < len_tmp) ) {

C corruption or double free, why?

I have a C code (first C code I have ever written), and there is an error in it, but I dont know, where. When I try to free a variable (dinamically allocated, its name is out_html) I get double free or corruption. I have no idea why my program does this, I checked all my calls for free etc.
The code:
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <time.h>
#include <math.h>
#include "fcntl.h"
#include "errno.h"
#include "sys/types.h"
#include "sys/stat.h"
#include "unistd.h"
#define MAX_SIZE 512
typedef struct node
{
char* data;
struct node * nextnode;
} node;
int max(int a, int b)
{
if(a>b) return a;
else return b;
}
node* push(node * stackTop, char* data)
{
//static
node* newItem;
newItem = calloc(sizeof(node),1);
newItem->data = data;
newItem->nextnode = stackTop;
return newItem;
}
node* pop(node* stackTop)
{
if(stackTop != NULL)
{
free(stackTop->data);
node* P = stackTop;
return stackTop->nextnode;
free(P);
}else return NULL;
}
int isMajorTag(char* tag)
{
if(strcmp(tag, "<html>") == 0 || strcmp(tag, "</html>") == 0 ||
strcmp(tag, "<body>") == 0 || strcmp(tag, "</body>") == 0 ||
strcmp(tag, "<head>") == 0 || strcmp(tag, "</head>") == 0 ) { return 1; }
else { return 0; };
}
int isHTMLtag(char* tag, char* tags)
{
char* tag2;
if(strstr(tag," ") != NULL)
{
char* strptr = strstr(tag, " ");
int End = strptr - tag;
char* tag_ = strndup(tag, End);
tag2 = calloc((strlen(tag_) + strlen("*") + 2), sizeof(char));
strcpy(tag2, tag_);
strcat(tag2,"*");
free(tag_);
}
else tag2 = tag;
int ret;
if(strstr(tags, tag2) != NULL){ ret = 1; }
else { ret = 0; };
if(tag2 != tag ) free(tag2);
return ret;
}
int isCloserTagOf(char* cltag, char* tag)
{
int ret = 1;
if( cltag[1] != '/' ) ret = 0;
if( tag[1] == '/' ) ret = 0;
char* ntag;
char* ncltag;
if(strstr(tag," ") != NULL)
{
char* strptr = strstr(tag, " ");
int End = strptr - tag;
ntag = strndup(tag, End) + 1;
// ntag = calloc(strlen(ntag0) + 1 + 1, sizeof(char)); strcpy(ntag, ntag0); strcat(ntag, ">");
ncltag = strndup(cltag+2,strlen(cltag) - 3);
} else
{
ntag = tag + 1;
ncltag = cltag + 2;
}
// printf("|%s|%s| %i", ntag, ncltag, strcmp(ncltag, ntag));
if(strcmp(ncltag, ntag) != 0) ret = 0;
return ret;
}
int isIndividualTag(char* tag)
{
if(strcmp(tag,"</br>") == 0) return 1;
else if(strncmp(tag,"<!--#include file=",18) == 0) return 2;
else if(strncmp(tag,"<!--#echo var=",14) == 0) return 3;
else if(strncmp(tag,"<!--",4) == 0) return 4;
else return 0;
}
int main(int argc,char *argv[])
{
char* fname;
if(argc == 2)
{
fname = argv[1];
} else
{
printf("Give me a filename!");
fname = calloc( MAX_SIZE, sizeof(char));
scanf("%s", fname);
};
printf("Parameter: %s \n\n", fname);
// beolvasas
int f = open(fname, O_RDONLY);
long pos = lseek(f, 0, SEEK_END);
lseek(f, 0, SEEK_SET);
char *buff = calloc(pos,1);
read(f, buff, pos);
close(f);
f = open("valid-tags", O_RDONLY);
pos = lseek(f, 0, SEEK_END);
lseek(f, 0, SEEK_SET);
char *valids = calloc(pos,1);
read(f, valids, pos);
close(f);
// printf("File: %s %s %i ",buff, valids, isCloserTagOf("</html>","<html>")); printf("Igen? %i", isHTMLtag("</head>",valids));
node* Stack = NULL;
char *P = buff;
int is_valid = 1;
int bodyCnt = 0;
char* body[6];
int correct_body = 1;
char* out_html = calloc(strlen(buff), sizeof(char));
while(P[0] != '\0' )
{
if(P[0] == '<')
{
char* strptr = strstr(P, ">");
if(strptr != NULL)
{
int nextCloser = strptr - P + 1;
char* tag = strndup(P, nextCloser);
int IsIndividual = isIndividualTag(tag);
if(isHTMLtag(tag, valids) || IsIndividual)
{
if(IsIndividual)
{
if(IsIndividual == 2) // file inclusion
{
char* firstQ = strstr(tag, "\"");
char* secondQ;
if( firstQ ) secondQ = strstr(firstQ + 1, "\"");
if( firstQ && secondQ )
{
char* incl_filename = strndup((firstQ + 1), (secondQ - firstQ - 1));
f = open(incl_filename, O_RDONLY);
pos = lseek(f, 0, SEEK_END);
lseek(f, 0, SEEK_SET);
char *inclstr = calloc(pos,1);
read(f, inclstr, pos);
close(f);
char* new_out_html = calloc((max(strlen(buff),strlen(out_html)) + pos + 1 + 1 + 1), sizeof(char));
strcpy(new_out_html, out_html);
strcat(new_out_html, inclstr);
free(out_html); out_html = NULL; // free(inclstr);
out_html = new_out_html;
} else
{
printf("Invalid file inclusion! \n");
is_valid = 0; break;
};
} else if (IsIndividual == 3) // date time
{
time_t t = time(NULL);
// int nDigits = floor(log10(abs(t)) + 1; (strlen(out_html) + nDigits
char* timestring = ctime(&t);
char* new_out_html = calloc(1 + max(strlen(buff),strlen(out_html)) + strlen(timestring), sizeof(char));
strcpy(new_out_html, out_html);
strcat(new_out_html, timestring);
//printf("%s",new_out_html);
free(out_html); out_html = NULL; // free(timestring);
out_html = new_out_html;
} else
{
strcat(out_html, tag);
};
}else
{
strcat(out_html, tag);
if(Stack != NULL && isCloserTagOf(tag,Stack->data))
{
Stack = pop(Stack);
}else
{
Stack = push(Stack, tag);
};
}
if(isMajorTag(tag))
{
if(bodyCnt < 6)
{ body[bodyCnt] = calloc(strlen(tag), sizeof(char));
strcpy(body[bodyCnt],tag);
++bodyCnt;
}else
{
printf("Too much major html tag found...");
correct_body = 0;
}
}
}else
{
printf("Invalid html tag: %s \n", tag);
is_valid = 0;
break;
}
P = P + nextCloser;
}
else
{
printf("Unclosed tag\n");
is_valid = 0;
break;
}
} else
{ //printf("-%c",P[0]);
strncat(out_html, P,1);
// printf("{(%s)}",out_html);
P = P + 1;
};
};
int i;
char* correctBody[] = { "<html>", "<head>", "</head>", "<body>", "</body>", "</html>"};
for(i = 0; i < bodyCnt && correct_body; ++i) {
correct_body = (strcmp(body[i],correctBody[i]) == 0); }
if(is_valid && Stack == NULL &&
correct_body && bodyCnt == 6){ printf("\nValid HTML Code\n");
printf("\n\n%s\n",out_html);
}
else printf("\nInvalid.\n");
// printf("%i %i %i",bodyCnt,correct_body,is_valid);
/*****************************************************************/
for(i=0;i<bodyCnt;++i) free(body[i]);
free(buff); free(valids); //
if(out_html != NULL) free(out_html);
return 0;
}
At the end of the code:
if(out_html != NULL) free(out_html);
Without this, there is no crash.
I think the crash is caused somewhere near line 196.
(there must be a valig-html file with proper html tags - without this, the code is useless, I mean a file like this: )
The error message can be a bit confusing.
When allocation say 200 bytes with calloc, the routine internally allocates a tad more:
say 8 , 16 or 32 bytes to make linked lists and other bookkeeping (i.e. has it been freed).
If the strings that are appended or copied with strcpy/strcat do not fit the target array, it internally leads to possible corruption of the bookkeeping.
So the error doesn't necessarily do anything with freeing a pointer twice.
It is hard to figure out what's going on in your code, but some potentically nonsensical operations are visible at the first sight. For example, consider this sequence
int f = open(fname, O_RDONLY);
long pos = lseek(f, 0, SEEK_END);
lseek(f, 0, SEEK_SET);
char *buff = calloc(pos,1);
read(f, buff, pos);
close(f);
...
char* out_html = calloc(strlen(buff), sizeof(char));
You read contents of some file into an allocated buffer buff (the size of the buffer is exactly the size of the file). And later you treat buff as a null-terminated string: you use it as an argument of strlen.
But you never bothered to null-teriminate your buff! How is this supposed to work? If it is not null-terminated, it is not a string and it cannot be meaningfully used as an argument of strlen.
You program contains several instances of code that follows the same pattern: the entire contents of some file is read into a buffer of the exact size and then interpreted as a null-terminated string, while in reality it is not null-terminated (nobody bothered to ensure null-termination).
Is the terminating zero supposed to be present in the file itself? If so, then how are we supposed to know that? We are not telepathes here.
This looks wrong:
(BTW: I could not find a definition / declaration of stackTop)
node* pop(node* stackTop)
{
if(stackTop != NULL)
{
free(stackTop->data);
node* P = stackTop;
return stackTop->nextnode;
free(P);
}else return NULL;
}
Here is another calloc() that's too short. (the strcpy() will put its nul byte at a place that does not belong to the calloc()ed object. BTW: sizeof(char) is 1, by definition.
if(bodyCnt < 6)
{ body[bodyCnt] = calloc(strlen(tag), sizeof(char));
strcpy(body[bodyCnt],tag);
++bodyCnt;
}else
{
printf("Too much major html tag found...");
correct_body = 0;
}

How do I handle a stream of data internal to a C-based app?

I am pulling data from a bzip2 stream within a C application. As chunks of data come out of the decompressor, they can be written to stdout:
fwrite(buffer, 1, length, stdout);
This works great. I get all the data when it is sent to stdout.
Instead of writing to stdout, I would like to process the output from this statement internally in one-line-chunks: a string that is terminated with a newline character \n.
Do I write the output of the decompressor stream to another buffer, one character at a time, until I hit a newline, and then call the per-line processing function? Is this slow and is there a smarter approach? Thanks for your advice.
EDIT
Thanks for your suggestions. I ended up creating a pair of buffers that store the remainder (the "stub" at the end of an output buffer) at the beginning of a short line buffer, each time I pass through the output buffer's worth of data.
I loop through the output buffer character by character and process a newline-line's worth of data at a time. The newline-less remainder gets allocated and assigned, and copied to the next stream's line buffer. It seems like realloc is less expensive than repeated malloc-free statements.
Here's the code I came up with:
char bzBuf[BZBUFMAXLEN];
BZFILE *bzFp;
int bzError, bzNBuf;
char bzLineBuf[BZLINEBUFMAXLEN];
char *bzBufRemainder = NULL;
int bzBufPosition, bzLineBufPosition;
bzFp = BZ2_bzReadOpen(&bzError, *fp, 0, 0, NULL, 0); /* http://www.bzip.org/1.0.5/bzip2-manual-1.0.5.html#bzcompress-init */
if (bzError != BZ_OK) {
BZ2_bzReadClose(&bzError, bzFp);
fprintf(stderr, "\n\t[gchr2] - Error: Bzip2 data could not be retrieved\n\n");
return -1;
}
bzError = BZ_OK;
bzLineBufPosition = 0;
while (bzError == BZ_OK) {
bzNBuf = BZ2_bzRead(&bzError, bzFp, bzBuf, sizeof(bzBuf));
if (bzError == BZ_OK || bzError == BZ_STREAM_END) {
if (bzBufRemainder != NULL) {
/* fprintf(stderr, "copying bzBufRemainder to bzLineBuf...\n"); */
strncpy(bzLineBuf, bzBufRemainder, strlen(bzBufRemainder)); /* leave out \0 */
bzLineBufPosition = strlen(bzBufRemainder);
}
for (bzBufPosition = 0; bzBufPosition < bzNBuf; bzBufPosition++) {
bzLineBuf[bzLineBufPosition++] = bzBuf[bzBufPosition];
if (bzBuf[bzBufPosition] == '\n') {
bzLineBuf[bzLineBufPosition] = '\0'; /* terminate bzLineBuf */
/* process the line buffer, e.g. print it out or transform it, etc. */
fprintf(stdout, "%s", bzLineBuf);
bzLineBufPosition = 0; /* reset line buffer position */
}
else if (bzBufPosition == (bzNBuf - 1)) {
bzLineBuf[bzLineBufPosition] = '\0';
if (bzBufRemainder != NULL)
bzBufRemainder = (char *)realloc(bzBufRemainder, bzLineBufPosition);
else
bzBufRemainder = (char *)malloc(bzLineBufPosition);
strncpy(bzBufRemainder, bzLineBuf, bzLineBufPosition);
}
}
}
}
if (bzError != BZ_STREAM_END) {
BZ2_bzReadClose(&bzError, bzFp);
fprintf(stderr, "\n\t[gchr2] - Error: Bzip2 data could not be uncompressed\n\n");
return -1;
} else {
BZ2_bzReadGetUnused(&bzError, bzFp, 0, 0);
BZ2_bzReadClose(&bzError, bzFp);
}
free(bzBufRemainder);
bzBufRemainder = NULL;
I really appreciate everyone's help. This is working nicely.
I don't think there's a smarter approach (except finding an automata library that already does this for you). Be careful with allocating proper size for the "last line" buffer: if it cannot handle arbitrary length and the input comes from something accessible to third parties, it becomes a security risk.
I've also been working with processing bzip2 data per line, and I found that reading one byte at a time was too slow. This worked better for me:
#include <stdio.h>
#include <stdlib.h>
#include <bzlib.h>
/* gcc -o bz bz.c -lbz2 */
#define CHUNK 128
struct bzdata {
FILE *fp;
BZFILE *bzf;
int bzeof, bzlen, bzpos;
char bzbuf[4096];
};
static int bz2_open(struct bzdata *bz, char *file);
static void bz2_close(struct bzdata *bz);
static int bz2_read_line(struct bzdata *bz, char **line, int *li);
static int bz2_buf(struct bzdata *bz, char **line, int *li, int *ll);
static int
bz2_buf(struct bzdata *bz, char **line, int *li, int *ll)
{
int done = 0;
for (; bz->bzpos < bz->bzlen && done == 0; bz->bzpos++) {
if (*ll + 1 >= *li) {
*li += CHUNK;
*line = realloc(*line, (*li + 1) * sizeof(*(*line)));
}
if ( ((*line)[(*ll)++] = bz->bzbuf[bz->bzpos]) == '\n') {
done = 1;
}
}
if (bz->bzpos == bz->bzlen) {
bz->bzpos = bz->bzlen = 0;
}
(*line)[*ll] = '\0';
return done;
}
static int
bz2_read_line(struct bzdata *bz, char **line, int *li)
{
int bzerr = BZ_OK, done = 0, ll = 0;
if (bz->bzpos) {
done = bz2_buf(bz, line, li, &ll);
}
while (done == 0 && bz->bzeof == 0) {
bz->bzlen = BZ2_bzRead(&bzerr, bz->bzf, bz->bzbuf, sizeof(bz->bzbuf));
if (bzerr == BZ_OK || bzerr == BZ_STREAM_END) {
bz->bzpos = 0;
if (bzerr == BZ_STREAM_END) {
bz->bzeof = 1;
}
done = bz2_buf(bz, line, li, &ll);
} else {
done = -1;
}
}
/* Handle last lines that don't have a line feed */
if (done == 0 && ll > 0 && bz->bzeof) {
done = 1;
}
return done;
}
static int
bz2_open(struct bzdata *bz, char *file)
{
int bzerr = BZ_OK;
if ( (bz->fp = fopen(file, "rb")) &&
(bz->bzf = BZ2_bzReadOpen(&bzerr, bz->fp, 0, 0, NULL, 0)) &&
bzerr == BZ_OK) {
return 1;
}
return 0;
}
static void
bz2_close(struct bzdata *bz)
{
int bzerr;
if (bz->bzf) {
BZ2_bzReadClose(&bzerr, bz->bzf);
bz->bzf = NULL;
}
if (bz->fp) {
fclose(bz->fp);
bz->fp = NULL;
}
bz->bzpos = bz->bzlen = bz->bzeof = 0;
}
int main(int argc, char *argv[]) {
struct bzdata *bz = NULL;
int i, lc, li = 0;
char *line = NULL;
if (argc < 2) {
return fprintf(stderr, "usage: %s file [file ...]\n", argv[0]);
}
if ( (bz = calloc(1, sizeof(*bz))) ) {
for (i = 1; i < argc; i++) {
if (bz2_open(bz, argv[i])) {
for (lc = 0; bz2_read_line(bz, &line, &li) > 0; lc++) {
/* Process line here */
}
printf("%s: lines=%d\n", argv[i], lc);
}
bz2_close(bz);
}
free(bz);
}
if (line) {
free(line);
}
return 0;
}
This would be easy to do using C++'s std::string, but in C it takes some code if you want to do it efficiently (unless you use a dynamic string library).
char *bz_read_line(BZFILE *input)
{
size_t offset = 0;
size_t len = CHUNK; // arbitrary
char *output = (char *)xmalloc(len);
int bzerror;
while (BZ2_bzRead(&bzerror, input, output + offset, 1) == 1) {
if (offset+1 == len) {
len += CHUNK;
output = xrealloc(output, len);
}
if (output[offset] == '\n')
break;
offset++;
}
if (output[offset] == '\n')
output[offset] = '\0'; // strip trailing newline
else if (bzerror != BZ_STREAM_END) {
free(output);
return NULL;
}
return output;
}
(Where xmalloc and xrealloc handle errors internally. Don't forget to free the returned string.)
This is almost an order of magnitude slower than bzcat:
lars#zygmunt:/tmp$ wc foo
1193 5841 42868 foo
lars#zygmunt:/tmp$ bzip2 foo
lars#zygmunt:/tmp$ time bzcat foo.bz2 > /dev/null
real 0m0.010s
user 0m0.008s
sys 0m0.000s
lars#zygmunt:/tmp$ time ./a.out < foo.bz2 > /dev/null
real 0m0.093s
user 0m0.044s
sys 0m0.020s
Decide for yourself whether that's acceptable.
I think you should copy chunks of characters to another buffer until the latest chunk you write contains a new line character. Then you can work on the whole line.
You can save the rest of the buffer (after the '\n') into a temporary and then create a new line from it.

Resources