How can I detect one line comments like // in Flex and skip those lines?
Also, for /* comments, will the following snippet be enough?
"/*" { comment(); }
%%
comment()
{
char c, c1;
loop:
while ((c = input()) != '*' && c != 0)
putchar(c);
if ((c1 = input()) != '/' && c != 0)
{
unput(c1);
goto loop;
}
if (c != 0)
putchar(c1);
}
Why don't you just use regular expressions to recognize the comments? The whole point of lex/flex is to save you from having to write lexical scanners by hand. The code you present should work (if you put the pattern /* at the beginning of the line), but it's a bit ugly, and it is not obvious that it will work.
Your question says that you want to skip comments, but the code you provide uses putchar() to print the comment, except for the /* at the beginning. Which is it that you want to do? If you want to echo the comments, you can use an ECHO action instead of doing nothing.
Here are the regular expressions:
Single line comment
This one is easy because in lex/flex, . won't match a newline. So the following will match from // to the end of the line, and then do nothing.
"//".* { /* DO NOTHING */ }
Multiline comment
This is a bit trickier, and the fact that * is a regular expression character as well as a key part of the comment marker makes the following regex a bit hard to read. I use [*] as a pattern which recognizes the character *; in flex/lex, you can use "*" instead. Use whichever you find more readable. Essentially, the regular expression matches sequences of characters ending with a (string of) * until it finds one where the next character is a /. In other words, it has the same logic as your C code.
[/][*][^*]*[*]+([^*/][^*]*[*]+)*[/] { /* DO NOTHING */ }
The above requires the terminating */; an unterminated comment will force the lexer to back up to the beginning of the comment and accept some other token, usually a / division operator. That's likely not what you want, but it's not easy to recover from an unterminated comment since there's no really good way to know where the comment should have ended. Consequently, I recommend adding an error rule:
[/][*][^*]*[*]+([^*/][^*]*[*]+)*[/] { /* DO NOTHING */ }
[/][*] { fatal_error("Unterminated comment"); }
For // you can read until you find the end of line \n or EOF, in case if the comment was at the end of file, for example:
static void
skip_single_line_comment(void)
{
int c;
/* Read until we find \n or EOF */
while((c = input()) != '\n' && c != EOF)
;
/* Maybe you want to place back EOF? */
if(c == EOF)
unput(c);
}
as for multiple lines comments /* */, you can read until you see * and peek the next character, if it's / this means this is the end of comment, if not just skip it with any other character. You shouldn't expect EOF, means unclosed comment:
static void
skip_multiple_line_comment(void)
{
int c;
for(;;)
{
switch(input())
{
/* We expect ending the comment first before EOF */
case EOF:
fprintf(stderr, "Error unclosed comment, expect */\n");
exit(-1);
goto done;
break;
/* Is it the end of comment? */
case '*':
if((c = input()) == '/')
goto done;
unput(c);
break;
default:
/* skip this character */
break;
}
}
done:
/* exit entry */ ;
}
Complete file:
%{
#include <stdio.h>
static void skip_single_line_comment(void);
static void skip_multiple_line_comment(void);
%}
%option noyywrap
%%
"//" { puts("short comment was skipped ");
skip_single_line_comment();}
"/*" { puts("long comment begins ");
skip_multiple_line_comment();
puts("long comment ends");}
" " { /* empty */ }
[\n|\r\n\t] { /* empty */ }
. { fprintf(stderr, "Tokenizing error: '%c'\n", *yytext);
yyterminate(); }
%%
static void
skip_single_line_comment(void)
{
int c;
/* Read until we find \n or EOF */
while((c = input()) != '\n' && c != EOF)
;
/* Maybe you want to place back EOF? */
if(c == EOF)
unput(c);
}
static void
skip_multiple_line_comment(void)
{
int c;
for(;;)
{
switch(input())
{
/* We expect ending the comment first before EOF */
case EOF:
fprintf(stderr, "Error unclosed comment, expect */\n");
exit(-1);
goto done;
break;
/* Is it the end of comment? */
case '*':
if((c = input()) == '/')
goto done;
unput(c);
break;
default:
/* skip this character */
break;
}
}
done:
/* exit entry */ ;
}
int main(int argc, char **argv)
{
yylex();
return 0;
}
To detect single line comments :
^"//" printf("This is a comment line\n");
This says any line which starts with // will be considered as comment line.
To detect multi line comments :
^"/*"[^*]*|[*]*"*/" printf("This is a Multiline Comment\n");
*
Explanation :
*
^"/*" This says beginning should be /*.
[^*]* includes all characters including \n but excludes *.
[*]* says 0 or more number of stars.
[^*]|[*]* - "or" operator is applied to get any string.
"*/" specifies */ as end.
This will work perfectly in lex.
Below is the complete code of lex file :
%{
#include <stdio.h>
int v=0;
%}
%%
^"//" printf("This is a comment line\n");
^"/*"[^*]*|[*]*"*/" printf("This is a Multiline Comment\n");
.|\n {}
%%
int yywrap()
{
return 1;
}
main()
{
yylex();
}
Related
I am supposed to be "fixing" code given to me to make it display the correct number of visible characters in a file (spaces too). The correct number is supposed to be 977. I have never dealt with files before and I don't understand what I need to do to display the correct number.
* Driver Menu System for Homework
* Andrew Potter - Mar 5, 2019 <-- Please put your name/date here
*/
#include <stdio.h>//header file for input/output -
#include <stdlib.h>
#include <ctype.h>
// since you will place all your assigned functions (programs) in this file, you do not need to include stdio.h again!
int menu(void); //prototype definition section
void hello(void);
void countall(void);
int main(void)
{
int selection = menu();
while(selection != 99) {
switch(selection) {
case 1:
hello();
break;
case 2:
countall();
break;
case 3:
break;
case 4:
break;
default:
printf("Please enter a valid selection.\n");
}
selection = menu();
}
return 0;
}
int menu(void) {
int choice;
printf("***************************\n");
printf(" 1. Hello \n");
printf(" 2. Countall\n");
printf(" 3. \n");
printf(" 4. \n");
printf("99. Exit\n");
printf("Please select number and press enter:\n");
printf("***************************\n");
scanf("%d", &choice);
getchar();
return choice;
}
void hello(void) {
printf("Hello, World!!!\n");
}
//*****Andrew 5/1/19*****
#define SLEN 81 /* from reverse.c */
/* original header: int count(argc, *argv[]) */
void countall(void)
{
int ch; // place to store each character as read
FILE *fp; // "file pointer"
long unsigned count = 0;
char file[SLEN]; /* from reverse.c */
/*Checks whether a file name was included when run from the command prompt
* The argument count includes the program file name. A count of 2 indicates
* that an additional parameter was passed
if (argc != 2)
{
printf("Usage: %s filename\n", argv[0]);
exit(EXIT_FAILURE);
}
* The following uses the second parameter as the file name
* and attempts to open the file
if ((fp = fopen(argv[1], "r")) == NULL)
{
printf("Can't open %s\n", argv[1]);
exit(EXIT_FAILURE);
} */
/*************************************
Code from reverse.c included to make the program work from within our IDE
*************************************/
puts("Enter the name of the file to be processed:");
scanf("%s", file);
if ((fp = fopen(file,"rb")) == NULL) /* read mode */
{
printf("count program can't open %s\n", file);
exit(EXIT_FAILURE);
}
/* EOF reached when C realizes it tried to reach beyond the end of the file! */
/* This is good design - see page 573 */
while ((ch = getc(fp)) != EOF)
{
if (isprint(ch)) {
count++;
}
else if (isprint(ch)) {
count++;
}
putc(ch,stdout); // same as putchar(ch);
count++;
}
fclose(fp);
printf("\nFile %s has %lu characters\n", file, count);
}
I expected I would get the correct number of visible characters using the combination of isprint and isspace but I usually get 2086.
The assignment directions are: "Word identifies 977 characters including spaces. Your current countall() believes there are 1043. Make the corrections necessary to your code to count only the visible characters and spaces! (Hint: check out 567 in your textbook.)" Before I edited any code the count was 1043, now i am getting 2020. I need 977.
isprint() returns a Boolean result - zero if the character is not "printable", and non-zero if it is. As such isprint(ch) != '\n'makes no sense. Your complete expression in the question makes even less sense, but I'll come on to that at the end.
isprint() on its own returns true (non-zero) for all printable characters, so you need no other tests. Moreover you increment count unconditionally and in every conditional block, so you are counting every character and some twice.
You just need:
if( isprint(ch) )
{
count++;
}
putc( ch, stdout ) ;
While your code is clearly an incomplete fragment, it is not clear where or how your are reading ch. You need a getc() or equivalent in there somewhare.
while( (ch = getc(fp)) != EOF )
{
if( isprint(ch) )
{
count++;
}
putc( ch, stdout ) ;
}
It is not clear whether you need to count all whitespace (including space, tab and newline) or just "spaces" as you stated. If so be clear that isprint() will match space, but not control characters newline or tab. isspace() matches all these, but should not be counted separately to isprint() because 'space' is in both white-space and printable sets. If newline and tab are to be counted (and less likely; "vertical tab") then:
while( (ch = getc(fp)) != EOF )
{
if( isprint(ch) || isspace(ch) )
{
count++;
}
putc( ch, stdout ) ;
}
Another aspect of C that you seem to misunderstand is how Boolean expressions work. To test a single variable for multiple values you must write:
if( var == x || var == y || var == z )
You have written:
if( var == x || y || z )
which may make sense in English (or other natural language) when you read it out aloud, but in C it means:
if( var == (x || y || z ) )
evaluating (x || y || z ) as either true or false and comparing it to var.
It is probably worth considering the semantics of your existing solution to show why it actually compiles, but produces the erroneous result it does.
Firstly,
isprint(ch) != '\n' || '\t' || '\0'
is equivalent to isprint(ch) != true, for the reasons described earlier. So you increment the counter for all characters that are not printable.
Then here:
isspace(ch) == NULL
NULL is a macro representing an invalid pointer, and isspace() does not return a pointer. However NULL will implicitly cast to zero (or false). So here you increment the counter for all printable characters that are not spaces.
Finally, you unconditionally count every character here:
putc(ch,stdout); // same as putchar(ch);
count++;
So your result will be:
number-of-non-printing-characters +
number-of-printing-characters - number-of-spaces +
total-number-of-characters
which is I think (2 x file-length) - number-of-spaces
Finally note that if you open a text file that has CR+LF line ends (conventional for text files on Windows) in "binary" mode, isspace() will count two characters for every new-line. Be sure to open in "text" mode (regardless of the platform).
From isprint():
A printable character is a character that occupies a printing position on a display (this is the opposite of a control character, checked with iscntrl).
and
A value different from zero (i.e., true) if indeed c is a printable character. Zero (i.e., false) otherwise.
So that function should be sufficient. Please note that you have to make sure to feed all these is...() functions from <ctype.h> unsigned values. So if you use it with a value of uncertain origin, better cast to char unsigned.
#include <stdlib.h>
#include <stdio.h>
#include <ctype.h>
int main(void)
{
char const *filename = "test.txt";
FILE *input = fopen(filename, "r");
if (!input) {
fprintf(stderr, "Couldn't open \"%s\" for reading. :(\n\n", filename);
return EXIT_FAILURE;
}
long long unsigned count = 0;
for (int ch; (ch = fgetc(input)) != EOF;) {
if (isprint(ch))
++count;
}
fclose(input);
printf("Count: %llu\n\n", count);
}
If I wasn't lucky enough to guess which characters you want to be counted, have a look at ctype.h, there is a table.
if ((ch == '\t') || isprint(ch))
count++;
If you want to handle tabs differently (maybe to count how many spaces they use):
if (ch == '\t') {
/* Do smth */
} else if (isprint(ch)) {
count++;
}
This should be enough.
I am working on a Lexical Analysis program , everything works fine when detecting a single line comment. This is my code for single line comment detection.
//Single Comment
if ((Current_Character == '/') && (fgetc(File_Input) == '/')){
printf("%c", Current_Character);
do{
printf ("%c", Current_Character);
Current_Character = fgetc (File_Input);
}while(Current_Character != '\n');
printf("\b \t | COMMENT\n", Current_Character);
i = -1;
Lexeme_Count++;
Comment_Count++;
}
But when i am trying to detect the Multi-Line comment it got a logical error which it cannot detect the opening asterisk. here is my code for Multi-Line comment detection:
//Multi-Line Comment
if((Current_Character == '/') && (fgetc(File_Input) == '*')){
printf ("%c", fgetc(File_Input));
do{
printf ("%c", Current_Character);
Current_Character = fgetc(File_Input);
}while(Current_Character != '/');
printf("\b | COMMENT\n", Current_Character);
i = -1;
Lexeme_Count++;
Comment_Count++;
}
Current character is for the first character for multi-line comment which is backslash and second character which is (fgetc(File_Input) (getting the next latest character from file) is for opening askterisk.
This is the content of the file I inputted:
#include <conio.h>
{
int a[3],t1,t2;
t1=2; a[0]=1; a[1]=2; a[t1]=3;
t2=
-
(a[2]+t1*6)/(a[2]
-
t1);
if t2>5 then
print(t2);
else {
int t3;
t3=99;
t2=
-
25;
print(
-
t1+t2*t3); // this is a comment on 2 lines
} endif /* THIS IS A MUTLI-LINE COMMENT ON 2 LINES
*/ }
This is my current output
You have:
if((Current_Character == '/') && (fgetc(File_Input) == '*')){
printf ("%c", fgetc(File_Input));
do{
printf ("%c", Current_Character);
Current_Character = fgetc(File_Input);
}while(Current_Character != '/');
The first printf() should be printing the character returned by the fgetc(), which you know to be a *, so you could use putchar('*'); or (if you really insist) printf("%c", '*') or printf("*").
Note that you've got another problem lurking:
x = a/b;
It isn't clear which of your comment blocks executes first, but both of them lose the b after the division. There are numerous other subtleties in comment detection in C — I won't bore you with them all, but suffice to say "it is hard work removing comments in C" (and harder still in C++). One of the issues you're not addressing is unexpected EOF (end of file).
You probably need a peek() function to look at the next character without consuming it:
int peek(FILE *fp)
{
int c = fgetc(fp);
if (c != EOF)
ungetc(c, fp);
return c;
}
this code snippet skips all the chars until you detect the */, considering the special cases where you have endings like ***/:
int state = 0;
while((c = getchar()) != EOF) {
switch(state) {
case 0:
switch(c) {
case '*': state = 1; continue;
default: /* process as comment char, but ignore */
continue;
} /* NOTREACHED */
case 1:
switch(c) {
case '*': continue;
case '/': /* end comment processing and return */
state = 0;
return COMMENT; /* or continue, depending on scanner */
default: /* any other char returns to state 0 */
state = 0;
/* process comment char */
continue;
} /* NOTREACHED */
} /* switch */
} /* while */
I can't seem to get it right, tried everything, but..
int commentChars() {
char str[256], fileName[256];
FILE *fp;
int i;
do{
long commentCount=0;
fflush(stdin);
printf("%s\nEnter the name of the file in %s/", p, dir);
gets(fileName);
if(!(fp=fopen(fileName, "r"))) {
printf("Error! File not found, try again");
return 0;
}
while(!feof(fp)) {
fgets(str,sizeof str,fp);
for(int i=0;i<=sizeof str;i++) {
if(str[i] == '/' && str[i+1] == '/') {
commentCount += (strlen(str)-2);
}
}
}
fclose(fp);
printf("All the chars, contained in a comment: %ld\n", commentCount);
puts(p);
printf("Do you want to search for another file?<Y/N>: ");
i=checker();
}while(i);}
The result is "All the chars, containted in a comment: 0", even though I have comments.
And my second question was.. Analogically, how can I do the same for comments, containing /* */, seems like an impossible job for me.
I think you best use regular expressions. They seem scary, but they're really not that bad for things like this. You can always try playing some regex golf to practice ;-)
I'd approach it as follows:
Build a regular expression that captures comments
Scan your file for it
Count the characters in the match
Using some regex code and a bit about matching comments in C, I hacked this together which should allow you to count all the bytes that are part of a block style comment /* */ - Including the delimiters. I only tested it on OS X. I suppose you can handle the rest?
#include <regex.h>
#include <stdio.h>
#include <stdlib.h>
#define MAX_ERROR_MSG 0x1000
int compile_regex(regex_t *r, char * regex_text)
{
int status = regcomp (r, regex_text, REG_EXTENDED|REG_NEWLINE|REG_ENHANCED);
if (status != 0) {
char error_message[MAX_ERROR_MSG];
regerror (status, r, error_message, MAX_ERROR_MSG);
printf ("Regex error compiling '%s': %s\n",
regex_text, error_message);
return 1;
}
return 0;
}
int match_regex(regex_t *r, const char * to_match, long long *nbytes)
{
/* Pointer to end of previous match */
const char *p = to_match;
/* Maximum number of matches */
size_t n_matches = 10;
/* Array of matches */
regmatch_t m[n_matches];
while(1) {
int i = 0;
int nomatch = regexec (r, p, n_matches, m, 0);
if(nomatch) {
printf("No more matches.\n");
return nomatch;
}
//Just handle first match (the entire match), don't care
//about groups
int start;
int finish;
start = m[0].rm_so + (p - to_match);
finish = m[0].rm_eo + (p - to_match);
*nbytes += m[0].rm_eo - m[0].rm_so;
printf("match length(bytes) : %lld\n", m[0].rm_eo - m[0].rm_so);
printf("Match: %.*s\n\n", finish - start, to_match + start);
p += m[0].rm_eo;
}
return 0;
}
int main(int argc, char *argv[])
{
regex_t r;
char regex_text[128] = "/\\*(.|[\r\n])*?\\*/";
long long comment_bytes = 0;
char *file_contents;
size_t input_file_size;
FILE *input_file;
if(argc != 2) {
printf("Usage : %s <filename>", argv[0]);
return 0;
}
input_file = fopen(argv[1], "rb");
fseek(input_file, 0, SEEK_END);
input_file_size = ftell(input_file);
rewind(input_file);
file_contents = malloc(input_file_size * (sizeof(char)));
fread(file_contents, sizeof(char), input_file_size, input_file);
compile_regex(&r, regex_text);
match_regex(&r, file_contents, &comment_bytes);
regfree(&r);
printf("Found %lld bytes in comments\n", comment_bytes);
return 0;
}
This basically trivial modification of your code deals with several problems in your code.
You should not use feof() like that — `while (!feof(file)) is always wrong.
You should not read data that is not part of the string just read.
I've also refactored your code so that the function takes a file name, opens, counts and closes it, and reports on what it found.
#include <stdio.h>
#include <string.h>
// Revised interface - process a given file name, reporting
static void commentChars(char const *file)
{
char str[256];
FILE *fp;
long commentCount = 0;
if (!(fp = fopen(file, "r")))
{
fprintf(stderr, "Error! File %s not found\n", file);
return;
}
while (fgets(str, sizeof(str), fp) != 0)
{
int len = strlen(str);
for (int i = 0; i <= len; i++)
{
if (str[i] == '/' && str[i + 1] == '/')
{
commentCount += (strlen(str) - 2);
break;
}
}
}
fclose(fp);
printf("%s: Number of characters contained in comments: %ld\n", file, commentCount);
}
int main(int argc, char **argv)
{
if (argc == 1)
commentChars("/dev/stdin");
else
{
for (int i = 1; i < argc; i++)
commentChars(argv[i]);
}
return 0;
}
When run on the source code (ccc.c), it yields:
ccc.c: Number of characters contained in comments: 58
The comment isn't really complete (oops), but it serves to show what goes on. It counts the newline which fgets() preserves as part of the comment, though the // introducer is not counted.
Dealing with /* comments is harder. You need to spot a slash followed by a star, and then read up to the next star slash character pair. This is probably more easily done using character by character input than line-by-line input; you will, at least, need to be able to interleave character analysis with line input.
When you're ready for it, you can try this torture test on your program. It's what I use to check my comment stripper, SCC (which doesn't handle trigraphs — by conscious decision; if the source contains trigraphs, I have a trigraph remover which I use on the source first).
/*
#(#)File: $RCSfile: scc.test,v $
#(#)Version: $Revision: 1.7 $
#(#)Last changed: $Date: 2013/09/09 14:06:33 $
#(#)Purpose: Test file for program SCC
#(#)Author: J Leffler
*/
/*TABSTOP=4*/
// -- C++ comment
/*
Multiline C-style comment
#ifndef lint
static const char sccs[] = "#(#)$Id: scc.test,v 1.7 2013/09/09 14:06:33 jleffler Exp $";
#endif
*/
/*
Multi-line C-style comment
with embedded /* in line %C% which should generate a warning
if scc is run with the -w option
Two comment starts /* embedded /* in line %C% should generate one warning
*/
/* Comment */ Non-comment /* Comment Again */ Non-Comment Again /*
Comment again on the next line */
// A C++ comment with a C-style comment marker /* in the middle
This is plain text under C++ (C99) commenting - but comment body otherwise
// A C++ comment with a C-style comment end marker */ in the middle
The following C-style comment end marker should generate a warning
if scc is run with the -w option
*/
Two of these */ generate */ one warning
It is possible to have both warnings on a single line.
Eg:
*/ /* /* */ */
SCC has been trained to handle 'q' single quotes in most of
the aberrant forms that can be used. '\0', '\\', '\'', '\\
n' (a valid variant on '\n'), because the backslash followed
by newline is elided by the token scanning code in CPP before
any other processing occurs.
This is a legitimate equivalent to '\n' too: '\
\n', again because the backslash/newline processing occurs early.
The non-portable 'ab', '/*', '*/', '//' forms are handled OK too.
The following quote should generate a warning from SCC; a
compiler would not accept it. '
\n'
" */ /* SCC has been trained to know about strings /* */ */"!
"\"Double quotes embedded in strings, \\\" too\'!"
"And \
newlines in them"
"And escaped double quotes at the end of a string\""
aa '\\
n' OK
aa "\""
aa "\
\n"
This is followed by C++/C99 comment number 1.
// C++/C99 comment with \
continuation character \
on three source lines (this should not be seen with the -C flag)
The C++/C99 comment number 1 has finished.
This is followed by C++/C99 comment number 2.
/\
/\
C++/C99 comment (this should not be seen with the -C flag)
The C++/C99 comment number 2 has finished.
This is followed by regular C comment number 1.
/\
*\
Regular
comment
*\
/
The regular C comment number 1 has finished.
/\
\/ This is not a C++/C99 comment!
This is followed by C++/C99 comment number 3.
/\
\
\
/ But this is a C++/C99 comment!
The C++/C99 comment number 3 has finished.
/\
\* This is not a C or C++ comment!
This is followed by regular C comment number 2.
/\
*/ This is a regular C comment *\
but this is just a routine continuation *\
and that was not the end either - but this is *\
\
/
The regular C comment number 2 has finished.
This is followed by regular C comment number 3.
/\
\
\
\
* C comment */
The regular C comment number 3 has finished.
Note that \u1234 and \U0010FFF0 are legitimate Unicode characters
(officially universal character names) that could appear in an
id\u0065ntifier, a '\u0065' character constant, or in a "char\u0061cter\
string". Since these are mapped long after comments are eliminated,
they cannot affect the interpretation of /* comments */. In particular,
none of \u0002A. \U0000002A, \u002F and \U0000002F ever constitute part
of a comment delimiter ('*' or '/').
More double quoted string stuff:
if (logtable_out)
{
sprintf(logtable_out,
"insert into %s (bld_id, err_operation, err_expected, err_sql_stmt, err_sql_state)"
" values (\"%s\", \"%s\", \"%s\", \"", str_logtable, blade, operation, expected);
/* watch out for embedded double quotes. */
}
/* Non-terminated C-style comment at the end of the file
#include <stdio.h>
size_t counter(FILE *fp){
int ch, chn;
size_t count = 0;
enum { none, in_line_comment, in_range_comment, in_string, in_char_constant } status;
#if 0
in_range_comment : /* this */
in_line_comment : //this
in_string : "this"
in_char_constnt : ' '
#endif
status = none;
while(EOF!=(ch=fgetc(fp))){
switch(status){
case in_line_comment :
if(ch == '\n'){
status = none;
}
++count;
continue;
case in_range_comment :
if(ch == '*'){
chn = fgetc(fp);
if(chn == '/'){
status = none;
continue;
}
ungetc(chn, fp);
}
++count;
continue;
case in_string :
if(ch == '\\'){
chn = fgetc(fp);
if(chn == '"'){
continue;
}
ungetc(chn, fp);
} else {
if(ch == '"')
status = none;
}
continue;
case in_char_constant :
if(ch == '\\'){
chn = fgetc(fp);
if(chn == '\''){
continue;
}
ungetc(chn, fp);
} else {
if(ch == '\'')
status = none;
}
continue;
case none :
switch(ch){
case '/':
if('/' == (chn = fgetc(fp))){
status = in_line_comment;
continue;
} else if('*' == chn){
status = in_range_comment;
continue;
} else
ungetc(chn, fp);
break;
case '"':
status = in_string;
break;
case '\'':
status = in_char_constant;
break;
}
}
}
return count;
}
int main(void){
FILE *fp = stdin;
size_t c = counter(fp);
printf("%lu\n", c);
return 0;
}
I need to validate that a text file is in CSV format (i.e. that each digit is separated by a comma).
From reading online, it seems that people have conflicting views about it - but is Strtok() the best way to do this?
Any help would be great.
Your input seems so easy that I would probably just use a loop around fgetc(3); I'll sketch some pseudo-code here:
fd = fopen("file", "r");
int c;
while((c=fgetc(fd)) != EOF) {
switch(c) {
case '0':
case '1':
/* so on */
case '9':
handle_digit(c);
break;
case ',':
handle_comma();
break;
case '\n':
handle_newline();
break;
default:
fprintf(stderr, "mistaken input %c\n", c);
break;
}
}
fclose(fd);
You'll have to manage the input in the functions in a manner that may be a bit awkward if you're used to higher-level languages such as Ruby or Python where you'd just run line.split(',') to get a list of numbers, but that is pretty idiomatic C.
Of course, if this were a real problem, I'd probably prefer flex and bison, and write a tiny lexer and grammar, mostly because it would be a lot easier to extend in the future as needs change.
Update
With some additional criteria to check, the handle_{digit,comma,newline}() routines are easier to sketch. I'll sketch using global variables, but you could just as easily stuff these into a struct and pass them around from function to function:
enum seen {
NEWLINE,
COMMA,
DIGIT,
};
enum seen last_seen = NEWLINE;
handle_digit(int c) {
if (last_seen == DIGIT) {
/* error if numbers cannot have multiple digits
or construct a larger number if numbers can have
multiple digits */
} else if (last_seen == COMMA || last_seen == NEWLINE) {
/* start a new entry */
}
last_seen = DIGIT;
}
handle_comma() {
if (last_seen == COMMA) {
/* error */
} else if (last_seen == NEWLINE) {
/* error */
} else if (last_seen == DIGIT) {
/* end previous field */
}
last_seen = COMMA;
}
handle_newline() {
if (last_seen == NEWLINE) {
/* error */
} else if (last_seen == COMMA) {
/* error */
} else if (last_seen == DIGIT) {
/* end previous field */
}
last_seen = NEWLINE;
}
Add whichever checks you need to validate the contents according to whichever rules you have. You might wish to standardize the order and contents of the tests to ensure that you never forget one, even if it means you write a /* nop */ comment once or twice to remind yourself that something is fine.
I want to split a C file into tokens, not for compiling but for analyzing. I feel like this should be pretty straight-forward, and tried looking online for a defined tokens.l (or something similar) file for flex with all the C grammar already defined, but couldn't find anything. I was wondering if there are any sort of defined grammars floating around, or if perhaps I'm going about this all wrong?
Yes, there's at least one around.
Edit:
Since there are a few issues that doesn't handle, perhaps it's worth looking at some (hand written) lexing code I wrote several years ago. This basically only handles phases 1, 2 and 3 of translation. If you define DIGRAPH, it also turns on some code to translate C++ digraphs. If memory serves, however, it's doing that earlier in translation than it should really happen, but you probably don't want it in any case. OTOH, this does not even attempt to recognize anywhere close to all tokens -- mostly it separates the source into comments, character literals, string literals, and pretty much everything else. OTOH, it does handle trigraphs, line splicing, etc.
I suppose I should also add that this leaves conversion of the platform's line-ending character into a new-line to the underlying implementation by opening the file in translated (text) mode. Under most circumstances, that's probably the right thing to do, but if you want to produce something like a cross-compiler where your source files have a different line-ending sequence than is normal for this host, you might have to change that.
First the header that defines the external interface to all this stuff:
/* get_src.h */
#ifndef GET_SRC_INCLUDED
#define GET_SRC_INCLUDED
#include <stdio.h>
#ifdef __cplusplus
extern "C" {
#endif
/* This is the size of the largest token we'll attempt to deal with. If
* you want to deal with bigger tokens, change this, and recompile
* get_src.c. Note that an entire comment is treated as a single token,
* so long comments could overflow this. In case of an overflow, the
* entire comment will be read as a single token, but the part larger
* than this will not be stored.
*/
#define MAX_TOKEN_SIZE 8192
/* `last_token' will contain the text of the most recently read token (comment,
* string literal, or character literal).
*/
extern char last_token[];
/* This is the maximum number of characters that can be put back into a
* file opened with parse_fopen or parse_fdopen.
*/
#define MAX_UNGETS 5
#include <limits.h>
#include <stdio.h>
typedef struct {
FILE *file;
char peeks[MAX_UNGETS];
int last_peek;
} PFILE;
/* Some codes we return to indicate having found various items in the
* source code. ERROR is returned to indicate a newline found in the
* middle of a character or string literal or if a file ends inside a
* comment, or if a character literal contains more than two characters.
*
* Note that this starts at INT_MIN, the most negative number available
* in an int. This keeps these symbols from conflicting with any
* characters read from the file. However, one of these could
* theoretically conflict with EOF. EOF usually -1, and these are far
* more negative than that. However, officially EOF can be any value
* less than 0...
*/
enum {
ERROR = INT_MIN,
COMMENT,
CHAR_LIT,
STR_LIT
};
/* Opens a file for parsing and returns a pointer to a structure which
* can be passed to the other functions in the parser/lexer to identify
* the file being worked with.
*/
PFILE *parse_fopen(char const *name);
/* This corresponds closely to fdopen - it takes a FILE * as its
* only parameter, creates a PFILE structure identifying that file, and
* returns a pointer to that structure.
*/
PFILE *parse_ffopen(FILE *stream);
/* Corresponds to fclose.
*/
int parse_fclose(PFILE *stream);
/* returns characters from `stream' read as C source code. String
* literals, characters literals and comments are each returned as a
* single code from those above. All strings of any kind of whitespace
* are returned as a single space character.
*/
int get_source(PFILE *stream);
/* Basically, these two work just like the normal versions of the same,
* with the minor exception that unget_character can unget more than one
* character.
*/
int get_character(PFILE *stream);
void unget_character(int ch, PFILE *stream);
#ifdef __cplusplus
}
#endif
#endif
And then the implementation of all that:
/* get_src.c */
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include <stdlib.h>
#define GET_SOURCE
#include "get_src.h"
static size_t current = 0;
char last_token[MAX_TOKEN_SIZE];
PFILE *parse_fopen(char const *name) {
PFILE *temp = malloc(sizeof(PFILE));
if ( NULL != temp ) {
temp->file = fopen(name, "r");
memset(temp->peeks, 0, sizeof(temp->peeks));
temp->last_peek = 0;
}
return temp;
}
PFILE *parse_ffopen(FILE *file) {
PFILE *temp = malloc(sizeof(PFILE));
if ( NULL != temp) {
temp->file = file;
memset(temp->peeks, 0, sizeof(temp->peeks));
temp->last_peek = 0;
}
return temp;
}
int parse_fclose(PFILE *stream) {
int retval = fclose(stream->file);
free(stream);
return retval;
}
static void addchar(int ch) {
/* adds the passed character to the end of `last_token' */
if ( current < sizeof(last_token) -1 )
last_token[current++] = (char)ch;
if ( current == sizeof(last_token)-1 )
last_token[current] = '\0';
}
static void clear(void) {
/* clears the previous token and starts building a new one. */
current = 0;
}
static int read_char(PFILE *stream) {
if ( stream->last_peek > 0 )
return stream->peeks[--stream->last_peek];
return fgetc(stream->file);
}
void unget_character(int ch, PFILE * stream) {
if ( stream->last_peek < sizeof(stream->peeks) )
stream->peeks[stream->last_peek++] = ch;
}
static int check_trigraph(PFILE *stream) {
/* Checks for trigraphs and returns the equivalant character if there
* is one. Expects that the leading '?' of the trigraph has already
* been read before this is called.
*/
int ch;
if ( '?' != (ch=read_char(stream))) {
unget_character(ch, stream);
return '?';
}
ch = read_char(stream);
switch( ch ) {
case '(': return '[';
case ')': return ']';
case '/': return '\\';
case '\'': return '^';
case '<': return '{';
case '>': return '}';
case '!': return '|';
case '-': return '~';
case '=': return '#';
default:
unget_character('?', stream);
unget_character(ch, stream);
return '?';
}
}
#ifdef DIGRAPH
static int check_digraph(PFILE *stream, int first) {
/* Checks for a digraph. The first character of the digraph is
* transmitted as the second parameter, as there are several possible
* first characters of a digraph.
*/
int ch = read_char(stream);
switch(first) {
case '<':
if ( '%' == ch )
return '{';
if ( ':' == ch )
return '[';
break;
case ':':
if ( '>' == ch )
return ']';
break;
case '%':
if ( '>' == ch )
return '}';
if ( ':' == ch )
return '#';
break;
}
/* If it's not one of the specific combos above, return the characters
* separately and unchanged by putting the second one back into the
* stream, and returning the first one as-is.
*/
unget_character(ch, stream);
return first;
}
#endif
static int get_char(PFILE *stream) {
/* Gets a single character from the stream with any trigraphs or digraphs converted
* to the single character represented. Note that handling digraphs this early in
* translation isn't really correct (and shouldn't happen in C at all).
*/
int ch = read_char(stream);
if ( ch == '?' )
return check_trigraph(stream);
#ifdef DIGRAPH
if (( ch == '<' || ch == ':' || ch == '%' ))
return check_digraph(stream, ch);
#endif
return ch;
}
int get_character(PFILE *stream) {
/* gets a character from `stream'. Any amount of any kind of whitespace
* is returned as a single space. Escaped new-lines are "eaten" here as well.
*/
int ch;
if ( !isspace(ch=get_char(stream)) && ch != '\\')
return ch;
// handle line-slicing
if (ch == '\\') {
ch = get_char(stream);
if (ch == '\n')
ch = get_char(stream);
else {
unget_character(ch, stream);
return ch;
}
}
/* If it's a space, skip over consecutive white-space */
while (isspace(ch) && ('\n' != ch))
ch = get_char(stream);
if ('\n' == ch)
return ch;
/* Then put the non-ws character back */
unget_character(ch, stream);
/* and return a single space character... */
return ' ';
}
static int read_char_lit(PFILE *stream) {
/* This is used internally by `get_source' (below) - it expects the
* opening quote of a character literal to have already been read and
* returns CHAR_LIT or ERROR if there's a newline before a close
* quote is found, or if the character literal contains more than two
* characters after escapes are taken into account.
*/
int ch;
int i;
clear();
addchar('\'');
for (i=0; i<2 && ('\'' != ( ch = read_char(stream))); i++) {
addchar(ch);
if ( ch == '\n' )
return ERROR;
if (ch == '\\' ) {
ch = get_char(stream);
addchar(ch);
}
}
addchar('\'');
addchar('\0');
if ( i > 2 )
return ERROR;
return CHAR_LIT;
}
static int read_str_lit(PFILE *stream) {
/* Used internally by get_source. Expects the opening quote of a string
* literal to have already been read. Returns STR_LIT, or ERROR if a
* un-escaped newline is found before the close quote.
*/
int ch;
clear();
addchar('"');
while ( '"' != ( ch = get_char(stream))) {
if ( '\n' == ch || EOF == ch )
return ERROR;
addchar(ch);
if( ch == '\\' ) {
ch = read_char(stream);
addchar(ch);
}
}
addchar('"');
addchar('\0');
return STR_LIT;
}
static int read_comment(PFILE *stream) {
/* Skips over a comment in stream. Assumes the leading '/' has already
* been read and skips over the body. If we're reading C++ source, skips
* C++ single line comments as well as normal C comments.
*/
int ch;
clear();
ch = get_char(stream);
/* Handle a single line comment.
*/
if ('/' == ch) {
addchar('/');
addchar('/');
while ( '\n' != ( ch = get_char(stream)))
addchar(ch);
addchar('\0');
return COMMENT;
}
if ('*' != ch ) {
unget_character(ch, stream);
return '/';
}
addchar('/');
do {
addchar(ch);
while ('*' !=(ch = get_char(stream)))
if (EOF == ch)
return ERROR;
else
addchar(ch);
addchar(ch);
} while ( '/' != (ch=get_char(stream)));
addchar('/');
addchar('\0');
return COMMENT;
}
int get_source(PFILE *stream) {
/* reads and returns a single "item" from the stream. An "item" is a
* comment, a literal or a single character after trigraph and possible
* digraph substitution has taken place.
*/
int ch = get_character(stream);
switch(ch) {
case '\'':
return read_char_lit(stream);
case '"':
return read_str_lit(stream);
case '/':
return read_comment(stream);
default:
return ch;
}
}
#ifdef TEST
int main(int argc, char **argv) {
PFILE *f;
int ch;
if (argc != 2) {
fprintf(stderr, "Usage: get_src <filename>\n");
return EXIT_FAILURE;
}
if (NULL==(f= parse_fopen(argv[1]))) {
fprintf(stderr, "Unable to open: %s\n", argv[1]);
return EXIT_FAILURE;
}
while (EOF!=(ch=get_source(f)))
if (ch < 0)
printf("\n%s\n", last_token);
else
printf("%c", ch);
parse_fclose(f);
return 0;
}
#endif
I'm not sure about how easy/difficult it would/will be to integrate that into a Flex-based lexer though -- I seem to recall Flex has some sort of hook to define what it uses to read a character, but I've never tried to use it, so I can't say much more about it (and ultimately, can't even say with anything approaching certainty that it even exists).