c - Removing comments (/* and //) and strings - c

Can anyone tell me whats wrong with the following code that's suppose to remove comments and strings from an input (but not comments that's why it recognizes comments)? This is related to a prior question of mine: Removing comments with a sliding window without nested while loops
#include <stdio.h>
int main()
{
int c, c1 = 0, c2 = 0 ,state = 0, next = 0;
while(1)
{
switch(state)
{
case 0: next = ((c2 == '*' && c1 == '/') ? 1 : (c2 == '\"') ? 2 : (c2 == '/' && c1 == '/') ? 3 : (c2 == '\'') ? 4: 0); break;
case 1: next = ((c2 == '/' && c1 == '*') ? 0 : 1); break;
case 2: next = ((c2 == '\"' && c1 != '\\') ? 0 : 2); break;
case 3: next = ((c2 == '\n') ? 0 : 3); break;
case 4: next = ((c2 == '\'' && c1 != '\\') ? 0 : 4); break;
default: next = state;
}
c = getchar(); if( c < 0) break;
c1 = c2; c2 = c; // slide window
if(state == 1)
{
if(c2 == '*')
{
c = getchar();
c1 = c2; c2 = c;
if(c2 != '/')
putchar(c1);
}
else
putchar(c2);
}
else if(state == 2)
{
if(c2 != '"' || (c2 == '\"' && c1 != '\\'))
putchar(c2);
}
else if(state == 3)
{
putchar(c2);
}
else
state = next;
// c2 is the current input byte and c1 is the previous input byte
}
return 0;
}

I don't think you actually need a sliding window for your task of removing C and C++ comments. You can expand your state machine to include a few additions states for tracking escapes, etc... With more states the code gets a bit bigger, but it might make it conceptually simpler since you only have one state to track. So converting the spirit of your code to the new state machine formula I'd suggest, you get the code below (and I also agree with Basile's suggestion of using enums and included it).
#include <stdio.h>
int main()
{
enum {
START, SLASH,
STRING, CHAR, STRING_ESCAPE, CHAR_ESCAPE,
SINGLE_LINE_COMMENT, MULTI_LINE_COMMENT, MULTI_LINE_END,
} state = START;
int c;
while ((c = getchar()) != EOF) {
switch (state) {
case START:
state_START:
if (c == '/') { state = SLASH; break; }
putchar(c);
if (c == '\"') state = STRING;
else if (c == '\'') state = CHAR;
break;
case SLASH:
if (c == '/') state = SINGLE_LINE_COMMENT;
else if (c == '*') state = MULTI_LINE_COMMENT;
else { state = START; goto state_START; }
break;
case STRING:
putchar(c);
if (c == '"') state = START;
else if (c == '\\') state = STRING_ESCAPE;
break;
case CHAR:
putchar(c);
if (c == '\'') state = START;
else if (c == '\\') state = CHAR_ESCAPE;
break;
case SINGLE_LINE_COMMENT:
if (c == '\n') state = START;
break;
case MULTI_LINE_COMMENT:
state_MULTI_LINE_COMMENT:
if (c == '*') state = MULTI_LINE_END;
break;
case STRING_ESCAPE:
putchar(c);
state = STRING;
break;
case CHAR_ESCAPE:
putchar(c);
state = CHAR;
break;
case MULTI_LINE_END:
if (c == '/') state = START;
else { state = MULTI_LINE_COMMENT; goto state_MULTI_LINE_COMMENT; }
break;
}
}
return 0;
}

just to give you an advice without knowing your purposes... Did you thought about regular expressions to solve your problem? It is may faster and your code will be cleaner, assuming you understand regular expressions.
I found a neat site for your problem by the way... It explains how to get those comments out of the code...
How get comments with regex
And here is a library for regex in C.

Related

program to count commented characters and words in a C file

I have to count characters and word in comment of a C file, for both single line comments and blocked comment. Here's what I have:
#include <stdio.h>
#define IN = 1
#define OUT = 0
main() {
int c, nc;
nc = 0;
while ((c = getchar()) != EOF) {
if (c == '/') {
if (getchar() == '/')
while (getchar() != '\n')
++nc;
}
}
if (c == '/') {
if (getchar() == '*')
while (getchar() != '/')
++nc;
}
printf("Character Counts: %d\n", nc);
}
It works for every single line comment (//), but it skips the blocked comments (/*...*/). I feel like it never enter the if block for the blocked comment. Much appreciate!
There are multiple problems in your code:
You must specify int as the return type of the main function. The syntax in the question is obsolete.
The definitions of IN and OUT are incorrect. You should either use
#define IN 1
#define OUT 0
or
enum { IN = 1, OUT = 0 };
The first loop consumes all the bytes in standard input, you are at the end of file, so the tests for /*...*/ comments never produce anything.
loops such as while (getchar() != '\n') can run forever if the byte tested is not found before the end of file.
You cannot test // and /*...*/ comments separately as one can hide the other:
//* this is a line comment that does not start a C style one
/* this comment contains a // but stops here */ return 0;
Note also that you should parse C strings and character constants as they may contain // and or /* sequences that do not start a comment.
For a complete solution, you should also handle escaped newlines. Here are some pathological examples:
// this is a line comment that extends \
on multiple \
lines (and tricks the colorizer)
/\
* this is a C comment, but the colorizer missed it *\
/
This problem is non-trivial to solve in the general case, but you can start with simple cases.
Here is a modified version:
#include <stdio.h>
int main() {
int c, cc, nc = 0;
while ((c = getchar()) != EOF) {
if (c == '/') {
if ((cc = getchar()) == '/') {
while ((c = getchar()) != '\n')
nc++;
} else
if (cc == '*') {
while ((cc = getchar()) != EOF) {
if (cc == '*') {
if ((cc = getchar()) == '/')
break;
ungetc(cc, stdin);
}
nc++;
}
}
}
}
printf("Character Counts: %d\n", nc);
return 0;
}
I added code to count the words. It works on few occasions, but it behaves weird when I have space after the slash. For example, // comment... Most of the time, the word count is off by 1.
#include<stdio.h>
#define IN 1
#define OUT 0
int main() {
int c, cc, nc = 0;
int state;
int nw = 0;
state = OUT;
while ((c = getchar()) != EOF) {
if (c == '/') {
if ((cc = getchar()) == '/') {
while ((c = getchar()) != '\n'){
nc++;
if (c == ' ' || c == '\t')
state = OUT;
else if (state == OUT){
state = IN;
nw++;
}
}
}
else if (cc == '*') {
while ((cc = getchar()) != EOF) {
if (cc == ' ' || cc == '\t')
state = OUT;
else if (state == OUT){
state = IN;
nw++;
}
if (cc == '*') {
if ((cc = getchar()) == '/')
break;
ungetc(cc, stdin);
}
nc++;
}
}
}
}
printf("Character Counts: %d\n", nc);
printf("Word Counts: %d\n", nw);
return 0;
}
program to count commented characters and words in a C file
it skips the blocked comments (/.../)
I recommend, at a minimum, to parse code and look for 5 states: normal, in a // comment, in a /* comment, in a "" string literal, in a '' character constant.
// pseudo code
while ((ch = getchar()) != EOF) {
if ch == '/' and next == '/', process `//` comment until end-of-line
elseif ch '/' and next == '*', process `/*` comment until `*/`
elseif ch '"', process string until " (but not \")
elseif ch ''', process constant until ' (but not \')
else process normally
}
To look at the next character, call getchar() and then ungetc() if not as expected.

Switch Case to count amount of words and characters

the output window commandI am trying to use switch case in C to figure out the amount of characters, words, newlines in a user input. The code seems legit, no errors raised, however, the output does not work as expected. Please take a look and tell me what I did wrong. Thanks in advance! Here is the code:
#include <stdio.h>
int main()
{
char a, words = 1, characters = 0, newlines = 0;
printf("What do you have in mind? ");
a = getchar();
while ((a=getchar()) && a != EOF)
{
switch (a)
{
case '1':
if (a >= 'a' && a <= 'z' || a >= 'A' && a <= 'Z')
characters++;
printf("The amount of character is %c ", characters);
case '2':
if (a == ' ' || a == '\t')
words++;
printf("The amount of word is %c ", words);
case '3':
if (a == '\t')
newlines++;
printf("The amount of newlines is %c ", newlines);
default:
if (a == EOF)
break;
}
}
return 0;
}
you misunderstand what switch /case means. It does not means 'first case','second case' .. of some conditions, it means (in your specific situation), if the user just typed '1' then do this, it the users just typed '2' then do this,... well you are not typing 1 or 2 or 3.
Simply do this
while ((a=getchar()) && a != EOF)
{
if (a >= 'a' && a <= 'z' || a >= 'A' && a <= 'Z')
characters++;
printf("The amount of character is %c ", characters);
if (a == ' ' || a == '\t')
words++;
printf("The amount of word is %c ", words);
if (a == '\t')
newlines++;
printf("The amount of newlines is %c ", newlines);
}
Also you must change a to be an int
#include<stdio.h>
int
main ()
{
int ch_count = 0, line_count = 0, word_count = 0, choice;
char ch;
FILE *fp;
fp = fopen ("wordcount.txt", "r"); //make a seperate file "wordcount.txt" and Read the Test content from it.
if (fp == NULL) // Show error if previous step is not done i.e wordcount.txt file is not made.
{
perror ("FILE NOT FOUND.");
return (-1);
}
else // If file is opened properly then ask for NO. of counts.
{
printf ("Select the Following Option-------\n"
"1 - Number of Characters\n"
"2 - Number of Words\n"
"3 - Number of Lines\n");
scanf ("%d", &choice);
}
{
switch (choice) // switch to desired case as per choice of user.
{
case 1: // CASE 1 - To count the characters in the file.
{
while ((ch = fgetc (fp)) && ch != EOF)
if (ch >= 'a' && ch <= 'z' || ch >= 'A' && ch <= 'Z')
ch_count++;
}
printf ("Number of Characters : %d\n", ch_count);
break;
case 2: // CASE 2: To count total number of words
while ((ch = fgetc (fp)) && ch != EOF)
if ((ch == ' ') || (ch == '\t') || (ch == '\n') || (ch == '\0'))
{
word_count++;
}
printf ("Numbers of Words : %d\n", word_count);
break;
case 3: // CASE 3: To count total number of lines
while ((ch = fgetc (fp)) && ch != EOF)
if ((ch == '\n') || (ch == '\0'))
{
line_count++;
}
printf ("Numbers of lines : %d\n", line_count);
break;
} //switch closed
}
fclose (fp); //file closed
return 0;
}

How to check if a character is inside the scope of a comment in C [closed]

Closed. This question needs to be more focused. It is not currently accepting answers.
Want to improve this question? Update the question so it focuses on one problem only by editing this post.
Closed 5 years ago.
Improve this question
I have to write a program that counts the number of times an operator that returns the address of a variable (&) is found inside a file.
I use this simple loop to do so (do not mind the !feof(p) that raises some questions):
while (!feof(p)){
c = fgetc(p);
if (c=='&') n++; }
However, this does not satisfy my needs. For instance, if an AND operator (&&) is found, my loop will increase the variable "n" twice but it mustn't even once. Another thing is that if the & operator is found in the scope of a single or multi-line comment it should not be counted.
My question is how can I make sure the given character/string (in my case "&" operator) is in a comment or not? And how to make sure it is indeed a "&" operator and not a part of a "&&" or a string?
As been mentioned in the comments, this is not a trivial task that could be written with a few lines of code. What you need is a parser. That parser needs to handle many different cases. Here is a (probably non-exhaustive) list:
One line comments: // This is a comment
Multiline comments: /* This is a comment */
Characters: char c='&'
String literals: strcmp(str, "A string with a & in it")
The bitwise operator: int a = mask & b
You would also need to decide how to handle incorrect input. Should the program be able to detect incorrect c code, or should it assume all input is correct? Another thing to consider is how to handle #include. Do you want to count the number of occurrences in the included files too? (I assume not, but this demonstrates a problem)
If you want it to 100% accurate in finding only the address operator, then it is way above your knowledge. (OP wrote "This is a problem is designed to be solved by 1st-semester students with only basic knowledge." in comment below)
If you're allowed to cut some corners there are easier ways.
Here is a complete example that cut some corners. It handles comments and strings, including escaped characters. However, it does not handle the bitwise operator.
#include <stdio.h>
#include <stdlib.h>
#define INPUT "input.c"
int main()
{
FILE *f;
if ((f = fopen(INPUT, "r")) == NULL)
{
perror (INPUT);
return (EXIT_FAILURE);
}
char c, p=0;
int n=0;
while((c = fgetc(f)) != EOF)
{
if(c == '/' && p == '/') {
while((c = fgetc(f)) != EOF) {
// If we read // then we throw away the rest of the line
if( c == '\n' ) {
break;
}
}
if( c == EOF) {
goto end;
}
}
else if(c == '*' && p == '/') {
// If we read /* then we throw away everything until we have read */
while((c = getc(f)) != EOF) {
if( c == '*' ) {
if((c = getc(f)) != EOF)
if( c == '/')
break;
}
} if ( c == EOF) {
goto end;
}
}
else if(c == '"') {
// Read until end of string
while((c = getc(f)) != EOF) {
if(c == '\\') {
if((c = getc(f)) == EOF)
goto end;
}
else if(c == '"')
break;
}
}
else if(c == '\'') {
while((c = getc(f)) != EOF) {
if(c == '\\') {
if((c = getc(f)) == EOF)
goto end;
}
else if(c == '\'')
break;
} if ( c == EOF)
goto end;
}
else if(c == '&') {
printf("hej");
if(p == '&')
n--;
else
n++;
}
p=c;
}
end:
printf("\n\nExited at pos %ld\n", ftell(f));
printf("Number of address operators: %d\n", n);
}
It works a bit like this: When it sees a start of a comment, it reads and throws away everything until the comment is finished or EOF. It does the same for strings.
On this input:
// Test &
/* Also
&
test */
// "
int main()
{
/* " //
*/
// /*
char str[]="hej&\"";
char c='&';
char k='\'';
int a, b;
int * p;
p=&a;
int c=a&b;
int q=a&&b;
}
// Test &
/* Also
&
test */
It reports the expected result 2. It would be better if it printed 1, but as I mentioned, it cannot handle the bitwise operator, thus counting it as an address operator. Fixing this issue would make things a lot more complicated.
And yes, I'm using goto since it is extremely convenient in a situation like this. In C++, I'd use exceptions, but that's not an option in C.
To cover all the cases in the C language would be pretty hard and you would need a proper parser probably, but if you only intend to use this for excersise - to make in work in the cases described in the question, you could implement something like this:
char previous = 0;
int single_line_comment = 0;
int multi_line_comment = 0;
int in_string = 0;
int in_char = 0;
while (!feof(p)){
c = fgetc(p);
if (c == '&' && !single_line_comment && !multi_line_comment && !in_string && !in_char)
{
if(previous == '&')
n--;
else
n++;
}
else if(c == '/' && prev == '/' && !multi_line_comment && !in_string && !in_char)
single_line_comment = 1;
else if(prev == '/' && c == '*' && !single_line_comment && !in_string && !in_char)
multi_line_comment = 1;
else if(c == '\n' && !multi_line_comment && !in_string && !in_char)
single_line_comment = 0;
else if(prev == '*' && c == '/' && !single_line_comment && !in_string && !in_char)
multi_line_comment = 0;
else if(c = '"' && !single_line_comment && !multi_line_comment && !in_char)
in_string = !in_string;
else if(c = '\'' && !single_line_comment && !multi_line_comment && !in_string)
in_char = !in_char;
previous = c;
}
Of course this is not a prefect solution, but could give an idea of how to overcome some of the problems.

c how to check the first char of an array

I am trying to take in 10 characters over a serial console and add them to an array called buffer. The first character needs to be 'L' or 'S' and the next characters either '1' or '0'.
The code passes the first if statement ok. But the line if((buffer[0] != 'L') || (buffer[0] != 'S')) doesn't seem to work even when I enter 'L' OR 'S'.
Is there anything wrong with using the buffer[0] != notation?
int main(void)
{
char ch;
char buffer[10] = "";
putstring("Enter 9 characters beginning with 'L' or 'S' and 8 digits\r\n");
for (int i = 0; i < 9; i++) {
ch = getcharacter();
if ((ch == '0') || (ch == '1') || (ch == 'L') || (ch == 'S')) {
buffer[i] = ch;
//check first character
if((buffer[0] != 'L') || (buffer[0] != 'S')) {
printf("First letter must be L or S\r\n");
goto error;
}
error:
return -1;
}
}
}
int getcharacter(void) {
char c = 0;
const uint32_t recieve_ready = 1 << 7;
//disable interrupt for a read ready
*uart_control_reg = 0;
while (1) {
//check if RRDY bit is set
if ((*uart_status_reg) & recieve_ready) {
c = *uart_rxdata_reg;
break;
}
}
return ((char) c);
}
if((buffer[0] != 'L') || (buffer[0] != 'S'))
is wrong, you need
if((buffer[0] != 'L') && (buffer[0] != 'S'))
or
if (!(buffer[0] == 'L' || buffer[0] == 'S'))
Your original code was "if the char is not L or the char is not S" which is always true. If the char is L, then the second part was true, making the whole if statement true.
Just noticed Chris Turner's comment above. The return -1 is always executed, move it to replace the line that says goto error.
Try using
if((buffer[0] != 'L') && (buffer[0] != 'S'))
instead of
if((buffer[0] != 'L') || (buffer[0] != 'S'))
Just some logic problem here. According to your code, the char needs to be equal to 'L' AND 'S' to avoid the condition, which is never the case !

Brainfuck interpreter in c printing trouble

I'm trying to code a very simple brainfuck interpreter in C, and I run into problems while trying to outprint certain characters by what I understand.
This is all my code:
#include <stdio.h>
int bla(char tabukaz[30000], int ukaz, int num) {
int sum = 0;
int index = ukaz;
while (sum > -1) {
index -= num;
if (tabukaz[index] == ']')
sum += num;
else if (tabukaz[index] == '[')
sum -= num;
}
return index;
}
int main () {
int tab[30000];
int tabukaz[30000];
int c;
int i = 0; int ukaz = 0;
unsigned char ch;
for (int i = 0; i < 30000; i++) {
tab[i] = 0;
tabukaz[i] = 0;
}
while ((c=getchar()) != EOF) {
ch = (unsigned char)c;
if (ch == '>' || ch == '<' || ch == '+' || ch == '-' || ch == '.' || ch == '[' || ch == ']')
{
tabukaz[ukaz] = ch;
}
switch (ch) {
case '>': i++; break;
case '<': i--; break;
case '+': tab[i]++;break;
case '-': tab[i]--; break;
case '.': putchar(tab[i]); break;
case '[':
if (tab[i]==0) {
ukaz = bla(tabukaz, ukaz, -1);
}
break;
case ']':
if (tab[i]!=0) {
ukaz = bla(tabukaz, ukaz, 1);
}
break;
default:
break;
}
ukaz++;
}
return 0;
}
This is the input in question (I tried to avoid the other text in the actual input (keep in mind everything down here is part of the input, even the unnecessary text) We were provided with a make file which will write the output into a text file, and compare it to a predefined text, the problem is my text file comes out as a binary file and I cant figure out why. The problem may be hidden in how I handle [ and ] as I didn't have that problem in the earlier tests without them
+++++ +++++ initialize counter (cell #0) to 10
[ use loop to set 70/100/30/10
> +++++ ++ add 7 to cell #1
> +++++ +++++ add 10 to cell #2
> +++ add 3 to cell #3
> + add 1 to cell #4
<<<< - decrement counter (cell #0)
]
> ++ . print 'H'
> + . print 'e'
+++++ ++ . print 'l'
. print 'l'
+++ . print 'o'
> ++ . print ' '
<< +++++ +++++ +++++ . print 'W'
> . print 'o'
+++ . print 'r'
----- - . print 'l'
----- --- . print 'd'
> + . print '!'
> . print '\n'
As a suggestion made by somebody I did this:
while ((c=getchar())!=EOF) {
ch = (unsigned char)c;
if (ch == '>' || ch == '<' || ch == '+' || ch == '-' || ch == '.' || ch == '[' || ch == ']')
{
tabukaz[ukaz]=ch;
stukaz++;
}
}
while (stukaz>0) {
switch (tabukaz[ukaz]) {
case '>': i++; break;
case '<': i--; break;
case '+': if(tab[i]==255) tab[i] = 0;
else tab[i]++;
break;
case '-': if (tab[i]==0) tab[i] = 255;
else tab[i]--;
break;
case '.': printf ("%c", tab[i]); break;
case '[':
if (tab[i]==0) {
ukaz = bla(tabukaz, ukaz, -1);
}
break;
case ']':
if (tab[i]!=0) {
ukaz = bla(tabukaz, ukaz, 1);
}
break;
default: break;
}
stukaz--;
ukaz++;
}
However the problem now extends to the tests before that, as it even outputs those as binary files, I'm thinking there's something wrong with the [ and ] code and thus it doesn't increment the fields properly printing unwanted characters, how this extended to tests without them only when putting another loop around it I have no idea.
EDIT: the problem with the above loop is not the while loop not going trough, the problem is that it will never get into the switch, any solution to that?
The test is wrong while scanning for a matching bracket.
while (sum > -1) {
index -= num;
if (tabukaz[index] == ']')
sum += num;
else if (tabukaz[index] == '[')
sum -= num;
}
You set num to 1 for a backwards scan, but to -1 for a forward scan.
case '[':
if (tab[i]==0) {
ukaz = bla(tabukaz, ukaz, -1);
}
break;
case ']':
if (tab[i]!=0) {
ukaz = bla(tabukaz, ukaz, 1);
}
break;
So the test should be (sum != 0) so it stops when the brackets are balanced from either direction. Of course, sum is initialized to 0, so I recommend a do { ... } while(); loop so the test is at the end.
Also, remember that you're incrementing the instruction pointer at the end of the loop.
ukaz++;
So you may want to set the pointer to bla(...) - 1, so the increment puts the pointer in the correct place.
You might also like to look at my own brainfuck interpreter, which is very similar to yours. One of my reviewers there gives an excellent explanation of optimizing the loop execution with a jump table.

Resources