how can i parse this file in c - c

how can split the word from its meaning
1. mammoth: large
My code:
void ReadFromFile(){
FILE *dictionary = fopen("dictionary.txt", "r");
char word[20];
char meaning[50];
while(fscanf(dictionary, "%[^:]:%[^\t]\t", word, meaning) == 2){
printf("%s %s\n", word, meaning);
}
fclose(dictionary);

Assuming the word and the meaning do not contain digits and dots,
my approach is the following:
First, split the input line on the digits and dots into the tokens which
have the form as word: meaning.
Next separate each token on the colon character.
As a finish up, remove the leading and trailing blank characters.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define INFILE "dictionary.txt"
void split(char *str);
void separate(char *str);
char *trim(char *str);
/*
* split line on serial number into "word" and "meaning" pairs
* WARNING: the array of "str" is modified
*/
void
split(char *str)
{
char *tk; // pointer to each token
char delim[] = "0123456789."; // characters used in the serial number
tk = strtok(str, delim); // get the first token
while (tk != NULL) {
separate(tk); // separate each token
tk = strtok(NULL, delim); // get the next token
}
}
/*
* separate the pair into "word" and "meaning" and print them
*/
void
separate(char *str)
{
char *p;
if (NULL == (p = index(str, ':'))) {
// search a colon character in "str"
fprintf(stderr, "Illegal format: %s\n", str);
exit(1);
}
*p++ = '\0'; // terminate the "word" string
// now "p" points to the start of "meaning"
printf("%s %s\n", trim(str), trim(p));
}
/*
* remove leading and trailing whitespaces
* WARNING: the array of "str" is modified
*/
char *
trim(char *str)
{
char *p;
for (p = str; *p != '\0'; p++); // jump to the end of "str"
for (; p > str && (*p == ' ' || *p == '\t' || *p == '\r' || *p == '\n' || *p == '\0'); p--);
// rewind the pointer skipping blanks
*++p = '\0'; // chop the trailing blanks off
for (p = str; *p != '\0' && (*p == ' ' || *p == '\t' || *p == '\r' || *p == '\n'); p++);
// skip leading blanks
return p;
}
int
main()
{
FILE *fp;
char str[BUFSIZ];
if (NULL == (fp = fopen(INFILE, "r"))) {
perror(INFILE);
exit(1);
}
while (NULL != fgets(str, BUFSIZ, fp)) {
split(trim(str));
}
fclose(fp);
return 0;
}
Output:
foe enemy
vast huge
purchase buy
drowsy sleepy
absent missing
prank trick
[snip]
[Alternative]
I suppose C may not be a suitable language for this kind of string manipulations. High-level languages such as python, perl or ruby will solve it with much fewer codes. Here is an example with python which will produce the same results:
import re
with open("dictionary.txt") as f:
s = f.read()
for m in re.finditer(r'\d+\.\s*(.+?):\s*(\S+)', s):
print(m.group(1) + " " + m.group(2))

Related

How do I strcmp specifically for newline in C?

#include <stdio.h>
#include <string.h>
int main() {
int counter1, counter2;
char line[200] = ""; //store all words that don't need to be deleted
char deleteWord[100]; //word that needs to be deleted
char space;
char word[100];
scanf("%s", deleteWord);
while (1) {
scanf("%s", word);
if (feof(stdin))
break;
// increment counter of total words
++counter1;
if (strcmp(word, deleteWord) == 0) {
// see if the word read in == delete word
// increment counter of deleted words
++counter2;
} else
if (strcmp(word, " ") == 0) { // space is an actual space
strcat(line, word);
strcat(line, " ");
} else
if (strcmp(word, "\n")) { // space a new line \n
strcat(line, word);
strcat(line, "\n");
}
}
printf("--NEW TEXT--\n%s", line);
return 0;
}
In summary, my code is supposed to remove a user input string (one or more words) from another user input string (containing or not containing the word(s)) and produce the output. The code removes the word but it adds a newline per word for each iteration. I believe it is doing this because the expression for the second else if is always true. However, when I properly add the strcmp function for the second else if statement, the code does not produce an output at all (no compiler errors - just missing input). Why is this happening and how do I do a strcmp function for a newline?
Your read the words with scanf("%s", word), which poses these problems:
all white space is ignored, so you cannot test for spaces nor newlines as you try to do in the loop, and you cannot keep track of line breaks.
you should tell scanf() the maximum number of bytes to store into the destination array word, otherwise any word longer than 99 characters will cause a buffer overflow and invoke undefined behavior.
you should test the return value of scanf() instead of callin feof() which might be true after the last word has been successfully read. You should simply write the loop as
while (scanf("%99s", word) == 1) {
// increment counter of total words
++counter1;
...
you do not test if the words fit in the line array either, causing a buffer overflow if the words kept amount to more than 199 characters including separators.
To delete a specific word from a stream, you could read one line at a time and delete the matching words from the line:
#include <ctype.h>
#include <stdio.h>
#include <string.h>
int main() {
char deleteWord[100]; //word that needs to be deleted
char line[1000]; //store all words that don't need to be deleted
printf("Enter the word to remove: ");
if (scanf("%99s", deleteWord) != 1)
return 1;
// read and discard the rest of the input line
int c;
while ((c = getchar()) != EOF && c != '\n')
continue;
size_t len = strlen(deleteWord);
printf("Enter the text: ");
while (fgets(line, sizeof line, stdin)) {
char *p = line;
char *q;
while ((p = strstr(p, deleteWord)) != NULL) {
if ((p == line || isspace((unsigned char)p[-1]))
&& (p[len] == '\0' || isspace((unsigned char)p[len]))) {
/* remove the word */
memmove(p, p + len, strlen(p + len) + 1);
} else {
p += len;
}
}
/* squeeze sequences of spaces as a single space */
for (p = q = line + 1; *p; p++) {
if (*p != ' ' || p[-1] != ' ')
*q++ = *p;
}
*q = '\0';
fputs(line, stdout);
}
return 0;
}

String parsing in C... strtok() doesn't quite get it done

OK... first question so please forgive me if it isn't quite understandable the first go.
I am attempting to parse a string input to stdin through a couple of different conditions.
Example input string: move this into "tokens that I need" \n
I would like to parse this into tokens as:
Token 1 = move
Token 2 = this
Token 3 = into
Token 4 = tokens that I need
Where the tokens are by whitespace (easy enough) until a quote is encountered, then everything inside of the open and close quotes is treated as a single token.
I've tried several different methods, but I unfortunately feel that I may be in over my head here so any help would be greatly appreciated.
My latest attempt:
fgets(input, BUFLEN, stdin); //gets the input
input[strlen(input)-1] = '\0';//removes the new line
printf("Input string = %s\n",input);//Just prints it out for me to see
char *token = strtok(input,delim);//Tokenizes the input, which unfortunately does not do what I need. delim is just my string of delimiters which currently only has a " " in it.
I tried to scan through the string one character at a time and then place those characters into arrays so that I could have them as I wanted, but that failed miserably.
The ultimate solution with customized version of my_strtok_r is here. This solution has advantage over solution with non re-entrant: strtok.
my_strtok_r is re-entrant: you can call them from multiple threads simultaneously, or in nested loops, et cetera.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
char * my_strtok_r(char *s, const char *delim1, const char *delim2, char **save_ptr)
{
char *end;
size_t s1;
size_t s2;
int delim2found = 0;
if (s == NULL)
s = *save_ptr;
if (*s == '\0'){
*save_ptr = s;
return NULL;
}
s1 = strspn (s, delim1);
s2 = strspn (s, delim2);
if(s2 > s1){
s += s2;
delim2found = 1;
}
else{
s += s1;
}
if (*s == '\0'){
*save_ptr = s;
return NULL;
}
/* Find the end of the token. */
if(delim2found)
end = s + strcspn (s, delim2);
else
end = s + strcspn (s, delim1);
if (*end == '\0') {
*save_ptr = end;
return s;
}
/* Terminate the token and make *save_ptr point past it. */
*end = '\0';
*save_ptr = end + 1;
return s;
}
int main (void)
{
char str[] = " 123 abc \"SPLITTING WORKS\" yes! \"GREAT WE HAVE A SOLUTION\" ! ";
char *d1 = " ";
char *d2 = "\"";
char *token;
char *rest = str;
char array[20][80];
printf ("Splitting string \"%s\" into tokens:\n",str);
size_t nr_of_tokens = 0;
while ((token = my_strtok_r(rest, d1, d2, &rest)))
{
strcpy (array[nr_of_tokens], token);
nr_of_tokens++;
}
for(int i=0; i < nr_of_tokens; i++)
printf ("%s\n",array[i]);
return 0;
}
Test:
Splitting string " 123 abc "SPLITING WORKS" yes! "GREAT WE HAVE A SOLUTION" ! " into tokens:
123
abc
SPLITTING WORKS
yes!
GREAT WE HAVE A SOLUTION
!
This is another solution (fully tested) which you can use. You can mix any number of tokens delimited by white spaces and '\"'. It can be configured to your needs. Extensive explanations are given in the code itself.
#include<stdio.h>
#include<stdlib.h>
#include<string.h>
#include <ctype.h>
char *get_str_segment(char *output_str, char *input_str, char extDel)
{
/*
Purpose :
To copy to output first segment.
To extract the segment two types of delimiters are used:
1. white space delimiter
2. 'extDel' -
do not put here white space or '\0'!
(typicaly '"' = quote!)
'extDel' allows us to put white spaces inside the segment.
Notice that 'extDel' cannot be embedded inside the segment!
It makes 'extDel' special character which will not be encountered
in the 'output_string'! First appearance of 'extDel' starts new
segment!
Notice that unbalanced 'extDel' will cause coping whole string to
destination from that point!
Return:
Pointer to the first character after the segment
or NULL !!!
we will not allow **empty** segments with unbalanced 'extDel'
if ('extDel' is unbalanced) it has to have at list one character!
It can be white space!
Notice!
"get_str_segment()" on strings filed with white spaces
and empty strings will return *** NULL *** to indicate that
no conclusive segment has been found!
Example:
input_str = " qwerty"123 45" "samuel" G7 "
output_str = ""
// Call:
char *ptr = get_str_segment(output_str,input_str,'"');
Result:
input_str = " qwerty"123 45" "samuel" G7 "
^
|
ptr----------------------.
output_str = "qwerty"
*/
char *s = input_str;
char *d = output_str;
char i = 0;
if(!s) return NULL; // rule #1 our code never brakes!
if(!d) return NULL;
// eliminate white spaces from front of the block
while(1)
{
if ( *s == '\0')
{
*d = '\0' ; // end the output string
return (NULL) ; // return NULL to indicate that no
// coping has been done.
//
//
// "get_str_segment()" on
// strings filed with white spaces
// and empty strings
// will return NULL to indicate that
// no conclusive segment has been found
//
}
if (isspace(*s)) ++s; // move pointer to next char
else break; // break the loop!
}
// we found first non white character!
if( *s != extDel)
{
// copy block up to end of string first white space or extDel
while( ((*s) != '\0') && (!isspace(*s)) && ((*s) != extDel) )
{
*d = *s; // copy segment characters
++s;
++d;
}
*d = '\0' ; // end the destination string
return (s); // return pointer to end of the string ||
// trailing white space ||
// 'extDel' char
}
else // It is 'extDel' character !
{
++s; // skip opening 'extDel'
while( ((*s) != '\0') && ((*s) != extDel) )
{
i=1; // we executed loop at list one time
*d = *s; // copy segment characters till '\0' or extDel
++s;
++d;
}
*d = '\0' ; // end the destination string
if( *s == extDel ) ++s; // skip *closing* 'extDel'
else
{
// unbalanced 'extDel'!
printf("WARNING:get_str_segment: unbalanced '%c' encountered!\n",extDel);
if (i==0) return NULL; // we will not allow
// **empty** unbalanced segments 'extDel'
// if ('extDel' is unbalanced) it has to have at list one character!
// It can be white space!
}
return (s); // return pointer to next char after 'extDel'
// ( It can be '\0')
// if it is '\0' next pass will return 'NULL'!
}
}
int parse_line_to_table(int firstDim, int secondDim, char *table, char * line, char separator)
{
// Purpose:
// Parse 'line' to 'table'
// Function returns: number of segments
// 'table' has to be passed from outside
char* p;
int i;
if(!table) return (-1);
// parse segments to 'table':
if(line)
{
p = line; // A necessary initialization!
for(i=0; i<firstDim; i++)
{
p = get_str_segment( table+i*secondDim , p , separator );
if(p==NULL) break;
}
}
else
return (-1);
// debug only
// for(int j=0; j<i; j++) { printf(" i=%d %s",j, table+j*secondDim ); }
// printf("\n");
return (i); // notice that i is post incremented
}
int main(void)
{
char table[20][80];
char *line = "move this into \"tokens that I need\"";
int ret = parse_line_to_table(20, 80, table, line, '\"');
for(int i = 0; i < ret; i++ )
printf("%s\n",table[i]);
return 0;
}
Output:
move
this
into
tokens that I need

Parsing array of chars into ***char

I've a array of chars as a result of gets() (which is a command inputed to my shell), for example "ls -l \ | sort". Now what I want to have is a char*** that would hold pointers to particular commands (so split by |). With my example:
*1[] = {"ls", "-l", "\", null}
*2[] = {"sort", null}
and my char*** would be {1,2}. The thing is, I don't know how many strings will be given to me in this array of characters, so I can't predefine that. What I have now is just splitting the array of chars into words by whitespaces and I can't figure out how to do what I actually need.
Also in my input/output above the function should react the same to "ls -l \|sort" and "ls -l \ | sort"
My code so far:
int parse(char *line, char **argv)
{
while (*line != '\0') {
while (*line == ' ' || *line == '\t' || *line == '\n')
*line++ = '\0';
*argv++ = line;
while (*line != '\0' && *line != ' ' && *line != '\t' && *line != '\n'){
line++;
}
}
*argv = '\0';
return 0;
}
To split a line, use strtok(). You can use this function with | as delimiter and then with :
#include <string.h>
int parse(char *line, char **argv, const char *delim)
{
int word_counter = 0
/* get the first word */
char *word = strtok(line, delim);
/* walk through other words */
while (word != NULL)
{
printf(" %s\n", word);
word_counter++;
/* save word somewhere */
word = strtok(NULL, delim);
}
printf("This string contains %d words separated with %s\n",word_counter,delim);
}

Seg fault while parsing a line into ***char

My code is supposed to parse an array of chars into ***char, so that it splits it first by '|' char and then by whitespaces, newline characters etc into words. Sample i/o:
I = ls -l | sort | unique
O =
*cmds[1] = {"ls", "-l", NULL};
*cmds[2] = {"sort", NULL};
*cmds[3] = {"unique", NULL};
above are pointers to char arrays, so split by words and then below is ***char with pointers to above pointers
char **cmds[] = {1, 2, 3, NULL};
Now, I don't see my mistake (probably because I am not so skilled in C), but program gives segfault the second I call parse(..) function from inside parsePipe(). Can anyone please help?
void parse(char *line, char **argv)
{
while (*line != '\0') {
while (*line == ' ' || *line == '\t' || *line == '\n')
*line++ = '\0';
*argv++ = line;
while (*line != '\0' && *line != ' ' && *line != '\t' && *line != '\n'){
line++;
}
}
*argv = '\0';
}
void parsePipe(char *line, char ***cmds)
{
char *cmd = strtok(line, "|");
int word_counter = 0;
while (cmd != NULL)
{
printf("Printing word -> %s\n", cmd);
word_counter++;
parse(cmd, *cmds++);
cmd = strtok(NULL, "|");
}
printf("This string contains %d words separated with |\n",word_counter);
}
void main(void)
{
char line[1024];
char **cmds[64];
while (1) {
printf("lsh -> ");
gets(line);
printf("\n");
parsePipe(line, cmds);
}
}
[too long for a comment]
This line
*argv++ = line; /* with char ** argv */
refers to invalid memory, as the code does *argv[n] (with char **argv[64]) which refers nothing.
The namings you use do not make live easier.
Try the following naming:
void parse(char *line, char **cmd)
{
while (*line != '\0') {
while (*line == ' ' || *line == '\t' || *line == '\n')
*line++ = '\0';
*cmd++ = line;
while (*line != '\0' && *line != ' ' && *line != '\t' && *line != '\n'){
line++;
}
}
*argv = '\0';
}
void parsePipe(char *line, char ***cmdline)
{
char *cmd = strtok(line, "|");
int word_counter = 0;
while (cmd != NULL)
{
printf("Printing word -> %s\n", cmd);
word_counter++;
parse(cmd, *cmdline++);
cmd = strtok(NULL, "|");
}
printf("This string contains %d words separated with |\n",word_counter);
}
void main(void)
{
char line[1024];
char **cmdline[64];
while (1) {
printf("lsh -> ");
gets(line);
printf("\n");
parsePipe(line, cmdline);
}
}
For none of the cmds used memory had been allocated.
So
*cmd++ = line;
fails, as cmd points nowhere, but gets dereferenced and the code tries to write to where it's pointing, which is nowhere, that is invalid memory.
Fixing this can be done by passing char*** to parse(char *** pcmd) and counting the tokens found
size_t nlines = 0;
...
++nlines.
and the doing a
*pcmd = realloc(*pcmd, nlines + 1); /* Allocate one more as needed to later find the end of the array. */
(*pcmd)[nlines -1] = line;
(*pcmd)[nlines] = NULL; /* Initialise the stopper, marking the end of the array. */
for each token found.
Obviously you need to call it like this:
parse(cmd, cmdline++);
To have all this work the inital array needs to initialised properly (as you should have done anyway):
char **cmdline[64] = {0};

Parsing command line statements as a list of tokens

#include <stdio.h>
#include <string.h> /* needed for strtok */
#include <unistd.h>
#include <stdlib.h>
int main(int argc, char **argv) {
char text[10000];
fgets(text, sizeof(text), stdin);
char *t;
int i;
t = strtok(text, "\"\'| ");
for (i=0; t != NULL; i++) {
printf("token %d is \"%s\"\n", i, t);
t = strtok(NULL, "\"\'| ");
}
}
This is part of the code that im trying to make it is supposed to separate tokens
Let's say the input is 'abc' "de f'g" hij| k "lm | no"
The output should be
token 1: "abc"
token 2: "de f'g"
token 3: "hij"
token 4: "|"
token 5: "k"
token 6: "lm | no"
I get something different but close anyway I can change it to this format?
What you're trying to do is essentially a parser. strtok isn't a very good tool for this, and you may have better luck writing your own. strtok works on the presumption that whatever delimits your tokens is unimportant and so can be overwritten with '\0'. But you DO care what the delimiter is.
The only problem you'll have is that | syntax. The fact that you want to use it as a token delimiter and a token is likely to make your code more complicated (but not too much). Here, you have the issue that hij is followed immediately by |. If you terminate hij to get the token, you will have to overwrite the |. You either have to store the overwritten character and restore it, or copy the string out somewhere else.
You basically have three cases:
The bar | is a special delimiter that is also a token;
Quoted delimiters " and ' match everything until the next quote of the same kind;
Otherwise, tokens are delimited by whitespace.
#include <stdio.h>
#include <string.h>
char *getToken(char **sp){
static const char *sep = " \t\n";
static char vb[] = "|", vbf;
char *p, *s;
if(vbf){
vbf = 0;
return vb;
}
if (sp == NULL || *sp == NULL || **sp == '\0') return(NULL);
s = *sp;
if(*s == '"')
p = strchr(++s, '"');
else if(*s == '\'')
p = strchr(++s, '\'');
else
p = s + strcspn(s, "| \t\n");
if(*p != '\0'){
if(*p == '|'){
*vb = vbf = '|';
}
*p++ = '\0';
p += strspn(p, sep);
}
*sp = p;
if(!*s){
vbf = 0;
return vb;
}
return s;
}
int main(int argc, char **argv) {
char text[10000];
fgets(text, sizeof(text), stdin);
char *t, *p = text;
int i;
t = getToken(&p);
for (i=1; t != NULL; i++) {
printf("token %d is \"%s\"\n", i, t);
t = getToken(&p);
}
return 0;
}

Resources