Extract string pattern in C (no libs) - c

I'm reading a file with the following format:
/* ...more text above */
[Text=WORKING CharacterOffsetBegin=73516 CharacterOffsetEnd=73523 PartOfSpeech=VBG
Lemma=work] [Text=MEN CharacterOffsetBegin=73524 CharacterOffsetEnd=73527
PartOfSpeech=NNS Lemma=man] [Text=OF CharacterOffsetBegin=73528
CharacterOffsetEnd=73530 PartOfSpeech=IN Lemma=of] [Text=ALL
CharacterOffsetBegin=73531 CharacterOffsetEnd=73534 PartOfSpeech=NN Lemma=all]
[Text=COUNTRIES CharacterOffsetBegin=73535 CharacterOffsetEnd=73544 PartOfSpeech=NNS
Lemma=country] [Text=, CharacterOffsetBegin=73544 CharacterOffsetEnd=73545
PartOfSpeech=, Lemma=,] [Text=UNITE CharacterOffsetBegin=73546
CharacterOffsetEnd=73551 PartOfSpeech=VB Lemma=unite] [Text=!
CharacterOffsetBegin=73551 CharacterOffsetEnd=73552 PartOfSpeech=. Lemma=!]
/* ...more text below */
What i want to do is to extract into an array the strings given by Text= and Lemma=. For example, for the text above the output would be:
WORKING
work
MEN
man
OF
of
And so on. What i've tried:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define MAX_LINE 4096
int main()
{
FILE *fp;
fp = fopen("moby.txt.out", "r");
char *linha = malloc(MAX_LINE);
int s, t;
char lemma[100];
while(fgets(linha, MAX_LINE, fp))
{
if(sscanf(linha, "Sentence #%d (%d tokens):", &s, &t))
{
/*printf("%d\n", t);*/
}
else if(sscanf(linha, "Lemma=%s", lemma))
{
printf("%s", lemma);
}
}
fclose(fp);
return 0;
}
I can't use external libs.
I know regex is not part of C language, so any help is welcome.
Thanks!

And anyways you don't even need regexes. Nor scanf(). A simple solution is using strstr().
const char *s = "[Text=WORKING CharacterOffsetBegin=73516 CharacterOffsetEnd=73523 PartOfSpeech=VBG \
Lemma=work] [Text=MEN CharacterOffsetBegin=73524 CharacterOffsetEnd=73527 \
PartOfSpeech=NNS Lemma=man]";
const char *p = s;
while ((p = strstr(p, "Text=")) != NULL) {
p += 5;
const char *t = strchr(p, ' ');
printf("%.*s\n", (int)(t - p), p);
p = strstr(t + 1, "Lemma=") + 6;
t = strchr(p, ']');
printf("%.*s\n", (int)(t - p), p);
p = t + 1;
}

Related

How to split with multiple delimiters in C

I have this line of text:
32+-#3#2-#3#3
I need to separate numbers from each other. So basically the result would be like this:
3
2+-
3
2-
3
3
This is my code but it's not working properly because I have numbers with two digits:
#include <stdio.h>
#include <string.h>
int main(void) {
char string[50] = "32-#3#2-#3#3";
// Extract the first token
char *token = strtok(string, "#");
// loop through the string to extract all other tokens
while (token != NULL) {
printf(" %s\n", token); //printing each token
token = strtok(NULL, "#");
}
return 0;
}
You can't do it with strtok (alone), because there is no delimiter between the numbers you want to split. It's easier without strtok, just print what you want printed and add a separator unless a character which belongs to the token follows:
#include <stdio.h>
int main()
{
char string[] = "32+-#3#2-#3#3";
for (char *token = string; *token; ++token)
if ('0'<=*token && *token<='9' || *token=='+' || *token=='-')
{
putchar(*token);
if (token[1]!='+' && token[1]!='-') putchar('\n');
}
}
If you consider this too easy, you can use a regular expression to match the tokens:
#include <stdio.h>
#include <regex.h>
int main()
{
char *string = "32+-#3#2-#3#3";
regex_t reg;
regcomp(&reg, "[0-9][+-]*", 0);
regmatch_t match = {0};
while (regexec(&reg, string+=match.rm_eo, 1, &match, 0) == 0)
printf("%.*s\n", (int)(match.rm_eo-match.rm_so), string+match.rm_so);
}
There is a simple way to achieve this, but in C is a bit more complicated since we don't have vector as in C++ but I can suggest a pure C implementation which can be improved:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
void split_ss(const char* src,const char* pattern, char** outvec, size_t* outsize)
{
const size_t pat_len = strlen(pattern);
char* begin = (char*) src;
const char* next = begin;
if ((begin = strstr((const char*)begin, pattern)) != 0x00) {
unsigned int size = begin - next;
*outvec = malloc(sizeof(char) * size);
memcpy(*outvec , next, size);
outvec++;
(*outsize)+=1;
split_ss(begin+pat_len, pattern, outvec, outsize);
} else {
unsigned int size = &src[strlen(src)-1] - next + 1;
*outvec = malloc(sizeof(char) * size);
memcpy(*outvec, next, size);
(*outsize) += 1;
}
}
int main()
{
char* outdata[64] = {0};
size_t size, i=0;
split_ss("32+-#3#2-#3#3", "#", outdata, &size);
for(i=0; i < size; i++) {
printf("[%s]\r\n", outdata[i]);
}
// make sure to free it
return 0;
}
strstr is used to split by string rather than a character. Also output is a poorman 2D array with out size to iterate it and don't forget to free it.
strtok() is not the right tool for you purpose... As a matter of fact strtok() is rarely the right tool for any purpose because of its tricky semantics and side effects.
A simple loop will do:
#include <stdio.h>
int main(void) {
char string[50] = "32+-#3#2-#3#3";
for (char *p = string; *p; p++) {
if (*p == '#')
continue;
putchar(*p);
while (p[1] == '+' || p[1] == '-')
putchar(*++p);
putchar('\n');
}
return 0;
}

how to reverse the words in string with dot delimiter in c maintaining the dot in output

i want to reverse the words of string with dots in between them in c.
i tried using for loops in c.
#include <stdio.h>
#include <string.h>
int main()
{
char str[100];
scanf("%s",str);
printf("%s",str);
int length = strlen(str);
// Traverse string from end
int i;
for (i = length - 1; i >= 0; i--) {
if (str[i] == '.') {
// putting the NULL character at the
// position of space characters for
// next iteration.
str[i] = '.';
// Start from next charatcer
printf("%s ", &(str[i]) + 1);
}
}
// printing the last word
printf("%s", str);
return 0;
}
Example input
i.love.you
Example output
you.love.i
This is how I would do it (omitting error checking):
#include <stdio.h>
#include <string.h>
int main(int argc, char *argv[])
{
char str[100];
char *c;
scanf("%s", str);
while ((c = strrchr(str, '.'))) {
*c = 0;
printf("%s.", c + 1);
}
printf("%s\n", str);
return 0;
}
Output:
$ ./a.out
this.is.a.test.for.stackoverflow
stackoverflow.for.test.a.is.this
Well, there's two glaring errors:
You never actually NUL-terminate (not NULL) the string you're going to pass to printf. I suspect you copied that part from somewhere or somebody.
// putting the NULL character at the
// position of space characters for
// next iteration.
str[i] = '.';
Actually printing spaces instead of whatever delimiter:
printf("%s ", &(str[i]) + 1);
/* ^ */
What I'd do instead:
#include <stdio.h>
#include <string.h>
#include <assert.h>
#include <stdlib.h>
struct {
char input[100];
char result[100];
}testcases[] = {
{.input = "i.love.you",
.result = "you.love.i"},
{.input = ".hi.you",
.result = "you.hi."},
{.input = "there.is.no spoon",
.result = "no spoon.is.there"},
{.input = "..",
.result = ".."},
{.input = "oh.no.",
.result = ".no.oh"},
{.input = "",
.result = ""}
};
char *function(char * s) {
size_t max_length = strlen(s) + 1;
/* Choosing to allocate and return a new string,
but you can do whatever you like: */
char * scratch = malloc(max_length);
/* Keeping an offset to where we concatenate substrings in the output: */
size_t offset = 0;
for (char * delimiter = NULL; (delimiter = strrchr(s, '.')) != NULL;) {
/* NUL-terminate where delimiter was found, without modifying the input argument,
we'll need it later: */
*delimiter = 0;
/* Copy the substring after the delimiter: */
int nwritten = snprintf(scratch + offset, max_length - offset, "%s.", delimiter + 1);
/* Adjust offset: */
offset += nwritten;
}
/* Now copy what's left over (if anything) at the beginning of the input string: */
snprintf(scratch + offset, max_length - offset, "%s", s);
return scratch;
}
int main(void) {
for (size_t idx = 0; idx < sizeof testcases / sizeof *testcases; ++idx) {
char * reversed = function(testcases[idx].input);
assert(!strcmp(reversed, testcases[idx].result));
printf("Testcase #%lu passed\n", idx);
free (reversed);
}
return EXIT_SUCCESS;
}
Hope you have time to understand how it works before your homework's deadline is up. :)

Get substring of string using C

Say I have a const char* string like this:
../products/product_code1233213/image.jpg
I want to retrieve the second last part of this path string, which is the parent folder name of the jpg file, how would I do that?
You can use strtok.
#include <string.h>
#include <stdio.h>
int main()
{
char str[] = "/products/product_code1233213/image.jpg";
char s[2] = "/";
char *token;
/* get the first token */
token = strtok(str, s);
/* walk through other tokens */
while( token != NULL )
{
printf( " %s\n", token );
token = strtok(NULL, s);
}
return(0);
}
Output:
products
product_code1233213
image.jpg
This version works with a const char *:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int main()
{
const char *s = "/products/product_code1233213/image.jpg";
const char *p = s, *begin = s, *end = s;
char *result;
size_t len;
while (p) {
p = strchr(p, '/');
if (p) {
begin = end;
end = ++p;
}
}
if (begin != end) {
len = end - begin - 1;
result = malloc(len + 1);
memcpy(result, begin, len);
result[len] = '\0';
printf("%s\n", result);
free(result);
}
return 0;
}
Using only strchr() and no backtracking. Fast and const-safe.
#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#define SEPARATOR '/'
const char *path = "../products/product_code1233213/image.jpg";
int main(void) {
const char *beg, *end, *tmp;
beg = path;
if ((end = strchr(beg, SEPARATOR)) == NULL) {
exit(1); /* no separators */
}
while ((tmp = strchr(end + 1, SEPARATOR)) != NULL) {
beg = end + 1;
end = tmp;
}
(void) printf("%.*s\n", (int) (end - beg), beg);
return 0;
}

c - sprintf() behaviour with argv

This is a very simple question and mistake that I'm making but can anyone explain how I could use sprintf to add to a argv value?
On the command line, I have a file name say data.new.txt (but in my scenario I don't know the name of the file) how can I write to a new file that is named data.new.output.txt
I don't want to create a new file with a different name or update the new file, because this is program will output about 100+ files and that's the specification.
int main(int argc, char * argv[]){
char buffer[100];
sprintf(buffer, "%s.decoded", argv[1]);
printf("%s\n", buffer);
}
Cheers!
Here is a C99 (it uses snprintf()) program for you. Have fun!
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
char *buildname(const char *old, const char *newpart) {
char *dot;
char *buf = NULL;
dot = strrchr(old, '.');
if (dot) {
size_t len;
len = snprintf(NULL, 0, "%.*s%s.%s",
(int)(dot - old) + 1, old, newpart, dot + 1);
buf = malloc(len + 1);
if (buf) {
snprintf(buf, len + 1, "%.*s%s.%s",
(int)(dot - old) + 1, old, newpart, dot + 1);
}
}
return buf;
}
int main(void) {
char *p, *q;
q = "data.new.txt";
p = buildname(q, "output");
if (p) printf("%s ==> %s\n", q, p);
else printf("%s ==> (no dots)\n", q);
free(p);
q = "rrrrrrrrrrrr";
p = buildname(q, "aaa");
if (p) printf("%s ==> %s\n", q, p);
else printf("%s ==> (no dots)\n", q);
free(p);
q = "a.b.c.d.e.f.g.h.i.j.k.l";
p = buildname(q, "Q.Q");
if (p) printf("%s ==> %s\n", q, p);
else printf("%s ==> (no dots)\n", q);
free(p);
return 0;
}

Erase last members of line from text file

I have a text file as data.txt and I want to delete the last members of each line:
Here's the text file:
2031,2,0,0,0,0,0,0,54,0,
2027,2,0,0,0,0,0,0,209,0,
2029,2,0,0,0,0,0,0,65,0,
2036,2,0,0,0,0,0,0,165,0,
I would like to delete so it becomes:
2031,2,0,0,0,0,0,0,
2027,2,0,0,0,0,0,0,
2029,2,0,0,0,0,0,0,
2036,2,0,0,0,0,0,0,
I'm working in C but as the numbers can have two or three digits, I'm not sure how to do this.
A couple of uses of strrchr() can do the job:
#include <string.h>
void zap_last_field(char *line)
{
char *last_comma = strrchr(line, ',');
if (last_comma != 0)
{
*last_comma = '\0';
last_comma = strrchr(line, ',');
if (last_comma != 0)
*(last_comma + 1) = '\0';
}
}
Compiled code that seems to work. Note that given a string containing a single comma, it will zap that comma. If you don't want that to happen, then you have to work a little harder.
Test code for zap_last_field()
#include <string.h>
extern void zap_last_field(char *line);
void zap_last_field(char *line)
{
char *last_comma = strrchr(line, ',');
if (last_comma != 0)
{
*last_comma = '\0';
last_comma = strrchr(line, ',');
if (last_comma != 0)
*(last_comma + 1) = '\0';
}
}
#include <stdio.h>
#include <stdlib.h>
int main(void)
{
char *line = malloc(4096);
if (line != 0)
{
while (fgets(line, 4096, stdin) != 0)
{
printf("Line: %s", line);
zap_last_field(line);
printf("Zap1: %s\n", line);
}
free(line);
}
return(0);
}
This has been vetted with valgrind and is OK on both the original data file and the mangled data file listed below. The dynamic memory allocation is there to give valgrind the maximum chance of spotting any problems.
I strongly suspect that the core dump reported in a comment happens because the alternative test code tried to pass a literal string to the function, which won't work because literal strings are not generally modifiable and this code modifies the string in situ.
Test code for zap_last_n_fields()
If you want to zap the last couple of fields (a controlled number of fields), then you'll probably want to pass in a count of the number of fields to be zapped and add a loop. Note that this code uses a VLA so it requires a C99 compiler.
#include <string.h>
extern void zap_last_n_fields(char *line, size_t nfields);
void zap_last_n_fields(char *line, size_t nfields)
{
char *zapped[nfields+1];
for (size_t i = 0; i <= nfields; i++)
{
char *last_comma = strrchr(line, ',');
if (last_comma != 0)
{
zapped[i] = last_comma;
*last_comma = '\0';
}
else
{
/* Undo the damage wrought above */
for (size_t j = 0; j < i; j++)
*zapped[j] = ',';
return;
}
}
zapped[nfields][0] = ',';
zapped[nfields][1] = '\0';
}
#include <stdio.h>
int main(void)
{
char line1[4096];
while (fgets(line1, sizeof(line1), stdin) != 0)
{
printf("Line: %s", line1);
char line2[4096];
for (size_t i = 1; i <= 3; i++)
{
strcpy(line2, line1);
zap_last_n_fields(line2, i);
printf("Zap%zd: %s\n", i, line2);
}
}
return(0);
}
Example run — using your data.txt as input:
Line: 2031,2,0,0,0,0,0,0,54,0,
Zap1: 2031,2,0,0,0,0,0,0,54,
Zap2: 2031,2,0,0,0,0,0,0,
Zap3: 2031,2,0,0,0,0,0,
Line: 2027,2,0,0,0,0,0,0,209,0,
Zap1: 2027,2,0,0,0,0,0,0,209,
Zap2: 2027,2,0,0,0,0,0,0,
Zap3: 2027,2,0,0,0,0,0,
Line: 2029,2,0,0,0,0,0,0,65,0,
Zap1: 2029,2,0,0,0,0,0,0,65,
Zap2: 2029,2,0,0,0,0,0,0,
Zap3: 2029,2,0,0,0,0,0,
Line: 2036,2,0,0,0,0,0,0,165,0,
Zap1: 2036,2,0,0,0,0,0,0,165,
Zap2: 2036,2,0,0,0,0,0,0,
Zap3: 2036,2,0,0,0,0,0,
It also correctly handles a file such as:
2031,0,0,
2031,0,
2031,
2031
,

Resources