I'm after some simple examples and best practices of how to use regular expressions in ANSI C. man regex.h does not provide that much help.
Regular expressions actually aren't part of ANSI C. It sounds like you might be talking about the POSIX regular expression library, which comes with most (all?) *nixes. Here's an example of using POSIX regexes in C (based on this):
#include <regex.h>
regex_t regex;
int reti;
char msgbuf[100];
/* Compile regular expression */
reti = regcomp(®ex, "^a[[:alnum:]]", 0);
if (reti) {
fprintf(stderr, "Could not compile regex\n");
exit(1);
}
/* Execute regular expression */
reti = regexec(®ex, "abc", 0, NULL, 0);
if (!reti) {
puts("Match");
}
else if (reti == REG_NOMATCH) {
puts("No match");
}
else {
regerror(reti, ®ex, msgbuf, sizeof(msgbuf));
fprintf(stderr, "Regex match failed: %s\n", msgbuf);
exit(1);
}
/* Free memory allocated to the pattern buffer by regcomp() */
regfree(®ex);
Alternatively, you may want to check out PCRE, a library for Perl-compatible regular expressions in C. The Perl syntax is pretty much that same syntax used in Java, Python, and a number of other languages. The POSIX syntax is the syntax used by grep, sed, vi, etc.
This is an example of using REG_EXTENDED.
This regular expression
"^(-)?([0-9]+)((,|.)([0-9]+))?\n$"
Allows you to catch decimal numbers in Spanish system and international. :)
#include <regex.h>
#include <stdlib.h>
#include <stdio.h>
regex_t regex;
int reti;
char msgbuf[100];
int main(int argc, char const *argv[])
{
while(1){
fgets( msgbuf, 100, stdin );
reti = regcomp(®ex, "^(-)?([0-9]+)((,|.)([0-9]+))?\n$", REG_EXTENDED);
if (reti) {
fprintf(stderr, "Could not compile regex\n");
exit(1);
}
/* Execute regular expression */
printf("%s\n", msgbuf);
reti = regexec(®ex, msgbuf, 0, NULL, 0);
if (!reti) {
puts("Match");
}
else if (reti == REG_NOMATCH) {
puts("No match");
}
else {
regerror(reti, ®ex, msgbuf, sizeof(msgbuf));
fprintf(stderr, "Regex match failed: %s\n", msgbuf);
exit(1);
}
/* Free memory allocated to the pattern buffer by regcomp() */
regfree(®ex);
}
}
It's probably not what you want, but a tool like re2c can compile POSIX(-ish) regular expressions to ANSI C. It's written as a replacement for lex, but this approach allows you to sacrifice flexibility and legibility for the last bit of speed, if you really need it.
man regex.h doesn't show any manual entry for regex.h, but man 3 regex shows a page explaining the POSIX functions for pattern matching.
The same functions are described in The GNU C Library: Regular Expression Matching, which explains that the GNU C Library supports both the POSIX.2 interface and the interface the GNU C Library has had for many years.
For example, for an hypothetical program that prints which of the strings passed as argument matches the pattern passed as first argument, you could use code similar to the following one.
#include <errno.h>
#include <regex.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
void print_regerror (int errcode, size_t length, regex_t *compiled);
int
main (int argc, char *argv[])
{
regex_t regex;
int result;
if (argc < 3)
{
// The number of passed arguments is lower than the number of
// expected arguments.
fputs ("Missing command line arguments\n", stderr);
return EXIT_FAILURE;
}
result = regcomp (®ex, argv[1], REG_EXTENDED);
if (result)
{
// Any value different from 0 means it was not possible to
// compile the regular expression, either for memory problems
// or problems with the regular expression syntax.
if (result == REG_ESPACE)
fprintf (stderr, "%s\n", strerror(ENOMEM));
else
fputs ("Syntax error in the regular expression passed as first argument\n", stderr);
return EXIT_FAILURE;
}
for (int i = 2; i < argc; i++)
{
result = regexec (®ex, argv[i], 0, NULL, 0);
if (!result)
{
printf ("'%s' matches the regular expression\n", argv[i]);
}
else if (result == REG_NOMATCH)
{
printf ("'%s' doesn't the regular expression\n", argv[i]);
}
else
{
// The function returned an error; print the string
// describing it.
// Get the size of the buffer required for the error message.
size_t length = regerror (result, ®ex, NULL, 0);
print_regerror (result, length, ®ex);
return EXIT_FAILURE;
}
}
/* Free the memory allocated from regcomp(). */
regfree (®ex);
return EXIT_SUCCESS;
}
void
print_regerror (int errcode, size_t length, regex_t *compiled)
{
char buffer[length];
(void) regerror (errcode, compiled, buffer, length);
fprintf(stderr, "Regex match failed: %s\n", buffer);
}
The last argument of regcomp() needs to be at least REG_EXTENDED, or the functions will use basic regular expressions, which means that (for example) you would need to use a\{3\} instead of a{3} used from extended regular expressions, which is probably what you expect to use.
POSIX.2 has also another function for wildcard matching: fnmatch(). It doesn't allow to compile the regular expression, or get the substrings matching a sub-expression, but it is very specific for checking when a filename match a wildcard (e.g. it uses the FNM_PATHNAME flag).
While the answer above is good, I recommend using PCRE2. This means you can literally use all the regex examples out there now and not have to translate from some ancient regex.
I made an answer for this already, but I think it can help here too..
Regex In C To Search For Credit Card Numbers
// YOU MUST SPECIFY THE UNIT WIDTH BEFORE THE INCLUDE OF THE pcre.h
#define PCRE2_CODE_UNIT_WIDTH 8
#include <stdio.h>
#include <string.h>
#include <pcre2.h>
#include <stdbool.h>
int main(){
bool Debug = true;
bool Found = false;
pcre2_code *re;
PCRE2_SPTR pattern;
PCRE2_SPTR subject;
int errornumber;
int i;
int rc;
PCRE2_SIZE erroroffset;
PCRE2_SIZE *ovector;
size_t subject_length;
pcre2_match_data *match_data;
char * RegexStr = "(?:\\D|^)(5[1-5][0-9]{2}(?:\\ |\\-|)[0-9]{4}(?:\\ |\\-|)[0-9]{4}(?:\\ |\\-|)[0-9]{4})(?:\\D|$)";
char * source = "5111 2222 3333 4444";
pattern = (PCRE2_SPTR)RegexStr;// <<<<< This is where you pass your REGEX
subject = (PCRE2_SPTR)source;// <<<<< This is where you pass your bufer that will be checked.
subject_length = strlen((char *)subject);
re = pcre2_compile(
pattern, /* the pattern */
PCRE2_ZERO_TERMINATED, /* indicates pattern is zero-terminated */
0, /* default options */
&errornumber, /* for error number */
&erroroffset, /* for error offset */
NULL); /* use default compile context */
/* Compilation failed: print the error message and exit. */
if (re == NULL)
{
PCRE2_UCHAR buffer[256];
pcre2_get_error_message(errornumber, buffer, sizeof(buffer));
printf("PCRE2 compilation failed at offset %d: %s\n", (int)erroroffset,buffer);
return 1;
}
match_data = pcre2_match_data_create_from_pattern(re, NULL);
rc = pcre2_match(
re,
subject, /* the subject string */
subject_length, /* the length of the subject */
0, /* start at offset 0 in the subject */
0, /* default options */
match_data, /* block for storing the result */
NULL);
if (rc < 0)
{
switch(rc)
{
case PCRE2_ERROR_NOMATCH: //printf("No match\n"); //
pcre2_match_data_free(match_data);
pcre2_code_free(re);
Found = 0;
return Found;
// break;
/*
Handle other special cases if you like
*/
default: printf("Matching error %d\n", rc); //break;
}
pcre2_match_data_free(match_data); /* Release memory used for the match */
pcre2_code_free(re);
Found = 0; /* data and the compiled pattern. */
return Found;
}
if (Debug){
ovector = pcre2_get_ovector_pointer(match_data);
printf("Match succeeded at offset %d\n", (int)ovector[0]);
if (rc == 0)
printf("ovector was not big enough for all the captured substrings\n");
if (ovector[0] > ovector[1])
{
printf("\\K was used in an assertion to set the match start after its end.\n"
"From end to start the match was: %.*s\n", (int)(ovector[0] - ovector[1]),
(char *)(subject + ovector[1]));
printf("Run abandoned\n");
pcre2_match_data_free(match_data);
pcre2_code_free(re);
return 0;
}
for (i = 0; i < rc; i++)
{
PCRE2_SPTR substring_start = subject + ovector[2*i];
size_t substring_length = ovector[2*i+1] - ovector[2*i];
printf("%2d: %.*s\n", i, (int)substring_length, (char *)substring_start);
}
}
else{
if(rc > 0){
Found = true;
}
}
pcre2_match_data_free(match_data);
pcre2_code_free(re);
return Found;
}
Install PCRE using:
wget https://ftp.pcre.org/pub/pcre/pcre2-10.31.zip
make
sudo make install
sudo ldconfig
Compile using :
gcc foo.c -lpcre2-8 -o foo
Check my answer for more details.
Related
The code below consist of read_files() that reads a bunch of text files and match() function that does string matching against a pattern using the gnu regex library.
inside read_files() i use getline() with size argument set to 0 so that getline() will start with the default 120 size and then increased as needed
#include <limits.h> // for PATH_MAX
#include <regex.h> // for regcomp, regerror, regexec, regfree, size_t, REG...
#include <stdio.h> // for printf, fprintf, NULL, fclose, fopen, getline
#include <stdlib.h> // for exit, free, EXIT_FAILURE
int match(const char *regex_str, const char *str) {
regex_t regex;
int reti;
char msgbuf[100];
/* Compile regular expression */
reti = regcomp(®ex, regex_str, REG_EXTENDED);
if (reti) {
fprintf(stderr, "Could not compile regex\n");
exit(1);
}
/* Execute regular expression */
reti = regexec(®ex, str, 0, NULL, 0);
if (!reti) {
return 1;
} else if (reti == REG_NOMATCH) {
return 0;
} else {
regerror(reti, ®ex, msgbuf, sizeof(msgbuf));
fprintf(stderr, "Regex match failed: %s\n", msgbuf);
exit(1);
}
/* Free memory allocated to the pattern buffer by regcomp() */
regfree(®ex);
}
void read_files() {
size_t path_count = 2;
char pathnames[2][PATH_MAX] = {"./tmp/test0.conf", "./tmp/test1.conf"};
FILE *fp;
char *line = NULL;
size_t len = 0;
ssize_t read_count;
for (int i = 0; i < path_count; i++) {
printf("opening file %s\n", pathnames[i]);
fp = fopen(pathnames[i], "r");
if (fp == NULL) {
printf("internal error,couldn't open file %s\"}", pathnames[i]);
exit(EXIT_FAILURE);
}
int linenum=1;
while ((read_count = getline(&line, &len, fp)) != -1) {
printf("%d: %s",linenum,line);
linenum++;
}
printf("len: %zu\n", len);
fclose(fp);
// len=0; // this is the line that fixes the bug, if i reset len to 0 after reading the first file then everything works as expected, if i don't reset it then regex matching fails
if (line)
free(line);
}
}
int main(int argc, char *argv[]) {
read_files();
if (!match("^[a-zA-Z0-9]+$", "jack")) {
printf("input don't match\n");
}
}
the content of test0.conf
AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
the content of test1.conf
testing123
when running the above code i get this output:
opening file ./tmp/test0.conf
1: AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA
len: 240
opening file ./tmp/test1.conf
1: testing123
len: 240
input don't match
so the pattern matching is failing with the string "jack" which in reality matches.
You can see that after finishing reading the first file that len is set to 240 so when getline gets executed again for the second file it will read the file with 240 buffer size, but this for some reason causes the regex matching to fail.
If i reset the len to 0 argument after reading the first file then the code works as expected(the regex matching works fine).
So why does the getline() len parameter affect the behavior of the gnu regex?
So why does the getline() len parameter affect the behavior of the gnu regex?
As Marian commented, you are using getline incorrectly, causing it to corrupt heap. You can observe this by compiling the program with -fsanitize=address flag and running it. See the Address Sanitizer manual to understand the error.
This is undefined behavior, and your program can do anything. Here it just happens to cause the GNU regex library to stop working correctly. A SIGSEGV is another likely outcome.
To fix the problem, you should move the free call out of the loop and only free the memory after you are done reading the lines.
Setting line = NULL in the loop after you free it is another possible (but less efficient) fix.
I have the following character pointer
char message[100] = "START_MESSAGE hello world \r\n\r\n";
I am trying to use regex.h to parse the above message. I want to get anything between START_MESSAGE and \r\n\r\n
So, I tried the following code (by following answer of THIS SO post)
#include <stdio.h>
#include <regex.h>
#include <stdlib.h>
int main() {
regex_t regex;
int reti;
char msgbuf[100];
reti = regcomp(®ex, "START_MESSAGE*\r\n\r\n", 0);
reti = regexec(®ex, "START_MESSAGE hello world\r\n\r\n", 0, NULL, 0);
if (!reti) {
puts("Match");
} else
if (reti == REG_NOMATCH) {
puts("No match");
} else {
regerror(reti, ®ex, msgbuf, sizeof(msgbuf));
fprintf(stderr, "Regex match failed: %s\n", msgbuf);
exit(1);
}
/* Free memory allocated to the pattern buffer by regcomp() */
regfree(®ex);
return 0;
}
But, I get no match. I thought, maybe its because of the escape sequence. So, I put \\r\\n\\r\\n and still get no match. I looked for raw string literal (like r before the string in python). But, I get
error: stray ‘R’ in program
I tried removing \r\n\r\n and looked for only START_MESSAGE pattern, I get a match. How can I get \r\n\r\n to be matched or get the text between START_MESSAGE and \r\n\r\n.
I am trying to create a script to detect "not correct" characters on a username. I think the best solution is to apply regex. I have created a sample script for this purpose but I can not find the correct combination for detecting the "bad/faulty" characters.
Update: The regex expression that I am using I found it on Wikipedia Regular expressions
Sample of code:
#include <stdio.h> /* stderr, stdout */
#include <string.h> /* stncpy, strncat etc. */
#include <stdlib.h> /* memory allocation, process control etc. */
#include <sys/types.h>
#include <regex.h>
#define tofind "[^A-Za-z0-9_] $" /* Non word characters */
#define MAX_USERS 4
#define MAX_CHARACTERS 20
typedef struct rec {
char users[MAX_USERS][MAX_CHARACTERS];
}TEMPORARY;
int main (void) {
regex_t regex;
int reti , i;
TEMPORARY *ptr_record;
ptr_record = (TEMPORARY *) malloc (sizeof(TEMPORARY));
if (ptr_record == NULL) {
printf("Out of memmory!\nExit!\n");
exit(0);
}
printf("Sizeof users: %li\n",sizeof(*ptr_record).users);
/* Compile regular expression */
reti = regcomp(®ex, tofind, REG_EXTENDED);
if( reti ){ fprintf(stderr, "Could not compile regex\n"); exit(1); }
for(i = 0; i < MAX_USERS; i++) {
printf("Enter username[%i]:\n",i);
scanf( " %[^\n]" , (*ptr_record).users[i] );
/* Execute regular expression */
reti = regexec(®ex, (*ptr_record).users[i], 0, NULL, 0);
if( !reti ){
puts("Match");
}
else if( reti == REG_NOMATCH ){
puts("No match");
}
else{
regerror(reti, ®ex, (*ptr_record).users[i], sizeof((*ptr_record).users));
fprintf(stderr, "Regex match failed: %s\n", (*ptr_record).users[i]);
exit(1);
}
}
/* Free compiled regular expression if you want to use the regex_t again */
regfree(®ex);
for(i = 0; i < MAX_USERS; i++) {
printf("Username[%i][%s]:\n",i,(*ptr_record).users[i]);
}
return 0;
}
Update 2: Input and Output of the code:
Enter username[0]:
Th#nos
No match
Enter username[1]:
t#est
No match
Enter username[2]:
!anotherT$est
No match
Enter username[3]:
S%mple
No match
Username[0][Th#nos]:
Username[1][t#est]:
Username[2][!anotherT$est]:
Username[3][S%mple]:
I thought that all these inputs would be detected by the regular expression and produce a non match.
Answer to the specific question: Finally after some experimentation I understood what I was doing wrong. My regex was not correctly defined. So at the specific task that I want to apply, the correct regex would be "[^A-Za-z0-9_]" instead of "[^A-Za-z0-9_] $". It works perfectly sample of output:
Enter username[0]:
thanos
No match
Enter username[1]:
thanos test
Match
Enter username[2]:
th#no
Match
Enter username[3]:
test
No match
Username[0][thanos]:
Username[1][thanos test]:
Username[2][th#no]:
Username[3][test]:
I am trying to write a program to find whether a give string is hex or not.So the given string must contain only character in between 0-9,A-F and a-f.How can i accomplish this using C?
The program i tried is give below but the regex pattern is not working well.What will be the error in this pattern?
#include <sys/types.h>
#include <regex.h>
#include <stdio.h>
int main(int argc, char *argv[]){
regex_t regex;
int reti;
char msgbuf[100];
/* Compile regular expression */
reti = regcomp(®ex, "^[a-fA-F0-9]+$", 0);
if( reti )
{
fprintf(stderr, "Could not compile regex\n");
//exit(1);
}
/* Execute regular expression */
reti = regexec(®ex, "ABC123defG", 0, NULL, 0);
if( !reti ){
puts("Match");
}
else if( reti == REG_NOMATCH ){
puts("No match");
}
else{
regerror(reti, ®ex, msgbuf, sizeof(msgbuf));
fprintf(stderr, "Regex match failed: %s\n", msgbuf);
//exit(1);
}
/* Free compiled regular expression if you want to use the regex_t again */
regfree(®ex);
return 0;
}
You need to specify REG_EXTENDED in the flags argument to regcomp. If you don't, you end up with "basic" regular expression syntax, which doesn't include the + operator, amongst other things.
It's slightly surprising that "basic" regular expressions still exist, never mind being the default. But that's backwards-compatibility for you.
I'm trying to create a collection of regexes in C, with no much success.
Currently I'm trying to find include statements with the following regex:
(#include <.+>)|(#include \".+\")
here is my code:
#include <stdio.h>
#include <stdlib.h>
#include <regex.h>
char *regex_str = "(#include <.+>)|(#include \".+\")";
char *str = "#include <stdio.h>";
regex_t regex;
int reti;
int main() {
/* Compile Regex */
reti = regcomp(®ex, regex_str, 0);
if (reti) {
printf("Could not compile regex.\n");
exit(1);
}
/* Exec Regex */
reti = regexec(®ex, str, 0, NULL, 0);
if (!reti) {
printf("Match\n");
} else if (reti == REG_NOMATCH) {
printf("No Match\n");
} else {
regerror(reti, ®ex, str, sizeof(str));
printf("Regex match failed: %s\n", str);
exit(1);
}
/* Free compiled regular expression if you want to use the regex_t again */
regfree(®ex);
return 0;
}
The result I get is: No Match
What am I doing wrong?
You might need to escape your match group:
char *regex_str = "\\(#include [\"<].*[\">]\\)";
Which could likely be rolled into one pattern.