PCRE is not matching utf8 characters - c

I'm compiling a PCRE pattern with utf8 flag enabled and am trying to match a utf8 char* string against it, but it is not matching and pcre_exec returns negative. I'm passing the subject length as 65 to pcre_exec which is the number of characters in the string. I believe it expects the number of bytes so I have tried with increasing the argument till 70 but still get the same result. I don't know what else is making the match fail. Please help before I shoot myself.
(If I try without the flag PCRE_UTF8 however, it matches but the offset vector[1] is 30 which is index of the character just before a unicode character in my input string)
#include "stdafx.h"
#include "pcre.h"
#include <pcre.h> /* PCRE lib NONE */
#include <stdio.h> /* I/O lib C89 */
#include <stdlib.h> /* Standard Lib C89 */
#include <string.h> /* Strings C89 */
#include <iostream>
int main(int argc, char *argv[])
{
pcre *reCompiled;
int pcreExecRet;
int subStrVec[30];
const char *pcreErrorStr;
int pcreErrorOffset;
char* aStrRegex = "(\\?\\w+\\?\\s*=)?\\s*(call|exec|execute)\\s+(?<spName>\\w+)("
// params can be an empty pair of parenthesis or have parameters inside them as well.
"\\(\\s*(?<params>[?\\w,]+)\\s*\\)"
// paramList along with its parenthesis is optional below so a SP call can be just "exec sp_name" for a stored proc call without any parameters.
")?";
reCompiled = pcre_compile(aStrRegex, 0, &pcreErrorStr, &pcreErrorOffset, NULL);
if(reCompiled == NULL) {
printf("ERROR: Could not compile '%s': %s\n", aStrRegex, pcreErrorStr);
exit(1);
}
char* line = "?rt?=call SqlTxFunctionTesting(?înFîéld?,?outField?,?inOutField?)";
pcreExecRet = pcre_exec(reCompiled,
NULL,
line,
65, // length of string
0, // Start looking at this point
0, // OPTIONS
subStrVec,
30); // Length of subStrVec
printf("\nret=%d",pcreExecRet);
//int substrLen = pcre_get_substring(line, subStrVec, pcreExecRet, 1, &mantissa);
}

1)
char * q= "î";
printf("%d, %s", q[0], q);
Output:
63, ?
2) You must rebuild PCRE with PCRE_BUILD_PCRE16 (or 32) and PCRE_SUPPORT_UTF. And use pcre16.lib and/or pcre16.dll. Then you can try this code:
pcre16 *reCompiled;
int pcreExecRet;
int subStrVec[30];
const char *pcreErrorStr;
int pcreErrorOffset;
wchar_t* aStrRegex = L"(\\?\\w+\\?\\s*=)?\\s*(call|exec|execute)\\s+(?<spName>\\w+)("
// params can be an empty pair of paranthesis or have parameters inside them as well.
L"\\(\\s*(?<params>[?,\\w\\p{L}]+)\\s*\\)"
// paramList along with its paranthesis is optional below so a SP call can be just "exec sp_name" for a stored proc call without any parameters.
L")?";
reCompiled = pcre16_compile((PCRE_SPTR16)aStrRegex, PCRE_UTF8, &pcreErrorStr, &pcreErrorOffset, NULL);
if(reCompiled == NULL) {
printf("ERROR: Could not compile '%s': %s\n", aStrRegex, pcreErrorStr);
exit(1);
}
const wchar_t* line = L"?rt?=call SqlTxFunctionTesting( ?inField?,?outField?,?inOutField?,?fd? )";
const wchar_t* mantissa=new wchar_t[wcslen(line)];
pcreExecRet = pcre16_exec(reCompiled,
NULL,
(PCRE_SPTR16)line,
wcslen(line), // length of string
0, // Start looking at this point
0, // OPTIONS
subStrVec,
30); // Length of subStrVec
printf("\nret=%d",pcreExecRet);
for (int i=0;i<pcreExecRet;i++){
int substrLen = pcre16_get_substring((PCRE_SPTR16)line, subStrVec, pcreExecRet, i, (PCRE_SPTR16 *)&mantissa);
wprintf(L"\nret string=%s, length=%i\n",mantissa,substrLen);
}
3) \w = [0-9A-Z_a-z]. It doesn't contains unicode symbols.
4) This can really help: http://answers.oreilly.com/topic/215-how-to-use-unicode-code-points-properties-blocks-and-scripts-in-regular-expressions/
5) from PCRE 8.33 source (pcre_exec.c:2251)
/* Find out if the previous and current characters are "word" characters.
It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
be "non-word" characters. Remember the earliest consulted character for
partial matching. */

Related

C regex extraction

Please consider this C code:
#include <stdio.h>
#include <regex.h>
#include <string.h>
int main(){
char * our_string = "/var/www/html/cameras/cam7/2020-01/15/cam7-2020-01-15-17-45-20-1037-03.h264";
regex_t re;
//int regex_int = regcomp(&re, "cam[:digit:]", 0);
int regex_int = regcomp(&re, "cam", 0);
if (regex_int) {
fprintf(stderr, "regex failed to compile!");
return 1;
}
regmatch_t rm[2];
if ((regexec(&re, our_string, 2, rm,0)) ){
fprintf(stderr, "regex failed to exec!");
return 1;
}
char temp[8192] = {0};
memcpy(temp, our_string + rm[1].rm_so, rm[1].rm_eo - rm[1].rm_so);
printf("We got: %s\n", temp);
puts("Bye!");
return 0;
}
I am trying to extract camX out of our_string, and need help. In its current form, above code is turning blank:
$ ./a.out
We got:
Bye!
C regex is not my forte, Please help!
You have a couple of problems:
//int regex_int = regcomp(&re, "cam[:digit:]", 0)
If you want to match cam followed by a digit, you need (Besides uncommenting this line, of course, and commenting out the one beneath it), to put [:digit:] inside a bracket expression:
int regex_int = regcomp(&re, "cam[[:digit:]]", 0)
The second issue:
memcpy(temp, our_string + rm[1].rm_so, rm[1].rm_eo - rm[1].rm_so);
Neither of your regular expressions have any groups; the second element of the rm array is not going to have anything useful in it. You need to use the first element, which has the offsets of the complete match:
memcpy(temp, our_string + rm[0].rm_so, rm[0].rm_eo - rm[0].rm_so);
You also have a memory leak because you don't have a regfree(&re); to free up memory allocated for the regular expression. Not a big deal in a simple demo program like this, but in something bigger or longer running or that does the matching in a loop, it'll become an issue.

ssscanf unexpected behavior

When I use sscanf on url http://www.website.com:30081/text.txt, like this:
int parse_url(char *url, requesthdrs *hdrs, char
*uri, char *domain_name) {
int request_port;
int host_hdr_len;
char temp[MAXLINE];
size_t uri_len = 1;
/* case 1: request_port specified */
/* format: http://domain_name:request_port/... */
if(sscanf(url, "http://%s:%d%s", domain_name,
&request_port, uri) == 3) {
fprintf(stderr, "case1\n");
host_hdr_len = build_hosthdr(hdrs->host,
domain_name, request_port);
}
The program fails to enter case 1, why is that the case?
The %s reads until the first white space, or end of string. That isn't what you want or need. Use %[^:] (a scan set that matches anything except a colon)
instead.
Beware of buffer overflows. You don't know how big the strings can be.

How to compare my string, which is stored in an array, to function names from a complete library in c

After I enter a string in c and store it in for example char s[100], how can I compare that string to all function names in a math.h? For example, I enter pow and the result will look like this in stored form.
s[0]='p'
s[1]='o'
s[2]='w'
s[3]='\0'
Since my string is the equivalent of pow(), I want my program to recognise that and then call pow() during execution of my program. I know it is not that hard to do string comparison within the code, but that would mean that I would have to do string comparison for every function name in the library. I don't want to do that. How is it possible to compare my string against all names in the library without hard coding every comparison?
Thank you :)
You can't, not without doing work yourself. There are no names of functions present at runtime in general, and certainly not of functions you haven't called.
C is not a dynamic language, names are only used when compiling/linking.
Regular expressions in C
Try parsing the header files using FILE and use aforementioned link as a guide to check whether the function exists or not.
I tried to make a little sample about what I assume the questioner is looking for (eval.c):
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <assert.h>
/* mapping function names to function pointers and number of parameters */
struct Entry {
const char *name; /* function name */
double (*pFunc)(); /* function pointer */
int nArgs; /* number of arguments */
} table[] = {
#define REGISTER(FUNC, N_ARGS) { #FUNC, &FUNC, N_ARGS }
REGISTER(atan2, 2),
REGISTER(pow, 2),
REGISTER(modf, 2),
REGISTER(sin, 1),
REGISTER(cos, 1)
#undef REGISTER
};
/* let compiler count the number of entries */
enum { sizeTable = sizeof table / sizeof *table };
void printUsage(const char *argv0)
{
int i;
printf(
"Usage:\n"
" %s FUNC\n"
" where FUNC must be one of:\n", argv0);
for (i = 0; i < sizeTable; ++i) printf(" - %s\n", table[i].name);
}
int main(int argc, char **argv)
{
int i;
char *func;
struct Entry *pEntry;
/* read command line argument */
if (argc <= 1) {
fprintf(stderr, "ERROR: Missing function argument!\n");
printUsage(argv[0]);
return -1;
}
func = argv[1];
/* find function by name */
for (i = 0; i < sizeTable && strcmp(func, table[i].name) != 0; ++i);
if (i >= sizeTable) {
fprintf(stderr, "ERROR! Unknown function '%s'!\n", func);
printUsage(argv[0]);
return -1;
}
/* perform found function on all (standard) input */
pEntry = table + i;
for (;;) { /* endless loop (bail out at EOF or error) */
switch (pEntry->nArgs) {
case 1: {
double arg1, result;
/* get one argument */
if (scanf("%lf", &arg1) != 1) {
int error;
if (error = !feof(stdin)) fprintf(stderr, "Input ERROR!\n");
return error; /* bail out at EOF or error */
}
/* compute */
result = (*pEntry->pFunc)(arg1);
/* output */
printf("%s(%f): %f\n", pEntry->name, arg1, result);
} break;
case 2: {
double arg1, arg2, result;
/* get two arguments */
if (scanf("%lf %lf", &arg1, &arg2) != 2) {
int error;
if (error = !feof(stdin)) fprintf(stderr, "Input ERROR!\n");
return error; /* bail out at EOF or error */
}
/* compute */
result = (*pEntry->pFunc)(arg1, arg2);
/* output */
printf("%s(%f, %f): %f\n", pEntry->name, arg1, arg2, result);
} break;
default: /* should never happen */
fprintf(stderr,
"ERROR! Functions with %d arguments not yet implemented!\n",
pEntry->nArgs);
assert(0);
return -1; /* bail out at error */
}
}
}
I compiled and tested this with gcc in cygwin on Windows (64 bit):
$ gcc -std=c11 -o eval eval.c
$ ./eval
ERROR: Missing function argument!
Usage:
./eval FUNC
where FUNC must be one of:
- atan2
- pow
- modf
- sin
- cos
$ echo "1 2 3 4 5 6 7 8 9 10" | ./eval pow
pow(1.000000, 2.000000): 1.000000
pow(3.000000, 4.000000): 81.000000
pow(5.000000, 6.000000): 15625.000000
pow(7.000000, 8.000000): 5764801.000000
pow(9.000000, 10.000000): 3486784401.000000
$ echo "1 2 3 4 5 6 7 8 9 10" | ./eval sin
sin(1.000000): 0.841471
sin(2.000000): 0.909297
sin(3.000000): 0.141120
sin(4.000000): -0.756802
sin(5.000000): -0.958924
sin(6.000000): -0.279415
sin(7.000000): 0.656987
sin(8.000000): 0.989358
sin(9.000000): 0.412118
sin(10.000000): -0.544021
The usage of this application: the name of the function to apply is provided as command line argument. The values (to apply function to) are provided via standard input. In the sample session, I used echo and a pipe (|) to redirect the output of echo to the input of eval. (If eval is called stand-alone the numbers may be typed in by keyboard.)
Notes:
The table does the actual mapping of strings to function pointers. To solve that issue about the number of parameters, I considered this in struct Entry also.
The REGISTER macro is a trick to use the identifier as string constant also. The #FUNC is a stringize macro-operation (a typical C trick to prevent errors due to typos).
The sizeTable is another trick to prevent redundant definitions. I let the compiler count the number of entries. Thus, new entries may be added and it still will work without any other editing.
The actual trick is to provide a function pointer where the arguments are "left out". When it is called, the correct number of arguments is used and it works. (assuming, of course, the table initialization has been implemented carefully.) However, it would be a pain to do this in C++ because the functions with distinct number of arguments would need an appropriate function pointer with matching signature - horrible casts would be necessary. (Try to compile this with g++ -std=c++11 -c eval.c to see what I mean.)
For a productive solution, I would sort the entries by names (lexicographically) and apply a binary search (or even use hashing to be faster and more sophisticated). For this sample, I wanted to keep it simple.
math.h provides a lot of functions in "float flavor" also. These may not be added to this sample without additional effort. To support other than double arguments
some type info had to been added to the table entries
the type info has to be considered somehow in the switch statement of evaluation.
...not to mention functions where argument types are distinct to each other (or return type). (I cannot remember whether math.h even provides such functions.)
Btw. this will work for non-math.h functions also...

pattern matching / extracting in c using regex.h

I need help extracting a substring from a string using regex.h in C.
In this example, I am trying to extract all occurrences of character 'e' from a string 'telephone'. Unfortunately, I get stuck identifying the offsets of those characters. I am listing code below:
#include <stdio.h>
#include <regex.h>
int main(void) {
const int size=10;
regex_t regex;
regmatch_t matchStruct[size];
char pattern[] = "(e)";
char str[] = "telephone";
int failure = regcomp(&regex, pattern, REG_EXTENDED);
if (failure) {
printf("Cannot compile");
}
int matchFailure = regexec(&regex, pattern, size, matchStruct, 0);
if (!matchFailure) {
printf("\nMatch!!");
} else {
printf("NO Match!!");
}
return 0;
}
So per GNU's manual, I should get all of the occurrences of 'e' when a character is parenthesized. However, I always get only the first occurrence.
Essentially, I want to be able to see something like:
matchStruct[1].rm_so = 1;
matchStruct[1].rm_so = 2;
matchStruct[2].rm_so = 4;
matchStruct[2].rm_so = 5;
matchStruct[3].rm_so = 7;
matchStruct[3].rm_so = 8;
or something along these lines. Any advice?
Please note that you are in fact not comparing your compiled regex against str ("telephone") but rather to your plain-text pattern. Check your second attribute to regexec. That fixed, proceed for instance to "regex in C language using functions regcomp and regexec toggles between first and second match" where the answer to your question is already given.

syscall read acting weird

c lang, ubuntu
so i have a task - write a menu with these 3 options:
1. close program
2. show user id
3. show current working directory
i can only use 3 libraries - unistd.h, sys/syscall.h, sys/sysinfo.h.
so no printf/scanf
i need to use an array of a struct im given, that has a function pointer,
to call the function the user wants to use.
problem is on options 2 & 3;
when i pick 2, on the first time it works fine (i think)
second time i pick 2, it works, but then when going to the third iteration,
it doesn't wait for an input, it takes '\n' as an input for some reason, then it says invalid input. (i checked what it takes as input with printf, i printed index after recalculating it and it because -39, so it means selection[0] = 10 = '\n')
that's the first problem, that i just cant find the solution for.
second problem is on the current working directory function;
the SYS_getcwd returns -1 for some reason, which means there's an error, but i cant figure it out.
any explanations for these things?
(also - slen and __itoa are functions i am given - slen returns the length of a string,
__itoa returns a char*, that was the string representation of an integer)
helper.h:
typedef struct func_desc {
char *name;
void (*func)(void);
} fun_desc;
main.c:
#include <unistd.h>
#include "helper.h"
#include <sys/syscall.h>
#include <sys/sysinfo.h>
void exitProgram();
void printID();
void currDir();
int main() {
fun_desc arrFuncs[3];
arrFuncs[0].name = "exitProgram";
arrFuncs[0].func = &exitProgram;
arrFuncs[1].name = "printID";
arrFuncs[1].func = &printID;
arrFuncs[2].name = "currDir";
arrFuncs[2].func = &currDir;
char selection[2];
int index;
const char* menu = "Welcome to the menu. Please pick one of the following actions:\n1.Close the program\n2.Print the current user's id\n3.Print the current directory's id\n";
while(1 == 1) {
syscall(SYS_write, 0, menu, slen(menu));
syscall(SYS_write, 0, "Your selection: ", slen("Your selection: "));
syscall(SYS_read, 1, selection, slen(selection)); //might be a problem
selection[1] = '\0';
index = selection[0] - '0' - 1;
if(index > 2)
syscall(SYS_write, 0, "Invalid input\n", slen("Invalid input\n"));
else
arrFuncs[index].func();
}
return(0);
}
void exitProgram() {
syscall(SYS_write, 0, "The program will close\n", slen("The program will close\n"));
syscall(SYS_exit);
}
void printID() { //problem
char* uid = __itoa(syscall(SYS_getuid));
syscall(SYS_write, 0, uid, slen(uid));
syscall(SYS_write, 0, "\n", slen("\n"));
}
void currDir() { //????
char* buf = __itoa(syscall(SYS_getcwd));
syscall(SYS_write, 0, buf, slen(buf));
syscall(SYS_write, 0, "\n", slen("\n"));
}
You're passing the wrong number of arguments to some of these system calls. In particular:
syscall(SYS_exit);
_exit() takes one argument: the exit code.
char* buf = __itoa(syscall(SYS_getcwd));
getcwd() takes two arguments: a pointer to a buffer to write the string to, and the length of that buffer. In practice, this probably looks something like:
char pathbuf[PATH_MAX];
syscall(SYS_getcwd, pathbuf, sizeof(pathbuf));
If you don't have the header which defines PATH_MAX, define it yourself. 4096 is an appropriate value.
Note that getcwd() writes a string into the buffer passed to it — it does not return a numeric identifier.
As an aside, you may want to save yourself some time by implementing a wrapper to write a string, e.g.
void putstring(const char *str) {
syscall(SYS_write, 0, str, slen(str));
}
since you seem to be doing that a lot.

Resources