C regex extraction - c

Please consider this C code:
#include <stdio.h>
#include <regex.h>
#include <string.h>
int main(){
char * our_string = "/var/www/html/cameras/cam7/2020-01/15/cam7-2020-01-15-17-45-20-1037-03.h264";
regex_t re;
//int regex_int = regcomp(&re, "cam[:digit:]", 0);
int regex_int = regcomp(&re, "cam", 0);
if (regex_int) {
fprintf(stderr, "regex failed to compile!");
return 1;
}
regmatch_t rm[2];
if ((regexec(&re, our_string, 2, rm,0)) ){
fprintf(stderr, "regex failed to exec!");
return 1;
}
char temp[8192] = {0};
memcpy(temp, our_string + rm[1].rm_so, rm[1].rm_eo - rm[1].rm_so);
printf("We got: %s\n", temp);
puts("Bye!");
return 0;
}
I am trying to extract camX out of our_string, and need help. In its current form, above code is turning blank:
$ ./a.out
We got:
Bye!
C regex is not my forte, Please help!

You have a couple of problems:
//int regex_int = regcomp(&re, "cam[:digit:]", 0)
If you want to match cam followed by a digit, you need (Besides uncommenting this line, of course, and commenting out the one beneath it), to put [:digit:] inside a bracket expression:
int regex_int = regcomp(&re, "cam[[:digit:]]", 0)
The second issue:
memcpy(temp, our_string + rm[1].rm_so, rm[1].rm_eo - rm[1].rm_so);
Neither of your regular expressions have any groups; the second element of the rm array is not going to have anything useful in it. You need to use the first element, which has the offsets of the complete match:
memcpy(temp, our_string + rm[0].rm_so, rm[0].rm_eo - rm[0].rm_so);
You also have a memory leak because you don't have a regfree(&re); to free up memory allocated for the regular expression. Not a big deal in a simple demo program like this, but in something bigger or longer running or that does the matching in a loop, it'll become an issue.

Related

Can't use regular expression with .*

I've been trying to use regular expressions (<regex.h>) in a C project I am developing.
According to regex101 the regex it is well written and identifies what I'm trying to identify but it doesn't work when I try to run it in C.
#include <stdio.h>
#include <regex.h>
int main() {
char pattern[] = "#include.*";
char line[] = "#include <stdio.h>";
regex_t string;
int regex_return = -1;
regex_return = regcomp(&string, line, 0);
regex_return += regexec(&string, pattern, 0, NULL, 0);
printf("%d", regex_return);
return 0;
}
This is a sample code I wrote to test the expression when I found out it didn't work.
It prints 1, when I expected 0.
It prints 0 if I change the line to "#include", which is just strange to me, because it's ignoring the .* at the end.
line and pattern are swapped.
regcomp takes the pattern and regexec takes the string to check.

Unable to read memory error comes up every time I use the function below,

I tried to write a function similar to sscanf that forwards the input string pointer, so it could be used like scanf to scan multiple strings one after the other. when I tried to use the function
there was an exception so I ran it in debug and it is unable to read the input Strings value.
int scanStrAndMove(char **readString, char* formatString, char * writeString){
int forwardBy = 0;
while(isspace(*readString[forwardBy])){
forwardBy++;
}
int retVal = sscanf(*readString,formatString,writeString);
forwardBy += strlen(writeString) + strlen(formatString) - 2;
if(retVal > 0) *readString += forwardBy;
return retVal;
}
a screenshot of the problem
the calling of the function(line is of type char*)
edit: I tried to replace readString with another char to and it didn't solve the problem, thanks for all the answers.
edit: I saw the comments and tried to make a minimal reproducible example, try to run this with the function above:
#include <stdio.h>
#include <stdlib.h>
int main(int argc, char *argv[])
{
char inputStr[10] = " cde fg";
char outputSrt[10];
scanStrAndMove(&inputStr,"%s",outputSrt);
printf("%s",outputSrt);
scanStrAndMove(&inputStr,"%s",outputSrt);
printf("%s",outputSrt)
}

Segfault thrown on one line of code

I am using this library for libgps and having a few issues with it getting it to run properly.
The error from my debugger after it says segfault is:
Cannot find bounds of current function
The line of code throwing this is located in this file, on line 132.
uint8_t checksum= (uint8_t)strtol(strchr(message, '*')+1, NULL, 16);
I don't know the context of this at all, and I dont know why it would / wouldn't throw a segfault.
My code:
#include <stdio.h>
#include <stdlib.h>
#include <gps.h>
#include <math.h>
/*
*
*/
int main(int argc, char** argv) {
// Open
gps_init();
gps_on();
loc_t data;
gps_location(&data);
printf("%lf %lf\n", data.latitude, data.longitude);
return (EXIT_SUCCESS);
}
The function gps_location() takes you into gps.c and from there it runs into serial.c, once it runs:
void serial_readln(char *buffer, int len)
{
char c;
char *b = buffer;
int rx_length = -1;
while(1) {
rx_length = read(uart0_filestream, (void*)(&c), 1);
if (rx_length <= 0) {
//wait for messages
sleep(1);
} else {
if (c == '\n') {
*b++ = '\0';
break;
}
*b++ = c;
}
}
}
On the break it returns to gps.c goes into:
switch (nmea_get_message_type(buffer)) {
which takes it into nmea.c for nmea_get_message_type above.
It then runs the line:
if ((checksum = nmea_valid_checksum(message)) != _EMPTY)
taking it down to: uint8_t checksum= (uint8_t)strtol(strchr(message, '*')+1, NULL, 16); which is where the error is.
What is causing this?
Edit:
uint8_t is defined as: typedef unsigned char uint8_t;
Segmentation fault is not a "thrown exception" per se, it is a hardware-issued problem ("you said go there, but I don't see anything named 'there'").
As for your problem: what happens when strchr() does not find the specified character? I suggest you try it and find out.
The code you are working with is horrible and has no error checking anywhere. So it may go haywire for any unexpected input. This could be a potential security vulnerability too.
To fix this particular instance, change the code to:
if ( !message )
return NMEA_CHECKSUM_ERR; // possibly `exit` or something, this shouldn't happen
char *star = strchr(message, '*');
if ( !star )
return NMEA_CHECKSUM_ERR;
uint8_t checksum = strtol(star, NULL, 16);
The nmea_parse_gpgga and nmea_parse_gprmc also have multiple instances of a similar problem.
These functions might be acceptable if there was a parser or a regexp check that sanitizes the input before calling these functions. However, based on your question (I didn't check the codebase), it seems data is passed directly from read which is inexcusable.
The segfaulting function was not designed to handle an empty message or in fact any message not matching the expected form.
Another disastrous blunder is that the serial_readln function never checks that it does not write beyond len.

pattern matching / extracting in c using regex.h

I need help extracting a substring from a string using regex.h in C.
In this example, I am trying to extract all occurrences of character 'e' from a string 'telephone'. Unfortunately, I get stuck identifying the offsets of those characters. I am listing code below:
#include <stdio.h>
#include <regex.h>
int main(void) {
const int size=10;
regex_t regex;
regmatch_t matchStruct[size];
char pattern[] = "(e)";
char str[] = "telephone";
int failure = regcomp(&regex, pattern, REG_EXTENDED);
if (failure) {
printf("Cannot compile");
}
int matchFailure = regexec(&regex, pattern, size, matchStruct, 0);
if (!matchFailure) {
printf("\nMatch!!");
} else {
printf("NO Match!!");
}
return 0;
}
So per GNU's manual, I should get all of the occurrences of 'e' when a character is parenthesized. However, I always get only the first occurrence.
Essentially, I want to be able to see something like:
matchStruct[1].rm_so = 1;
matchStruct[1].rm_so = 2;
matchStruct[2].rm_so = 4;
matchStruct[2].rm_so = 5;
matchStruct[3].rm_so = 7;
matchStruct[3].rm_so = 8;
or something along these lines. Any advice?
Please note that you are in fact not comparing your compiled regex against str ("telephone") but rather to your plain-text pattern. Check your second attribute to regexec. That fixed, proceed for instance to "regex in C language using functions regcomp and regexec toggles between first and second match" where the answer to your question is already given.

C libpcap resolve DLT entries, some nasty bug

while sort of writing my own sniffer, I found one example that only starts if it is talking ethernet. Other DLT_types have been ignored. They can be found in pcap-bpf.h I wrote some
lines, that try to implement a missing pcap_resolve_dlt(). It's really nasty code(1), seems to work, though I hit a nasty bug, where one needs to give a space to the corresponding number like:
user#debian:~/tmp$ ./resolve_dlt 114
DLT_LTALK 114
user#debian:~/tmp$ ./resolve_dlt 14
DLT_ATM_RFC1483 11
user#debian:~/tmp$ ./resolve_dlt " 14"
DLT_RAW 14
Maybe the approach itself is totally wrong and one should grep the pcap-bpf.h directly.
1) http://nopaste.info/4a2470cc83.html, uses strstr()
Kind Regards,
Charles
Tags: C libpcap DLT_
You are doing strstr(dlt[i],argv[1]) so the first "14" matches the text in "DLT_ATM_RFC1483", however the text " 14" matches the text in "DLT_RAW 14".
You could use the token-pasting operator to make this work a little better:
#include <stdlib.h>
#include <stdio.h>
#include <pcap-bpf.h>
#define TAB_ENTRY(x) { x, #x }
struct {
long dlt_code;
const char *dlt_name;
} dlt_tab[] = {
TAB_ENTRY(DLT_NULL),
TAB_ENTRY(DLT_EN10MB),
TAB_ENTRY(DLT_EN3MB),
TAB_ENTRY(DLT_AX25),
TAB_ENTRY(DLT_PRONET),
TAB_ENTRY(DLT_CHAOS),
TAB_ENTRY(DLT_IEEE802),
TAB_ENTRY(DLT_ARCNET),
TAB_ENTRY(DLT_SLIP),
TAB_ENTRY(DLT_PPP),
TAB_ENTRY(DLT_FDDI),
TAB_ENTRY(DLT_ATM_RFC1483),
TAB_ENTRY(DLT_RAW),
TAB_ENTRY(DLT_RAW),
TAB_ENTRY(DLT_SLIP_BSDOS),
TAB_ENTRY(DLT_PPP_BSDOS),
TAB_ENTRY(DLT_SLIP_BSDOS),
TAB_ENTRY(DLT_PPP_BSDOS),
TAB_ENTRY(DLT_ATM_CLIP),
TAB_ENTRY(DLT_REDBACK_SMARTEDGE),
TAB_ENTRY(DLT_PPP_SERIAL),
TAB_ENTRY(DLT_PPP_ETHER),
TAB_ENTRY(DLT_SYMANTEC_FIREWALL),
TAB_ENTRY(DLT_C_HDLC),
TAB_ENTRY(DLT_C_HDLC),
TAB_ENTRY(DLT_IEEE802_11),
TAB_ENTRY(DLT_FRELAY),
TAB_ENTRY(DLT_LOOP),
TAB_ENTRY(DLT_LOOP),
TAB_ENTRY(DLT_ENC),
TAB_ENTRY(DLT_ENC),
TAB_ENTRY(DLT_LINUX_SLL),
TAB_ENTRY(DLT_LTALK),
TAB_ENTRY(DLT_ECONET),
TAB_ENTRY(DLT_IPFILTER),
TAB_ENTRY(DLT_PFLOG),
TAB_ENTRY(DLT_CISCO_IOS),
TAB_ENTRY(DLT_PRISM_HEADER),
TAB_ENTRY(DLT_AIRONET_HEADER),
TAB_ENTRY(DLT_HHDLC),
TAB_ENTRY(DLT_IP_OVER_FC),
TAB_ENTRY(DLT_SUNATM),
TAB_ENTRY(DLT_RIO),
TAB_ENTRY(DLT_PCI_EXP),
TAB_ENTRY(DLT_AURORA),
TAB_ENTRY(DLT_IEEE802_11_RADIO),
TAB_ENTRY(DLT_TZSP),
TAB_ENTRY(DLT_ARCNET_LINUX),
TAB_ENTRY(DLT_JUNIPER_MLPPP),
TAB_ENTRY(DLT_JUNIPER_MLFR),
TAB_ENTRY(DLT_JUNIPER_ES),
TAB_ENTRY(DLT_JUNIPER_GGSN),
TAB_ENTRY(DLT_JUNIPER_MFR),
TAB_ENTRY(DLT_JUNIPER_ATM2),
TAB_ENTRY(DLT_JUNIPER_SERVICES),
TAB_ENTRY(DLT_JUNIPER_ATM1),
TAB_ENTRY(DLT_APPLE_IP_OVER_IEEE1394),
TAB_ENTRY(DLT_MTP2_WITH_PHDR),
TAB_ENTRY(DLT_MTP2),
TAB_ENTRY(DLT_MTP3),
TAB_ENTRY(DLT_SCCP),
TAB_ENTRY(DLT_DOCSIS),
TAB_ENTRY(DLT_LINUX_IRDA),
TAB_ENTRY(DLT_IBM_SP),
TAB_ENTRY(DLT_IBM_SN),
TAB_ENTRY(DLT_USER0),
TAB_ENTRY(DLT_USER1),
TAB_ENTRY(DLT_USER2),
TAB_ENTRY(DLT_USER3),
TAB_ENTRY(DLT_USER4),
TAB_ENTRY(DLT_USER5),
TAB_ENTRY(DLT_USER6),
TAB_ENTRY(DLT_USER7),
TAB_ENTRY(DLT_USER8),
TAB_ENTRY(DLT_USER9),
TAB_ENTRY(DLT_USER10),
TAB_ENTRY(DLT_USER11),
TAB_ENTRY(DLT_USER12),
TAB_ENTRY(DLT_USER13),
TAB_ENTRY(DLT_USER14),
TAB_ENTRY(DLT_USER15),
TAB_ENTRY(DLT_IEEE802_11_RADIO_AVS),
TAB_ENTRY(DLT_JUNIPER_MONITOR),
TAB_ENTRY(DLT_BACNET_MS_TP),
TAB_ENTRY(DLT_PPP_PPPD),
TAB_ENTRY(DLT_PPP_PPPD),
TAB_ENTRY(DLT_PPP_PPPD),
TAB_ENTRY(DLT_JUNIPER_PPPOE),
TAB_ENTRY(DLT_JUNIPER_PPPOE_ATM),
TAB_ENTRY(DLT_GPRS_LLC),
TAB_ENTRY(DLT_GPF_T),
TAB_ENTRY(DLT_GPF_F),
TAB_ENTRY(DLT_GCOM_T1E1),
TAB_ENTRY(DLT_GCOM_SERIAL),
TAB_ENTRY(DLT_JUNIPER_PIC_PEER),
TAB_ENTRY(DLT_ERF_ETH),
TAB_ENTRY(DLT_ERF_POS),
TAB_ENTRY(DLT_LINUX_LAPD),
TAB_ENTRY(DLT_JUNIPER_ETHER),
TAB_ENTRY(DLT_JUNIPER_PPP),
TAB_ENTRY(DLT_JUNIPER_FRELAY),
TAB_ENTRY(DLT_JUNIPER_CHDLC),
TAB_ENTRY(DLT_MFR),
TAB_ENTRY(DLT_JUNIPER_VP),
TAB_ENTRY(DLT_A429),
TAB_ENTRY(DLT_A653_ICM),
TAB_ENTRY(DLT_USB),
TAB_ENTRY(DLT_BLUETOOTH_HCI_H4),
TAB_ENTRY(DLT_IEEE802_16_MAC_CPS),
TAB_ENTRY(DLT_USB_LINUX),
TAB_ENTRY(DLT_CAN20B),
TAB_ENTRY(DLT_IEEE802_15_4_LINUX),
TAB_ENTRY(DLT_PPI),
TAB_ENTRY(DLT_IEEE802_16_MAC_CPS_RADIO),
TAB_ENTRY(DLT_JUNIPER_ISM),
TAB_ENTRY(DLT_IEEE802_15_4),
TAB_ENTRY(DLT_SITA),
TAB_ENTRY(DLT_ERF),
TAB_ENTRY(DLT_RAIF1),
TAB_ENTRY(DLT_IPMB),
TAB_ENTRY(DLT_JUNIPER_ST),
TAB_ENTRY(DLT_BLUETOOTH_HCI_H4_WITH_PHDR)
};
int main(int argc, char *argv[])
{
char *endptr = NULL;
long code;
int i, found;
if (argc > 1)
code = strtol(argv[1], &endptr, 0);
if (!endptr || endptr == argv[1]) {
fprintf(stderr, "Usage: %s <dlt_code>\n", argv[0]);
exit(2);
}
found = 0;
for (i = 0; i < (sizeof dlt_tab / sizeof dlt_tab[0]); i++) {
if (dlt_tab[i].dlt_code == code) {
found = 1;
break;
}
}
if (!found) {
printf("%ld not found\n", code);
exit(1);
}
printf("%ld is %s\n", code, dlt_tab[i].dlt_name);
return 0;
}
Example:
$ ./bpf 12
12 is DLT_RAW
$ ./bpf 120
120 is DLT_AIRONET_HEADER
(Note that 12 is DLT_RAW on Linux systems, not 14).
At least with newer versions of libpcap/WinPcap, you can use pcap_datalink_val_to_name to map a DLT_ value to the DLT_ name. resolve_dlt could just use strtol() on its first argument and pass the result to pcap_datalink_val_to_name (after, of course, checking for errors, and for values that don't fit in an int).

Resources