C regex performance - c

I'm writing a code in C to perform some regex in enwik8 and enwik9. I'm also creating the same algorithm in other languages for benchmark purposes. The issue is that I'm doing something wrong with my C code because it takes 40 seconds while python and others take just 10 seconds.
What am I forgetting?
#include <stdio.h>
#include <regex.h>
#define size 1024
int main(int argc, char **argv){
FILE *fp;
char line[size];
regex_t re;
int x;
const char *filename = "enwik8";
const char *strings[] = {"\bhome\b", "\bdear\b", "\bhouse\b", "\bdog\b", "\bcat\b", "\bblue\b", "\bred\b", "\bgreen\b", "\bbox\b", "\bwoman\b", "\bman\b", "\bwomen\b", "\bfull\b", "\bempty\b", "\bleft\b", "\bright\b", "\btop\b", "\bhelp\b", "\bneed\b", "\bwrite\b", "\bread\b", "\btalk\b", "\bgo\b", "\bstay\b", "\bupper\b", "\blower\b", "\bI\b", "\byou\b", "\bhe\b", "\bshe\b", "\bwe\b", "\bthey\b"};
for(x = 0; x < 33; x++){
if(regcomp(&re, strings[x], REG_EXTENDED) != 0){
printf("Failed to compile regex '%s'\n", strings[x]);
return -1;
}
fp = fopen(filename, "r");
if(fp == 0){
printf("Failed to open file %s\n", filename);
return -1;
}
while((fgets(line, size, fp)) != NULL){
regexec(&re, line, 0, NULL, 0);
}
}
return 0;
}

file access and compiling regexes is probably a culprit.
compile your regexs once and store them in an array
open the file
read a line
run each compiled regex over it
close the file.

Related

Converting a binary to char [C]

a lot of times ago, i found an extract of C code, that could read the content of an executable file and that can store it as an array of char, in another file (ex: output.txt). It should work, but when i tried it, it corrupts the output, and it can't copy exactly the content of the exe as a char without damaging it. I don't know where could be the problem.
This is my extract of code in C
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
int main(int argc, char *argv[])
{
if(argc != 3)
{
fprintf(stderr, "Usage >NameProgram firstParam Executable.exe\n");
return -1;
}
FILE *output = fopen("output.txt", "a");
[..]
char* input_file = argv[2]; //the name of the exe
FILE* f_input = fopen(input_file, "rb");
fprintf(output,"char byn[] = {\n");
unsigned long n = 0;
while(!feof(f_input))
{
unsigned char c;
if(fread(&c, 1, 1, f) == 0)
break;
fprintf(output,"0x%.2X,", (int)c);
++n;
if(n % 10 == 0)
fprintf(output,"\n");
}
fclose(f_input);
fclose(output);
//truncating file
FILE *output = fopen("output.txt", "r+");
fseek(output, -1, SEEK_END);
fprintf(output,"};\n\n");
fclose(output);
[..]
using your posted code as a guide, I produced the following:
#include <stdio.h>
#include <stdlib.h>
int main(int argc, char *argv[])
{
if(argc != 3)
{
fprintf( stderr, "USAGE: %s outFilename inFileName\n", argv[0] );
exit( EXIT_FAILURE );
}
FILE *fp_out = fopen( argv[1], "a");
FILE* fp_in = fopen( argv[2], "rb");
fprintf(fp_out, "char byn[] = {\n" );
unsigned long n = 0;
unsigned char c;
while(fread(&c, 1, 1, fp_in) == 1)
{
fprintf( fp_out,"0x%.2X,", (int)c);
++n;
if(n % 10 == 0)
fprintf( fp_out,"\n");
}
fprintf( fp_out, "};\n\n");
fclose(fp_in);
fclose(fp_out);
} // end function: main
which I ran on a few executable files
Note: I'm running Ubuntu Linux 14.04
It seems to work correctly.
As with your example, I skipped the error checking (which you should really include in your actual code.)

How to unpack a msgpack file?

I am writing msgpack-encoded data to a file. On writing, I am just using the fbuffer of the C API. As in (I striped all error handling for the example):
FILE *fp = fopen(filename, "ab");
msgpack_packer pk;
msgpack_packer_init(pk, fp, msgpack_fbuffer_write);
msgpack_pack_int(pk, 42);
// more data ...
How do I read this file back in? All the example I found assume that the data is in memory, however, my files are up to 5GB, it is not exactly a good idea to hold this in memory completely. Also I do not want to read in chunks myself. After all, I do not know how long the msgpack objects are, so chances are I end up with half an integer in my buffer.
Can msgpack's unpack somehow read from disk directly? Or is there some standard pattern to do this?
You might consider using "msgpack_unpacker" for that instead, which seems to be the official way that MessagePack implements a 'streaming' deserializer. Have a look at msgpack-c/example/c/lib_buffer_unpack.c
Regards, NiteHawk
Okay, I managed to do it.
Here is how to write:
#include <stdlib.h>
#include <msgpack.h>
#include <msgpack/fbuffer.h>
int main(int argc, char **argv) {
if(2 != argc) {
fprintf(stderr, "Call all writeFile <file>");
return;
}
FILE *fp = fopen(argv[1], "ab");
msgpack_packer pk;
msgpack_packer_init(&pk, fp, msgpack_fbuffer_write);
for(int i=0;i<2048;i++) {
msgpack_pack_int(&pk, i);
}
fclose(fp);
}
And this is what the read looks like:
#include <stdlib.h>
#include <msgpack.h>
static const int BUFFERSIZE = 2048;
int main(int argc, char **argv) {
if(2 != argc) {
fprintf(stderr, "Call with readFile <file>");
return 1;
}
char *inbuffer = (char *) malloc(BUFFERSIZE);
if(NULL == inbuffer) {
fprintf(stderr, "Out of memory!");
return 1;
}
FILE *fp = fopen(argv[1], "rb");
size_t off = 0;
size_t read = 0;
msgpack_unpacked unpacked;
msgpack_unpacked_init(&unpacked);
do {
read = fread(inbuffer, sizeof(char), BUFFERSIZE - off, fp);
off = 0;
while(msgpack_unpack_next(&unpacked, inbuffer, read, &off)) {
msgpack_object_print(stdout, unpacked.data);
puts("");
}
memcpy(inbuffer, &(inbuffer[off]), read-off);
off = read - off;
} while(read != 0);
free(inbuffer);
fclose(fp);
msgpack_unpacked_destroy(&unpacked);
return 0;
}
I did not try, but I think it will work with larger objects (arrays, maps etc.) as well.

How to redirect more than one text file in c programm

How to redirect more than one text file in c program? For example I have the following C code:
//redirection.c
#include<stdio.h>
main()
{
int x,y;
scanf("%d",&x);
x=x*x;
printf("%d",x);
scanf("%d",&y);
y=x+y;
printf("%d",y);
}
After compiling this code I created two text files text1.txt having the value 8 and text2.txt having the value 6.
When I give input to this program using command line redirection (as redirection<text1.txt), it gives output 64 and does not wait to take another input (and program exits) which I want to give another input from text2.txt.
Is there any solution how can I send another input via text2.txt for second scanf function in the above program?
While giving the input as redirection as like this.
cat a b | ./a.out.
Or else you can use the command line arguments.
#include<stdio.h>
main(int argc, char *argv[])
{
FILE *fp, *fp1;
if ( (fp=fopen(argv[1],"r")) == NULL ){
printf("file cannot be opened\n");
return 1;
}
if (( fp1=fopen(argv[2],"r")) == NULL ){
printf("file cannot be opened\n");
return 1;
}
int x,y;
fscanf(fp,"%d",&x);// If you having only the value in that file
x=x*x;
printf("%d\n",x);
fscanf(fp1,"%d",&y);// If you having only the value in that file
y=x+y;
printf("%d\n",y);
}
you can also use command line arguments:
#include <stdio.h>
#define BUFSIZE 1000
int main(int argc, char *argv[])
{
FILE *fp1 = NULL, *fp2 = NULL;
char buff1[BUFSIZE], buff2[BUFSIZE];
fp1 = fopen(argv[1], "r");
while (fgets(buff1, BUFSIZE - 1, fp1) != NULL)
{
printf("%s\n", buff1);
}
fclose(fp1);
fp2 = fopen(argv[2], "r");
while (fgets(buff2, BUFSIZE - 1, fp2) != NULL)
{
printf("%s\n", buff2);
}
fclose(fp2);
}
here is a more cleaned up version:
#include <stdio.h>
#define BUFSIZE 1000
void print_content(char *file);
int main(int argc, char *argv[])
{
print_content(argv[1]);
print_content(argv[2]);
}
void print_content(char *file){
char buff[BUFSIZE];
FILE *fp = fopen(file, "r");
while (fgets(buff, sizeof(buff), fp) != NULL)
{
printf("%s\n", buff);
}
fclose(fp);
}

Unable to read a file and pass into arguments

1) I'm trying to open a file, read the mix data (ints, chars and strings) and store them into args.
1.1) so in the sample.txt is a total of 13 (excluding args[0])
2) Need to read a file from terminal "./myprog.c < sample.txt"
Heres my code and have no idea where i went wrong:
sample.txt:
123 213 110 90 1
hello my friend
boo bleh
a b c
myprog.c:
#include <stdio.h>
int main()
{
int i = 1;
FILE *fstin=fopen(argv[0], "r"); //open the file
if (fstin == NULL) {
puts("Couldn't fopen...");
return -1;
}
//Getting all the inputs from file
while ((fscanf(fstin, "%d", argv[i])) != EOF){
i++;
}
fclose(fstin);
for (i=0; i<10; i++) {
printf("%d\n",argv[i]);
}
return 0;
}
Any help is greatly appreciated!
PS: Would like if anyone could post their complete solution? Will upload unto this post and let everyone have a review of this problem
PPS: Please excuse the poor level of coding as I am a beginner and completely new to C.
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int main(int ac, char *av[]){
int i, argc=0;
char **argv=NULL, data[16];
FILE *fstin = stdin;
if(ac == 2){
if(NULL==(fstin = fopen(av[1], "r"))){
puts("Couldn't fopen...");
return -1;
}
}
while (1==fscanf(fstin, "%15s", data)){
argv = realloc(argv, (argc+1)*sizeof(char*));
argv[argc] = malloc(strlen(data)+1);
strcpy(argv[argc++], data);
}
if(ac == 2)
fclose(fstin);
for (i=0; i<argc; ++i) {
printf("%s\n", argv[i]);
}
//deallocate
return 0;
}
You are making mistake at 2nd point where you divert your file to other file which is wrong. Actually you need to first compile and need to make executable.
gcc -o my_prog ./myprog.c -Wall
You need to execute this program as below to read file from c program:
./my_prog ./sample.txt
As you are new to C programming first go to man pages related to file operations.
Solution:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int main(int argc, char *argv[]) {
//If command line argument is not inserted then stop operation
if (2 != argc) {
printf("Invalid number of arguments : %d\n", argc);
return -1;
}
int size = 0, ret = 0;
char *data = NULL;
FILE *fp = NULL;
//Open file in read mode given from command line argument
if (NULL != (fp = fopen(argv[1], "r")))
{
//Find size of file
fseek(fp, 0L, SEEK_END);
size = ftell(fp);
fseek(fp, 0L, SEEK_SET);
//if file is empty no need to read it.
if (size > 0)
{
//Data pointer which contains file information
data = (char *) calloc(sizeof(char), size);
if (NULL != data)
{
//Read whole file in one statement
fread(data, sizeof(char), size, fp);
printf("File %s is readed successfully\n", argv[1]);
printf("Data:\n");
printf("%s\n", data);
free(data); data = NULL;
}
else
{
perror("memory allocation failed\n");
ret = -1;
}
}
else
{
printf("File %s is empty\n", argv[1]);
}
fclose(fp); fp = NULL;
}
else
{
perror("File open failed\n");
ret = -1;
}
return ret;
}
Now Test it on your setup and if any query please post comments.

Why wont the program read from the 2 argument file?

So the assignment is to implement a substring search program using an input file to be searched from and an input to be searched. I created the following code:
#include <stdio.h>
#include <string.h>
int main(int argc,char *argv[])
{
FILE *fp;
fp = fopen(argv[1],"r");
if (fp == NULL)
{
printf("Error");
return 0;
}
char* tmpp[100];
int count = 0;
char* nexts = argv[2];
char* tmp = fgets(tmpp,100,fp);
while(tmp = strstr(tmp,nexts))
{
count++;
tmp++;
}
printf("%d\n\n",count);
fclose(fp);
return 0;
}
The program compiles but when i go to implement it in the ubuntu terminal as:
echo "aabb" >beta
./a.out beta a
1
Why isnt the program using the first argument (argv[1]) as beta and the second argument (argv[2]) as a correctly?
You should open a file and then read bytes from that file into temporary buffer:
FILE *file = fopen("file", "r");
while (1) {
char buffer[BUFSIZ+1];
size_t nread = fread(buffer, 1, sizeof(buffer)-1, file);
if (nread == 0) break; // read error or EOF
buffer[nread] = 0;
// chunk with BUFSIZ amount of bytes is available via buffer (and is zero-terminated)
}
If you want to search for string/pattern in a file, be aware that looked pattern in file may cross your chunk-size boundary, for example: you look for "hello", and BUFSIZ is 512. File contains "hello" at byte 510. Obviously, if you read by 512, you will get the first chunk ending with "he", and the second chunk starting with "llo". Probability of this situation is nonzero for all chunk sizes (except SIZE_MAX, but that buffer size is impossible by other reasons). Dealing with borders may be very complicated.
Close...but this is closer:
#include <stdio.h>
#include <string.h>
int main(int argc, char *argv[])
{
if (argc != 3)
{
fprintf(stderr, "Usage: %s file pattern\n", argv[0]);
return 1;
}
FILE *fp = fopen(argv[1], "r");
if (fp == NULL)
{
fprintf(stderr, "Error: failed to open file %s for reading\n", argv[1]);
return 1;
}
char tmpp[1000];
int count = 0;
char* nexts = argv[2];
while (fgets(tmpp, sizeof(tmpp), fp) != 0)
{
char *tmp = tmpp;
while ((tmp = strstr(tmp, nexts)) != 0)
{
count++;
tmp++;
}
}
printf("%d\n", count);
fclose(fp);
return 0;
}
The main difference is that this loops reading multiple lines from the input file. Yours would only work on files with a single line of input.

Resources