C: searching for a string in a file - c

If I have:
const char *mystr = "cheesecakes";
FILE *myfile = fopen("path/to/file.exe","r");
I need to write a function to determine whether myfile contains any occurrences of mystr. Could anyone help me? Thanks!
UPDATE: So it turns out the platform I need to deploy to doesn't have memstr. Does anyone know of a free implementation I can use in my code?

If you can't fit the whole file into memory, and you have access to the GNU memmem() extension, then:
Read as much as you can into a buffer;
Search the buffer with memmem(buffer, len, mystr, strlen(mystr) + 1);
Discard all but the last strlen(mystr) characters of the buffer, and move those to the start;
Repeat until end of file reached.
If you don't have memmem, then you can implement it in plain C using memchr and memcmp, like so:
/*
* The memmem() function finds the start of the first occurrence of the
* substring 'needle' of length 'nlen' in the memory area 'haystack' of
* length 'hlen'.
*
* The return value is a pointer to the beginning of the sub-string, or
* NULL if the substring is not found.
*/
void *memmem(const void *haystack, size_t hlen, const void *needle, size_t nlen)
{
int needle_first;
const void *p = haystack;
size_t plen = hlen;
if (!nlen)
return NULL;
needle_first = *(unsigned char *)needle;
while (plen >= nlen && (p = memchr(p, needle_first, plen - nlen + 1)))
{
if (!memcmp(p, needle, nlen))
return (void *)p;
p++;
plen = hlen - (p - haystack);
}
return NULL;
}

Because there is no memmem or memstr to find a string in a binary array (others suggested to read it into memory and use strstr - no this doesn't work) you have to read it byte by byte with "fgetch" and write a small state machine to match it while reading.

See here:
http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
for a Boyer-Moore implementation in C99. This is a very common string searching algorithm and runs in O(n).

Here is a slapped together version of it. It has no error checking and probably has overflow bugs. But I think it finds the desired string and accounts for the backtracking necessary for partial substring matches. I doubt there are more than 15 bugs left.
Edit: There was at least one in the first answer. I woke up in the middle of the night and realized the backtracking check was wrong. It didn't find '12123' in '1212123'. It might still be wrong, but at least it finds that one now.
int main( int argc, char* argv[] )
{
FILE *fp;
char *find, *hist;
int len, pos=0, hl=0, i;
char c;
fp = fopen( argv[1], "r" );
find = argv[2];
len = (int)strlen( find );
hist = malloc( len );
memset( hist, 0, len );
while ( !feof( fp )) {
c = fgetc( fp );
if ( find[pos++] == c ) {
if ( pos == len ) {
printf( "Found it\n" );
return 1;
}
}
else {
// check history buffer (kludge for backtracking)
if ( pos > 0 ) {
pos = 0;
for ( i = 0; i < len - 1; i++ )
if ( 0 == memcmp( hist+len-i-1, find, i + 1 )) {
// we had a mismatch, but the history matches up to len i
pos = i;
}
}
}
// update history buffer - this is innefficient - better as circular buffer
memmove( hist, hist + 1, len - 1 );
hist[len-1] = c;
}
printf( "Not found\n" );
}

Here's a function that will search for a string in a buffer.
Limitations: it doesn't handle wide characters (in the case of internationalization). You'll have to write your own code to read the file into memory. It won't find the pattern if the pattern is split between 2 read buffers.
/*****************************************************
const char *buffer pointer to your read buffer (the larger, the better).
size_t bufsize the size of your buffer
const char *pattern pattern you are looking for.
Returns an index into the buffer if pattern is found.
-1 if pattern is not found.
Sample:
pos = findPattern (buffer, BUF_SIZE, "cheesecakes");
*****************************************************/
int findPattern (const char *buffer, size_t bufSize, const char *pattern)
{
int i,j;
int patternLen;
// minor optimization. Determine patternLen so we don't
// bother searching buffer if fewer than patternLen bytes remain.
patternLen = strlen (pattern);
for (i=0; i<bufSize-patternLen; ++i)
{
for (j=0; j<patternLen; ++j)
{
if (buffer[i+j] != pattern[j])
{
break;
}
}
if (j == patternLen)
{
return i;
}
}
return -1;
}

chat match = "findthis";
int depth = 0;
while(not eof)
{
char ch = getonebyte();
if(ch == match[depth])
{
if (depth == strlen(match))
break;
else
depth++;
}
else
depth = 0;
}
roughly (I am sure there are off by ones in there)

Related

Subtlety in strstr?

I have a file of binary data with various character strings sprinkled throughout. I am trying to write a C code to find the first occurrence of user-specified strings in the file. (I know this can be done with bash but I need a C code for other reasons.) The code as it stands is:
#include <stdio.h>
#include <string.h>
#define CHUNK_SIZE 512
int main(int argc, char **argv) {
char *fname = argv[1];
char *tag = argv[2];
FILE *infile;
char *chunk;
char *taglcn = NULL;
long lcn_in_file = 0;
int back_step;
fpos_t pos;
// allocate chunk
chunk = (char*)malloc((CHUNK_SIZE + 1) * sizeof(char));
// find back_step
back_step = strlen(tag) - 1;
// open file
infile = fopen(fname, "r");
// loop
while (taglcn == NULL) {
// read chunk
memset(chunk, 0, (CHUNK_SIZE + 1) * sizeof(char));
fread(chunk, sizeof(char), CHUNK_SIZE, infile);
printf("Read %c\n", chunk[0]);
// look for tag
taglcn = strstr(chunk, tag);
if (taglcn != NULL) {
// if you find tag, add to location the offset in bytes from beginning of chunk
lcn_in_file += (long)(taglcn - chunk);
printf("HEY I FOUND IT!\n");
} else {
// if you don't find tag, add chunk size minus back_step to location and ...
lcn_in_file += ((CHUNK_SIZE - back_step) * sizeof(char));
// back file pointer up by back_step for next read
fseek(infile, -back_step, SEEK_CUR);
fgetpos(infile, &pos);
printf("%ld\n", pos);
printf("%s\n\n\n", chunk);
}
}
printf("%ld\n", lcn_in_file);
fclose(infile);
free(chunk);
}
If you're wondering, back_step is put in to take care of the unlikely eventuality that the string in question is split by a chunk boundary.
The file I am trying to examine is about 1Gb in size. The problem is that for some reason I can find any string within the first 9000 or so bytes, but beyond that, strstr is somehow not detecting any string. That is, if I look for a string located beyond 9000 or so bytes into the file, strstr does not detect it. The code reads through the entire file and never finds the search string.
I have tried varying CHUNK_SIZE from 128 to 50000, with no change in results. I have tried varying back_step as well. I have even put in diagnostic code to print out chunk character by character when strstr fails to find the string, and sure enough, the string is exactly where it is supposed to be. The diagnostic output of pos is always correct.
Can anyone tell me where I am going wrong? Is strstr the wrong tool to use here?
Since you say your file is binary, strstr() will stop scanning at the first null byte in the file.
If you wish to look for patterns in binary data, then the memmem() function is appropriate, if it is available. It is available on Linux and some other platforms (BSD, macOS, …) but it is not defined as part of standard C or POSIX. It bears roughly the same relation to strstr() that memcpy() bears to strcpy().
Note that your code should detect the number of bytes read by fread() and only search on that.
char *tag = …; // Identify the data to be searched for
size_t taglen = …; // Identify the data's length (maybe strlen(tag))
int nbytes;
while ((nbytes = fread(chunk, 1, (CHUNK_SIZE + 1), infile)) > 0)
{
…
tagcln = memmem(chunk, nbytes, tag, taglen);
if (tagcln != 0)
…found it…
…
}
It isn't really clear why you have the +1 on the chunk size. The fread() function doesn't add null bytes at the end of the data or anything like that. I've left that aspect unchanged, but would probably not use it in my own code.
It is good that you take care of identifying a tag that spans the boundaries between two chunks.
The most likely reason for strstr to fail in your code is the presence of null bytes in the file. Furthermore, you should open the file in binary mode for the file offsets to be meaningful.
To scan for a sequence of bytes in a block, use the memmem() function. If it is not available on your system, here is a simple implementation:
#include <string.h>
void *memmem(const void *haystack, size_t n1, const void *needle, size_t n2) {
const unsigned char *p1 = haystack;
const unsigned char *p2 = needle;
if (n2 == 0)
return (void*)p1;
if (n2 > n1)
return NULL;
const unsigned char *p3 = p1 + n1 - n2 + 1;
for (const unsigned char *p = p1; (p = memchr(p, *p2, p3 - p)) != NULL; p++) {
if (!memcmp(p, p2, n2))
return (void*)p;
}
return NULL;
}
You would modify your program this way:
#include <errno.h>
#include <stdio.h>
#include <string.h>
void *memmem(const void *haystack, size_t n1, const void *needle, size_t n2);
#define CHUNK_SIZE 65536
int main(int argc, char **argv) {
if (argc < 3) {
fprintf(sderr, "missing parameters\n");
exit(1);
}
// open file
char *fname = argv[1];
FILE *infile = fopen(fname, "rb");
if (infile == NULL) {
fprintf(sderr, "cannot open file %s: %s\n", fname, strerror(errno));
exit(1);
}
char *tag = argv[2];
size_t tag_len = strlen(tag);
size_t overlap_len = 0;
long long pos = 0;
char *chunk = malloc(CHUNK_SIZE + tag_len - 1);
if (chunk == NULL) {
fprintf(sderr, "cannot allocate memory\n");
exit(1);
}
// loop
for (;;) {
// read chunk
size_t chunk_len = overlap_len + fread(chunk + overlap_len, 1,
CHUNK_SIZE, infile);
if (chunk_len < tag_len) {
// end of file or very short file
break;
}
// look for tag
char *tag_location = memmem(chunk, chunk_len, tag, tag_len);
if (tag_location != NULL) {
// if you find tag, add to location the offset in bytes from beginning of chunk
printf("string found at %lld\n", pos + (tag_location - chunk));
break;
} else {
// if you don't find tag, add chunk size minus back_step to location and ...
overlap_len = tag_len - 1;
memmove(chunk, chunk + chunk_len - overlap_len, overlap_len);
pos += chunk_len - overlap_len;
}
}
fclose(infile);
free(chunk);
return 0;
}
Note that the file is read in chunks of CHUNK_SIZE bytes, which is optimal if CHUNK_SIZE is a multiple of the file system block size.
For some really simple code, you can use mmap() and memcmp().
Error checking and proper header files are left as an exercise for the reader (there is at least one bug - another exercise for the reader to find):
int main( int argc, char **argv )
{
// put usable names on command-line args
char *fname = argv[ 1 ];
char *tag = argv[ 2 ];
// mmap the entire file
int fd = open( fname, O_RDONLY );
struct stat sb;
fstat( fd, &sb );
char *contents = mmap( NULL, sb.st_size, PROT_READ, MAP_PRIVATE, fd, 0 );
close( fd );
size_t tag_len = strlen( tag );
size_t bytes_to_check = 1UL + sb.st_size - tag_len;
for ( size_t ii = 0; ii < bytes_to_check; ii++ )
{
if ( !memcmp( contents + ii, tag, tag_len ) )
{
// match found
// (probably want to check if contents[ ii + tag_len ]
// is a `\0' char to get actual string matches)
}
}
munmap( contents, sb.st_len );
return( 0 );
}
That likely won't be anywhere near the fastest way (in general, mmap() is not going to be anywhere near a performance winner, especially in this use case of simply streaming through a file from beginning to end), but it's simple.
(Note that mmap() also has problems if the file size changes while it's being read. If the file grows, you won't see the additional data. If the file is shortened, you'll get SIGBUS when you try to read the removed data.)
A binary data file is going to contain '\0' bytes acting as string ends. The more that are in there, the shorter the area strstr is going to search will be. Note strstr will consider its work done once it hits a 0 byte.
You can scan the memory in intervals like
while (strlen (chunk) < CHUNKSIZE)
chunk += strlen (chunk) + 1;
i.e. restart after a null byte in the chunk as long as you are still within the chunk.

Remove first few characters from a string

I need to remove the first 3 characters from an array without any libraries. How would I go about doing this? I know that I can use memmove but I'm working on a system without the standard library, also memmove is for pointers. With memmove I can do this:
void chopN(char *str, size_t n)
{
assert(n != 0 && str != 0);
size_t len = strlen(str);
if (n > len)
return; // Or: n = len;
memmove(str, str+n, len - n + 1);
}
But could I remove characters from an array without memmove or any other standard library functions?
As long as you know the string is at least 3 characters long, you can simply use str + 3.
Hmmm: 2 simple while loops should do it.
Some untested code to give you an idea.
void chopN(char *str, size_t n) {
char *dest = str;
// find beginning watching out for rump `str`
while (*str && n--) {
str++;
}
// Copy byte by byte
while (*src) {
*dest++ = *src++;
}
*dest = '\0';
}
Could add a if (n==0) short-cut if desired.
Here is a function that does not use standard C string functions. n can be less then or equal to strlen( s ). Otherwise the function does nothing.
#include <stdio.h>
char * chopN( char *s, size_t n )
{
char *src = s;
while ( *src && n ) --n, ++src;
if ( n == 0 && src != s )
{
for ( char *dst = s; ( *dst++ = *src++ ); );
}
return s;
}
int main(void)
{
char s[] = "Hello, World";
puts( s );
puts( chopN( s, 7 ) );
return 0;
}
The program output is
Hello, World
World
If you want that in case when n is greater than strlen( s ) all characters were removed then it is enough to substituted the if statement
if ( n == 0 && src != s )
for this one
if ( src != s )
Simply start at the new start (str+n) and copy to the old start char by char until you reach the end of the string:
char *str1;
for(str1 = str+n; *str1; *str++=*str1++)
;
*str = 0;
If you want something more powerful, you can e.g., steal a memmove implementation from http://git.musl-libc.org/cgit/musl/tree/src/string/memmove.c (it basically does the same thing, except with some performance tweaks and a decision on which way (left/right) the move goes).
You don't need to pass the "amount" of characters as a parameter, you can search for '\0':
void chopN(char *str, size_t n){
char *aux;
int i=0,j=0;
while(str[i]!='\0'){
if(i>n+1){
aux[j++]=str[i++];
}else i++;
}
aux[j]='\0';
str = aux;
}

C read big file into char* array too slow

I'd like to read a big file while the first character of a line isn't " ".
But the code I have written is very slow. How can I speed up the routine?
Is there a better solution instead of getline?
void readString(const char *fn)
{
FILE *fp;
char *vString;
struct stat fdstat;
int stat_res;
stat_res = stat(fn, &fdstat);
fp = fopen(fn, "r+b");
if (fp && !stat_res)
{
vString = (char *)calloc(fdstat.st_size + 1, sizeof(char));
int dataEnd = 1;
size_t len = 0;
int emptyLine = 1;
char **linePtr = malloc(sizeof(char*));
*linePtr = NULL;
while(dataEnd)
{
// Check every line
getline(linePtr, &len, fp);
// When data ends, the line begins with space (" ")
if(*linePtr[0] == 0x20)
emptyLine = 0;
// If line begins with space, stop writing
if(emptyLine)
strcat(vString, *linePtr);
else
dataEnd = 0;
}
strcat(vString, "\0");
free(linePtr);
linePtr = NULL;
}
}
int main(int argc, char **argv){
readString(argv[1]);
return EXIT_SUCCESS;
}
How can I speed up the routine?
The most suspicious aspect of your program performance-wise is the strcat(). On each call, it needs to scan the whole destination string from the beginning to find the place to append the source string. As a result, if your file's lines have length bounded by a constant (even a large one), then your approach's performance scales with the square of the file length.
The asymptotic complexity analysis doesn't necessarily tell the whole story, though. The I/O part of your code scales linearly with file length, and since I/O is much more expensive than in-memory data manipulation, that will dominate your performance for small enough files. If you're in that regime then you're probably not going to do much better than you already do. In that event, though, you might still do a bit better by reading the whole file at once via fread(), and then scanning it for end-of-data via strstr():
size_t nread = fread(vString, 1, fdstat.st_size, fp);
// Handle nread != fdstat.st_size ...
// terminate the buffer as a string
vString[nread] = '\0';
// truncate the string after the end-of-data:
char *eod = strstr(vString, "\n ");
if (eod) {
// terminator found - truncate the string after the newline
eod[1] = '\0';
} // else no terminator found
That scales linearly, so it addresses your asymptotic complexity problem, too, but if the data of interest will often be much shorter than the file, then it will leave you in those cases doing a lot more costly I/O than you need to do. In that event, one alternative would be to read in chunks, as #laissez_faire suggested. Another would be to tweak your original algorithm to track the end of vString so as to use strcpy() instead of strcat() to append each new line. The key part of that version would look something like this:
char *linePtr = NULL;
size_t nread = 0;
size_t len = 0;
*vString = '\0'; // In case the first line is end-of-data
for (char *end = vString; ; end += nread) {
// Check every line
nread = getline(&linePtr, &len, fp);
if (nread < 0) {
// handle eof or error ...
}
// When data ends, the line begins with space (" ")
if (*linePtr == ' ') {
break;
}
strcpy(end, *linePtr);
}
free(linePtr);
Additionally, note that
you do not need to initially zero-fill the memory allocated for *vString, as you're just going to overwrite those zeroes with the data of real interest (and then ignore the rest of the buffer).
You should not cast the return value of malloc()-family functions, including calloc().
Have you tried to read the file using fread and read a bigger chunk of data in each step and then parse the data after reading it? Something like:
#include <stdio.h>
#include <string.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include <stdlib.h>
char *readString(const char *fn)
{
FILE *fp;
char *vString;
struct stat fdstat;
int stat_res;
stat_res = stat(fn, &fdstat);
fp = fopen(fn, "r+b");
if (fp && !stat_res) {
vString = (char *) calloc(fdstat.st_size + 1, sizeof(char));
int newline = 1;
int index = 0;
while (index < fdstat.st_size) {
int len =
fdstat.st_size - index >
4096 ? 4096 : fdstat.st_size - index;
char *buffer = (char *) malloc(len);
int read_len = fread(buffer, 1, len, fp);
int i;
if (newline) {
if (read_len > 0 && buffer[0] == ' ') {
return vString;
}
newline = 0;
}
for (i = 0; i < read_len; ++i) {
if (buffer[i] == '\n') {
if (i + 1 < read_len && buffer[i + 1] == ' ') {
memcpy(vString + index, buffer, i + 1);
return vString;
}
newline = 1;
}
}
memcpy(vString + index, buffer, read_len);
index += read_len;
}
}
return vString;
}
int main(int argc, char **argv)
{
char *str = readString(argv[1]);
printf("%s", str);
free(str);
return EXIT_SUCCESS;
}

C: What's wrong with my reverse string function?

Been solving leetcode puzzles and thought I solved this one pretty quickly, yet I am running into a strange error. My output is matching the expected output, so I have no idea why it's rejecting my solution based off the following test case.
char* reverseString(char* s)
{
/* Sample input: "Hello"
Sample output: "olleh"
*/
char * reversed_string;
char temp[1];
int length = 0;
int i;
if(s == NULL)
return NULL;
length = strlen(s);
/* While string is not null, increment pointer */
while(*s != NULL)
{
s = s + 1;
}
/* Allocate reversed string based off length of original string */
reversed_string = malloc(length + 1);
/* Traverse backwards for length of string */
/* Copy each letter to temp */
/* Concatenate each letter to reversed_string */
for(i = 0; i < length; i++)
{
s = s - 1;
strncpy(temp, s, 1);
strcat(reversed_string, temp);
}
reversed_string[length] = '\0';
/* Return reversed string */
return reversed_string;
}
MOutput = My Output
EOutput = Expected Output
Input: "?CZU.9Iw8G3K?fse,b7 m;0?f :`c9d!D'`Pem0'Du0;9i` 03F,: 7,oPw'T'5`1g!iwR5J71iJ\"f;r6L;qZaDGx?cvkS 8\"UY2u`YC P3CM y`4v 1q7P;Zd1.;:RA!oYh;!2W8xMfMx8W2!;hYo!AR:;.1dZ;P7q1 v4`y MC3P CY`u2YU\"8 Skvc?xGDaZq;L6r;f\"Ji17J5Rwi!g1`5'T'wPo,7 :,F30 `i9;0uD'0meP`'D!d9c`: f?0;Z 7b,esf?K3G8wI9.UmC?"
MOutput: "?CmU.9Iw8G3K?fse,b7 Z;0?f :`c9d!D'`Pem0'Du0;9i` 03F,: 7,oPw'T'5`1g!iwR5J71iJ"f;r6L;qZaDGx?cvkS 8"UY2u`YC P3CM y`4v 1q7P;Zd1.;:RA!oYh;!2W8xMfMx8W2!;hYo!AR:;.1dZ;P7q1 v4`y MC3P CY`u2YU"8 Skvc?xGDaZq;L6r;f"Ji17J5Rwi!g1`5'T'wPo,7 :,F30 `i9;0uD'0meP`'D!d9c`: f?0;m 7b,esf?K3G8wI9.UZC?"
EOutput: "?CmU.9Iw8G3K?fse,b7 Z;0?f :`c9d!D'`Pem0'Du0;9i` 03F,: 7,oPw'T'5`1g!iwR5J71iJ"f;r6L;qZaDGx?cvkS 8"UY2u`YC P3CM y`4v 1q7P;Zd1.;:RA!oYh;!2W8xMfMx8W2!;hYo!AR:;.1dZ;P7q1 v4`y MC3P CY`u2YU"8 Skvc?xGDaZq;L6r;f"Ji17J5Rwi!g1`5'T'wPo,7 :,F30 `i9;0uD'0meP`'D!d9c`: f?0;m 7b,esf?K3G8wI9.UZC?"
Anyone spot what might be wrong with my function? Is there undefined behavior anywhere?
You need to allocate 2 chars for temp and initialize with 0's, so change it to
char temp[2] = { 0 };
Also, initialize reversed_string after allocating memory to it, so that first strcat works properly.
reversed_string = malloc(length + 1);
reversed_string[0] = '\0';
this part of the posted code:
for(i = 0; i < length; i++)
{
s = s - 1;
strncpy(temp, s, 1);
strcat(reversed_string, temp);
}
is not doing the right thing.
It should be copying one byte at a time from the end of the original string to the beginning of the reversed string.
Suggest the following code which:
cleanly compiles
performs the desired functionality
is a complete program
because it is going to be used in one of the online coding contests does not perform any error checking.
and now, the code
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
// use meaningful variable and parameter names
char* reverseString(char* original)
{
size_t length = strlen( original );
/* Allocate reversed string based off length of original string */
char *reversed = NULL;
reversed = malloc(length + 1);
char *dest = reversed;
char *source = &(original[ strlen(original) -1 ]);
for( ; source >= original; source--)
{
*dest = *source;
dest++;
}
*dest = '\0';
/* Return reversed string */
return reversed;
} // end function: reverseString
int main( void )
{
char inBuffer[ 4096 ];
fgets( inBuffer, sizeof(inBuffer), stdin );
// eliminate trailing newline if it exists
char * newline = NULL;
if( NULL != (newline = strstr( inBuffer, "\n" ) ) )
{
*newline = '\0';
}
char * newString = reverseString( inBuffer );
printf( "%s\n\n", newString );
} // end function: main

read user input without maxsize in C

In C i can use the char *fgets(char *s, int size, FILE *stream) function to read user input from stdin. But the size of the user input is limited to size.
How can i read user input of variable size?
In C you are responsible for your buffers, and responsible for their size. So you can not have some dynamic buffer ready for you.
So the only solution is to use a loop (either of fgets or fgetc - depends on your processing and on your stop condition)
If you go beyond C to C++, you will find that you can accept std::string objects of variable sizes (there you need to deal with word and/or line termination instead - and loop again)
This function reads from standard input until end-of-file is encountered, and returns the number of characters read. It should be fairly easy to modify it to read exactly one line, or alike.
ssize_t read_from_stdin(char **s)
{
char buf[1024];
char *p;
char *tmp;
ssize_t total;
size_t len;
size_t allocsize;
if (s == NULL) {
return -1;
}
total = 0;
allocsize = 1024;
p = malloc(allocsize);
if (p == NULL) {
*s = NULL;
return -1;
}
while(fgets(buf, sizeof(buf), stdin) != NULL) {
len = strlen(buf);
if (total + len >= allocsize) {
allocsize <<= 1;
tmp = realloc(p, allocsize);
if (tmp == NULL) {
free(p);
*s = NULL;
return -1;
}
p = tmp;
}
memcpy(p + total, buf, len);
total += len;
}
p[total] = 0;
*s = p;
return total;
}

Resources