Segmentation fault while convering large number of xml files into text - c

#include<stdio.h>
#include<stdlib.h>
#include<dirent.h>
#include<string.h>
int main()
{
FILE *fin,*fout;
char dest[80]="/home/vivs/InexCorpusText/";
char file[30];
DIR *dir;
char c,state='1';
int len;
struct dirent *ent;
if((dir=opendir("/home/vivs/InexCorpus"))!=NULL)
{
while((ent=readdir(dir))!=NULL)
{
if(strcmp(ent->d_name,".") &&
strcmp(ent->d_name,"..") &&
strcmp(ent->d_name,".directory"))
{
len=strlen(ent->d_name);
strcpy(file,ent->d_name);
file[len-3]=file[len-1]='t';
file[len-2]='x';
//strcat(source,ent->d_name);
strcat(dest,file);
printf("%s\t%s\n",ent->d_name,dest);
fin=fopen(ent->d_name,"r");
fout=fopen(dest,"w");
while((c=fgetc(fin))!=EOF)
{
if(c=='<')
{
fputc(' ',fout);
state='0';
}
else if(c=='>')
state='1';
else if(state=='1')
{
if(c!='\n')
fputc(c,fout);
if(c=='.')
{
c=fgetc(fin);
if(c==' '||c=='\n'||c=='<')
{
fputc('\n',fout);
ungetc(c,fin);
}
else fputc(c,fout);
}
}
}
}
close(fin);
close(fout);
strcpy(dest,"/home/vivs/InexCorpusText/");
}
closedir(dir);
}
else
{
printf("Error in opening directory\n");
}
return 0;
}
I was trying to convert xml files to text. This code simply remove tags and nothing else.
When i execute this code for around 300 files, it doesn't show any error but when number goes to 500 or more i receive a segmentation fault after processing around 300 files.

At least one reason 'right from the start':
Here is struct dirent declaration from man:
On Linux, the dirent structure is defined as follows:
struct dirent {
ino_t d_ino; /* inode number */
off_t d_off; /* offset to the next dirent */
unsigned short d_reclen; /* length of this record */
unsigned char d_type; /* type of file; not supported
by all file system types */
char d_name[256]; /* filename */
};
You are in trouble on any name longer than 30 (actually 29) chars. Memory overwrite occurs because file has only 30 bytes (reserve 1 for '\0' terminator):
char file[30];
...
strcpy(file,ent->d_name);

There are two structures within XML that it does not appear that you account for.
Attribute contents can contain unescaped > characters, which could throw off your count. See http://www.w3.org/TR/REC-xml/#NT-AttValue.
CDATA sections can contain both < and > characters as literal text, as long as they do not appear as part of the closing ]]> string. See http://www.w3.org/TR/REC-xml/#NT-CharData. This could seriously throw off your logic.
Why don't you look in your files to see if any contain the text CDATA?
You might want to consider using xsltproc or libxslt; a very simple XSLT transform would give you exactly what you want. See Extract part of an XML file as plain text using XSLT for such a transform engine.

OK, another problematic place:
len=strlen(ent->d_name);
....
file[len-3]=file[len-1]='t';
file[len-2]='x';
Because d_name could have less than 3 characters it could again lead to memory overwrite.
You should be careful with functions like strlen() and always validate their result.

Related

C - sscanf ignoring comma from csv file and reading the same piece of data twice

I am trying to read from a CSV file into struct. For some reason, the value for social security numbers is also reading the address and them the address is being read a second time into newBin.address. It looks like the sscanf is ignoring the comma that separates the socials and address when it reads the file but then does register it when it moves on reading the address. Any help is appreciated.
#include <stdio.h>
#include<string.h>
#include<stdlib.h>
#define STRSIZE 70
typedef struct BIN_DATA {
unsigned int user_number;
char name[32];
char social_security[10];
char address[32];
} BIN_DATA;
int main()
{
// Define variables.
FILE *in, *out;
char str[STRSIZE];
// New BIN.
BIN_DATA newBin;
// Open files.
in = fopen("updated.txt", "r");
// Check files.
if(in == NULL)
{
puts("Could not open file");
exit(0);
}
while(fgets(str, STRSIZE, in) != NULL)
{
memset(&newBin, '\0', sizeof(BIN_DATA));
sscanf(str, "%6u, %[^,], %[^,], %[^\n\r]", &newBin.user_number, newBin.name,\
newBin.social_security, newBin.address);
printf("%u. %s. %s. %s.\n", newBin.user_number, newBin.name,\
newBin.social_security, newBin.address);
}
return 0;
}
File being read:
289383,Estefana Lewey,591-82-1520,"9940 Ohio Drv, 85021"
930886,Burl Livermore,661-18-3839,"226 Amherst, 08330"
692777,Lannie Crisler,590-36-6612,"8143 Woods Drv, 20901"
636915,Zena Hoke,510-92-2741,"82 Roehampton St, 47905"
747793,Vicente Clevenger,233-46-1002,"9954 San Carlos St., 55016"
238335,Lidia Janes,512-92-7402,"348 Depot Ave, 29576"
885386,Claire Paladino,376-74-3432,"587 Front Ave, 32703"
760492,Leland Stillson,576-55-8588,"9793 Boston Lane, 08610"
516649,Wes Althouse,002-58-0518,"8597 Annadale Drive, 06514"
641421,Nadia Gard,048-14-6428,"218 George Street, 29150"
As mentioned in the comments, the social_security member does not allocate enough space to hold the data you're reading. It needs to be at least 12 to hold the SSN as written with a terminator at the end.
As for the format string you use with sscanf(), it's nearly correct. However, you'll want to bound the maximum string length to match what you have storage for, so for example with name of 32 you should limit it to 31 characters saving one at the end for the terminator.
I changed the social_security field to char social_security[12]; and then changed the format string to sscanf to be the following:
"%6u, %31[^,], %11[^,], %31[^\n\r]"
I was able to run the modified code with the sample input file to get the output you described. You can try it too at the link:
Runnable code

In C how can I find in a directory all the file names and store them in a array of strings of characters?

I create a code in C to find all file names in a directory, this code is recursive so if it finds other directories within this will review the others directories. The search in the directory and the search of files works well,, because i printed on screen what the program read and i realized that find all filenames and does not repeat any filename. The problem i have is that i saves into an array of strings of characters the file name found, and when you find all the files i will have in the main the arrangement of strings with all the file names, but when i printed on screen that arrangement there is not all the files in it, but a few repeated files names. The ultimate goal of this program is not print all file names in screen, the purpose is to save in the arrangement all the file names .I can not see the error, if someone can tell me something that I'm doing wrong I'll be grateful.
void findfiles(char *root,char *p[],int *tam){
DIR * dir;
struct dirent *entrada;
struct stat stt;
dir = opendir(root);
char *aux;
char nombre[BUFFER_TAM];
char buf[30];
if (dir == NULL) {
printf("hola4\n");
return;
}
while ((entrada = readdir(dir)) != NULL) {
if (strcmp(entrada->d_name,".")==0 || strcmp(entrada->d_name,"..")==0);
else {
if (entrada->d_type == DT_DIR){
strcpy(nombre,root);
strcat(nombre,"/");
strcat(nombre,entrada->d_name);
findfiles(nombre,p,tam);
}
else {
strcpy(nombre,root);
strcat(nombre,"/");
strcat(nombre,entrada->d_name);
p[*tam]=malloc(strlen(nombre)+1);
p[*tam]=nombre;
*tam = *tam +1;
}
}
}
}
void main(){
char *archivos[BUFFER_TAM];
char root[BUFFER_TAM]="/home/jesusmolina/Documentos";
int i=0,tam=0;
findfiles(root,archivos,&tam);
for (i;i<tam;i++)
printf("%s\n",archivos[i]);
}
p[*tam]=malloc(strlen(nombre)+1);
p[*tam]=nombre;
You allocate a chunk of memory, then immediately lose the pointer to that memory and leak it. You probably wanted:
p[*tam]=malloc(strlen(nombre)+1);
strcpy(p[*tam], nombre);

Unexpected Output - Storing into 2D array in c

I am reading data from a number of files, each containing a list of words. I am trying to display the number of words in each file, but I am running into issues. For example, when I run my code, I receive the output as shown below.
Almost every amount is correctly displayed with the exception of two files, each containing word counts in the thousands. Every other file only has three digits worth of words, and they seem just fine.
I can only guess what this problem could be (not enough space allocated somewhere?) and I do not know how to solve it. I apologize if this is all poorly worded. My brain is fried and I am struggling. Any help would be appreciated.
I've tried to keep my example code as brief as possible. I've cut out a lot of error checking and other tasks related to the full program. I've also added comments where I can. Thanks.
StopWords.c
#include <stdio.h>
#include <stdlib.h>
#include <dirent.h>
#include <stddef.h>
#include <string.h>
typedef struct
{
char stopwords[2000][60];
int wordcount;
} LangData;
typedef struct
{
int languageCount;
LangData languages[];
} AllData;
main(int argc, char **argv)
{
//Initialize data structures and open path directory
int langCount = 0;
DIR *d;
struct dirent *ep;
d = opendir(argv[1]);
//Count the number of language files in the directory
while(readdir(d))
langCount++;
//Account for "." and ".." in directory
//langCount = langCount - 2 THIS MAKES SENSE RIGHT?
langCount = langCount + 1; //The program crashes if I don't do this, which doesn't make sense to me.
//Allocate space in AllData for languageCount
AllData *data = malloc(sizeof(AllData) + sizeof(LangData)*langCount); //Unsure? Seems to work.
//Reset the directory in preparation for reading data
rewinddir(d);
//Copy all words into respective arrays.
char word[60];
int i = 0;
int k = 0;
int j = 0;
while((ep = readdir(d)) != NULL) //Probably could've used for loops to make this cleaner. Oh well.
{
if (!strcmp(ep->d_name, ".") || !strcmp(ep->d_name, ".."))
{
//Filtering "." and ".."
}
else
{
FILE *entry;
//Get string for path (i should make this a function)
char fullpath[100];
strcpy(fullpath, path);
strcat(fullpath, "\\");
strcat(fullpath, ep->d_name);
entry = fopen(fullpath, "r");
//Read all words from file
while(fgets(word, 60, entry) != NULL)
{
j = 0;
//Store each word one character at a time (better way?)
while(word[j] != '\0') //Check for end of word
{
data->languages[i].stopwords[k][j] = word[j];
j++; //Move onto next character
}
k++; //Move onto next word
data->languages[i].wordcount++;
}
//Display number of words in file
printf("%d\n", data->languages[i].wordcount);
i++; Increment index in preparation for next language file.
fclose(entry);
}
}
}
Output
256 //czech.txt: Correct
101 //danish.txt: Correct
101 //dutch.txt: Correct
547 //english.txt: Correct
1835363006 //finnish.txt: Should be 1337. Of course it's 1337.
436 //french.txt: Correct
576 //german.txt: Correct
737 //hungarian.txt: Correct
683853 //icelandic.txt: Should be 1000.
399 //italian.txt: Correct
172 //norwegian.txt: Correct
269 //polish.txt: Correct
437 //portugese.txt: Correct
282 //romanian.txt: Correct
472 //spanish.txt: Correct
386 //swedish.txt: Correct
209 //turkish.txt: Correct
Do the files have more than 2000 words? You have only allocated space for 2000 words so once your program tries to copy over word 2001 it will be doing it outside of the memory allocated for that array, possibly into the space allocated for "wordcount".
Also I want to point out that fgets returns a string to the end of the line or at most n characters (60 in your case), whichever comes first. This will work find if there is only one word per line in the files you are reading from, otherwise will have to locate spaces within the string and count words from there.
If you are simply trying to get a word count, then there is no need to store all the words in an array in the first place. Assuming one word per line, the following should work just as well:
char word[60];
while(fgets(word, 60, entry) != NULL)
{
data->languages[i].wordcount++;
}
fgets reference- http://www.cplusplus.com/reference/cstdio/
Update
I took another look and you might want to try allocating data as follows:
typedef struct
{
char stopwords[2000][60];
int wordcount;
} LangData;
typedef struct
{
int languageCount;
LangData *languages;
} AllData;
AllData *data = malloc(sizeof(AllData));
data->languages = malloc(sizeof(LangData)*langCount);
This way memory is being specifically allocated for the languages array.
I agree that langCount = langCount - 2 makes sense. What error are you getting?

Searching a particular string in a large file

I am making program in C which can search for a specific string in a large .txt file and count it and then print it out. But it seems that something have go wrong, cause the output of my program is different from that of the two text editor. According to the text editor, there are totally 3000 words,in this case I search for the word "make", in that .txt file. But the output of my program is just 2970.
I cannot find out the problem of my program. So I am curios about how could a text editor search for a specific string so accurately? How do people implement that? Can any people show me some code in C?
To make things clear: that is a large .txt file, 20M or so, containing lots of characters. So I think it's not so good to read it into memory all at once. I have implement my program by splitting my program in to pieces and then scan all of those for parsing. However, it fail some way.
Maybe I should put the code here. Wait a minute please.
The code is kinda long, 70 lines or so. I have put it on my github, if you have any interest, please help. https://github.com/walkerlala/searchText
note that the only related file is wordCount.c and testfile.txt which goes like:
#include<stdio.h>
#include<stdlib.h>
#include<stdbool.h>
char arr[51];
int flag=0;
int flag2=0;
int flag3=0;
int flag4=0;
int pieceCount(FILE*);
int main()
{
//the file in which I want to search the word is testfile.txt
//I have formatted the file so that it contain no newlins any more
FILE* fs=fopen("testfile.txt","r");
int n=pieceCount(fs);
printf("%d\n",n);
rewind(fs); //refresh the file...
static bool endOfPiece1=false,endOfPiece2=false,endOfPiece3=false;
bool begOfPiece1,begOfPiece2,begOfPiece3;
for(int start=0;start<n;++start){
fgets(arr,sizeof(arr),fs);
for(int i=0;i<=46;++i){
if((arr[i]=='M'||arr[i]=='m')&&(arr[i+1]=='A'||arr[i+1]=='a')&&(arr[i+2]=='K'||arr[i+2]=='k')&&(arr[i+3]=='E'||arr[i+3]=='e')){
flag+=1;
//continue;
}
}
//check the border
begOfPiece1=((arr[1]=='e'||arr[1]=='E'));
if(begOfPiece1==true&&endOfPiece1==true)
flag2+=1;
endOfPiece1=((arr[47]=='m'||arr[47]=='M')&&(arr[48]=='a'||arr[48]=='A')&&(arr[49]=='k'||arr[49]=='K'));
begOfPiece2=((arr[1]=='k'||arr[1]=='K')&&(arr[2]=='e'||arr[2]=='E'));
if(begOfPiece2==true&&endOfPiece2==true)
flag3+=1;
endOfPiece2=((arr[48]=='m'||arr[48]=='M')&&(arr[49]=='a'||arr[49]=='A'));
begOfPiece3=((arr[1]=='a'||arr[1]=='A')&&(arr[2]=='k'||arr[2]=='K')&&(arr[3]=='e'||arr[3]=='E'));
if(begOfPiece3==true&&endOfPiece3==true)
flag4+=1;
endOfPiece3=(arr[49]=='m'||arr[49]=='M');
}
printf("%d\n%d\n%d\n%d\n",flag,flag2,flag3,flag4);
getchar();
return 0;
}
//the function counts how many pieces have I split the file into
int pieceCount(FILE* file){
static int count=0;
char arr2[51]={'\0'};
while(fgets(arr2,sizeof(arr),file)){
count+=1;
continue;
}
return count;
}
You can do this quite simply just by having a rolling buffer. You don't need to break the file into sections.
#include <stdio.h>
#include <string.h>
int main(void) {
char buff [4]; // word buffer
int count = 0; // occurrences
FILE* fs=fopen("test.txt","r"); // open the file
if (fs != NULL) { // if the file opened
if (4 == fread(buff, 1, 4, fs)) { // fill the buffer
do { // if it worked
if (strnicmp(buff, "make", 4) == 0) // check for target word
count++; // tally
memmove(buff, buff+1, 3); // shift the buffer down
} while (1 == fread(buff+3, 1, 1, fs)); // fill the last position
} // end of file
fclose(fs); // close the file
}
printf("%d\n", count); // report the result
return 0;
}
For simplicity I stopped short of making the search word "softer" and allocating the correct buffer and various sizes, since that wasn't in the question. And I have to leave something for OP to do.

Regarding FOPEN in C

I am having a problem regarding FOPEN in C.
I have this code which reads a particular file from a directory
FILE *ifp ;
char directoryname[50];
char result[100];
char *rpath = "/home/kamal/samples/pipe26/divpipe0.f00001";
char *mode = "r";
ifp = fopen("director.in",mode); %director file contains path of directory
while (fscanf(ifp, "%s", directoname) != EOF)
{
strcpy(result,directoname); /* Path of diretory /home/kamal/samples/pipe26 */
strcat(result,"/"); /* front slash for path */
strcat(result,name); /* name of the file divpipe0.f00001*/
}
Till this point my code works perfectly creating a string which looks " /home/kamal/samples/pipe26/divpipe0.f00001 ".
The problem arises when I try to use the 'result' to open a file, It gives me error. Instead if I use 'rpath' it works fine even though both strings contain same information.
if (!(fp=fopen(rpath,"rb"))) /* This one works fine */
{
printf(fopen failure2!\n");
return;
}
if (!(fp=fopen(result,"rb"))) /* This does not work */
{
printf(fopen failure2!\n");
return;
}
Could some one please tell why I am getting this error ?
I think you mean char result[100];; i.e. without the asterisk. (Ditto for directoryname.)
You're currently stack-allocating an array of 100 pointers. This will not end well.
Note that rpath and mode point to read-only memory. Really you should use const char* for those two literals.
The error is the array 'char* result[100]', here you are allocating an array of 100 pointers to strings, not 100 bytes / characters, which was your intent.

Resources