How to find the character index of an XPath match with libxml2? - c

Given an XML file (stored in, say, sample.xml), and an XPath expression (say, //storyinfo), I want to get the character index in the XML file of the start of each node that results from evaluating the XPath expression.
A line number and a column number would also be fine.
What I've tried:
Given a sample XML like this one, stored in sample.xml:
<?xml version="1.0" encoding="utf-8"?>
<story>
<storyinfo>
<author>John Fleck</author>
<datewritten>June 2, 2002</datewritten>
<keyword>example keyword</keyword>
</storyinfo>
<body>
<headline>This is the headline</headline>
<para>This is the body text.</para>
</body>
</story>
I can get the "storyinfo" node using an XPath expression with libxml2, like so:
#include <stdio.h>
#include <stdlib.h>
#include <libxml/parser.h>
#include <libxml/tree.h>
#include <libxml/xpath.h>
int
main(void)
{
int i, size;
char *filename;
xmlChar *xpath_expr;
xmlDocPtr doc;
xmlNodePtr node;
xmlNodeSetPtr nodes;
xmlXPathContextPtr xpath_ctx;
xmlXPathObjectPtr xpath_obj;
/* Read file. */
filename = "sample.xml";
doc = xmlParseFile(filename);
if (doc == NULL) {
fprintf(stderr, "Error: unable to parse file \"%s\"\n",
filename);
exit(1);
}
/* Evaluate XPath expression. */
xpath_ctx = xmlXPathNewContext(doc);
if (xpath_ctx == NULL) {
fprintf(stderr, "Error: unable to create new XPath context\n");
xmlFreeDoc(doc);
exit(1);
}
xpath_expr = (xmlChar *)"//storyinfo";
xpath_obj = xmlXPathEvalExpression(xpath_expr, xpath_ctx);
if (xpath_obj == NULL) {
fprintf(stderr,
"Error: unable to evaluate XPath expression \"%s\"\n",
xpath_expr);
xmlXPathFreeContext(xpath_ctx);
xmlFreeDoc(doc);
exit(1);
}
/* Print XPath matches. */
nodes = xpath_obj->nodesetval;
size = (nodes) ? nodes->nodeNr : 0;
for (i = 0; i < size; ++i) {
node = nodes->nodeTab[i];
printf("Match %d - Name: %s\n", i, node->name);
printf("Match %d - Content: %s\n", i, node->content);
printf("Match %d - Line: %d\n", i, node->line);
printf("Match %d - Extra: %d\n", i, node->extra);
}
/* Cleanup. */
xmlXPathFreeObject(xpath_obj);
xmlXPathFreeContext(xpath_ctx);
xmlFreeDoc(doc);
return 0;
}
Here I saw that in libxml2, the node has a field line, with a value of 3, which is the line number in which the node is located in the XML file. However, that is not helpful if the XML is all in a single line (which isn't uncommon).
What I want instead is the character index: in the sample.xml above, that would be 50. Either that, or a line and column number: in the example above, that would be line 3, column 3.

Related

Matching text from 2 files

I have written a program that is designed to recover linux system passwords by searching for matching hashes which are present in two text files
#include<stdio.h>
#include<string.h>
#include<stdlib.h>
#define MAXCHAR 1000
//Declaring Functions to match word in file
int matchfile(char *shadowfilename, char*hashtablefilename);
//shadowfilename for shadow.txt hashtablefilename for hash table
void UsageInfo(char *shadowfile, char * hashtablefile );
//Display usage info on arguments for program
void UsageInfo(char *shadowfile, char * hashtablefile) {
printf("Usage: %s %s <shadowfile> <hashtable>\n", shadowfile,hashtablefile);
}
//main function.
int main(int argc, char *argv[]) {
int result, errcode;
//Display format for user to enter arguments and
//End program if user does not enter exactly 3 arguments
if(argc < 3 || argc > 3) {
UsageInfo(argv[1],argv[2]);
exit(1);
}
system("cls");
//Pass command line arguments into searchstringinfile
result = matchfile(argv[1], argv[2]);
//Display error message
if(result == -1) {
perror("Error");
printf("Error number = %d\n", errcode);
exit(1);
}
return(0);
}
//Declaring Functions to match word in file
//int matchfile(char *shadowfilename, char *hashtablefilename);
//shadowfilename for shadow.txt hashtablefilename for hash table
int matchfile(char *shadowfilename, char *hashtablefilename){
FILE *shadowfile;
FILE *hashtable;
char strshadow[MAXCHAR];
char strhash[MAXCHAR];
shadowfile = fopen(shadowfilename, "r");
if (shadowfile == NULL){
printf("Could not open file %s",shadowfilename);
return 1;
}
hashtable = fopen(hashtablefilename, "r");
if (hashtable == NULL){
printf("Could not open file %s",hashtablefilename);
return 1;
}
//Getting text from the 2 files
while (fgets(strshadow, MAXCHAR, shadowfile) != NULL &&fgets(strhash,MAXCHAR,
hashtable) != NULL){
printf("%s", strshadow);
printf("%s", strhash);
int linenumber = 1;
int search_result = 0;
//Matching words line-by-line
if((strstr(strshadow,strhash)) != NULL) {
//Display line in which matched word is found
printf("A match found on line: %d\n", linenumber);
printf("\n%s\n", strhash);
search_result++;
}
linenumber++;
}
fclose(shadowfile);
return 0;
}
However, I am unable to match the two hash values present in the two files due to the characters in front of them.
hashtable.txt.
This file contains the missing password in plain-text and is corresponding hash values.
The format is as follows: (password):(hash)
banana:$1$$Tnq7a6/C1wwyKyt0V/.BP/:17482:0:99999:7:::
shadow.txt. This file contains the account username in plain-text and is corresponding hash values.
The format is as follows: (user):(hash)
pyc1:$1$$Tnq7a6/C1wwyKyt0V/.BP/:17482:0:99999:7:::
As seen above, the words 'banana' and 'pyc1' prevent the program from detecting the two hashes from being detected.
Could someone tell me the changes I need to make to overcome this ?
Thank you.
Edit:Clarified format of shadow.txt and hashtable.txt
The simplest way to skip characters in string until some condition is met is:
char someString[MAXCHAR];
for (char* ptr = someString; *ptr != '\0'; ptr++) {
if (conditionIsMet(ptr)) {
doSomething();
break;
}
}
In your case, conditionIsMet(ptr) should be comparing *ptr to ':' and in that case, the password hash is under (ptr + 1) (string starting from the next character). I think you can write the rest of the code yourself.

My program not work as expected

I need to find the position of some strings. These strings are stored in a file named queryfile , from an other file named datafile.
However, my program does not work as expected.
Can some one help me?
My program
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
int main()
{
FILE *queryfile;
queryfile = fopen("op2query.txt","r");
FILE *datafile;
datafile = fopen("op2data.txt","r" );
int i = 1;
char word[99];
char search[99];
if(queryfile==NULL) {
printf("Error in reading Query File");
exit(1);
}
if(datafile==NULL) {
printf("Error in reading Data File");
exit(1);
}
while(fscanf(queryfile,"%98s",search)==1){
while(fscanf(datafile,"%98s",word)==1){
if (strcmp(word,search)==0){
printf("\n %i %s ", i, search);
rewind(datafile);
i=1;
break;
}
else
i++;
}
}
fclose(datafile);
fclose(queryfile);
return 0;
}
I build an array of each set of words to be tested, by splitting the query string into words. These words can span a line break in the data file. I mark the data file position on the second word of the set, if the search fails I seek to that point (if necessary). The program succeeds even if I duplicate every word "age" in the data file.
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#define MAXWORDS 5
#define MAXLEN 99
int main()
{
int j, i, done, words, count;
long mark;
char word[MAXLEN];
char search[MAXLEN];
char *tok, *sptr[MAXWORDS];
FILE *queryfile;
FILE *datafile;
if ((queryfile = fopen("op2query.txt","r")) == NULL) {
printf("Error in reading Query File");
exit(1);
}
if ((datafile = fopen("op2data.txt","r" )) == NULL) {
printf("Error in reading Data File");
exit(1);
}
while(fgets(search, MAXLEN, queryfile) != NULL){
words = 0;
done = 0;
count = 0;
mark = -1;
tok = strtok(search, " \r\n");
while (tok && words < MAXWORDS) { // build array of query
sptr[words++] = tok;
tok = strtok(NULL, " \r\n"); // strips newline too
}
if (words < 1) // no more queries
break;
rewind(datafile); // beginning of file
while (!done) { // until none to read
count++;
if (mark >= 0) // when more than one word to search
fseek (datafile, mark, SEEK_SET);
mark = -1;
for (j=0; j<words; j++) {
if (j == 1) // mark for next search
mark = ftell(datafile);
if (fscanf(datafile, "%98s", word) != 1){
done = 1; // end of file
break;
}
if (strcmp(sptr[j], word)!=0)
break; // failed multi word search
}
if (done)
printf("NOT FOUND!");
else if (j == words) { // if all words found
printf("%d", count);
done = 1; // success
}
}
for (i=0; i<words; i++)
printf(" %s", sptr[i]); // show array of words asked
printf("\n");
}
fclose(datafile);
fclose(queryfile);
return 0;
}
Program output:
18 wisdom
40 season
NOT FOUND! summer
22 age of foolishness
UPDATE - I print NOT FOUND! when query not found. Added "summer" to query file.
you should put some debug-output behind the fscanf-calls (like printf("search:<%s> word:<%s>", search, word);
Then you will see, that fscanf stops at finding a white-space. you compare wisdom to each consecutive word in op2data.txt.
You should read line by line with fgets() removing the CR/LF from search.
But be aware, that the multi-word-search-word in data-file may be split between lines. like:
find me
i am the text to find
me in this file
so a better solution would be:
read search word by line (remove CR/LF) (normalize it by removing double spaces and not-letters)
read a chunk from datafile and normalize it too.
compare or continue by moving the read-position in data left by length of length of search word

Program works with string literals but not with string arrays

I have a hashtable ADT which has two functions, insert and lookup. I put in to the insert function a hash table, hash table size, ID #, and book title and that inserts it into the hash table. This works fine when I pass it a string literal, i.e. insert(...,"Hello, world!"...); It doesn't work when I read in strings from a file, store them in an array, and try and use my insert and lookup functions.
I have all of my code here but the most important files are main.c and hash.c. Hash.c has the newHash(), hash(), insert(), and lookup() functions and main.c reads from two files, in this case test1.lib.in and test1.req.in, and from the first file will get the library id and title of a book from each line and then put it in the hash table. From the second file, it gets requests for a book title and should print the ids in its linked list.
List of links to files https://docs.google.com/document/d/1tFNs-eVkfnCfjwAHcAUdHtUl1KVv_WcnW2IS0SRFvcM/edit?usp=sharing
Example of code that works.
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include "list.h"
#include "hash.h"
int main(){
ListHndl* temp = newHash(10);
insert(442440, "cvyaqbznxel", 10,temp);
lookup(temp,"cvyaqbznxel", 10);
return 0;
}
Code that doesn't work
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
#include "list.h"
#include "hash.h"
int main(int argc, char * argv[]) {
if (argc != 3) {
printf("Incorrect arguments, please specify 2 files to be read\n");
return EXIT_FAILURE;
}
FILE *file = fopen( argv[1], "r");
FILE *secondFile = fopen(argv[2], "r");
if (file == 0 || secondFile == 0) {
printf("Could not open a file\n");
return EXIT_FAILURE;
}
int numDataLines2;
int numDataLines;
int hashTableSize;
//First line of first file gives number of lines in file and
//size of hash table to be made
if(fscanf(file, "%d%d", &numDataLines, &hashTableSize) < 2) {
printf("Unable to parse first line of first file\n");
return EXIT_FAILURE;
}
ListHndl* theHash = newHash(hashTableSize);
int libraryID;
char *tempString = calloc(numDataLines,41*sizeof(char));
char lineHolder[129];
//discard the new line which always shows up
fgets(lineHolder, 128, file);
for(int i = 0; i < numDataLines; i++) {
//Gets the whole line to be scanned with sscanf
fgets(lineHolder, 128, file);
//If the line consists of just a newline char, continue
if(strcmp(lineHolder, "\n") == 0 ) {
continue;
}
//Scans the line retrieved from fgets and placed in lineHolder
if(sscanf(lineHolder, "%d, %40[^\n]", &libraryID,&tempString[i]) == 0){
printf("Unable to parse line %d of first file\n",i+2);
return EXIT_FAILURE;
}
insert(libraryID, &tempString[i], hashTableSize, theHash);
}
char String[41];
fgets(String, 40, secondFile);
numDataLines2 = atoi(String);
char *storeSecondFileStuff = calloc(numDataLines2,41*sizeof(char));
for(int i = 0; i< numDataLines2; i++) {
fgets(lineHolder, 128, secondFile);
if(strcmp(lineHolder, "\n") == 0) {
continue;
}
if(sscanf(lineHolder, "%40[^\n]",&storeSecondFileStuff[i]) == 0) {
printf("Unable to parse line %d of second file\n",i+2);
return EXIT_FAILURE;
}
lookup(theHash, &storeSecondFileStuff[i], hashTableSize);
}
printf("\n");
fclose(file);
fclose(secondFile);
return 0;
}
Thanks!
I think you have multiple problems. To start with, you might not be scanning your input line correctly. Change your line
if(sscanf(lineHolder, "%d, %40[^\n]", &libraryID,&tempString[i]) == 0)
to
if(sscanf(lineHolder, "%d, %40[^\n]", &libraryID, tempString) < 0)
that way, you will trap the situation where the sscanf function did not successfully convert both arguments - for example, if there is no comma in the input line. Note that sscanf returns the number of successful conversions; success would return a value of 2, so testing for <2 is the right way to go.
Note also that I changed &tempString[i] to tempString. The former points to some place along tempString - which only has 41 characters allocated to it. Yet you always allow up to 40 characters (plus '\0' to be written to it - so you will write past the end of the string. Since this is only a temporary variable, there is no sense in doing this. Just scan the input into the temp variable, then do whatever you need to do with it.
This means that your insert also changes, from
insert(libraryID, &tempString[i], hashTableSize, theHash);
to
insert(libraryID, tempString, hashTableSize, theHash);
Again, you need to do the same thing lower down in your code.
Here is an attempt at making the code work for you - see if this hits the spot. Note that all I really did was change the type of tempString and storeSecondFileStuff, and modified the way they were used in various function calls accordingly. I did not attempt to compile / run because of the complexity of the other files involved - but this should help a bit:
int main(int argc, char * argv[]) {
if (argc != 3) {
printf("Incorrect arguments, please specify 2 files to be read\n");
return EXIT_FAILURE;
}
FILE *file = fopen( argv[1], "r");
FILE *secondFile = fopen(argv[2], "r");
if (file == 0 || secondFile == 0) {
printf("Could not open a file\n");
return EXIT_FAILURE;
}
int numDataLines2;
int numDataLines;
int hashTableSize;
//First line of first file gives number of lines in file and
//size of hash table to be made
if(fscanf(file, "%d%d", &numDataLines, &hashTableSize) < 2) {
printf("Unable to parse first line of first file\n");
return EXIT_FAILURE;
}
ListHndl* theHash = newHash(hashTableSize);
int libraryID;
char **tempString = calloc(numDataLines,sizeof(char*)); // <<< ARRAY of pointers
char lineHolder[129];
//discard the new line which always shows up
fgets(lineHolder, 128, file);
for(int i = 0; i < numDataLines; i++) {
//Gets the whole line to be scanned with sscanf
fgets(lineHolder, 128, file);
tempString[i] = calloc(1, 41 * sizeof(char)); // <<< space for this string
//If the line consists of just a newline char, continue
if(strcmp(lineHolder, "\n") == 0 ) {
continue;
}
//Scans the line retrieved from fgets and placed in lineHolder
if(sscanf(lineHolder, "%d, %40[^\n]", &libraryID, tempString[i]) < 0){ // <<< changed
printf("Unable to parse line %d of first file\n",i+2);
return EXIT_FAILURE;
}
insert(libraryID, tempString[i], hashTableSize, theHash); // <<< changed
}
char String[41];
fgets(String, 40, secondFile);
numDataLines2 = atoi(String);
char **storeSecondFileStuff = calloc(numDataLines2, sizeof(char*)); // changed: again char **
for(int i = 0; i< numDataLines2; i++) {
fgets(lineHolder, 128, secondFile);
storeSecondFileStuff[i] = calloc(1, 41 * sizeof(char));
if(strcmp(lineHolder, "\n") == 0) {
continue;
}
if(sscanf(lineHolder, "%40[^\n]",storeSecondFileStuff[i]) == 0) {
printf("Unable to parse line %d of second file\n",i+2);
return EXIT_FAILURE;
}
lookup(theHash, storeSecondFileStuff[i], hashTableSize); // <<<< changed
}
printf("\n");
fclose(file);
fclose(secondFile);
return 0;
}

Read contents from xml file and store in an array

I'm working with xml for the first time and I have some problems in storing the contents of the xml file in an array. I'm using libxml2 for parsing the xml file and I'm able to get the data and able to print it. The code is given below:
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <libxml/xmlmemory.h>
#include <libxml/parser.h>
#include <wchar.h>
wchar_t buffer[7][50]={"\0"};
static void parseDoc(const char *docname)
{
xmlDocPtr doc;
xmlNodePtr cur;
xmlChar *key;
int i=0;
doc = xmlParseFile(docname);
if (doc == NULL ) {
fprintf(stderr,"Document not parsed successfully. \n");
return;
}
cur = xmlDocGetRootElement(doc);
if (cur == NULL)
{
fprintf(stderr,"empty document\n");
xmlFreeDoc(doc);
return;
}
cur = cur->xmlChildrenNode;
while (cur != NULL)
{
key = xmlNodeListGetString(doc, cur->xmlChildrenNode, 1);
wmemcpy(buffer[i],(wchar_t*)(key),size(key)); /*segmentation fault at this stage*/
printf("Content : %s\n", key);
xmlFree(key);
i++;
cur = cur->next;
}
xmlFreeDoc(doc);
return;
}
int main(void)
{
const char *docname="/home/workspace/TestProject/Text.xml;
parseDoc (docname);
return (1);
}
The sample xml file is provided below
<?xml version="1.0"?>
<story>
<author>John Fleck</author>
<datewritten>June 2, 2002</datewritten>
<keyword>example keyword</keyword>
<headline>This is the headline</headline>
<para>This is the body text.</para>
</story>
The output of the file contents when printed on the screen were as below
Content : null
Content : John Fleck
Content : null
Content : June 2, 2002
Content : null
Content : example keyword
Content : null
Content : This is the headline
Content : null
Content : This is the body text.
I feel that the content of the file being null in few places is causing the problem in copy and hence generating the segmentation fault. Please let me know how to fix the problem and is there an better way to get the thing done. I had done a similar xml file read using MSXML parser and this is my first time with Linux API's.
EDIT The copying part is performed as below but the contents of the wchart array are garbled. Further help would be appreciated.
while (cur != NULL) {
key = xmlNodeListGetString(doc, cur->xmlChildrenNode, 1);
if(key!=NULL)
{
wmemcpy(DiscRead[i],(const wchar_t *)key,sizeof(key));
i++;
}
printf("keyword: %s\n", key);
xmlFree(key);
cur = cur->next;
}
Your code has multiple problems:
You use wchar_t for your string array. This isn't appropriate for the UTF-8 encoded strings you'll get from libxml2. You should stick with xmlChar or use char.
You use xmlNodeListGetString to get the text content of nodes passing cur->xmlChildrenNode as node list. The latter will be NULL for text nodes, so xmlNodeListGetString will return NULL as an error condition. You should simply call xmlNodeGetContent on the current node but only if it is an element node.
Using xmlChildrenNode as field name is deprecated. You should use children.
The call to wmemcpy is dangerous. I'd suggest something safer like strlcpy.
Try something like this:
char buffer[7][50];
static void parseDoc(const char *docname)
{
xmlDocPtr doc;
xmlNodePtr cur;
xmlChar *key;
int i = 0;
doc = xmlParseFile(docname);
if (doc == NULL) {
fprintf(stderr, "Document not parsed successfully. \n");
return;
}
cur = xmlDocGetRootElement(doc);
if (cur == NULL) {
fprintf(stderr, "empty document\n");
xmlFreeDoc(doc);
return;
}
for (cur = cur->children; cur != NULL; cur = cur->next) {
if (cur->type != XML_ELEMENT_NODE)
continue;
key = xmlNodeGetContent(cur);
strlcpy(buffer[i], key, 50);
printf("Content : %s\n", key);
xmlFree(key);
i++;
}
xmlFreeDoc(doc);
}
You should also check that i doesn't overrun the number of strings in your array.
buffer array is not large enough. Increase buffer size to buffer[7+3][50]
wchar_t buffer[7][50]={"\0"};
...
while (cur != NULL) {
wmemcpy(buffer[i],(wchar_t*)(key),size(key)); /*segmentation fault */
printf("Content : %s\n", key);
...
i++;
}
The output is 10 lines of "Content : ...". Thus i incremented form 0 to 9. But buffer may only be indexed 0 to 6. Indexing 7 and later is undefined behavior and this eventually manifested itself as a segment fault.

to count the tag with same name in an XML using libxml2

i want to count the tag with same name in the complete xml using libxml2. please suggest if there is any inbuilt function in libxml2 for this which directly returns count.
It's easy using XPath. See XPath examples. Once you obtain the result of xmlXPathEvalExpression, you simply write:
xpathObj->nodesetval->nodeNr
Here is XPath syntax given. For example //elem, to count all elem elements.
And the full function based on the xpath1.c example:
int count(xmlDocPtr doc, const char* path)
{
int count;
xmlXPathContextPtr xpathCtx;
xmlXPathObjectPtr xpathObj;
/* Create xpath evaluation context */
xpathCtx = xmlXPathNewContext(doc);
if(xpathCtx == NULL) {
fprintf(stderr,"Error: unable to create new XPath context\n");
return(-1);
}
/* Evaluate xpath expression */
xpathObj = xmlXPathEvalExpression(path, xpathCtx);
if(xpathObj == NULL) {
fprintf(stderr,"Error: unable to evaluate xpath expression \"%s\"\n", path);
xmlXPathFreeContext(xpathCtx);
return(-1);
}
count = xpathObj->nodesetval->nodeNr;
printf("count of %s: %d\n", path, count);
xmlXPathFreeObject(xpathObj);
xmlXPathFreeContext(xpathCtx);
return count;
}

Resources