parent child indexing in apache solr - solr

I'm new to Apache solr search. I'm not getting ho to get solr search result with child documents.
My entity in data-config.xml
<entity name="products" query="SELECT DISTINCT IDENTIFIER,PDT_NAME,PDT_DESCRIPTION FROM **PARENT_TABLE**"
deltaQuery="SELECT IDENTIFIER FROM PARENT_TABLE WHERE LAST_MODIFIED_DATE > '${dataimporter.last_index_time}'">
<field column="IDENTIFIER" name="pdtid" />
<field column="PDT_NAME" name="productname" />
<field column="PDT_DESCRIPTION" name="productdescription" />
<entity name="productVersions" child="true" query="SELECT DISTINCT child_id , child_name FROM WHERE IDENTIFIER = '${**products.IDENTIFIER**}'">
<field column="IDENTIFIER" name="productVersions.pdtesat" />
<field column="VERSION_NUMBER" name="productVersions.versionnum" />
<field column="DISPLAY_NAME" name="productVersions.displayname" />
</entity>
</entity>
field details in managed-schema file:
<field name="pdtid" type="text_general" indexed="true" stored="true" multiValued="false" />
<field name="productname" type="text_general" indexed="true" stored="true" multiValued="true" />
<field name="productnamerrr" type="text_general" indexed="true" stored="true" multiValued="false" />
<field name="productdescription" type="text_general" indexed="true" stored="true" multiValued="false" />
<field name="productVersions.childid" type="text_general" indexed="true" stored="true" multiValued="false" />
<field name="productVersions.versionnum" type="text_general" indexed="true" stored="true" multiValued="false" />
<field name="productVersions.displayname" type="text_general" indexed="true" stored="true" multiValued="false" />
I'm expecting my solr result should be :
"response":{"numFound":26,"start":0,"docs":[
{
"productdescription":" Java",
"productnamerrr":"pdtid",
"pdtid":"6591",
"child_docs" : [
"productVersions":[
"productVersions.childid":"123"
"productVersions.versionnum":"V1"
"productVersions.displayname":"disp"],
"productVersions":[
"productVersions.childid":"456"
"productVersions.versionnum":"V2"
"productVersions.displayname":"disp2"]
],
"id":"92689209-dc5f-4ae6-bd3c-d55dbd0e200c",
"_version_":1599132440456069120},
Please help me in getting the multiple child docs in json format after indexing.
May 2nd edit.
My query result from solr search like below.
"response":{"numFound":38,"start":0,"docs":[
{
"productdescription":" JIRA provides issue (bug) and project tracking
for the software development team.",
"productnamerrr":"Atlassian JIRA",
"productVersions":
["childid:6.x,versionnum:Jira 6.x,displayname :Withdrawn",
"childid:2.0.3,versionnum:Atlassian JIRA,displayname:Planning",
"childid:JIRA Server 5.0.1 - 6.3.15,versionnum:JIRA - JEditor,displayname :Withdrawn",
"childid:1.x,versionnum:Jira 1.x,displayname :Withdrawn"
],
"id":"0b5ba528-ef7a-49ba-a97b-2ea94922cbb5",
"_version_":1599297669816123392},
Edited on May 3-2018
returned data is correct. But the i'm expecting in parent child documents explicitly. getting child docs as below.
"productVersions":["childid:6.x,versionnum:Jira 6.x,displayname :Withdrawn",
"childid:2.0.3,versionnum:Atlassian JIRA,displayname:Planning",
"childid:JIRA Server 5.0.1 - 6.3.15,versionnum:JIRA - JEditor,displayname :Withdrawn",
"childid:1.x,versionnum:Jira 1.x,displayname :Withdrawn"
],
Expecting like below.
"productVersions":[
"productVersions.childid":"123"
"productVersions.versionnum":"V1"
"productVersions.displayname":"disp"],
"productVersions":[
"productVersions.childid":"456"
"productVersions.versionnum":"V2"
"productVersions.displayname":"disp2"]
],
How can i change the query to get child docs separately as a separate entity.??

Related

Solr: Indexing child Documents via db-data-config.xml query

I am trying to index nested documents to with respect to parent docment, but does not find expected structure of indexed data in SOLR. Please correct me what is going wrong in solr configuration as mention below.
table structure:
enter image description here
db-data-config.xml
<document>
<entity name="parent" pk="parent_id" query="SELECT parent_id, name, salary, country from parent" deltaQuery="select parent_id, name, salary, country from parent where updated_at &gt ${dataimporter.last_index_time}">
<field column="parent_id" name="id" />
<field column="parent_id" name="parent_id" />
<field column="name" name="name" />
<field column="salary" name="salary" />
<field column="country" name="country" />
<entity name="child" child="true" pk="child_id" query="select child.child_id, child.parent_id, child.child_name from child where child.parent_id='${parent.parent_id}' ">
<field column="parent_id" name="id" />
<field column="child_id" name="child_id" />
<field column="child_name" name="child_name" />
</entity>
</entity>
</document>
managed-schema:
<!-- parent table fields -->
<field name="parent_d" type="text_general" indexed="true" stored="true"/>
<field name="name" type="text_general" indexed="true" stored="true"/>
<field name="salary" type="text_general" indexed="true" stored="true"/>
<field name="country" type="text_general" indexed="true" stored="true"/>
<!-- child table fields -->
<field name="child_id" type="text_general" indexed="true" stored="true"/>
<field name="child_name" type="text_general" indexed="true" stored="true"/>
Result of indexed documents are not nested, it seems flat representation:
"response":{"numFound":4,"start":0,"docs":[
{
"country":"IND",
"parent_id":"1",
"name":"p1",
"salary":"11",
"_version_":1582614969479856128
},
{
"id":"1",
"child_id":"1",
"child_name":"c1",
"_version_":1582614969479856128
},
{
"country":"USA",
"parent_id":"2",
"name":"p2",
"salary":"222",
"_version_":1582614969546964992
},
{
"id":"2",
"child_id":"2",
"child_name":"c2",
"_version_":1582614969546964992
}
]
}
Expected:
"response":{"numFound":4,"start":0,"docs":[
{
"parent_id":"1",
"country":"IND",
"name":"p1",
"salary":"11",
"child":{
"parent_id":"1",
"child_id":"1",
"child_name":"c1",
},
"_version_":1582614969479856128
},
{
"parent_id":"2",
"country":"USA",
"name":"p2",
"salary":"222",
"child":{
"parent_id":"2",
"child_id":"2",
"child_name":"c2",
},
"_version_":1582614969546964992
}
]
}
Solr stores the child docs as independent docs too, so what you see is normal. But there is some plumbing so you can get them back with the parent (and query one layer and get the other etc).
Read carefully this post by Yonik, and see how you must query to get children too etc.

indexing in returning only few of the columns specified in query in data-import xml

indexing in returning only few of the columns specified in query in data-import xml.
<entity
name="All_Manuals"
query="SELECT Query........"
dataSource="JdbcDataSource">
<field column="Column1" name="id" />
<field column="Column2" name="deptId" />
<field column="Column3" name="groupId" />
<field column="Column4" name="subGrpId" />
<field column="Column5" name="manualId" />
</entity>
We are indexing above all columns, but when we are fetching it is returning only first two columns.
you need to add all columns in your schema.xml like this :
<field name="id" type="string" indexed="true" stored="true" />
<field name="deptId" type="string" indexed="true" stored="true" />
<field name="groupId" type="string" indexed="true" stored="true" />
..............................
and suppose if you dnt want indexing on any column but still want that cloumn in your results
<field name="xxxxx" type="string" indexed="false" stored="true" />

How to index columns with same name but different data in solr

I have two table and both the tables have delete_status,but these columns have different data
CODE:(data-config.xml)
<entity name="category_masters" query="SELECT
category_updated,delete_status,category_id,category_name FROM category_masters
where category_id='${type_masters.category_id}'">
category_id=${category_masters.category_id}">
<field column="category_id" name="id"/>
<field column="category_name" name="category_name" indexed="true" stored="true" />
**<field column="delete_status" name="delete_status" indexed="true" stored="true" />**
<field column="category_updated" name="category_updated" indexed="true"
stored="true" />
</entity>
<entity name="type_masters" pk="type_id" query="SELECT
type_updated,delete_status as type_masters_delte,type_id,category_id,type_name FROM type_masters
where type_id='${businessmasters.Business_Type}' ">
<field column="type_id" name="id"/>
<field column="category_id" name="category_id" indexed="true" stored="true" />
<field column="type_name" name="type_name" indexed="true" stored="true" />
**<field column="delete_status" name="delete_status" indexed="true" stored="true" />**
<field column="type_updated" name="type_updated" indexed="true" stored="true" />
How do i display data from both the columns,i tried aliasing the columns but it does not work.
And when i query i only see one delete_status column,even if i make it multivalued how do i differentiate which delete_status belongs to which table.
I want the data separately and cant make changes in the database.
In your case, i would use the DIH. In that case, you could define an join to merge both tables in data-config.xml.
Using that file supports aliases for Column names, like table1.delete_status as type_masters_delete

Multiple Indexes in same Solr Core..?

I am using Apache Solr..I have the following Scenario.. :
I have Two table in my PostGreSQL database. One is "Cars". Other is "Dealers"
Now i have a data-config file for Cars like the following :
<document name="offerings">
<entity name="jc_offerings" query="select * from jc_offerings" >
<field column="id" name="id" />
<field column="name" name="name" />
<field column="display_name" name="display_name" />
<field column="extra" name="extra" />
</entity>
</document>
I have a similar data--config.xml for "Dealers". It has the same fields as Cars : name, extra etc
Now in my Schema.xml , i have defined the following fields :
<fields>
<field name="id" type="string" indexed="true" />
<field name="name" type="name" indexed="true" />
<field name="extra" type="extra" indexed="true" />
<field name="CarsText" type="text_general" indexed="true"
stored="true" multiValued="true"/>
</fields>
<uniqueKey>id</uniqueKey>
<defaultSearchField>CarsText</defaultSearchField>
<copyField source="name" dest="CarsText"/>
<copyField source="extra" dest="CarsText"/>
Now i want to search like : "where name is Maruti"..So how will Solr know Whether to Search ::: Cars Field : name OR Dealer Field "name"..??
I have read to the following link : http://wiki.apache.org/solr/MultipleIndexes
But i am not able to understand how is works..??
After reading that link : I made another field in My Cars and Dealers *data-config.xml* .. Something like :
<field name="type" value="car" /> : in Cars date-config.xml
and
<field name="type" value="dealer" /> : in Cars date-config.xml
And then in Schema.xml i created a new field :
<field name="type" type="string" indexed="true" stored="true" />
And then i queried something like :
localhost:8983/solr/select?q=name:Maruti&fq=type:dealer
But it dint Worked..!!
So what should i do..??
if the fields are the same for both cars and dealers, you could use one index with an object defined like so:
<fields>
<field name="id" type="string" indexed="true" stored="true"/>
<field name="name" type="name" indexed="true" stored="true" />
<field name="extra" type="extra" indexed="true" stored="true" />
<field name="description_text" type="text_general" indexed="true" stored="true" multiValued="true"/>
<field name="type" type="string" indexed="true" stored="true" />
</fields>
this will work for both cars and dealers (so you don't need to have 2 indexes) and you'll use the "type" field to sort out if you want a "dealer" or a "car" (i'm using the same system to filter out similar types of objects with only a minor "semanthical" difference)
also you'll need to add stored="true" to the fields you want to retrieve, or you'll be only able to use them for searching (hence that index="true")
Adding a default value to the type field will ensure the type value being set to cars|dealer.
You will have to index the sources separately. Then use copy field and you can easily filter on either cars|dealer.
This does seem a bit tricky and is not explained well in the muti-indexes link referred to above.

The simplest Solr DIH indexing

I'm trying to index data from a database in Solr using the DIH.
So I have modified the two config files as follows:
solrconfig.xml :
<requestHandler name="/dataimport"
class="org.apache.solr.handler.dataimport.DataImportHandler">
<lst name="defaults">
<str name="config">data-config.xml</str>
</lst>
</requestHandler>
data-config.xml :
<dataConfig>
<dataSource type="JdbcDataSource" driver="com.mysql.jdbc.Driver" url="jdbc:mysql://localhost:3306/test" user="root" password="****"/>
<document>
<entity name="source_scellee" query="select * from source_scellee">
</entity>
</document>
</dataConfig>
source_scellee being the name of my table on my test database. It contains many fields.
Obviously, I'm trying to run nothing else than a simple test. When running http://localhost:8983/solr/dataimport?command=full-import&clean=false&commit=true I get the following result :
<str name="Full Dump Started">2012-01-27 12:27:01</str><str name="">Indexing completed. Added/Updated: 4 documents. Deleted 0 documents.</str><str name="Committed">2012-01-27 12:27:02</str>
<str name="**Total Documents Failed**">4</str>
Besides no warning nor error on the server logs. 4 is my number of records inside table "source_scellee". But it says all documents fail.
If I run a query from http://localhost:8983/solr/admin/
no results appear, at all !! How can I solve it ?
(":" shows no results)
Thank you for your help!!!
----edit---
I have added these lines to my schema.xml :
<field name="ID" type="int" indexed="true" stored="true" />
<field name="reference_catalogue" type="string" indexed="true" stored="true"/>
<field name="reference_capsule" type="string" indexed="true" stored="true"/>
<field name="organisme_certificateur" type="string" indexed="true" stored="true" />
<field name="reference_certificat" type="string" indexed="true" stored="true" />
<field name="duree_d_utilisation" type="string" indexed="true" stored="true" />
<field name="activite_nominale" type="string" indexed="true" stored="true"/>
<field name="activite_minimale" type="string" indexed="true" stored="true"/>
<field name="activite_maximale" type="string" indexed="true" stored="true"/>
<field name="coffret" type="boolean" indexed="true" stored="true"/>
<field name="dispositif_medical" type="boolean" indexed="true" stored="true"/>
<field name="forme_speciale" type="boolean" indexed="true" stored="true" />
<field name="exemption_cpa" type="boolean" indexed="true" stored="true"/>
<field name="marquage_ce" type="boolean" indexed="true" stored="true"/>
<field name="element_cible" type="boolean" indexed="true" stored="true"/>
However the result is still the same: no results when querying (I tried to restart solr, and to re-index all also)
------second edit---
I have tried the dynamic import
Now my data-config.xml looks like this :
<document>
<entity name="source_scellee" query="select * from source_scellee">
<field column="ID" name="ID_i" />
<field column="reference_catalogue" name="reference_catalogue_s" />
<field column="reference_capsule" name="reference_capsule_s" />
<field column="organisme_certificateur" name="organisme_certificateur_s" />
<field column="reference_certificat" name="reference_certificat_s" />
<field column="duree_d_utilisation" name="duree_d_utilisation_s" />
<field column="activite_nominale" name="activite_nominale_s" />
<field column="activite_minimale" name="activite_minimale_s" />
<field column="activite_maximale" name="activite_maximale_s" />
<field column="coffret" name="coffret_b" />
<field column="dispositif_medical" name="dispositif_medical_b" />
<field column="forme_speciale" name="forme_speciale_b" />
<field column="exemption_cpa" name="exemption_cpa_b" />
<field column="marquage_ce" name="marquage_ce_b" />
<field column="element_cible" name="element_cible_b" />
</entity>
</document>
1.) You can take a look to the statistics page to see, how much docs are indexed right now:
http://localhost:8983/solr/admin/stats.jsp
2.) The result of your search depends on your schema.xml, because there it's defined how docs are indexed/stored, which fields are processed and how searchs are handled on query time.
Please take a look at this file or post the field definition from the schema.xml and also the schema/design from your table source_scellee.
Does the columns and the fields have the same name?
//Edit: This should work, if coulmname and filedname are the same:
<document>
<entity name="source_scellee"
pk="ID"
query="select * from source_scellee">
</entity>
</document>
is having NULL values in data an issue ?
that depends on the destination field.
Are your running solr in an tomcat or someting like that?
Take a look in the Java EE Container output, like catalina.out or so.
I am pretty sure the issue lies in how the DIH is trying to map fields. Thanks for adding the information from your schema file... However, I believe that what you have done is added configuration that needs to be added separately to both the schema.xml and the data-config.xml for the DIH.
Based on the Full Import Example from the Solr Wiki, I would try the following.
schema.xml
<field name="ID" type="int" indexed="true" stored="true" />
<field name="reference_catalogue" type="string" indexed="true" stored="true"/>
<field name="reference_capsule" type="string" indexed="true" stored="true"/>
<field name="date_de_creation" type="date" indexed="true" stored="true"/>
<field name="organisme_certificateur" type="string" indexed="true" stored="true" />
<field name="reference_certificat" type="string" indexed="true" stored="true" />
<field name="duree_d_utilisation" type="string" indexed="true" stored="true" />
<field name="activite_nominale" type="string" indexed="true" stored="true"/>
<field name="activite_minimale" type="string" indexed="true" stored="true"/>
<field name="activite_maximale" type="string" indexed="true" stored="true"/>
<field name="coffret" type="int" indexed="true" stored="true"/>
<field name="dispositif_medical" type="int" indexed="true" stored="true"/>
<field name="forme_speciale" type="int" indexed="true" stored="true" />
<field name="exemption_cpa" type="int" indexed="true" stored="true"/>
<field name="marquage_ce" type="int" indexed="true" stored="true"/>
<field name="element_cible" type="int" indexed="true" stored="true"/>
data-config.xml
<dataConfig>
<dataSource type="JdbcDataSource" driver="com.mysql.jdbc.Driver" url="jdbc:mysql://localhost:3306/test" user="root" password="****"/>
<document>
<entity name="source_scellee" query="select * from source_scellee">
<field column="ID" name="ID"/>
<field column="reference_catalogue" name="reference_catalogue"/>
<field column="reference_capsule" name="reference_capsule"/>
<field column="date_de_creation" name="date_de_creation"/>
<field column="organisme_certificateur" name="organisme_certificateur"/>
<field column="reference_certificat" name="reference_certificat"/>
<field column="duree_d_utilisation" name="duree_d_utilisation"/>
<field column="activite_nominale" name="activite_nominale"/>
<field column="activite_minimale" name="activite_minimale"/>
<field column="activite_maximale" name="activite_maximale"/>
<field column="coffret" name="coffret"/>
<field column="dispositif_medical" name="dispositif_medical"/>
<field column="forme_speciale" name="forme_speciale"/>
<field column="exemption_cpa" name="exemption_cpa"/>
<field column="marquage_ce" name="marquage_ce"/>
<field column="element_cible" name="element_cible"/>
</entity>
</document>
</dataConfig>
There is a way to setup the schema.xml to dynamically add fields that it encounters by using some naming conventions. Please see the Dynamic Fields details in the Solr Wiki for more details and some examples of how this can be done.

Resources