We need a full-text search for a db with millions of records (music meta-data) and I've only been working on Solr for 2 weeks roughly, I need some help regarding indexing. I am using DataImportHandler and have SQL query that generates result like this:
As you can see in the attached image above, the id (Integer data type) is repeated in the SQL result also used for in DIH and when I set uniqueKey to <uniqueKey>id</uniqueKey> solr overwites the values leaving only one record/row , in fact I think the last one processed which is the one with countryCode 'TL'.
When I first had this issue, I knew why solr was overwriting the value, its's normal so I thought of adding a global identifer to each record in db, a guid - without thinking things properly, I ended up up with same duplicates as you can see charGuid which is a uuid() from MySQL is duplicated.
But when I use the charGuid (String data type) as uniqueKey to <uniqueKey>charGuid</uniqueKey>, I get all records indexed and nothing is overwritten but of course duplicates are inevitable. The problem I for-see here is when I have to do an incremental update, solr will not be able to know which document to update exactly, In fact a quick test from admin console, revealed that the last or first record its find with that unique key is updated. - This is not acceptable.
I stumbled upon an article referencing multiValued="true", I thought making the fields that represents a JOIN column in my SQL will do the trick, but it doesn't. I was hoping a record with id:10 will be returned with a List of countryCode but no.
I am just puzzled as to how to circumvent this issue and why I did not find a similar problem posted by someone.
If I don't get a meaningful answer, I guess I will have to use charGuid as <uniqueKey> which allows duplicate and then use Solr Document Deduplication Detection to handle updates of my index but I want to believe, there is a better way.
Update
Here is my data-config.xml and schema.xml defination:
<entity name="albums" query="select * from Album">
<entity name="track" query="select t.id as id, t.title as trackTitle, t.removed as trackRemovedDate, t.productState from Track t where t.albumId='${albums.id}'"/>
<entity name="albumSalesAreaId" query="select asa.salesAreaId as albumSalesAreaId from AlbumSalesArea asa where asa.albumId='${albums.id}'"/>
<entity name="albumSalesArea" query="select sa.name as albumSalesArea from SalesArea sa where sa.id='${albumSalesAreaId.salesAreaId}'"/>
<entity name="salesAreaCountry" query="select sac.countryId as 'salesAreaCountry' from SalesAreaCountry sac where sac.salesAreaId ='${salesArea.id}'"/>
<entity name="countryId" query="select c.id as 'countryId' from Country c where c.id = '${salesAreaCountry.countryId}'"/>
<entity name="countryName" query="select c.name as 'countryName' from Country c where c.id = '${salesAreaCountry.countryId}'"/>
</entity>
**Schema.xml**
<!--new multivalue fields -->
<field name="albumSalesArea" type="int" stored="true" indexed="true" multiValued="true"/>
<field name="albumSalesAreaId" type="int" indexed="true" stored="true" multiValued="true"/>
<field name="salesAreaCountry" type="int" stored="true" indexed="true" multiValued="true"/>
<field name="countryId" type="int" indexed="true" stored="true" multiValued="true"/>
<field name="countryName" type="text_general" indexed="true" stored="true" multiValued="true"/>
When I compare my solr response with SQL result, I see countryCode but solr has none, only returned
"albumSalesAreaId": [
1,
3
],
Not sure why country etc not showing up.
Update 2
data-config.xml
<document name="content">
<entity name="albums" query="select * from Album">
<entity name="tracks" query="select t.id, t.title, t.removed, t.productState from Track t where t.albumId='${albums.id}'">
<field column="id" name="id" />
<field column="title" name="trackTitle" />
<field column="removed" name="trackRemovedDate" />
<field column="productState" name="trackProductState" />
</entity>
<entity name="albumSalesAreaIds" query="select salesAreaId from AlbumSalesArea where albumId = '${albums.id}'">
<field column="salesAreaId" name="albumSalesAreaId"/>
</entity>
<entity name="albumSalesAreaNames" query="select name from SalesArea where id = '${albumSalesAreaIds.salesAreaId}'">
<field column="name" name="albumSalesArea"/>
</entity>
<entity name="salesAreaCountryIds" query="select countryId from SalesAreaCountry where salesAreaId ='${albumSalesAreaIds.salesAreaId}'">
<field column="countryId" name="countryId" />
</entity>
<entity name="salesAreaCountry" query="select name from Country where id ='${salesAreaCountryIds.countryId}'">
<field column="name" name="countryName" />
</entity>
<field column="title" name="albumTitle"/>
<field column="removed" name="albumRemovedDate"/>
<field column="productState" name="albumProductState" />
</entity>
</document>
schema.xml
<field name="catchall" type="text_general" stored="true" indexed="true" multiValued="true"/>
<field name="publisher" type="text_general" indexed="true" stored="true"/>
<field name="uuid" type="binary" indexed="false" stored="true"/>
<field name="trackRemovedDate" type="tdate" indexed="true" stored="true"/>
<field name="albumRemovedDate" type="tdate" indexed="true" stored="true"/>
<field name="trackProductState" type="int" indexed="true" stored="true"/>
<field name="albumProductState" type="int" indexed="true" stored="true"/>
<field name="countryCode" type="text_general" indexed="true" stored="true" multiValued="true"/>
<field name="albumTitle" type="text_general" indexed="true" stored="true"/>
<field name="trackTitle" type="text_general" indexed="true" stored="true" multiValued="true"/>
<field name="guid" type="text_general" indexed="true" stored="true"/>
<!--new multivalue fields -->
<field name="albumSalesAreaId" type="int" indexed="true" stored="true" multiValued="true"/>
<field name="salesAreaCountry" type="int" stored="true" indexed="true" multiValued="true"/>
<field name="countryId" type="int" indexed="true" stored="true" multiValued="true"/>
<field name="countryName" type="text_general" indexed="true" stored="true" multiValued="true"/>
<field name="albumSalesArea" type="text_general" indexed="true" stored="true" multiValued="true"/>
sample solr response for id:5
{
"responseHeader": {
"status": 0,
"QTime": 1,
"params": {
"indent": "true",
"q": "id:5",
"_": "1383221233535",
"wt": "json"
}
},
"response": {
"numFound": 1,
"start": 0,
"docs": [
{
"id": "5",
"catchall": [
"5",
"Test Album 5",
"2011-10-21 00:00:00.0",
"[B#261ca3cb",
"Test Track 1",
"Ya man 2",
"2011-10-17 16:21:29.0",
"1",
"1450412569164513280"
],
"albumTitle": "Test Album 5",
"albumRemovedDate": "2011-10-21T00:00:00Z",
"uuid": "6oT/MMl+RDaPyKpGK1KN0w==",
"trackTitle": [
"Test Track 1",
"Ya man 2"
],
"trackRemovedDate": "2011-10-17T16:21:29Z",
"albumSalesAreaId": [
1
],
"_version_": 1450412569164513300
}
]
}
}
SQL result for id:5
trackTitle and albumSalesAreaId seem to be correct but not sure why others not been included however if hard code the albumSalesAreaNames entiy with from SalesArea where id = 1, then I get albumSalesArea field added to result, so it seem like from SalesArea where id = '${albumSalesAreaIds.salesAreaId}'" is returning null, also confirmed from by 'IN' test earlier.
This looks really a problem simply solved with a multivalued field.
If you use multivalued field in this structure what you will obtain is one document with ID=10, all the duplicated values will just be there once and all other fields will be multivalued. For example the NAME field will contain 4 different countries and so the country_code.
have a look at this article on how to structure your dataimportHandler to achieve this:
http://wiki.apache.org/solr/DataImportHandler#Full_Import_Example
basically you need one query for each multivalued field:
<dataConfig>
<dataSource driver="org.hsqldb.jdbcDriver" url="jdbc:hsqldb:/temp/example/ex" user="sa" />
<document name="products">
<entity name="item" query="select * from item">
<field column="ID" name="id" />
<field column="code" name="code" />
<entity name="countryName" query="select name from countrytable where item_id='${item.ID}'">
<field name="name" column="description" />
</entity>
<entity name="countryCode" query="select countryCode from countrytable where item_id='${item.ID}'">
</entity>
</entity>
</document>
(Posted on behalf of the OP).
SOLUTION
<entity name="albumSalesAreaNames" query="select name from SalesArea where id = '${albumSalesAreaIds.salesAreaId}'">
<field column="name" name="albumSalesArea"/>
</entity>
<field column="salesAreaId" name="albumSalesAreaId"/>
</entity>
Related
I am trying to create an index with non-anonymous nested classes. My desired output from solr is:
"responseHeader":{
"status":0,
"QTime":8,
"params":{
"q":"discriminator:project",
"indent":"true",
"fl":"*,[child]",
"q.op":"OR",
"_":"1660714908720"}},
"response":{"numFound":1003,"start":0,"numFoundExact":true,"docs":[
{"name":"Project 1",
"id":"315500",
"discriminator":"project",
"_version_":1741444763087798272,
"publicContacts":[
{
"name":"Gurney Halleck",
"id":"315520",
"discriminator":"publicContact",
"_version_":1741444763087798272},
{
"name":"Thufir Hawat",
"id":"315530",
"discriminator":"publicContact",
"_version_":1741444763087798272}]},
I have read and followed: https://solr.apache.org/guide/8_0/indexing-nested-documents.html
and https://solr.apache.org/guide/8_11/indexing-nested-documents.html#indexing-nested-documents
If I add /just/
<field name="_root_" type="string" indexed="true" stored="false" docValues="false" />
to my schema.xml I can perform a query and get a result with anonymous nested documents returned as childDocuments
"responseHeader":{
"status":0,
"QTime":8,
"params":{
"q":"discriminator:project",
"indent":"true",
"fl":"*,[child]",
"q.op":"OR",
"_":"1660714908720"}},
"response":{"numFound":1003,"start":0,"numFoundExact":true,"docs":[
"name":"Project 1",
"id":"315500",
"discriminator":"project",
"_version_":1741444763087798272,
"_childDocuments_":[
{
"name":"Gurney Halleck",
"id":"315520",
"discriminator":"publicContact",
"_version_":1741444763087798272},
{
"name":"Thufir Hawat",
"id":"315530",
"discriminator":"publicContact",
"_version_":1741444763087798272}]
},
However, if I add
<fieldType name="_nest_path_" class="solr.NestPathField" />
<field name="_nest_path_" type="_nest_path_" stored="true" />
the nesting relationships are not created at all (not even anonymous childDocuments!) but my nexted documents are put in the index.
I am using DIH to index the documents:
<entity transformer="RegexTransformer" name="project" query="select * from project">
<!-- universal fields -->
<field column="discriminator"/>
<field column="id"/>
<field column="name"/>
<entity child="true" name="publicContacts" query="select * from project_public_contacts where project_id='${project.id}'">
<field column="discriminator"/>
<field column="id"/>
<field column="name"/>
</entity>
</entity>
What am I doing wrong?
After digging into this, I have found this is a defect in Solr's DIH. As of 8/29/20, Apache has determined that this defect will not be fixed due to the deprecation of DIH.
https://issues.apache.org/jira/browse/SOLR-14490?page=com.atlassian.jira.plugin.system.issuetabpanels%3Aall-tabpanel
I did find a work around, which is to populate the nest_path yourself in db-data-config.xml. For example:
<entity name="project" query="select * from project">
<!-- universal fields -->
<field column="discriminator"/>
<field column="id"/>
<field column="name"/>
<entity child="true" name="publicContacts" query="select * from project_public_contacts where project_id='${project.id}'">
<field column="discriminator"/>
<field column="id"/>
<field column="name"/>
<field column="nest" name="_nest_path_"/>
</entity>
<entity child="true" name="privateContacts" query="select * from project_private_contacts where project_id='${project.id}'">
<field column="discriminator"/>
<field column="id"/>
<field column="name"/>
<field column="nest" name="_nest_path_"/>
</entity>
</entity>
where the value looks like:
/publicContacts
or whatever you want the property to be named. For more details about how/what the _nest_path_ field should be set to, you can set the field to be stored in schema.xml and then populate the data with the SOLR REST endpoints or other means that are not DIH to see how it's populated. This is how I debugged this issue.
<field name="_nest_path_" type="_nest_path_" stored="true"/>
I also noted that the documentation is incorrect in the SOLR links I provided in my original post. You DO need to have fields defined in schema.xml for the named child documents. I received errors when trying to index through REST endpoints without them. My definitions are:
<field name="publicContacts" type="string" indexed="true" stored="true" required="false" multiValued="true"/>
<field name="privateContacts" type="string" indexed="true" stored="true" required="false" multiValued="true"/>
I am trying to index my database for a question answer website. To start off, I want to index the questions and answers table which has a one to many relationship. I would expect solr to return documents like:
{
'question_id': 1,
'question': 'Is this a question?',
'answers' : [
{
'answer_id': 1,
'answer': 'Maybe'
},
{
'answer_id': 2,
'answer': 'yes it is'
}
]
}
What configuration do I need to achieve this?
I've gone through Configuring the DIH Configuration File tutorial.
Below are the configurations I've tried:
CONFIG 1
<dataConfig>
<dataSource type="JdbcDataSource" driver="com.mysql.jdbc.Driver" url="jdbc:mysql://localhost/questionsdb" user="root" password=""/>
<document>
<entity name="questions"
pk="id"
query="SELECT id, title FROM questions">
<field column="id" name="question_id"/>
<field column="title" name="title"/>
<entity name="answers"
pk="id"
query="select id, answer from answers where qid='${questions.id}'">
<field name="answer_id" column="id" />
<field name="answer" column="answer" />
</entity>
</entity>
</document>
</dataConfig>
QUERY OUTPUT:
CONFIG 2
<dataConfig>
<dataSource type="JdbcDataSource" driver="com.mysql.jdbc.Driver" url="jdbc:mysql://localhost/questionsdb" user="root" password=""/>
<document>
<entity name="questions"
query="SELECT questions.id as question_id, questions.title as question, answers.id as answer_id, answers.answer as answer FROM questions JOIN answers ON questions.id = answers.qid">
<field column="id" name="question_id"/>
<field column="title" name="title"/>
<field name="answer" column="answer" />
<field name="answer_id" column="answer_id" />
</entity>
</document>
</dataConfig>
QUERY OUTPUT:
I'm using solr 8.6.
EDIT 1:
Updated my managed-schema file to use multiValued="true":
<field name="question" type="string" indexed="true" stored="true" multiValued="false"/>
<field name="question_id" type="pint" indexed="false" stored="true" multiValued="false"/>
<field name="answer" type="string" indexed="true" stored="true" multiValued="true"/>
<field name="answer_id" type="pint" indexed="false" stored="true" multiValued="true"/>
The output indexes the answers now but the answer and answer_id come up as a list.
Is it possible to restructure them to be returned as a list of dictionaries as given in the example structure above?
I am trying to index nested documents to with respect to parent docment, but does not find expected structure of indexed data in SOLR. Please correct me what is going wrong in solr configuration as mention below.
table structure:
enter image description here
db-data-config.xml
<document>
<entity name="parent" pk="parent_id" query="SELECT parent_id, name, salary, country from parent" deltaQuery="select parent_id, name, salary, country from parent where updated_at > ${dataimporter.last_index_time}">
<field column="parent_id" name="id" />
<field column="parent_id" name="parent_id" />
<field column="name" name="name" />
<field column="salary" name="salary" />
<field column="country" name="country" />
<entity name="child" child="true" pk="child_id" query="select child.child_id, child.parent_id, child.child_name from child where child.parent_id='${parent.parent_id}' ">
<field column="parent_id" name="id" />
<field column="child_id" name="child_id" />
<field column="child_name" name="child_name" />
</entity>
</entity>
</document>
managed-schema:
<!-- parent table fields -->
<field name="parent_d" type="text_general" indexed="true" stored="true"/>
<field name="name" type="text_general" indexed="true" stored="true"/>
<field name="salary" type="text_general" indexed="true" stored="true"/>
<field name="country" type="text_general" indexed="true" stored="true"/>
<!-- child table fields -->
<field name="child_id" type="text_general" indexed="true" stored="true"/>
<field name="child_name" type="text_general" indexed="true" stored="true"/>
Result of indexed documents are not nested, it seems flat representation:
"response":{"numFound":4,"start":0,"docs":[
{
"country":"IND",
"parent_id":"1",
"name":"p1",
"salary":"11",
"_version_":1582614969479856128
},
{
"id":"1",
"child_id":"1",
"child_name":"c1",
"_version_":1582614969479856128
},
{
"country":"USA",
"parent_id":"2",
"name":"p2",
"salary":"222",
"_version_":1582614969546964992
},
{
"id":"2",
"child_id":"2",
"child_name":"c2",
"_version_":1582614969546964992
}
]
}
Expected:
"response":{"numFound":4,"start":0,"docs":[
{
"parent_id":"1",
"country":"IND",
"name":"p1",
"salary":"11",
"child":{
"parent_id":"1",
"child_id":"1",
"child_name":"c1",
},
"_version_":1582614969479856128
},
{
"parent_id":"2",
"country":"USA",
"name":"p2",
"salary":"222",
"child":{
"parent_id":"2",
"child_id":"2",
"child_name":"c2",
},
"_version_":1582614969546964992
}
]
}
Solr stores the child docs as independent docs too, so what you see is normal. But there is some plumbing so you can get them back with the parent (and query one layer and get the other etc).
Read carefully this post by Yonik, and see how you must query to get children too etc.
I am using Solr DataImportHandler module. Here is my config;
<dataConfig>
<dataSource type="JdbcDataSource"
name="sql"
driver="com.microsoft.sqlserver.jdbc.SQLServerDriver"
url="jdbc:sqlserver://localhost;databaseName=AdventureWorks2008;integratedSecurity=true;"/>
<document>
<entity name="Person" dataSource="sql"
pk="BusinessEntityID"
query="select BusinessEntityID,FirstName,LastName FROM [Person].[Person]"
deltaImportQuery="select BusinessEntityID,FirstName,LastName FROM [Person].[Person] WHERE id='${dih.delta.id}'"
deltaQuery="SELECT BusinessEntityID FROM [Person].[Person] WHERE ModifiedDate > '${dih.last_index_time}'">
<field column="BusinessEntityID" name="id"/>
<field column="FirstName" name="firstname"/>
<field column="LastName" name="lastname"/>
</entity>
</document>
</dataConfig>
for some reason, only id field is importing but not the rest.
What would be the reason? Am I missing something?
You might have missed the below entries in the schema.xml file
<field name="id" type="string" indexed="true" stored="true" required="true"/>
<field name="firstname" type="string" indexed="true" stored="true"/>
<field name="lastname" type="string" indexed="true" stored="true"/>
Here type for id can be int. Just check what you want.
<field name="id" type="int" indexed="true" stored="true" required="true"/>
Make sure your Id and unique field is Proper.
I was facing same issue, change Pk and unique field name and it's working fine.
Hi can anybody point me in the right direction for using Solr's Data Import Handler (DIH) to create an array of strings based on the SQL query.
My Solr DIH config looks like this:
<dataConfig>
<dataSource driver="org.postgresql.Driver"
url="jdbc:postgresql://localhost:5432/data"
user="xxxxx"
password="xxxxxx" />
<document>
<entity name="item" query="select id, subject from table1">
<field column="id" name="id" />
<field column="subject" name="subject" />
<entity name="ip_address" query="select ip_address from table2 where id='${item.id}'">
<field column="ip_address" name="ip_address" />
</entity>
</entity>
</document>
</dataConfig>
The query on table2 actually returns multiple items so I need this to be reflected in my documents.
e.g. :
{
"numFound": 1,
"start": 0,
"docs": [
{
"id": "29331109",
"subject": "Test document",
"ip_address": [
"88.103.210.139",
"88.103.210.144",
"88.103.210.133"
],
"_version_": 1468439879154139100
}
]
}
This is almost working for me except that Solr is only populating the first ip_address in my documents.
Here's the relevant part of my Schema:
<!-- Custom Field names -->
<field name="serial_number" type="string" indexed="true" stored="true"/>
<field name="subject" type="text_general" indexed="true" stored="true"/>
<field name="ip_address" type="string" indexed="true" stored="true" multiValued="true"/>
How is the "ip_address" field defined in schema.xml? It should be multiValued field.