Solr delta import Query exception - solr

THis is my db configuration file , I am able to full import properly
but getting problem to
delta import
there is two exception
datasource exception related to mysql
DHI
<dataConfig>
<dataSource driver="com.mysql.jdbc.Driver" url="jdbc:mysql://10.30.2.32:3306/artemis" batchSize="-1"
user="username" password="password" />
<document>
<entity name="job" pk="job_id"
query="SELECT * FROM job"
deltaImportQuery="SELECT * FROM job WHERE job_id = '${dataimporter.delta.id}'"
deltaQuery="SELECT job_id FROM job WHERE updated_date > convert_tz('${dataimporter.last_index_time}','+00:00','-05:30')">
<field column="job_id" name="jobId"/>
<field column="keywords" name="keywords"/>
<field column="speaciality" name="speaciality"/>
<field column="salary_min" name="salaryMin"/>
<field column="salary_max" name="salaryMax"/>
<field column="created_date" name="createdDate"/>
<field column="updated_date" name="updatedDate"/>
<field column="updated_date" name="updatedDate"/>
<field column="experience_from" name="experienceFrom"/>
<field column="experience_to" name="experienceTo"/>
<field column="job_title" name="jobTitle"/>
<entity name="city"
query="SELECT * FROM city where city_id ='${job.place_of_interview_id}'"
deltaQuery="select city_id from city where updated_date >convert_tz('${dataimporter.last_index_time}','+00:00','-05:30')"
parentDeltaQuery="select job_id from job where place_of_interview_id='${city.city_id}'"
>
<field column="name" name="city"/>
<entity name="district"
query="SELECT * FROM district where district_id='${city.district_id}'"
deltaQuery="select district_id from district where updated_date > '${dataimporter.last_index_time}'"
parentDeltaQuery="select city_id from city where district_id='${district.district_id}'"
>
<field column="name" name="district"/>
<entity name="state"
query="SELECT * FROM state where state_id='${district.state_id}'"
deltaQuery="select state_id from state where updated_date > '${dataimporter.last_index_time}'"
parentDeltaQuery="select district_id from district where state_id='${state.state_id}'"
>
<field column="name" name="state"/>
<entity name="country"
query="SELECT * FROM country where country_id='${state.country_id}'"
deltaQuery="select country_id from country where updated_date > '${dataimporter.last_index_time}'"
parentDeltaQuery="select state_id from state where country_id='${country.country_id}'"
>
<field column="name" name="country"/>
<entity name="region"
query="SELECT * FROM region where region_id='${country.region_id}'"
deltaQuery="select region_id from region where updated_date > '${dataimporter.last_index_time}'"
parentDeltaQuery="select country_id from country where region_id='${region.region_id}'"
>
<field column="name" name="region"/>
</entity>
</entity>
</entity>
</entity>
</entity>
<entity name="jobFunction"
query="SELECT * FROM job_function where job_function_id='${job.job_function_id}'"
deltaQuery="select job_function_id from job_function where updated_date > '${dataimporter.last_index_time}'"
parentDeltaQuery="select job_id from job where job_function_id='${jobFunction.job_function_id}'"
>
<field column="name" name="jobFunction"/>
<entity name="jobCategory"
query="SELECT * FROM master_data where id='${jobFunction.job_category_id}'"
deltaQuery="select id from master_data where updated_date > '${dataimporter.last_index_time}'"
parentDeltaQuery="select job_function_id from job_function where job_category_id='${jobCategory.id}'"
>
<field column="name" name="jobCategory"/>
</entity>
</entity>
<entity name="companyName"
query="SELECT * FROM employer where employer_id='${job.employer_id}'"
deltaQuery="select employer_id from employer where updated_date > '${dataimporter.last_index_time}'"
parentDeltaQuery="select job_id from job where employer_id='${companyName.employer_id}'"
>
<field column="company_name" name="companyName"/>
</entity>
</entity>
</document>
</dataConfig>
Exception I am getting at delta import
/29/2017, 6:25:47 PM
ERROR true
JdbcDataSource
Ignoring Error when closing connection
java.sql.SQLException: Streaming result set com.mysql.jdbc.RowDataDynamic#15196a0f is still active. No statements may be issued when any streaming result sets are open and in use on a given connection. Ensure that you have called .close() on any active streaming result sets before attempting more queries.
at com.mysql.jdbc.SQLError.createSQLException(SQLError.java:880)
at com.mysql.jdbc.SQLError.createSQLException(SQLError.java:876)
at com.mysql.jdbc.MysqlIO.checkForOutstandingStreamingData(MysqlIO.java:3111)
at com.mysql.jdbc.MysqlIO.sendCommand(MysqlIO.java:2366)
at com.mysql.jdbc.MysqlIO.sqlQueryDirect(MysqlIO.java:2594)
at com.mysql.jdbc.ConnectionImpl.execSQL(ConnectionImpl.java:2541)
at com.mysql.jdbc.ConnectionImpl.rollbackNoChecks(ConnectionImpl.java:4738)
at com.mysql.jdbc.ConnectionImpl.rollback(ConnectionImpl.java:4630)
at com.mysql.jdbc.ConnectionImpl.realClose(ConnectionImpl.java:4271)
at com.mysql.jdbc.ConnectionImpl.close(ConnectionImpl.java:1515)
at org.apache.solr.handler.dataimport.JdbcDataSource.closeConnection(JdbcDataSource.java:507)
at org.apache.solr.handler.dataimport.JdbcDataSource.close(JdbcDataSource.java:492)
at org.apache.solr.handler.dataimport.DocBuilder.closeEntityProcessorWrappers(DocBuilder.java:288)
at org.apache.solr.handler.dataimport.DocBuilder.closeEntityProcessorWrappers(DocBuilder.java:290)
at org.apache.solr.handler.dataimport.DocBuilder.execute(DocBuilder.java:277)
at org.apache.solr.handler.dataimport.DataImporter.doDeltaImport(DataImporter.java:444)
at org.apache.solr.handler.dataimport.DataImporter.runCmd(DataImporter.java:482)
at org.apache.solr.handler.dataimport.DataImporter$1.run(DataImporter.java:461)
3/29/2017, 6
java.lang.RuntimeException: java.lang.IllegalArgumentException: deltaQuery has no column to resolve to declared primary key pk='jobId'
at org.apache.solr.handler.dataimport.DocBuilder.execute(DocBuilder.java:270)
at org.apache.solr.handler.dataimport.DataImporter.doDeltaImport(DataImporter.java:444)
at org.apache.solr.handler.dataimport.DataImporter.runCmd(DataImporter.java:482)
at org.apache.solr.handler.dataimport.DataImporter$1.run(DataImporter.java:461)
Caused by: java.lang.IllegalArgumentException: deltaQuery has no column to resolve to declared primary key pk='jobId'
at org.apache.solr.handler.dataimport.DocBuilder.findMatchingPkColumn(DocBuilder.java:755)
at org.apache.solr.handler.dataimport.DocBuilder.collectDelta(DocBuilder.java:808)
at org.apache.solr.handler.dataimport.DocBuilder.collectDelta(DocBuilder.java:789)
at org.apache.solr.handler.dataimport.DocBuilder.doDelta(DocBuilder.java:344)
at org.apache.solr.handler.dataimport.DocBuilder.execute(DocBuilder.java:224)
... 3 more

can you replace <'${dataimporter.delta.id}'> to ${dataimporter.delta.job_id} and test.
deltaImportQuery : (Only used in delta-import) .There is a namespace ${dih.delta.column-name} which can be used in this query. e.g: select * from tbl where id=${dih.delta.id} Solr1.4
as above line mentioned that we can only use column name in delta namespace.
reference:
https://wiki.apache.org/solr/DataImportHandler
https://wiki.apache.org/solr/DataImportHandler#Configuration_in_data-config.xml

Related

Solr DataImport set field to specific value [duplicate]

Im making an index in solr from db in the following way:
<document name="Index">
<entity name="c" query="SELECT * FROM C">
<field column="Name" name="name"/>
</entity>
<entity name="p" query="SELECT * FROM P">
<field column="Name" name="name"/>
</entity>
</document>
Is it possible to have a static field that is set for each row that signify what type is returned to client so that one can make a call to the right database table based on that information from the json result?
That is a field that has no column in the table
<field name="id" value="1"/>
Or is there another way to solve this?
<document name="Index">
<entity name="c" transformer="TemplateTransformer" query="SELECT * FROM C">
<field column="Name" name="name"/>
<field column="id" template="1"/>
</entity>
<entity name="p" transformer="TemplateTransformer" query="SELECT * FROM P">
<field column="Name" name="name"/>
<field column="id" template="1"/>
</entity>
</document>
You can add a column to your SQL query that contains static data like this:
<document name="Index">
<entity name="c" query="SELECT *, 'foo' as NameFromC FROM C">
<field column="NameFromC" name="name"/>
</entity>
<entity name="p" query="SELECT *, 'bar' as NameFromP FROM P">
<field column="NameFromP" name="name"/>
</entity>
</document>
If you try to add a field with only name and template attributes, Solr will throw an error saying Field must have a column attribute.

Solr dataimport change dataSource dynamically

I have done the following settings for dataimport from about 20 mdb files using ucanaccess:
<?xml version="1.0" encoding="UTF-8" ?>
<dataConfig>
<dataSource name="a" driver="net.ucanaccess.jdbc.UcanaccessDriver" type="JdbcDataSource" url="jdbc:ucanaccess://E:/feqh/main.mdb;memory=false" />
<dataSource name="a1" driver="net.ucanaccess.jdbc.UcanaccessDriver" type="JdbcDataSource" url="jdbc:ucanaccess://E:/feqh/A/1.mdb;memory=false" />
<dataSource name="a2" driver="net.ucanaccess.jdbc.UcanaccessDriver" type="JdbcDataSource" url="jdbc:ucanaccess://E:/feqh/A/2.mdb;memory=false" />
<dataSource name="a3" driver="net.ucanaccess.jdbc.UcanaccessDriver" type="JdbcDataSource" url="jdbc:ucanaccess://E:/feqh/A/3.mdb;memory=false" />
<dataSource name="a4" driver="net.ucanaccess.jdbc.UcanaccessDriver" type="JdbcDataSource" url="jdbc:ucanaccess://E:/feqh/A/4.mdb;memory=false" />
<!-- and so on -->
<document>
<entity name="Book" dataSource="a"
query="select bkid AS id, bkid AS BookID,bk AS BookTitle, betaka AS BookInfo, cat as cat from 0bok">
<field column="id" name="id"/>
<field column="BookID" name="BookID"/>
<field column="BookTitle" name="BookTitle"/>
<field column="cat" name="cat"/>
<entity name="Category" dataSource="a"
query="select name as CatName, catord as CatWeight, Lvl as CatLevel from 0cat where id = ${Book.CAT}">
<field column="CatName" name="CatName"/>
<field column="CatWeight" name="CatWeight"/>
<field column="CatLevel" name="CatLevel"/>
</entity>
<entity name="Pages" dataSource="a5" onError="continue"
query="SELECT nass AS PageContent, page AS pageNum FROM b${Book.ID} ORDER BY page">
<field column="PageContent" name="PageContent" />
<field column="PageNum" name="PageNum" />
<entity name="Titles" dataSource="a5" onError="continue"
query="SELECT * FROM t${Book.ID} WHERE id = ${Pages.PAGE} ORDER BY sub">
<field column="ID" name="TitleID"/>
<field column="TIT" name="PageTitle"/>
<field column="SUB" name="TitleWeight"/>
<field column="LVL" name="TitleLevel"/>
</entity>
</entity>
</entity>
</document>
</dataConfig>
In every time I liked to import from a different dataSource I had to change dataSource attribute manually for both Pages and Titles entities, then perform dataimport without clean. Now with more than 600 mdb files, it is not an wise option. Is there any way to make looping inside the config? In other words: there is a main entity or mdb files that handles all books titles and categories then every book has its own mdb file named with its id for example 245.mdb for the book of id 245, So I need to change the dataSource for Pages and Titles dynamically.
You cannot create dataSources in a loop, but I believe you can pass dataSource information in a parameter variable. So, perhaps, you can loop over your collection outside of Solr and then trigger DIH with the correct source as a parameter variable.
Just ensure to run DIH in sync mode to avoid different calls stepping on each other (I think the param is syncMode)

Solr - DataImportHandler Not Working

I've simple DataImportHandler, that is working on my local system and not on my Server. Both the versions of Solr are same i.e Solr 4.6.0.
I've tried these configurations for DataImportHandler:
Configuration 1:
<dataConfig>
<dataSource type="JdbcDataSource"
driver="org.postgresql.Driver"
url="jdbc:postgresql://HOST:5432/mydb"
user="admin"
password="admin" />
<script><![CDATA[
function generate_resource_uri(row) {
row.put('resource_uri', '/api/v1/product/' + row.get('id') + '/');
return row;
}
]]></script>
<document>
<entity name="products_product"
query="SELECT id, image_url, impression_url, product_url, manufacturer_name, discount_percentage, short_description, merchant_name, product_name, sku, long_description, date_modified, merchant_id, commission, keywords, product_id, retail_price, date_created FROM products_product"
transformer="script:generate_resource_uri" >
<entity name="source" query="select title from products_source where id = '${products_product.id}'"
processor="CachedSqlEntityProcessor">
<field column="title" name="source"/>
</entity>
<entity name="currency" query="select code from products_currency where id = '${products_product.id}'"
processor="CachedSqlEntityProcessor">
<field column="code" name="currency"/>
</entity>
<entity name="category" query="select title from products_category where id = '${products_product.id}'"
processor="CachedSqlEntityProcessor">
<field column="title" name="category"/>
</entity>
</entity>
</document>
</dataConfig>
Configuration 2:
<dataConfig>
<dataSource type="JdbcDataSource"
driver="org.postgresql.Driver"
url="jdbc:postgresql://HOST:5432/mydb"
user="admin"
password="123456"/>
<script><![CDATA[
function generate_resource_uri(row) {
row.put('resource_uri', '/api/v1/product/' + row.get('id') + '/');
return row;
}
]]></script>
<document>
<entity name="products_product"
query="SELECT id, image_url, impression_url, product_url, manufacturer_name, discount_percentage, short_description, merchant_name, product_name, sku, long_description, date_modified, merchant_id, commission, keywords, product_id, retail_price, date_created FROM products_product"
transformer="script:generate_resource_uri" >
<entity name="source" query="select title from products_source where id = '${products_product.id}'"
cachePk="id" cacheLookup="products_product.id" cacheImpl="SortedMapBackedCache">
<field column="title" name="source"/>
</entity>
<entity name="currency" query="select code from products_currency where id = '${products_product.id}'"
cachePk="id" cacheLookup="products_product.id" cacheImpl="SortedMapBackedCache">
<field column="code" name="currency"/>
</entity>
<entity name="category" query="select title from products_category where id = '${products_product.id}'"
cachePk="id" cacheLookup="products_product.id" cacheImpl="SortedMapBackedCache">
<field column="title" name="category"/>
</entity>
</entity>
</document>
</dataConfig>
Locally I've have approx 2K rows, which is indexed properly, and all the child entity show up.
On server, the fields from the child entities are not showing up i.e source, category and currency. The server has approx 6M rows, its a silly doubt but I hope that memory is not the issue. My server is running on m1.medium EC2 instance, Ubuntu 12.04LTS.
Thanks in Advance :)

Static field for document in Data Import Handlerfor Solr

Im making an index in solr from db in the following way:
<document name="Index">
<entity name="c" query="SELECT * FROM C">
<field column="Name" name="name"/>
</entity>
<entity name="p" query="SELECT * FROM P">
<field column="Name" name="name"/>
</entity>
</document>
Is it possible to have a static field that is set for each row that signify what type is returned to client so that one can make a call to the right database table based on that information from the json result?
That is a field that has no column in the table
<field name="id" value="1"/>
Or is there another way to solve this?
<document name="Index">
<entity name="c" transformer="TemplateTransformer" query="SELECT * FROM C">
<field column="Name" name="name"/>
<field column="id" template="1"/>
</entity>
<entity name="p" transformer="TemplateTransformer" query="SELECT * FROM P">
<field column="Name" name="name"/>
<field column="id" template="1"/>
</entity>
</document>
You can add a column to your SQL query that contains static data like this:
<document name="Index">
<entity name="c" query="SELECT *, 'foo' as NameFromC FROM C">
<field column="NameFromC" name="name"/>
</entity>
<entity name="p" query="SELECT *, 'bar' as NameFromP FROM P">
<field column="NameFromP" name="name"/>
</entity>
</document>
If you try to add a field with only name and template attributes, Solr will throw an error saying Field must have a column attribute.

Struggling with learning solr

I am in the process of redesigning one of our companies site. My boss wants to play around with the idea of replacing all of our navigation with a search box.. the search box should be able to query any of our tables of unrelated data.
So right now I am trying it with 5 tables.
Products
Manufacturers
Category
Ingredients
Uses
So should be able to lookup a product name, a manufacturer name, a category name, an ingredient name, or a use name
When I retrieve the results. if the user clicked on a manufacturer search result.. It will take them to a manufacturer page that lookups all products for that manufacturer.
When clicks on a product page.. link will take them to that actual product information.
Ingredient will take them to a page that will show all products containing that ingredient.
Anyways here is my data config
<dataConfig>
<dataSource driver="com.mysql.jdbc.Driver" url="jdbc:mysql://localhost:3306/xxx" user="xxx" password="xxx" />
<document>
<entity name="manufacturer" transformer="TemplateTransformer" pk="manNum"
query="SELECT manNum, manName FROM manufacturer
WHERE active = 1">
<field column="id" name="id" template="MAN-${manNum}" />
<field column="type" template="manufacturer" name="type"/>
<field column="manName" name="text"/>
<field column="manNum" name="manNum"/>
</entity>
<entity name="product" transformer="TemplateTransformer"
query="SELECT products.prodNum, products.prodName as text, m.manName FROM products JOIN man m USING (manNum)
WHERE products.active = 1
AND (hideWeb = 0 or hideWeb IS NULL)">
<field column="id" template="PROD-${products.prodNum}" name="id"/>
<field column="type" template="product" name="type"/>
<field column="text" name="text"/>
<field column="manName" name="manName"/>
</entity>
<entity name="ingredients" transformer="TemplateTransformer" pk="id"
query="SELECT id, text FROM inglist WHERE sort != ''">
<field column="id" name="id" template="ING-${inglist.id}"/>
<field column="type" template="ingredient" name="type"/>
<field column="text" name="text" />
</entity>
<entity name="uses" transformer="TemplateTransformer" pk="id"
query="SELECT id, text FROM useslist">
<field column="id" name="id" template="USE-${id}"/>
<field column="type" template="use" name="type"/>
<field column="text" name="text"/>
</entity>
<entity name="categories" transformer="TemplateTransformer" pk="id"
query="SELECT id, textShow as text FROM categorylist">
<field column="id" name="id" template="CATEGORY-${id}"/>
<field column="type" template="category" name="type"/>
<field column="text" name="text"/>
</entity>
</document>
</dataConfig>
And my schema..
<fields>
<field name="id" type="string" indexed="true" stored="true"/>
<field name="text" indexed="true" stored="true" type="text"/>
<field name="type" type="string" indexed="false" stored="true"/>
<field name="manName" type="text" indexed="false" stored="true"/>
<field name="manNum" type="string" indexed="false" stored="false"/>
</fields>
Now perhaps I am not doing this the right way... and there may be a better way to handle this.
Anyways the problem I am running into right now is that I am getting the error missing required field "id". Now products query and manufacturer query does not have an id column in the select.. but I thought the transform query should take care of it? If I do the select prodNum as id .. then all the ids are overwritting each other.
Now I could probably concat it in the actual query.. and will do so as a last resort, but would like to know what I am doing wrong with this solution.
EDIT
Nevermind, it was just a noob issue, for some reason I was thinking that the template variable was refering to the table name in the SQL not the entity name,
So I replaced all of the
With
And it worked.
Prefixing the table-specific ID with a distinct character or string is a good idea. I do it in the SQL, which allows me to check the behavior outside of Solr.
select
concat('b',cast(b.id as char)) as id,
...
It Was a noob issue,
for some reason I was thinking that the template variable was refering to the table name in the SQL not the entity name.
I do it like this:
<entity name="GG-Boryslaw-1939-Phonebook"
transformer="TemplateTransformer,DateFormatTransformer"
pk="id"
query="SELECT * FROM boryslaw_1939_phonebook">
<field column="record_id" template="GG-Boryslaw-1939-Phonebook-${GG-Boryslaw-1939-Phonebook.id}" />
<field column="record_type" template="phonebook" />
<field column="record_source" template="Boryslaw Phonebook (1939)" />
<field column="record_date" template="${GG-Boryslaw-1939-Phonebook.Year}" dateTimeFormat="yyyy" />
...etc...
</entity>

Resources