Elasticsearch : set up parent/child using jdbc-rivers

Elasticsearch : set up parent/child using jdbc-rivers - sql-server

I am reading data from Sql Server database/table using jdbc-river currently. As of now I have created a individual type for each of the table in my database. As next step in my implementation I would like to use parent/child types so that I can translate the relationship between my sql tables and store them.
Table1
Col_id| name| prop1|prop2|prop3
child_table1
col_id| table_id| child_prop1|child_prop2|child_prop3
curl -XPUT 'localhost:9200/_river/parent/_meta' -d '{
"type" : "jdbc",
"jdbc" : {
"driver" : "com.mysql.jdbc.Driver",
"url" : "jdbc:mysql://localhost:3306/test",
"user" : "",
"password" : "",
"sql" : "select * from table1",
"index" : "index1",
"type" : "parent"
}
}'
curl -XPUT 'localhost:9200/_river/child/_meta' -d '{
"type" : "jdbc",
"jdbc" : {
"driver" : "com.mysql.jdbc.Driver",
"url" : "jdbc:mysql://localhost:3306/test",
"user" : "",
"password" : "",
"sql" : "select * from child_table1",
"index" : "index1",
"type" : "child"
}
}'
curl -XPOST 'localhost:9200/_river/child/_mapping' -d '{
"child":{
"_parent": {"type": "parent"}
}
}'
I would like to store my data in the following format
{
"id": "1",
"name": "A leading wordsmith",
"prop1": "data",
"prop2": "data",
"prop3": "data",
"child": [
{
"child_prop1": "data",
"child_prop2": "data",
"child_prop3": "data",
}
{
"child_prop1": "data1",
"child_prop2": "data1",
"child_prop3": "data1",
}
]
}
Can anyone comment on how can I use jdbc-rivers to store my data as parent/child type for above scenario.
UPDATE
Based on feedback following is the updated mapping & meta.
curl -XPOST 'http://localhost:9200/library' -d '{
"settings": {
"number_of_shards": 1,
"number_of_replicas": 0
},
"mappings": {
"person": {
"properties": {
"person_id": {
"type": "integer"
},
"name": {
"type": "string"
}
}
},
"work": {
"_parent": {
"type": "person"
},
"properties": {
"person_id": {
"type": "integer",
"index": "not_analyzed"
},
"name": {
"type": "string"
},
"genre": {
"type": "string"
},
"publisher": {
"type": "string"
}
}
}
}
}'
curl -XPUT localhost:9200/_river/person/_meta -d '{
"type": "jdbc",
"jdbc": {
"driver": "com.microsoft.sqlserver.jdbc.SQLServerDriver",
"url": "jdbc:sqlserver://127.0.0.1:1433;databaseName=blogcontext",
"user": "sa",
"password": "password",
"sql": "select person_id as _id, name from person",
"poll": "30s"
},
"index": {
"index": "library",
"type": "person",
"bulk_size": 500,
"autocommit": true
}
}'
curl -XPUT localhost:9200/_river/work/_meta -d '{
"type": "jdbc",
"jdbc": {
"driver": "com.microsoft.sqlserver.jdbc.SQLServerDriver",
"url": "jdbc:sqlserver://127.0.0.1:1433;databaseName=blogcontext",
"user": "sa",
"password": "password",
"sql": "select person_id as _parent,name,genre,publisher from work",
"poll": "30s"
},
"index": {
"index": "library",
"type": "work",
"bulk_size": 500,
"autocommit": true
}
}'
Log file
[2014-01-14 07:10:35,488][ERROR][OneShotRiverMouth ] bulk [1] error
org.elasticsearch.ElasticSearchIllegalArgumentException: Can't specify parent if no parent field has been configured
at org.elasticsearch.action.index.IndexRequest.process(IndexRequest.java:597)
at org.elasticsearch.action.bulk.TransportBulkAction.executeBulk(TransportBulkAction.java:165)
at org.elasticsearch.action.bulk.TransportBulkAction.doExecute(TransportBulkAction.java:140)
at org.elasticsearch.action.bulk.TransportBulkAction.doExecute(TransportBulkAction.java:63)
at org.elasticsearch.action.support.TransportAction.execute(TransportAction.java:63)
at org.elasticsearch.client.node.NodeClient.execute(NodeClient.java:92)
at org.elasticsearch.client.support.AbstractClient.bulk(AbstractClient.java:149)
at org.elasticsearch.action.bulk.BulkProcessor.execute(BulkProcessor.java:283)
at org.elasticsearch.action.bulk.BulkProcessor.access$400(BulkProcessor.java:46)
at org.elasticsearch.action.bulk.BulkProcessor$Flush.run(BulkProcessor.java:336)
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:471)
at java.util.concurrent.FutureTask$Sync.innerRunAndReset(FutureTask.java:351)
at java.util.concurrent.FutureTask.runAndReset(FutureTask.java:178)
at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$301(ScheduledThreadPoolExecutor.java:178)
at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:293)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:724)
thanks

Assumed that your tables look like:
table1
table_id| name| prop1|prop2|prop3
child_table1
child_id| table_id| child_prop1|child_prop2|child_prop3
You will need to select your primary row id and named it as "_id", your parent id and named it as "_parent"
curl -XPUT 'localhost:9200/_river/parent/_meta' -d '{
"type" : "jdbc",
"jdbc" : {
"driver" : "com.mysql.jdbc.Driver",
"url" : "jdbc:mysql://localhost:3306/test",
"user" : "",
"password" : "",
"sql" : "select table_id as _id, name, prop1, prop2, prop3 from table1",
"index" : "index1",
"type" : "parent"
}
}'
curl -XPUT 'localhost:9200/_river/child/_meta' -d '{
"type" : "jdbc",
"jdbc" : {
"driver" : "com.mysql.jdbc.Driver",
"url" : "jdbc:mysql://localhost:3306/test",
"user" : "",
"password" : "",
"sql" : "select child_id as _id, table_id as _parent, child_prop1, child_prop2, child_prop3 from child_table1",
"index" : "index1",
"type" : "child"
}
}'
And define the mapping parent/child as you did, then it's done. You can use parent/child queries to query the parent/child data now.
UPDATE:
I already use your newest mapping and create a sample database to import data. Everything work fine, I can index parent/child without any errors.
I'm using ES 0.9.5, jdbc-river 2.2.2.

Related

Druid - descending timestamps with groupBy query

What I'm asking for should be very simple but the Druid docs have little to no info about this.
I am making a groupBy query, and the data is very large so I'm "paging" it by increasing limitSpec.limit on each subsequent query.
By default, the returned array starts from the beginning timestamp and moves forward in time. I want the results to start from the end timestamp and move backwards in time from there.
Does anyone know how to do that?
So in other words, by default a groupBy query would look like this:
[
{
"version" : "v1",
"timestamp" : "2012-01-01T00:00:00.000Z",
"event" : {
"total_usage" : <some_value_one>
}
},
{
"version" : "v1",
"timestamp" : "2012-01-02T00:00:00.000Z",
"event" : {
"total_usage" : <some_value_two>
}
}
]
Whereas I want it to look like this:
[
{
"version" : "v1",
"timestamp" : "2012-01-02T00:00:00.000Z",
"event" : {
"total_usage" : <some_value_two>
}
},
{
"version" : "v1",
"timestamp" : "2012-01-01T00:00:00.000Z",
"event" : {
"total_usage" : <some_value_one>
}
}
]

You can achieve the ordering by using the "columns" attribute in the limit spec. see the below example.
{
"type" : "default",
"limit" : <integer_value>,
"columns" : [list of OrderByColumnSpec],
}
For more details you can refer the below druid doc -
http://druid.io/docs/latest/querying/limitspec.html

You can add timestamp as a dimension but truncated to date (assuming you use day granularity in your query) and force Druid to sort the result first by dimension values and then by timestamp.
Example Query:
{
"dataSource": "your_datasource",
"queryType": "groupBy",
"dimensions": [
{
"type": "default",
"dimension": "some_dimension_in",
"outputName": "some_dimension_out",
"outputType": "STRING"
},
{
"type": "extraction",
"dimension": "__time",
"outputName": "__timestamp",
"extractionFn": {
"type": "timeFormat",
"format" : "yyyy-MM-dd"
}
}
],
"aggregations": [
{
"type": "doubleSum",
"name": "some_metric",
"fieldName": "some_metric_field"
}
],
"limitSpec": {
"type": "default",
"limit": 1000,
"columns": [
{
"dimension": "__timestamp",
"direction": "descending",
"dimensionOrder": "numeric"
},
{
"dimension": "some_metric",
"direction": "descending",
"dimensionOrder": "numeric"
}
]
},
"intervals": [
"2019-09-01/2019-10-01"
],
"granularity": "day",
"context": {
"sortByDimsFirst": "true"
}
}

How to include imported fields in the search results?

I'm using document references to import parent fields into a child document. While searches against the parent fields work, the parent fields themselves do not seem to be included in the search results, only child fields.
To use the example in the documentation, salesperson_name does not appear in the fields entry for id:test:ad::1 when using query=John, or indeed when retrieving id:test:ad::1 via GET directly.
Here's a simplified configuration for my document model:
search definitions
person.sd - the parent
search person {
document person {
field name type string {
indexing: summary | attribute
}
}
fieldset default {
fields: name
}
}
event.sd - the child
search event {
document event {
field code type string {
indexing: summary | attribute
}
field speaker type reference<person> {
indexing: summary | attribute
}
}
import field speaker.name as name {}
fieldset default {
fields: code
}
}
documents
p1 - person
{
"fields": {
"name": "p1"
}
}
e1 - event
{
"fields": {
"code": "e1",
"speaker": "id:n1:person::1"
}
}
query result
curl -s "http://localhost:8080/search/?yql=select%20*%20from%20sources%20*where%20name%20contains%20%22p1%22%3B" | python -m json.tool
This returns both e1 and p1, as you would expect, given that name is present in both. But the fields of e1 do not include the name.
{
"root": {
"children": [
{
"fields": {
"documentid": "id:n1:person::1",
"name": "p1",
"sddocname": "person"
},
"id": "id:n1:person::1",
"relevance": 0.0017429193899782135,
"source": "music"
},
{
"fields": {
"code": "e1",
"documentid": "id:n1:event::1",
"sddocname": "event",
"speaker": "id:n1:person::1"
},
"id": "id:n1:event::1",
"relevance": 0.0017429193899782135,
"source": "music"
}
],
...
"fields": {
"totalCount": 2
},
}
}

Currently you'll need to add the imported 'name' into the default summary by
import field speaker.name as name {}
document-summary default {
summary name type string{}
}
More about explicit document summaries in http://docs.vespa.ai/documentation/document-summaries.html
The result of your query will then return
"children": [
{
"fields": {
"documentid": "id:n1:person::1",
"name": "p1",
"sddocname": "person"
},
"id": "id:n1:person::1",
"relevance": 0.0017429193899782135,
"source": "stuff"
},
{
"fields": {
"code": "e1",
"documentid": "id:n1:event::1",
"name": "p1",
"sddocname": "event",
"speaker": "id:n1:person::1"
},
"id": "id:n1:event::1",
"relevance": 0.0017429193899782135,
"source": "stuff"
}
],
We'll improve the documentation on this. Thanks for the very detailed write-up.

Add "summary" to the indexing statement of the imported field in the parent document type.
E.g in the documentation example change the "name" field in the "salesperson" document type to say "indexing: attribute | summary".

mongodb, update array element in array

i have a trouble.
i need to update value in nected array (array in array).
For example i have document like this:
{
"_id" : ObjectId("59eccf5ea7f6ff30be74d8ce"),
"name" : "some name",
"description" : "some description",
"users" : [
{
"id" : ObjectId("59d1549f4f5c6f6e0f1d6576"),
"technologies" : [
{"id": ObjectId("59450bc718fda360fdf4a719")},
]
},
{
"id": ObjectId("59d1549e4f5c6f6e0f1d6571"),
"technologies": [
{"id": ObjectId("59450f8318fda360fdf4a78b")},
{"id": ObjectId("59450bc718fda360fdf4a719")},
{"id": ObjectId("59450e3f18fda360fdf4a767")}
]
},
{
"id": ObjectId("59d154a44f5c6f6e0f1d65af"),
"technologies": [
ObjectId("59450f8318fda360fdf4a78b")
]
}
]
}
i need to delete exact technology from exact user. i know only:
_id - global document id
userId: 'users.id' element
technologyId: 'users.$.technologies.$.id' id of technology item that should be deleted
documentation of mongo says that i cant use two $ in update statement, but maybe is exists some actions to awoid this?

Try the following:
db.yourColl.update(
{
"_id": ObjectId("59eccf5ea7f6ff30be74d8ce"),
"users.id": ObjectId("59d1549e4f5c6f6e0f1d6571")
},
{
"$pull": {
"users.$.technologies": {
"id": ObjectId("59450bc718fda360fdf4a719")
}
}
}
)
The result should be:
{
"_id" : ObjectId("59eccf5ea7f6ff30be74d8ce"),
"name" : "some name",
"description" : "some description",
"users" : [
{
"id" : ObjectId("59d1549f4f5c6f6e0f1d6576"),
"technologies" : [
{
"id" : ObjectId("59450bc718fda360fdf4a719")
}
]
},
{
"id" : ObjectId("59d1549e4f5c6f6e0f1d6571"),
"technologies" : [
{
"id" : ObjectId("59450f8318fda360fdf4a78b")
},
{
"id" : ObjectId("59450e3f18fda360fdf4a767")
}
]
},
{
"id" : ObjectId("59d154a44f5c6f6e0f1d65af"),
"technologies" : [
ObjectId("59450f8318fda360fdf4a78b")
]
}
]
}

Exact string search in array in Elasticsearch

I want to search exact string in array.
My data in ES is like below:
{ category": [
"abc test"
],
"es_flag": false,
"bullet_points": [],
"content": "",
"description": false }
I have multiple category like "abc test", "new abc test" etc...
I am trying below query but I am getting multiple category result, I was searching for "abc test" but "new abc test" category is also coming in the result.
{
"from": 0,
"size": 30,
"query": {
"bool" : {
"must": [
{ "match_phrase": { "category": "abc test" } }
]
}
},
"sort": [ { "createdAt": { "order": "desc" } } ]
}
Help will be appreciated.

I'm assuming you are using default analyzer. In that case match_phrase against "field": "abc test" will match all documents which will have the fields with adjacent tokens of abc test, including:
new abc test
abc test new
foo abc test bar
And it will not match:
abc new test - query tokens are not adjacent
test abc - query tokens are adjacent, but in the wrong order
What would actually help you is using the keyword analyzer for your field (you either need to build new index from scratch or update your mappings). If you're building from scrach:
curl -XPUT http://localhost:9200/my_index -d '
{
"mappings": {
"categories": {
"properties": {
"category": {
"type": "text",
"analyzer": "keyword"
}
}
}
}
}'
And afterwards you need to use just simple query, e.g. like this (either match or term will do):
curl -XGET http://localhost:9200/my_index/_search -d '
{
"query": {
"match" : {
"message" : "abc test"
}
}
}'

My version of elasticsearch is 6.0.1. I am using this approach:
GET <your index>/_search
{
"query": {
"bool": {
"must": [{
"query_string": {
"query": "category:abc OR category:test"
}
}]
}
},
"sort":[{"createdAt": {
"order": "desc"
}}]
}

Join elasticsearch indices while matching fields in nested/inner objects

I am trying to join 2 elasticsearch indices by using terms filter lookup. I referred to http://www.elasticsearch.org/blog/terms-filter-lookup/ and http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/query-dsl-terms-filter.html. These Examples lookup on an array of fields like "followers" : ["1", "3"] and join works fine for similar data.
My requirement is to join with a field inside an array of objects. When I extend the above example to include an array of objects, my query fails.
Following is the sample data:
PUT /users/user/2 {
"followers" : [
{
"userId":"1",
"username":"abc",
"location":"xyz"
},
{
"userId":"3",
"username":"def",
"location":"xyz"
}
}
]
}
PUT /tweets/tweet/1 {
"user" : "2"
}
PUT /tweets/tweet/2 {
"user" : "1"
}
I am now trying to find tweets that are created by followers of user 2
POST /tweets/_search {
"query" : {
"filtered" : {
"filter" : {
"terms" : {
"user" : {
"index" : "users",
"type" : "user",
"id" : "2",
"path" : "followers.userId"
},
"_cache_key" : "user_2_friends"
}
}
}
}
}
My search results are 0 for above query. I tried 2 other approaches as well 1)declare the followers object as a nested object during mapping and use "nested" in the query, 2)tried to add a match query for followers.userId after giving path as "followers". None yielded results.
Does terms filter lookup support array of objects? Any pointers to solving my problem would be of great help

What you're trying to do worked for me, unless I'm missing something. What version of Elasticsearch are you using? I'm using 1.3.4.
So I created both indices and added the docs you have listed:
curl -XPUT "http://localhost:9200/users"
curl -XPUT "http://localhost:9200/users/user/2 " -d '
{
"followers" : [
{
"userId":"1",
"username":"abc",
"location":"xyz"
},
{
"userId":"3",
"username":"def",
"location":"xyz"
}
]
}'
curl -XPUT "http://localhost:9200/tweets"
curl -XPUT "http://localhost:9200/tweets/tweet/1 " -d'
{
"user" : "2"
}'
curl -XPUT "http://localhost:9200/tweets/tweet/2 " -d'
{
"user" : "1"
}'
then ran your search query:
curl -XPOST "http://localhost:9200/tweets/_search " -d'
{
"query": {
"filtered": {
"filter": {
"terms": {
"user": {
"index": "users",
"type": "user",
"id": "2",
"path": "followers.userId"
},
"_cache_key": "user_2_friends"
}
}
}
}
}'
and got back this result:
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 1,
"hits": [
{
"_index": "tweets",
"_type": "tweet",
"_id": "2",
"_score": 1,
"_source": {
"user": "1"
}
}
]
}
}
Here is the code I used:
http://sense.qbox.io/gist/4a2a2d77d0b6f4502ff6c5022b268acfa65ee6d2

Clear the indices if you have any
curl -XDELETE "http://example.com:9200/currencylookup/"
curl -XDELETE "http://example.com:9200/currency/"
Create the lookup table
curl -XPUT http://example.com:9200/currencylookup/type/2 -d '
{ "conv" : [
{ "currency":"usd","username":"abc", "location":"USA" },
{ "currency":"inr", "username":"def", "location":"India" },
{ "currency":"IDR", "username":"def", "location":"Indonesia" }]
}'
Lets put some dummy docs
curl -XPUT "http://example.com:9200/currency/type/USA" -d '{ "amount":"100", "currency":"usd", "location":"USA" }'
curl -XPUT "http://example.com:9200/currency/type/JPY" -d '{ "amount":"50", "currency":"JPY", "location":"JAPAN" }'
curl -XPUT "http://example.com:9200/currency/type/INR" -d '{ "amount":"50", "currency":"inr", "location":"INDIA" }'
curl -XPUT "http://example.com:9200/currency/type/IDR" -d '{ "amount":"30", "currency" : "IDR", "location": "Indonesia" }'
Time to check the output
curl http://example.com:9200/currency/_search?pretty -d '{
"query" : {
"filtered" : {
"filter" : {
"terms" : {
"currency" : {
"index" : "currencylookup",
"type" : "type",
"id" : "2",
"path" : "conv.currency"
},
"_cache_key" : "currencyexchange"
}
}
}
}
}'
Results
# curl http://example.com:9200/currency/_search?pretty -d '{
"query" : {
"filtered" : {
"filter" : {
"terms" : {
"currency" : {
"index" : "currencylookup",
"type" : "type",
"id" : "2",
"path" : "conv.currency"
},
"_cache_key" : "currencyexchange"
}
}
}
}
}'
{
"took" : 2,
"timed_out" : false,
"_shards" : {
"total" : 5,
"successful" : 5,
"failed" : 0
},
"hits" : {
"total" : 2,
"max_score" : 1.0,
"hits" : [ {
"_index" : "currency",
"_type" : "type",
"_id" : "INR",
"_score" : 1.0,
"_source":{ "amount":"50", "currency":"inr", "location":"INDIA" }
}, {
"_index" : "currency",
"_type" : "type",
"_id" : "USA",
"_score" : 1.0,
"_source":{ "amount":"100", "currency":"usd", "location":"USA" }
} ]
}
}
Conclusion
Capital letters are culprit here.
You can see 'IDR' is in caps so the match is failed for it and 'JPY' is not in look up even if it was there it would not have got matched because it is in caps.
cross matching values must be in small letters or numbers like
eg:
abc
1abc

Develop Reference

c reactjs sql-server angularjs arrays wpf database batch-file google-app-engine silverlight

Elasticsearch : set up parent/child using jdbc-rivers - sql-server

Related

Druid - descending timestamps with groupBy query

How to include imported fields in the search results?

mongodb, update array element in array

Exact string search in array in Elasticsearch

Join elasticsearch indices while matching fields in nested/inner objects

Categories

Resources