Join elasticsearch indices while matching fields in nested/inner objects - arrays

I am trying to join 2 elasticsearch indices by using terms filter lookup. I referred to http://www.elasticsearch.org/blog/terms-filter-lookup/ and http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/query-dsl-terms-filter.html. These Examples lookup on an array of fields like "followers" : ["1", "3"] and join works fine for similar data.
My requirement is to join with a field inside an array of objects. When I extend the above example to include an array of objects, my query fails.
Following is the sample data:
PUT /users/user/2 {
"followers" : [
{
"userId":"1",
"username":"abc",
"location":"xyz"
},
{
"userId":"3",
"username":"def",
"location":"xyz"
}
}
]
}
PUT /tweets/tweet/1 {
"user" : "2"
}
PUT /tweets/tweet/2 {
"user" : "1"
}
I am now trying to find tweets that are created by followers of user 2
POST /tweets/_search {
"query" : {
"filtered" : {
"filter" : {
"terms" : {
"user" : {
"index" : "users",
"type" : "user",
"id" : "2",
"path" : "followers.userId"
},
"_cache_key" : "user_2_friends"
}
}
}
}
}
My search results are 0 for above query. I tried 2 other approaches as well 1)declare the followers object as a nested object during mapping and use "nested" in the query, 2)tried to add a match query for followers.userId after giving path as "followers". None yielded results.
Does terms filter lookup support array of objects? Any pointers to solving my problem would be of great help

What you're trying to do worked for me, unless I'm missing something. What version of Elasticsearch are you using? I'm using 1.3.4.
So I created both indices and added the docs you have listed:
curl -XPUT "http://localhost:9200/users"
curl -XPUT "http://localhost:9200/users/user/2 " -d '
{
"followers" : [
{
"userId":"1",
"username":"abc",
"location":"xyz"
},
{
"userId":"3",
"username":"def",
"location":"xyz"
}
]
}'
curl -XPUT "http://localhost:9200/tweets"
curl -XPUT "http://localhost:9200/tweets/tweet/1 " -d'
{
"user" : "2"
}'
curl -XPUT "http://localhost:9200/tweets/tweet/2 " -d'
{
"user" : "1"
}'
then ran your search query:
curl -XPOST "http://localhost:9200/tweets/_search " -d'
{
"query": {
"filtered": {
"filter": {
"terms": {
"user": {
"index": "users",
"type": "user",
"id": "2",
"path": "followers.userId"
},
"_cache_key": "user_2_friends"
}
}
}
}
}'
and got back this result:
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 1,
"max_score": 1,
"hits": [
{
"_index": "tweets",
"_type": "tweet",
"_id": "2",
"_score": 1,
"_source": {
"user": "1"
}
}
]
}
}
Here is the code I used:
http://sense.qbox.io/gist/4a2a2d77d0b6f4502ff6c5022b268acfa65ee6d2

Clear the indices if you have any
curl -XDELETE "http://example.com:9200/currencylookup/"
curl -XDELETE "http://example.com:9200/currency/"
Create the lookup table
curl -XPUT http://example.com:9200/currencylookup/type/2 -d '
{ "conv" : [
{ "currency":"usd","username":"abc", "location":"USA" },
{ "currency":"inr", "username":"def", "location":"India" },
{ "currency":"IDR", "username":"def", "location":"Indonesia" }]
}'
Lets put some dummy docs
curl -XPUT "http://example.com:9200/currency/type/USA" -d '{ "amount":"100", "currency":"usd", "location":"USA" }'
curl -XPUT "http://example.com:9200/currency/type/JPY" -d '{ "amount":"50", "currency":"JPY", "location":"JAPAN" }'
curl -XPUT "http://example.com:9200/currency/type/INR" -d '{ "amount":"50", "currency":"inr", "location":"INDIA" }'
curl -XPUT "http://example.com:9200/currency/type/IDR" -d '{ "amount":"30", "currency" : "IDR", "location": "Indonesia" }'
Time to check the output
curl http://example.com:9200/currency/_search?pretty -d '{
"query" : {
"filtered" : {
"filter" : {
"terms" : {
"currency" : {
"index" : "currencylookup",
"type" : "type",
"id" : "2",
"path" : "conv.currency"
},
"_cache_key" : "currencyexchange"
}
}
}
}
}'
Results
# curl http://example.com:9200/currency/_search?pretty -d '{
"query" : {
"filtered" : {
"filter" : {
"terms" : {
"currency" : {
"index" : "currencylookup",
"type" : "type",
"id" : "2",
"path" : "conv.currency"
},
"_cache_key" : "currencyexchange"
}
}
}
}
}'
{
"took" : 2,
"timed_out" : false,
"_shards" : {
"total" : 5,
"successful" : 5,
"failed" : 0
},
"hits" : {
"total" : 2,
"max_score" : 1.0,
"hits" : [ {
"_index" : "currency",
"_type" : "type",
"_id" : "INR",
"_score" : 1.0,
"_source":{ "amount":"50", "currency":"inr", "location":"INDIA" }
}, {
"_index" : "currency",
"_type" : "type",
"_id" : "USA",
"_score" : 1.0,
"_source":{ "amount":"100", "currency":"usd", "location":"USA" }
} ]
}
}
Conclusion
Capital letters are culprit here.
You can see 'IDR' is in caps so the match is failed for it and 'JPY' is not in look up even if it was there it would not have got matched because it is in caps.
cross matching values must be in small letters or numbers like
eg:
abc
1abc

Related

How to remove a document inside an array in mongodb using $pull

I have the following document structure
{
"_id" : ObjectId("5ffef283f1f06ff8524aa2c2"),
"applicationName" : "TestApp",
"pName" : "",
"environments" : [],
"stages" : [],
"createdAt" : ISODate("2021-01-15T09:51:35.546Z"),
"workflows" : [
[
{
"pName" : "Test1",
"wName" : "TestApp_Test1",
"agent" : ""
},
{
"pName" : "Test2",
"wName" : "TestApp_Test2",
"agent" : ""
}
],
[
{
"pName" : "Test1",
"wName" : "TestApp_Test1",
"agent" : ""
}
]
],
"updatedAt" : Date(-62135596800000)
}
I wish to remove the occurrences of
{
"pName" : "Test1",
"wName" : "TestApp_Test1",
"agent" : ""
}
The resultant document should look like
{
"_id" : ObjectId("5ffef283f1f06ff8524aa2c2"),
"applicationName" : "TestApp",
"pName" : "",
"environments" : [],
"stages" : [],
"createdAt" : ISODate("2021-01-15T09:51:35.546Z"),
"workflows" : [
[
{
"pName" : "Test2",
"wName" : "TestApp_Test2",
"agent" : ""
}
]
],
"updatedAt" : Date(-62135596800000)
}
I've tried the below mongo query
db.getCollection('workflows').update({_id:ObjectId('5ffef283f1f06ff8524aa2c2')},
{$pull:{workflows: { $elemMatch: {pipelineName: 'Test1'}}}} )
This is removing all the documents from workflows field including Test2 since Test1 is matched.
How can we remove only the entries for Test1 and keep the others?
You can do it using the positional operator "$[]" :
db.getCollection('workflows').update({_id: ObjectId("5ffef283f1f06ff8524aa2c2") }, {$pull: {"workflows.$[]":{pName:"Test1" } } } )
but the schema looks abit strange and after the update you will have empty arrays inside workflows if all elements got deleted in the sub-array.
To fix the empty sub-arrays you will need to perform second operation to remove them:
db.getCollection('workflows').update({_id: ObjectId("5ffef283f1f06ff8524aa2c2") }, {$pull: {"workflows":[] } } )
You cannot use $elemMatch as it returns the first matching element in the array.
I am not sure there is another best way to do this with the provided schema design.
play
db.collection.aggregate({
"$unwind": "$workflows"
},
{
"$unwind": "$workflows"
},
{
"$match": {
"workflows.pName": {
"$ne": "Test1"
}
}
},
{
"$group": {
"_id": "$_id",
workflows: {
$push: "$workflows"
},
applicationName: {
"$first": "$applicationName"
}
}
},
{
"$group": {
"_id": "$_id",
workflows: {
$push: "$workflows"
},
applicationName: {
"$first": "$applicationName"
}
}
})
unwind twice required to de-normalize the data
match to filter out the unnecessary doc
group twice required to bring the required output
You can save this to a collection using $out as last stage.

Can $in and $or replace each other in MongoDB?

Can $in and $or replace each other in MongoDB?
db.restaurants.find(
{
"borough" :{$in :["Staten Island","Queens","Bronx","Brooklyn"]}},
{
"restaurant_id" : 1,
"name":1,"borough":1,
"cuisine" :1
}
);
db.restaurants.find(
{
"borough": "Bronx" ,
$or : [
{ "cuisine" : "American " },
{ "cuisine" : "Chinese" }
]
}
);
Here I observe that both these queries require us to choose from some options:
Does it make sense to replace $in in the first query with $or as follows:
db.restaurants.find(
{ $or: [{ borough: 'Staten Island',
borough: 'Queens',
borough: 'Bronx',
borough: 'Brooklyn' }],
{ _id : 1,
name: 1,
borough : 1,
cuisine : 1
}
})
Are $in and $or replaceable?
Update:
I tried to use two queries in a hope to get identical results:
Why is the second query selecting two rows of status 'D' only?
> db.inventory.find( {status : { $in: [ 'A', 'D'] }}, {item:1, status: 1})
{ "_id" : ObjectId("5eb67598bee5213484d45087"), "item" : "journal", "status" : "A" }
{ "_id" : ObjectId("5eb67598bee5213484d45088"), "item" : "notebook", "status" : "A" }
{ "_id" : ObjectId("5eb67598bee5213484d45089"), "item" : "paper", "status" : "D" }
{ "_id" : ObjectId("5eb67598bee5213484d4508a"), "item" : "planner", "status" : "D" }
{ "_id" : ObjectId("5eb67598bee5213484d4508b"), "item" : "postcard", "status" : "A" }
>
>
> db.inventory.find( {$or: [ {status: 'A', status: 'D'} ] }, {item:1, status: 1})
{ "_id" : ObjectId("5eb67598bee5213484d45089"), "item" : "paper", "status" : "D" }
{ "_id" : ObjectId("5eb67598bee5213484d4508a"), "item" : "planner", "status" : "D" }
>
From their official documentation itself or-versus-in :
When using $or with that are equality checks for the
value of the same field, use the $in operator instead of the $or
operator.
If you've docs like below :
[
{
"price": 100
},
{
"price": 200
},
{
"price": 300
},
{
"price": 400
},
{
"price": 500
}
]
If you wanted to get docs where price is equal to 100 or 500, query like :
db.collection.find({ price: { $in: [ 100, 500 ] } })
By doing like above, query is simple & clean. You can also use $or instead of $in but why would you loose shorthand notation and try to make your query look bulky by adding more objects of same field again and again ?
By default if you wanted to do logical OR on two different operators you would use $or, But when to use $or on same field :
db.collection.find({ $or: [ { price: { $lt: 200 } }, { price: { $gt: 400 } } ] })
As like above when you've multiple different conditions to match on same field you'll use it.
These two queries yield same result when executed but when you use $in - if input values are straight numbers or can be strings or other types where input values will exactly match with values of price field in docs, but when you use $or you're checking for different conditions on same field.
Test : mongoplayground

mongodb, update array element in array

i have a trouble.
i need to update value in nected array (array in array).
For example i have document like this:
{
"_id" : ObjectId("59eccf5ea7f6ff30be74d8ce"),
"name" : "some name",
"description" : "some description",
"users" : [
{
"id" : ObjectId("59d1549f4f5c6f6e0f1d6576"),
"technologies" : [
{"id": ObjectId("59450bc718fda360fdf4a719")},
]
},
{
"id": ObjectId("59d1549e4f5c6f6e0f1d6571"),
"technologies": [
{"id": ObjectId("59450f8318fda360fdf4a78b")},
{"id": ObjectId("59450bc718fda360fdf4a719")},
{"id": ObjectId("59450e3f18fda360fdf4a767")}
]
},
{
"id": ObjectId("59d154a44f5c6f6e0f1d65af"),
"technologies": [
ObjectId("59450f8318fda360fdf4a78b")
]
}
]
}
i need to delete exact technology from exact user. i know only:
_id - global document id
userId: 'users.id' element
technologyId: 'users.$.technologies.$.id' id of technology item that should be deleted
documentation of mongo says that i cant use two $ in update statement, but maybe is exists some actions to awoid this?
Try the following:
db.yourColl.update(
{
"_id": ObjectId("59eccf5ea7f6ff30be74d8ce"),
"users.id": ObjectId("59d1549e4f5c6f6e0f1d6571")
},
{
"$pull": {
"users.$.technologies": {
"id": ObjectId("59450bc718fda360fdf4a719")
}
}
}
)
The result should be:
{
"_id" : ObjectId("59eccf5ea7f6ff30be74d8ce"),
"name" : "some name",
"description" : "some description",
"users" : [
{
"id" : ObjectId("59d1549f4f5c6f6e0f1d6576"),
"technologies" : [
{
"id" : ObjectId("59450bc718fda360fdf4a719")
}
]
},
{
"id" : ObjectId("59d1549e4f5c6f6e0f1d6571"),
"technologies" : [
{
"id" : ObjectId("59450f8318fda360fdf4a78b")
},
{
"id" : ObjectId("59450e3f18fda360fdf4a767")
}
]
},
{
"id" : ObjectId("59d154a44f5c6f6e0f1d65af"),
"technologies" : [
ObjectId("59450f8318fda360fdf4a78b")
]
}
]
}

Finding a document with a specific term on an array

I'm trying to find a document containing a specific term in an array of strings.
I have a schema like this:
{
"pages": {
"mappings":{
"site":{
"properties":{
"urls":{"type":"string"}
}
}
}
}
}
And the following data indexed on it:
% curl -XPOST 'http://local.dev:9200/pages/site/_search?pretty
{
...
"hits" : {
"total" : 1,
"max_score" : 1.0,
"hits" : [ {
"_index" : "pages",
"_type" : "site",
"_id" : "ae634fea-878f-42ca-8239-c67cca007a38",
"_score" : 1.0,
"_source":{ "urls":["https://github.com/fulano","http://fulano.com"] }
}
}
I'm trying to search for sites whose urls array contains a specific url, but I can't make it work. I tried using terms - exactly as described here but I never get any results:
% curl -XPOST 'http://local.dev:9200/pages/site/_search?pretty' -d '
{
"query": {
"filtered": {
"filter": {
"term": { "urls": "https://github.com/fulano" }
}
}
}
}'
{
"hits" : {
"total" : 0,
"max_score" : null,
"hits" : [ ]
}
}
Using terms (that gets expanded into a series of bool operations by elastic):
% curl -XPOST 'http://local.dev:9200/pages/site/_search?pretty' -d '
{
"query": {
"terms" : {
"urls" : ["https://github.com/fulano"]
}
}
}'
{
"hits" : {
"total" : 0,
"max_score" : null,
"hits" : [ ]
}
}
I'm guessing this is something really silly, but I can't spot the problem. :(
This is the problem with the analyzer you are using. You need to use not_analyzed or keyword tokenizer as outlined here.

Elasticsearch : set up parent/child using jdbc-rivers

I am reading data from Sql Server database/table using jdbc-river currently. As of now I have created a individual type for each of the table in my database. As next step in my implementation I would like to use parent/child types so that I can translate the relationship between my sql tables and store them.
Table1
Col_id| name| prop1|prop2|prop3
child_table1
col_id| table_id| child_prop1|child_prop2|child_prop3
curl -XPUT 'localhost:9200/_river/parent/_meta' -d '{
"type" : "jdbc",
"jdbc" : {
"driver" : "com.mysql.jdbc.Driver",
"url" : "jdbc:mysql://localhost:3306/test",
"user" : "",
"password" : "",
"sql" : "select * from table1",
"index" : "index1",
"type" : "parent"
}
}'
curl -XPUT 'localhost:9200/_river/child/_meta' -d '{
"type" : "jdbc",
"jdbc" : {
"driver" : "com.mysql.jdbc.Driver",
"url" : "jdbc:mysql://localhost:3306/test",
"user" : "",
"password" : "",
"sql" : "select * from child_table1",
"index" : "index1",
"type" : "child"
}
}'
curl -XPOST 'localhost:9200/_river/child/_mapping' -d '{
"child":{
"_parent": {"type": "parent"}
}
}'
I would like to store my data in the following format
{
"id": "1",
"name": "A leading wordsmith",
"prop1": "data",
"prop2": "data",
"prop3": "data",
"child": [
{
"child_prop1": "data",
"child_prop2": "data",
"child_prop3": "data",
}
{
"child_prop1": "data1",
"child_prop2": "data1",
"child_prop3": "data1",
}
]
}
Can anyone comment on how can I use jdbc-rivers to store my data as parent/child type for above scenario.
UPDATE
Based on feedback following is the updated mapping & meta.
curl -XPOST 'http://localhost:9200/library' -d '{
"settings": {
"number_of_shards": 1,
"number_of_replicas": 0
},
"mappings": {
"person": {
"properties": {
"person_id": {
"type": "integer"
},
"name": {
"type": "string"
}
}
},
"work": {
"_parent": {
"type": "person"
},
"properties": {
"person_id": {
"type": "integer",
"index": "not_analyzed"
},
"name": {
"type": "string"
},
"genre": {
"type": "string"
},
"publisher": {
"type": "string"
}
}
}
}
}'
curl -XPUT localhost:9200/_river/person/_meta -d '{
"type": "jdbc",
"jdbc": {
"driver": "com.microsoft.sqlserver.jdbc.SQLServerDriver",
"url": "jdbc:sqlserver://127.0.0.1:1433;databaseName=blogcontext",
"user": "sa",
"password": "password",
"sql": "select person_id as _id, name from person",
"poll": "30s"
},
"index": {
"index": "library",
"type": "person",
"bulk_size": 500,
"autocommit": true
}
}'
curl -XPUT localhost:9200/_river/work/_meta -d '{
"type": "jdbc",
"jdbc": {
"driver": "com.microsoft.sqlserver.jdbc.SQLServerDriver",
"url": "jdbc:sqlserver://127.0.0.1:1433;databaseName=blogcontext",
"user": "sa",
"password": "password",
"sql": "select person_id as _parent,name,genre,publisher from work",
"poll": "30s"
},
"index": {
"index": "library",
"type": "work",
"bulk_size": 500,
"autocommit": true
}
}'
Log file
[2014-01-14 07:10:35,488][ERROR][OneShotRiverMouth ] bulk [1] error
org.elasticsearch.ElasticSearchIllegalArgumentException: Can't specify parent if no parent field has been configured
at org.elasticsearch.action.index.IndexRequest.process(IndexRequest.java:597)
at org.elasticsearch.action.bulk.TransportBulkAction.executeBulk(TransportBulkAction.java:165)
at org.elasticsearch.action.bulk.TransportBulkAction.doExecute(TransportBulkAction.java:140)
at org.elasticsearch.action.bulk.TransportBulkAction.doExecute(TransportBulkAction.java:63)
at org.elasticsearch.action.support.TransportAction.execute(TransportAction.java:63)
at org.elasticsearch.client.node.NodeClient.execute(NodeClient.java:92)
at org.elasticsearch.client.support.AbstractClient.bulk(AbstractClient.java:149)
at org.elasticsearch.action.bulk.BulkProcessor.execute(BulkProcessor.java:283)
at org.elasticsearch.action.bulk.BulkProcessor.access$400(BulkProcessor.java:46)
at org.elasticsearch.action.bulk.BulkProcessor$Flush.run(BulkProcessor.java:336)
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:471)
at java.util.concurrent.FutureTask$Sync.innerRunAndReset(FutureTask.java:351)
at java.util.concurrent.FutureTask.runAndReset(FutureTask.java:178)
at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$301(ScheduledThreadPoolExecutor.java:178)
at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:293)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:724)
thanks
Assumed that your tables look like:
table1
table_id| name| prop1|prop2|prop3
child_table1
child_id| table_id| child_prop1|child_prop2|child_prop3
You will need to select your primary row id and named it as "_id", your parent id and named it as "_parent"
curl -XPUT 'localhost:9200/_river/parent/_meta' -d '{
"type" : "jdbc",
"jdbc" : {
"driver" : "com.mysql.jdbc.Driver",
"url" : "jdbc:mysql://localhost:3306/test",
"user" : "",
"password" : "",
"sql" : "select table_id as _id, name, prop1, prop2, prop3 from table1",
"index" : "index1",
"type" : "parent"
}
}'
curl -XPUT 'localhost:9200/_river/child/_meta' -d '{
"type" : "jdbc",
"jdbc" : {
"driver" : "com.mysql.jdbc.Driver",
"url" : "jdbc:mysql://localhost:3306/test",
"user" : "",
"password" : "",
"sql" : "select child_id as _id, table_id as _parent, child_prop1, child_prop2, child_prop3 from child_table1",
"index" : "index1",
"type" : "child"
}
}'
And define the mapping parent/child as you did, then it's done. You can use parent/child queries to query the parent/child data now.
UPDATE:
I already use your newest mapping and create a sample database to import data. Everything work fine, I can index parent/child without any errors.
I'm using ES 0.9.5, jdbc-river 2.2.2.

Resources