How to replace a value of an array in MongoDB aggregation? - arrays

I'm trying to replace the value only in the aggregation result, not in the stored document, thats why I use an aggregation and not a update.
Suppose these are the documents stored in the collection:
{ "_id": 1, "first_name": "foo1", items: [
{ "item": "gloves", "body": "hands" },
{ "item": "gloves", "body": "hands" },
{ "item": "shirts" , "body": "torso" }
]
},
{ "_id": 2, "first_name": "foo2", items: [
{ "item": "skirts", "where": "legs" },
{ "item": "jeans", "body": "legs" },
{ "item": "socks" , "body": "feet" },
{ "item": "gloves" , "body": "hands" }
]
},
{ "_id": 3, "first_name": "foo3", items: [
{ "item": "jacket", "where": "torso" },
{ "item": "socks", "body": "feet" },
{ "item": "shirts" , "body": "torso" }
]
}
The aggregation must replace "gloves" by "sandals", and it correspondent attribute "body" from "hands" to "feet".
The expected result is:
{ "_id": 1, "first_name": "foo1", items: [
{ "item": "sandals", "body": "feet" },
{ "item": "sandals", "body": "feet" },
{ "item": "shirts" , "body": "torso" }
]
},
{ "_id": 2, "first_name": "foo2", items: [
{ "item": "skirts", "where": "legs" },
{ "item": "jeans", "body": "legs" },
{ "item": "socks" , "body": "feet" },
{ "item": "sandals" , "body": "feet" }
]
},
{ "_id": 3, "first_name": "foo3", items: [
{ "item": "jacket", "where": "torso" },
{ "item": "socks", "body": "feet" },
{ "item": "shirts" , "body": "torso" }
]
}
So far, I've tried using "$unwind" and $project with "$switch", among other approaches, but none of them work.

Related

Performance issue running mongodb aggregation

I need to run a query that joins documents from two collections, I wrote an aggregation query but it takes too much time when running in the production database with many documents. Is there any way to write this query in a more efficient way?
Query in Mongo playground: https://mongoplayground.net/p/dLb3hsJHNYt
There are two collections users and activities. I need to run a query to get some users (from users collection), and also their last activity (from activities collection).
Database:
db={
"users": [
{
"_id": 1,
"email": "user1#gmail.com",
"username": "user1",
"country": "BR",
"creation_date": 1646873628
},
{
"_id": 2,
"email": "user2#gmail.com",
"username": "user2",
"country": "US",
"creation_date": 1646006402
}
],
"activities": [
{
"_id": 1,
"email": "user1#gmail.com",
"activity": "like",
"timestamp": 1647564787
},
{
"_id": 2,
"email": "user1#gmail.com",
"activity": "comment",
"timestamp": 1647564834
},
{
"_id": 3,
"email": "user2#gmail.com",
"activity": "like",
"timestamp": 1647564831
}
]
}
Inefficient Query:
db.users.aggregate([
{
// Get users using some filters
"$match": {
"$expr": {
"$and": [
{ "$not": { "$in": [ "$country", [ "AR", "CA" ] ] } },
{ "$gte": [ "$creation_date", 1646006400 ] },
{ "$lte": [ "$creation_date", 1648684800 ] }
]
}
}
},
{
// Get the last activity within the time range
"$lookup": {
"from": "activities",
"as": "last_activity",
"let": { "cur_email": "$email" },
"pipeline": [
{
"$match": {
"$expr": {
"$and": [
{ "$eq": [ "$email", "$$cur_email" ] },
{ "$gte": [ "$timestamp", 1647564787 ] },
{ "$lte": [ "$timestamp", 1647564834 ] }
]
}
}
},
{ "$sort": { "timestamp": -1 } },
{ "$limit": 1 }
]
}
},
{
// Remove users with no activity
"$match": {
"$expr": {
"$gt": [ { "$size": "$last_activity" }, 0 ] }
}
}
])
Result:
[
{
"_id": 1,
"country": "BR",
"creation_date": 1.646873628e+09,
"email": "user1#gmail.com",
"last_activity": [
{
"_id": 2,
"activity": "comment",
"email": "user1#gmail.com",
"timestamp": 1.647564788e+09
}
],
"username": "user1"
},
{
"_id": 2,
"country": "US",
"creation_date": 1.646006402e+09,
"email": "user2#gmail.com",
"last_activity": [
{
"_id": 3,
"activity": "like",
"email": "user2#gmail.com",
"timestamp": 1.647564831e+09
}
],
"username": "user2"
}
]
I'm more familiar with relational databases, so I'm struggling a little to run this query efficiently.
Thanks!

DataTables - Load data from pre-defined JSON

I have a problem pointing dataTable to the right spot in the JSON. I receive a nested array:
{
"status": "ok",
"count": "7",
"msg ": "Operation Successful",
"data": [{
"contactHasServiceArea": true,
"issueCategories": [{
"id": "8",
"description": "Finance"
},
{
"id": "9",
"description": "Housing"
},
{
"id": "10",
"description": "International"
}
],
"cases": [{
"id": 31645,
"client_name": "Matthew",
"issue": "Assessment Completion",
"referral": null,
"opened_date": "10\/07\/2017",
"case_status": "Open"
}, {
"id": 31668,
"client_name": "Fanky ",
"issue": "Complex",
"referral": null,
"opened_date": "01\/07\/2017",
"case_status": "Open"
}]
}]
}
How do I point to the "cases" object? I'm sure this is simply, but I'm confused by the many options in the dataTables config.
I tried variations of data, dataSrc as well as data.cases or just cases, etc.
Thanks
$('#cases_table').DataTable( {
"ajax": "ajax/getCases",
"dataSrc" : "data.cases",
"data" : "cases",
"columns": [
{ "data": "client_name" },
{ "data": "issue" },
{ "data": "referral" },
{ "data": "opened_date" },
{ "data": "case_status" }
]
} );
You can configure like this:
$('#cases_table').DataTable( {
"ajax": {
"url": "ajax/getCases",
"dataSrc" : "data.cases"
},
"columns": [
{ "data": "client_name" },
{ "data": "issue" },
{ "data": "referral" },
{ "data": "opened_date" },
{ "data": "case_status" }
]
} );
datasrc points into the returns json. Remove the data option.

How to score by max relevance match in array elements in ElasticSearch?

I have an autocomplete analyser for a field("keywords"). This field is an array of strings. When I query with a search string I want to show first the documents where a single element of the array keywords matches best. The problem is that if a part of the string matches with more elements of the array "keywords", then this document appears before another that has less but better matches. For example, if I have a query with the word "gas station" the returning documents' keywords are these:
"hits": [
{
"_index": "locali_v3",
"_type": "categories",
"_id": "5810767ddc536a03b4761acd",
"_score": 3.1974547,
"_source": {
"keywords": [
"Radio Station",
"Radio Station"
]
}
},
{
"_index": "locali_v3",
"_type": "categories",
"_id": "581076d8dc536a03b4761cc3",
"_score": 3.0407648,
"_source": {
"keywords": [
"Stationery Store",
"Stationery Store"
]
}
},
{
"_index": "locali_v3",
"_type": "categories",
"_id": "5810767ddc536a03b4761ace",
"_score": 2.903595,
"_source": {
"keywords": [
"TV Station",
"TV Station"
]
}
},
{
"_index": "locali_v3",
"_type": "categories",
"_id": "581076cddc536a03b4761c87",
"_score": 2.517158,
"_source": {
"keywords": [
"Praktoreio Ugrwn Kausimwn/Gkaraz",
"Praktoreio Ygrwn Kaysimwn/Gkaraz",
"Praktoreio Ugron Kausimon/Gkaraz",
"Praktoreio Ygron Kaysimon/Gkaraz",
"Πρακτορείο Υγρών Καυσίμων/Γκαράζ",
"Gas Station"
]
}
}
The "Gas Station" is fourth, although it has the best single element matching. Is there a way to tell ElasticSearch that I do not care about how many times "gas" or "station" appears in keywords? I want the max element of the array keywords match as the score factor.
My settings are:
{
"locali": {
"settings": {
"index": {
"creation_date": "1480937810266",
"analysis": {
"filter": {
"autocomplete_filter": {
"type": "edge_ngram",
"min_gram": "1",
"max_gram": "20"
}
},
"analyzer": {
"keywords": {
"filter": [
"lowercase",
"autocomplete_filter"
],
"char_filter": [
"my_char_filter"
],
"type": "custom",
"tokenizer": "standard"
}
},
"char_filter": {
"my_char_filter": {
"type": "mapping",
"mappings": [
"ί => ι",
"Ί => Ι",
"ή => η",
"Ή => Η",
"ύ => υ",
"Ύ => Υ",
"ά => α",
"Ά => Α",
"έ => ε",
"Έ => Ε",
"ό => ο",
"Ό => Ο",
"ώ => ω",
"Ώ => Ω",
"ϊ => ι",
"ϋ => υ",
"ΐ => ι",
"ΰ => υ"
]
}
}
},
"number_of_shards": "1",
"number_of_replicas": "1",
"uuid": "TJjOt9L9QE2HrsUFHM6zJg",
"version": {
"created": "2040099"
}
}
}
}
}
And the mappings:
{
"locali": {
"mappings": {
"places": {
"properties": {
"formattedCategories": {
"properties": {
"english": {
"type": "string"
},
"greek": {
"type": "string"
}
}
},
"keywords": {
"type": "string",
"analyzer": "keywords"
},
"loc": {
"properties": {
"coordinates": {
"type": "geo_point"
}
}
},
"location": {
"properties": {
"formattedAddress": {
"properties": {
"english": {
"type": "string"
},
"greek": {
"type": "string"
}
}
},
"locality": {
"properties": {
"english": {
"type": "string"
},
"greek": {
"type": "string"
}
}
},
"neighbourhood": {
"properties": {
"english": {
"type": "string"
},
"greek": {
"type": "string"
}
}
}
}
},
"name": {
"properties": {
"english": {
"type": "string"
},
"greek": {
"type": "string"
}
}
},
"rating": {
"properties": {
"rating": {
"type": "long"
}
}
},
"seenDetails": {
"type": "long"
},
"verified": {
"type": "long"
}
}
},
"regions": {
"properties": {
"keywords": {
"type": "string",
"analyzer": "keywords"
},
"loc": {
"properties": {
"coordinates": {
"type": "geo_point"
}
}
},
"name": {
"properties": {
"english": {
"type": "string"
},
"greek": {
"type": "string"
}
}
},
"type": {
"type": "long"
},
"weight": {
"type": "long"
}
}
},
"categories": {
"properties": {
"keywords": {
"type": "string",
"analyzer": "keywords"
},
"name": {
"properties": {
"english": {
"type": "string"
},
"greek": {
"type": "string"
}
}
},
"weight": {
"type": "long"
}
}
}
}
}
}
Can you post your query here that you are trying here as well.
I tried your example with the following query
{
"query": {"match": {
"keywords": "gas station"
}
}
}
And i got your desired result.
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 3,
"max_score": 0.081366636,
"hits": [
{
"_index": "stack",
"_type": "type",
"_id": "AVjP6QnpdNp-z_ybGd-L",
"_score": 0.081366636,
"_source": {
"keywords": [
"Praktoreio Ugrwn Kausimwn/Gkaraz",
"Praktoreio Ygrwn Kaysimwn/Gkaraz",
"Praktoreio Ugron Kausimon/Gkaraz",
"Praktoreio Ygron Kaysimon/Gkaraz",
"Πρακτορείο Υγρών Καυσίμων/Γκαράζ",
"Gas Station"
]
}
},
{
"_index": "stack",
"_type": "type",
"_id": "AVjP5-u5dNp-z_ybGd-I",
"_score": 0.03182549,
"_source": {
"keywords": [
"Radio Station",
"Radio Station"
]
}
},
{
"_index": "stack",
"_type": "type",
"_id": "AVjP6KiKdNp-z_ybGd-K",
"_score": 0.03182549,
"_source": {
"keywords": [
"TV Station",
"TV Station"
]
}
}
]
}
}
Try this query to see if you are getting desired result. Also you can reply with your mappings, query and ES version if this does't work for you.
Hope this solves your problem. Thanks

Array included in array search with elasticsearch

I have users indexed with categories as follows
{
id: 1
name: John
categories: [
{
id: 1
name: Category 1
},
{
id: 2
name: Category 2
}
]
},
{
id: 2
name: Mark
categories: [
{
id: 1
name: Category 1
}
]
}
And I'm trying to get all the documents with Category 1 or Category 2 with
{
filter:
{
bool: {
must: [
{
terms: {user.categories.id: [1, 2]}
}
]
}
}
}
But It only returns the first document that has the two categories, what I am doing wrong?
As I understood, terms search that one of the values is contained in the field, so for user 1
user.categories.id: [1, 2]
user 2
user.categories.id: [1]
Categoy id 1 is contained in both documents
The best way to handle this is probably with a nested filter. You'll have to specify the "nested" type in your mapping, though.
I can set up an index like this:
PUT /test_index
{
"settings": {
"number_of_shards": 1,
"number_of_replicas": 0
},
"mappings": {
"doc": {
"properties": {
"categories": {
"type": "nested",
"properties": {
"id": {
"type": "long"
},
"name": {
"type": "string"
}
}
},
"id": {
"type": "long"
},
"name": {
"type": "string"
}
}
}
}
}
then add some docs:
PUT /test_index/doc/1
{
"id": 1,
"name": "John",
"categories": [
{ "id": 1, "name": "Category 1" },
{ "id": 2, "name": "Category 2" }
]
}
PUT /test_index/doc/2
{
"id": 2,
"name": "Mark",
"categories": [
{ "id": 1, "name": "Category 1" }
]
}
PUT /test_index/doc/3
{
"id": 3,
"name": "Bill",
"categories": [
{ "id": 3, "name": "Category 3" },
{ "id": 4, "name": "Category 4" }
]
}
Now I can use a nested terms filter like this:
POST /test_index/doc/_search
{
"query": {
"constant_score": {
"filter": {
"nested": {
"path": "categories",
"filter": {
"terms": {
"categories.id": [1, 2]
}
}
}
},
"boost": 1.2
}
}
}
...
{
"took": 3,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 2,
"max_score": 1,
"hits": [
{
"_index": "test_index",
"_type": "doc",
"_id": "1",
"_score": 1,
"_source": {
"id": 1,
"name": "John",
"categories": [
{
"id": 1,
"name": "Category 1"
},
{
"id": 2,
"name": "Category 2"
}
]
}
},
{
"_index": "test_index",
"_type": "doc",
"_id": "2",
"_score": 1,
"_source": {
"id": 2,
"name": "Mark",
"categories": [
{
"id": 1,
"name": "Category 1"
}
]
}
}
]
}
}
Here is the code I used:
http://sense.qbox.io/gist/668aefe910643b52a3a10d40aca67104491668fc

Aggregating array of values in elasticsearch

I need to aggregate an array as follows
Two document examples:
{
"_index": "log",
"_type": "travels",
"_id": "tnQsGy4lS0K6uT3Hwzzo-g",
"_score": 1,
"_source": {
"state": "saopaulo",
"date": "2014-10-30T17",
"traveler": "patrick",
"registry": "123123",
"cities": {
"saopaulo": 1,
"riodejaneiro": 2,
"total": 2
},
"reasons": [
"Entrega de encomenda"
],
"from": [
"CompraRapida"
]
}
},
{
"_index": "log",
"_type": "travels",
"_id": "tnQsGy4lS0K6uT3Hwzzo-g",
"_score": 1,
"_source": {
"state": "saopaulo",
"date": "2014-10-31T17",
"traveler": "patrick",
"registry": "123123",
"cities": {
"saopaulo": 1,
"curitiba": 1,
"total": 2
},
"reasons": [
"Entrega de encomenda"
],
"from": [
"CompraRapida"
]
}
},
I want to aggregate the cities array, to find out all the cities the traveler has gone to. I want something like this:
{
"traveler":{
"name":"patrick"
},
"cities":{
"saopaulo":2,
"riodejaneiro":2,
"curitiba":1,
"total":3
}
}
Where the total is the length of the cities array minus 1. I tried the terms aggregation and the sum, but couldn't output the desired output.
Changes in the document structure can be made, so if anything like that would help me, I'd be pleased to know.
in the document posted above "cities" is not a json array , it is a json object.
If changing the document structure is a possibility I would change cities in the document to be an array of object
example document:
cities : [
{
"name" :"saopaulo"
"visit_count" :"2",
},
{
"name" :"riodejaneiro"
"visit_count" :"1",
}
]
You would then need to set cities to be of type nested in the index mapping
"mappings": {
"<type_name>": {
"properties": {
"cities": {
"type": "nested",
"properties": {
"city": {
"type": "string"
},
"count": {
"type": "integer"
},
"value": {
"type": "long"
}
}
},
"date": {
"type": "date",
"format": "dateOptionalTime"
},
"registry": {
"type": "string"
},
"state": {
"type": "string"
},
"traveler": {
"type": "string"
}
}
}
}
After which you could use nested aggregation to get the city count per user.
The query would look something on these lines :
{
"query": {
"match": {
"traveler": "patrick"
}
},
"aggregations": {
"city_travelled": {
"nested": {
"path": "cities"
},
"aggs": {
"citycount": {
"cardinality": {
"field": "cities.city"
}
}
}
}
}
}

Resources