How to score by max relevance match in array elements in ElasticSearch? - arrays

I have an autocomplete analyser for a field("keywords"). This field is an array of strings. When I query with a search string I want to show first the documents where a single element of the array keywords matches best. The problem is that if a part of the string matches with more elements of the array "keywords", then this document appears before another that has less but better matches. For example, if I have a query with the word "gas station" the returning documents' keywords are these:
"hits": [
{
"_index": "locali_v3",
"_type": "categories",
"_id": "5810767ddc536a03b4761acd",
"_score": 3.1974547,
"_source": {
"keywords": [
"Radio Station",
"Radio Station"
]
}
},
{
"_index": "locali_v3",
"_type": "categories",
"_id": "581076d8dc536a03b4761cc3",
"_score": 3.0407648,
"_source": {
"keywords": [
"Stationery Store",
"Stationery Store"
]
}
},
{
"_index": "locali_v3",
"_type": "categories",
"_id": "5810767ddc536a03b4761ace",
"_score": 2.903595,
"_source": {
"keywords": [
"TV Station",
"TV Station"
]
}
},
{
"_index": "locali_v3",
"_type": "categories",
"_id": "581076cddc536a03b4761c87",
"_score": 2.517158,
"_source": {
"keywords": [
"Praktoreio Ugrwn Kausimwn/Gkaraz",
"Praktoreio Ygrwn Kaysimwn/Gkaraz",
"Praktoreio Ugron Kausimon/Gkaraz",
"Praktoreio Ygron Kaysimon/Gkaraz",
"Πρακτορείο Υγρών Καυσίμων/Γκαράζ",
"Gas Station"
]
}
}
The "Gas Station" is fourth, although it has the best single element matching. Is there a way to tell ElasticSearch that I do not care about how many times "gas" or "station" appears in keywords? I want the max element of the array keywords match as the score factor.
My settings are:
{
"locali": {
"settings": {
"index": {
"creation_date": "1480937810266",
"analysis": {
"filter": {
"autocomplete_filter": {
"type": "edge_ngram",
"min_gram": "1",
"max_gram": "20"
}
},
"analyzer": {
"keywords": {
"filter": [
"lowercase",
"autocomplete_filter"
],
"char_filter": [
"my_char_filter"
],
"type": "custom",
"tokenizer": "standard"
}
},
"char_filter": {
"my_char_filter": {
"type": "mapping",
"mappings": [
"ί => ι",
"Ί => Ι",
"ή => η",
"Ή => Η",
"ύ => υ",
"Ύ => Υ",
"ά => α",
"Ά => Α",
"έ => ε",
"Έ => Ε",
"ό => ο",
"Ό => Ο",
"ώ => ω",
"Ώ => Ω",
"ϊ => ι",
"ϋ => υ",
"ΐ => ι",
"ΰ => υ"
]
}
}
},
"number_of_shards": "1",
"number_of_replicas": "1",
"uuid": "TJjOt9L9QE2HrsUFHM6zJg",
"version": {
"created": "2040099"
}
}
}
}
}
And the mappings:
{
"locali": {
"mappings": {
"places": {
"properties": {
"formattedCategories": {
"properties": {
"english": {
"type": "string"
},
"greek": {
"type": "string"
}
}
},
"keywords": {
"type": "string",
"analyzer": "keywords"
},
"loc": {
"properties": {
"coordinates": {
"type": "geo_point"
}
}
},
"location": {
"properties": {
"formattedAddress": {
"properties": {
"english": {
"type": "string"
},
"greek": {
"type": "string"
}
}
},
"locality": {
"properties": {
"english": {
"type": "string"
},
"greek": {
"type": "string"
}
}
},
"neighbourhood": {
"properties": {
"english": {
"type": "string"
},
"greek": {
"type": "string"
}
}
}
}
},
"name": {
"properties": {
"english": {
"type": "string"
},
"greek": {
"type": "string"
}
}
},
"rating": {
"properties": {
"rating": {
"type": "long"
}
}
},
"seenDetails": {
"type": "long"
},
"verified": {
"type": "long"
}
}
},
"regions": {
"properties": {
"keywords": {
"type": "string",
"analyzer": "keywords"
},
"loc": {
"properties": {
"coordinates": {
"type": "geo_point"
}
}
},
"name": {
"properties": {
"english": {
"type": "string"
},
"greek": {
"type": "string"
}
}
},
"type": {
"type": "long"
},
"weight": {
"type": "long"
}
}
},
"categories": {
"properties": {
"keywords": {
"type": "string",
"analyzer": "keywords"
},
"name": {
"properties": {
"english": {
"type": "string"
},
"greek": {
"type": "string"
}
}
},
"weight": {
"type": "long"
}
}
}
}
}
}

Can you post your query here that you are trying here as well.
I tried your example with the following query
{
"query": {"match": {
"keywords": "gas station"
}
}
}
And i got your desired result.
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 5,
"successful": 5,
"failed": 0
},
"hits": {
"total": 3,
"max_score": 0.081366636,
"hits": [
{
"_index": "stack",
"_type": "type",
"_id": "AVjP6QnpdNp-z_ybGd-L",
"_score": 0.081366636,
"_source": {
"keywords": [
"Praktoreio Ugrwn Kausimwn/Gkaraz",
"Praktoreio Ygrwn Kaysimwn/Gkaraz",
"Praktoreio Ugron Kausimon/Gkaraz",
"Praktoreio Ygron Kaysimon/Gkaraz",
"Πρακτορείο Υγρών Καυσίμων/Γκαράζ",
"Gas Station"
]
}
},
{
"_index": "stack",
"_type": "type",
"_id": "AVjP5-u5dNp-z_ybGd-I",
"_score": 0.03182549,
"_source": {
"keywords": [
"Radio Station",
"Radio Station"
]
}
},
{
"_index": "stack",
"_type": "type",
"_id": "AVjP6KiKdNp-z_ybGd-K",
"_score": 0.03182549,
"_source": {
"keywords": [
"TV Station",
"TV Station"
]
}
}
]
}
}
Try this query to see if you are getting desired result. Also you can reply with your mappings, query and ES version if this does't work for you.
Hope this solves your problem. Thanks

Related

MongoDB How to search in nested objects?

How i can search for the value "20044" in all fields "Barcode" and just in field "Barcode" in a nested object without specifying an absolute path e.g. "Item.Item.Item.Barcode" in MongoDB?
My current solutions:
Search in all text fields, not only in "Barcode" fields
find({$text: {$search: '20044'}})
Search in one specifying absolute path and not in all "Barcode" fields
find({'Item.Item.Item.Barcode': '20044'})
This is my databse object:
{
"_id": {
"$oid": "633d7cc238d7f8dafeace6f5"
},
"Number": "2",
"Item": [
{
"Type": "FrameElement",
"Item": [
{
"Type": "Frame",
"Barcode": "20011"
},
{
"Type": "Frame",
"Barcode": "20012"
},
{
"Type": "SashElement",
"Item": [
{
"Type": "Sash",
"Barcode": "20021"
},
{
"Type": "Sash",
"Barcode": "20022"
},
{
"Type": "GlassBarElement",
"Item": [
{
"Type": "GlassBar",
"Barcode": "20031"
},
{
"Type": "GlassBar",
"Barcode": "20032"
}
]
}
]
},
{
"Type": "Glass",
"Barcode": "20016"
},
{
"Type": "GlassBarElement",
"Item": [
{
"Type": "GlassBar",
"Barcode": "20043"
},
{
"Type": "GlassBar",
"Barcode": "20044"
}
]
}
]
}
]
}

Removing and printing name/value pair from json using jolt

I want to remove a name/value pair from inside a json array and print it outside. I started by trying this and then expanding the whole request to be a json array. The solution mentioned above does not seem to be working.
Input :
[
{
"createdBy": "Admin",
"createdDate": "2022-09-08",
"modifiedBy": "Admin",
"attrs": [
{
"name": "Type",
"value": "Postpaid"
},
{
"name": "subscriber",
"value": "Paid"
},
{
"name": "Details",
"value": {
"createdDate": "today",
"description": "offer",
"id": null
}
}
],
"relatedInfo": [
{
"type": "Number",
"name": "000000"
},
{
"type": "Type",
"name": "Post"
}
]
},
{
"createdBy": "Admin",
"createdDate": "2022-09-08",
"modifiedBy": "Admin",
"attrs": [
{
"name": "Type",
"value": "Postpaid"
},
{
"name": "subscriber",
"value": "Paid"
},
{
"name": "Details",
"value": {
"createdDate": "today",
"description": "offer",
"id": null
}
}
],
"relatedInfo": [
{
"type": "Number",
"name": "000000"
},
{
"type": "Type",
"name": "Post"
}
]
}
]
Desired Output :
[
{
"createdBy": "Admin",
"createdDate": "2022-09-08",
"modifiedBy": "Admin",
"attrs": [
{
"name": "Type",
"value": "Postpaid"
},
{
"name": "subscriber",
"value": "Paid"
}
],
"Details": {
"createdDate": "today",
"description": "offer",
"id": null
},
"relatedInfo": [
{
"type": "Number",
"name": "000000"
},
{
"type": "Type",
"name": "Post"
}
]
},
{
"createdBy": "Admin",
"createdDate": "2022-09-08",
"modifiedBy": "Admin",
"attrs": [
{
"name": "Type",
"value": "Postpaid"
},
{
"name": "subscriber",
"value": "Paid"
}
],
"Details": {
"createdDate": "today",
"description": "offer",
"id": null
},
"relatedInfo": [
{
"type": "Number",
"name": "000000"
},
{
"type": "Type",
"name": "Post"
}
]
}
]
Current Jolt spec:
[
{
"operation": "shift",
"spec": {
"*": "[&]",
"attrs": {
"*": {
"name": {
"*": { "#2": "&4" },
"Details": {
"#(2,value)": "&1"
}
}
}
}
}
}
]
I can't seem to figure out how the jolt spec would change in case of the array
So far so good, just need to combine the attributes at a common node. To do this, I've used the identifiers [&1] and [&5] in order to reach the level of the outermost index within the tree such as
[
{
"operation": "shift",
"spec": {
"*": {
"*": "[&1].&",
"attrs": {
"*": {
"name": {
"*": {
"#2": "[&5].&4"
},
"Details": {
"#(2,value)": "[&5].&1"
}
}
}
}
}
}
}
]

JQ: Remove json objects from array

I have this json file with hundreds of entries that I need to strip from data I do not need.
Snippet:
{
"entries": [
{
"metadata": {
"tags": [
]
},
"sys": {
"space": {
"sys": {
"type": "Link",
"linkType": "Space",
"id": "9kn72w8zc6fh"
}
},
"id": "vcLKKhJ3mZNfGMvVZZi07",
"type": "Entry",
"createdAt": "2021-05-20T15:14:01.358Z",
"updatedAt": "2021-09-20T15:28:30.799Z",
"environment": {
"sys": {
"id": "production",
"type": "Link",
"linkType": "Environment"
}
},
"publishedVersion": 47,
"publishedAt": "2021-09-20T15:28:30.799Z",
"firstPublishedAt": "2021-05-25T10:26:56.722Z",
"createdBy": {
"sys": {
"type": "Link",
"linkType": "User",
"id": "6F84RwUIY9cXNNXBoQemqX"
}
},
"updatedBy": {
"sys": {
"type": "Link",
"linkType": "User",
"id": "6F84RwUIY9cXNNXBoQemqX"
}
},
"publishedCounter": 4,
"version": 48,
"publishedBy": {
"sys": {
"type": "Link",
"linkType": "User",
"id": "6F84RwUIY9cXNNXBoQemqX"
}
},
"contentType": {
"sys": {
"type": "Link",
"linkType": "ContentType",
"id": "page"
}
}
},
"fields": {
"title": {
"de-DE": "Startseite",
"en-US": "Home"
},
"description": {
"en-US": "foo"
},
"keywords": {
"en-US": "bar"
},
"stageModules": {
"en-US": [
{
"sys": {
"type": "Link",
"linkType": "Entry",
"id": "11AfBBuNK8bx3EygAS3WTY"
}
}
]
},
"contentModules": {
"en-US": [
{
"sys": {
"type": "Link",
"linkType": "Entry",
"id": "7uyuyIBsXWApHqpR7Pgkac"
}
},
{
"sys": {
"type": "Link",
"linkType": "Entry",
"id": "4HILHPLjqQkP2H1hA2FeBG"
}
},
{
"sys": {
"type": "Link",
"linkType": "Entry",
"id": "QuwRHL3XMSkguqrL1hUzC"
}
},
{
"sys": {
"type": "Link",
"linkType": "Entry",
"id": "4ZyVef5oWhQWXK9V1lr3vz"
}
}
]
},
"layout": {
"en-US": "Wide"
}
}
}
]
}
From the entries array, I actually only need:
entries.sys.id
entries.sys.contentType.sys.id
entries.fields
I came up with:
jq \
'.entries | .[] .sys, .[] .fields | del(.createdAt, .createdBy, .environment, .firstPublishedAt, .metadata, .publishedAt, .publishedBy, .publishedCounter, .publishedVersion, .space, .type, .updatedAt, .updatedBy, .version)' \
$infile >| $outfile
However, this changes the structure of the document. The entries node is missing (due to the .entries filter):
{
"id": "vcLKKhJ3mZNfGMvVZZi07",
"contentType": {
"sys": {
"type": "Link",
"linkType": "ContentType",
"id": "page"
}
}
}
{
"id": "1UgOmHIvsWrFEf1VCa84kz",
"contentType": {
"sys": {
"type": "Link",
"linkType": "ContentType",
"id": "moduleText"
}
}
}
{
"title": {
"de-DE": "Startseite",
"en-US": "Home"
},
"description": {
"en-US": "Foo"
},
"keywords": {
"en-US": "Bar"
},
"stageModules": {
"en-US": [
{
"sys": {
"type": "Link",
"linkType": "Entry",
"id": "11AfBBuNK8bx3EygAS3WTY"
}
}
]
},
"contentModules": {
"en-US": [
{
"sys": {
"type": "Link",
"linkType": "Entry",
"id": "7uyuyIBsXWApHqpR7Pgkac"
}
},
{
"sys": {
"type": "Link",
"linkType": "Entry",
"id": "4HILHPLjqQkP2H1hA2FeBG"
}
},
{
"sys": {
"type": "Link",
"linkType": "Entry",
"id": "QuwRHL3XMSkguqrL1hUzC"
}
},
{
"sys": {
"type": "Link",
"linkType": "Entry",
"id": "4ZyVef5oWhQWXK9V1lr3vz"
}
}
]
},
"layout": {
"en-US": "Wide"
}
}
I have 2 questions:
How can I delete deeper objects, eg. .entries.sys.space.sys.linkType?
How can I keep the .entries node in the outfile?
Thank you for your help.
If you want full control over the output, I'd just re-create the desired format.
It sounds like you're trying to accieve the following format:
{
"entries": [
{
"sys": {
"id": ...
},
"contentType": {
"sys": {
"id": ...
}
},
"fields": ...
}
}
]
}
We can achieve this by using the following JQ selector:
.entries |= map({ "sys": { "id": .sys.id }, "contentType": { "sys": { "id": .sys.contentType.sys.id } }, fields })
Try it online!

How to construct a JSON based query on nested object having array items with ES search?

[
{
"name": "Document 1",
"tags": {
"typeATags": ["a1"],
"typeBTags": ["b1"],
"typeCTags": ["c1"],
"typeDTags": ["d1"]
}
},
{
"name": "Document 2",
"tags": {
"typeATags": ["a2"],
"typeBTags": ["b1", "b2"],
"typeCTags": ["c2"],
"typeDTags": ["d1", "d2"]
}
},
{
"name": "Document 3",
"tags": {
"typeATags": ["a1", "a2", "a3"],
"typeBTags": ["b1", "b2", "b3"],
"typeCTags": ["c3"],
"typeDTags": ["d1", "d2", "d3"]
}
}
]
How to build a query on ES 6.0,
That will return all the records that has 'a1' and 'b1' tags ? // should return 1,3
That will return all the records that has 'a1' and 'a2' tags combined? // should return 3
That will return all the records that has 'a1' or 'a2' tags ? //should return 1,2,3
That will return all the records that has 'a1' AND ( 'c1' OR 'c3') tags ? //should return 1,2
Thanks #mickl for the Answer
Edit 1:
Here is my actual Schema,
{
"cmslocal": {
"mappings": {
"video": {
"properties": {
"assetProps": {
"properties": {
"assetType": {
"type": "string"
},
"configPath": {
"type": "string",
"fields": {
"raw": {
"type": "string",
"index": "not_analyzed"
}
},
"analyzer": "standard"
},
"contentSha1": {
"type": "string"
},
"originalPath": {
"type": "string",
"fields": {
"raw": {
"type": "string",
"index": "not_analyzed"
}
},
"analyzer": "standard"
},
"path": {
"type": "string",
"fields": {
"raw": {
"type": "string",
"index": "not_analyzed"
}
},
"analyzer": "standard"
},
"thumbnailPath": {
"type": "string",
"fields": {
"raw": {
"type": "string",
"index": "not_analyzed"
}
},
"analyzer": "standard"
}
}
},
"channel": {
"type": "string"
},
"configProps": {
"properties": {
"events": {
"type": "nested",
"include_in_root": true,
"properties": {
"Desc": {
"type": "string"
},
"Tags": {
"type": "string"
},
"UUID": {
"type": "string"
}
}
},
"roiUUID": {
"type": "string"
}
}
},
"contentSha1": {
"type": "string"
},
"eventDesc": {
"type": "string"
},
"ext": {
"type": "string"
},
"format": {
"type": "string"
},
"fovProps": {
"properties": {
"description": {
"type": "string"
},
"width": {
"type": "float"
}
}
},
"locationProps": {
"type": "nested",
"properties": {
"address": {
"type": "string"
},
"city": {
"type": "string"
},
"country": {
"type": "string"
},
"county": {
"type": "string"
},
"location": {
"type": "geo_point"
},
"postcode": {
"type": "string"
},
"state": {
"type": "string"
}
}
},
"nodeid": {
"type": "string"
},
"poleHeight": {
"type": "float"
},
"query": {
"properties": {
"bool": {
"properties": {
"filter": {
"properties": {
"term": {
"properties": {
"nodeid": {
"type": "string"
}
}
}
}
}
}
}
}
},
"retentionPolicy": {
"type": "string"
},
"siteScopeID": {
"type": "string"
},
"tagProps": {
"type": "nested",
"properties": {
"conditions": {
"type": "string"
},
"environment": {
"type": "string"
},
"events": {
"type": "string"
},
"lighting": {
"type": "string"
},
"objects": {
"type": "string"
},
"other": {
"type": "string"
},
"scenes": {
"type": "string"
},
"useCases": {
"type": "string"
},
"weather": {
"type": "string"
}
}
},
"test": {
"type": "string"
},
"title": {
"type": "string"
},
"uploadTime": {
"type": "date",
"format": "strict_date_optional_time||epoch_millis"
},
"videoProps": {
"properties": {
"bitrate": {
"type": "float"
},
"datetime": {
"type": "date",
"format": "date_hour_minute_second_millis"
},
"daySegments": {
"type": "string"
},
"duration": {
"type": "long"
},
"framerate": {
"type": "float"
},
"height": {
"type": "integer"
},
"overlaysOn": {
"type": "boolean"
},
"width": {
"type": "integer"
}
}
}
}
}
}
}
}
Please help to build the query so I can search for
Only nodeId, Only Channel, Date Range
Any of the Tags
I'm able to search for nodeId, Channel Id using
{
"query": {
"bool": {
"filter": [
{ "match": { "nodeid": "N02cff15a" } },
{ "match": { "channel": "1" } }
]
}
}
}
and I can able to Search for tagProps using
{
"nested": {
"path": "tagProps",
"query": {
"bool": {
"must": [
{ "match": { "tagProps.objects": "car" } },
{ "match": { "tagProps.objects": "truck" } }
]
}
}
}
}
Help me combine two queries so I can search for NodeId with Combination of Tags.
Since tags is a nested field you should define nested mapping before indexing your documents.
{
"mappings": {
"your_type": {
"properties": {
"tags": {
"type": "nested"
}
}
}
}
}
Now you can index your data and then you can utilize nested query syntax, for your first use case it's like:
{
"query": {
"nested": {
"path": "tags",
"query": {
"bool": {
"must": [
{ "match": { "tags.typeATags": "a1" }},
{ "match": { "tags.typeBTags": "b1" }}
]
}
}
}
}
}
Next queries can be composed of must and should, like for the last one:
{
"query": {
"nested": {
"path": "tags",
"query": {
"bool": {
"must": [
{ "match": { "tags.typeATags": "a1" }}
],
"should": [
{"match": {"tags.typeCTags": "c1"}},
{"match": {"tags.typeCTags": "c3"}}
]
}
}
}
}
}

Aggregating array of values in elasticsearch

I need to aggregate an array as follows
Two document examples:
{
"_index": "log",
"_type": "travels",
"_id": "tnQsGy4lS0K6uT3Hwzzo-g",
"_score": 1,
"_source": {
"state": "saopaulo",
"date": "2014-10-30T17",
"traveler": "patrick",
"registry": "123123",
"cities": {
"saopaulo": 1,
"riodejaneiro": 2,
"total": 2
},
"reasons": [
"Entrega de encomenda"
],
"from": [
"CompraRapida"
]
}
},
{
"_index": "log",
"_type": "travels",
"_id": "tnQsGy4lS0K6uT3Hwzzo-g",
"_score": 1,
"_source": {
"state": "saopaulo",
"date": "2014-10-31T17",
"traveler": "patrick",
"registry": "123123",
"cities": {
"saopaulo": 1,
"curitiba": 1,
"total": 2
},
"reasons": [
"Entrega de encomenda"
],
"from": [
"CompraRapida"
]
}
},
I want to aggregate the cities array, to find out all the cities the traveler has gone to. I want something like this:
{
"traveler":{
"name":"patrick"
},
"cities":{
"saopaulo":2,
"riodejaneiro":2,
"curitiba":1,
"total":3
}
}
Where the total is the length of the cities array minus 1. I tried the terms aggregation and the sum, but couldn't output the desired output.
Changes in the document structure can be made, so if anything like that would help me, I'd be pleased to know.
in the document posted above "cities" is not a json array , it is a json object.
If changing the document structure is a possibility I would change cities in the document to be an array of object
example document:
cities : [
{
"name" :"saopaulo"
"visit_count" :"2",
},
{
"name" :"riodejaneiro"
"visit_count" :"1",
}
]
You would then need to set cities to be of type nested in the index mapping
"mappings": {
"<type_name>": {
"properties": {
"cities": {
"type": "nested",
"properties": {
"city": {
"type": "string"
},
"count": {
"type": "integer"
},
"value": {
"type": "long"
}
}
},
"date": {
"type": "date",
"format": "dateOptionalTime"
},
"registry": {
"type": "string"
},
"state": {
"type": "string"
},
"traveler": {
"type": "string"
}
}
}
}
After which you could use nested aggregation to get the city count per user.
The query would look something on these lines :
{
"query": {
"match": {
"traveler": "patrick"
}
},
"aggregations": {
"city_travelled": {
"nested": {
"path": "cities"
},
"aggs": {
"citycount": {
"cardinality": {
"field": "cities.city"
}
}
}
}
}
}

Resources