Find records that start with specific digits MongoDB from other collection - database

I have two collections that look like this:
Ship_Data
{
_id: ObjectId("63d19d7a1991a09011aa35ef"),
sourcemmsi: 228051000,
navigationalstatus: 0,
course: {
rateofturn: -127,
speedoverground: 0,
courseoverground: 291.5,
trueheading: 511
},
coordinates: {
longitude: -4.4850965,
latitude: 48.38132,
timestamp: 1443650424
}
}
SourceMMSI
{
_id: ObjectId("63d19f671991a09011e4eba4"),
mmsi_code: 228,
country: 'France'
}
The first one (Ship_Data) has informations about specific ships while the second one (SourceMMSI) shows what the first digits of source mmsi code represent. For example in our case since the sourcemmsi field in Ship_Data start with 228, based on the second collection, the ship might be from France.
What I want to accomplish is to create a query using those two collection and returns records based on the first three digits from the sourcemmsi field. Is there any way I can accomplish that?

You can convert the code into strings by $toString (sourcemmsi need an extra conversion to Long first before converting to string). Then, use $indexOfCP to perform substring check and see whether the index is 0. (i.e. at start of the sourcemmsi).
db.SourceMMSI.aggregate([
{
"$lookup": {
"from": "Ship_Data",
"let": {
code: {
"$toString": "$mmsi_code"
}
},
"pipeline": [
{
$match: {
$expr: {
$eq: [
0,
{
"$indexOfCP": [
{
$toString: {
$toLong: "$sourcemmsi"
}
},
"$$code"
]
}
]
}
}
}
],
"as": "shipDataLookup"
}
}
])
Mongo Playground

Related

MongoDB Aggregation: How to return only the values that don't exist in all documents

Lets say I have an array ['123', '456', '789']
I want to Aggregate and look through every document with the field books and only return the values that are NOT in any documents. For example if '123' is in a document, and '456' is, but '789' is not, it would return an array with ['789'] as it's not included in any books fields in any document.
.aggregate( [
{
$match: {
books: {
$in: ['123', '456', '789']
}
}
},
I don't want the documents returned, but just the actual values that are not in any documents.
Here's one way to scan the entire collection to look for missing book values.
db.collection.aggregate([
{ // "explode" books array to docs with individual book values
"$unwind": "$books"
},
{ // scan entire collection creating set of book values
"$group": {
"_id": null,
"allBooksSet": {
"$addToSet": "$books" // <-- generate set of book values
}
}
},
{
"$project": {
"_id": 0, // don't need this anymore
"missing": { // use $setDifference to find missing values
"$setDifference": [
[ "123", "456", "789" ], // <-- your values go here
"$allBooksSet" // <-- the entire collection's set of book values
]
}
}
}
])
Example output:
[
{
"missing": [ "789" ]
}
]
Try it on mongoplayground.net.
Based on #rickhg12hs's answer, there is another variation replacing $unwind with $reduce, which considered less costly. Two out of Three steps are the same:
db.collection.aggregate([
{
$group: {
_id: null,
allBooks: {$push: "$books"}
}
},
{
$project: {
_id: 0,
allBooksSet: {
$reduce: {
input: "$allBooks",
initialValue: [],
in: {$setUnion: ["$$value", "$$this"]}
}
}
}
},
{
$project: {
missing: {
$setDifference: [["123","456", "789"], "$allBooksSet"]
}
}
}
])
Try it on mongoplayground.net.

Mongo updateMany statement with an inner array of objects to manipulate

I'm struggling to write a Mongo UpdateMany statement that can reference and update an object within an array.
Here I create 3 documents. Each document has an array called innerArray always containing a single object, with a single date field.
use test;
db.innerArrayExample.insertOne({ _id: 1, "innerArray": [ { "originalDateTime" : ISODate("2022-01-01T01:01:01Z") } ]});
db.innerArrayExample.insertOne({ _id: 2, "innerArray": [ { "originalDateTime" : ISODate("2022-01-02T01:01:01Z") } ]});
db.innerArrayExample.insertOne({ _id: 3, "innerArray": [ { "originalDateTime" : ISODate("2022-01-03T01:01:01Z") } ]});
I want to add a new date field, based on the original date field, to end up with this:
{ _id: 1, "innerArray": [ { "originalDateTime" : ISODate("2022-01-01T01:01:01Z"), "copiedDateTime" : ISODate("2022-01-01T12:01:01Z") } ]}
{ _id: 2, "innerArray": [ { "originalDateTime" : ISODate("2022-01-02T01:01:01Z"), "copiedDateTime" : ISODate("2022-01-02T12:01:01Z") } ]}
{ _id: 3, "innerArray": [ { "originalDateTime" : ISODate("2022-01-03T01:01:01Z"), "copiedDateTime" : ISODate("2022-01-03T12:01:01Z") } ]}
In pseudo code I am saying take the originalDateTime, run it through a function and add a related copiedDateTime value.
For my specific use-case, the function I want to run strips the timezone from originalDateTime, then overwrites it with a new one, equivalent to the Java ZonedDateTime function withZoneSameLocal. Aka 9pm UTC becomes 9pm Brussels (therefore effectively 7pm UTC). The technical justification and methodology were answered in another Stack Overflow question here.
The part of the query I'm struggling with, is the part that updates/selects data from an element inside an array. In my simplistic example, for example I have crafted this query, but unfortunately it doesn't work:
This function puts copiedDateTime in the correct place... but doesn't evaluate the commands to manipulate the date:
db.innerArrayExample.updateMany({ "innerArray.0.originalDateTime" : { $exists : true }}, { $set: { "innerArray.0.copiedDateTime" : { $dateFromString: { dateString: { $dateToString: { "date" : "$innerArray.0.originalDateTime", format: "%Y-%m-%dT%H:%M:%S.%L" }}, format: "%Y-%m-%dT%H:%M:%S.%L", timezone: "Europe/Paris" }}});
// output
{
_id: 1,
innerArray: [
{
originalDateTime: ISODate("2022-01-01T01:01:01.000Z"),
copiedDateTime: {
'$dateFromString': {
dateString: { '$dateToString': [Object] },
format: '%Y-%m-%dT%H:%M:%S.%L',
timezone: 'Europe/Paris'
}
}
}
]
}
This simplified query, also has the same issue:
b.innerArrayExample.updateMany({ "innerArray.0.originalDateTime" : { $exists : true }}, { $set: { "innerArray.0.copiedDateTime" : "$innerArray.0.originalDateTime" }});
//output
{
_id: 1,
innerArray: [
{
originalDateTime: ISODate("2022-01-01T01:01:01.000Z"),
copiedDateTime: '$innerArray.0.originalDateTime'
}
]
}
As you can see this issue looks to be separate from the other stack overflow question. Instead of being able changing timezones, it's about getting things inside arrays to update.
I plan to take this query, create 70,000 variations of it with different location/timezone combinations and run it against a database with millions of records, so I would prefer something that uses updateMany instead of using Javascript to iterate over each row in the database... unless that's the only viable solution.
I have tried putting $set in square brackets. This changes the way it interprets everything, evaluating the right side, but causing other problems:
test> db.innerArrayExample.updateMany({ "_id" : 1 }, [{ $set: { "innerArray.0.copiedDateTime" : "$innerArray.0.originalDateTime" }}]);
//output
{
_id: 1,
innerArray: [
{
'0': { copiedDateTime: [] },
originalDateTime: ISODate("2022-01-01T01:01:01.000Z")
}
]
}
Above it seems to interpret .0. as a literal rather than an array element. (For my needs I know the array only has 1 item at all times). I'm at a loss finding an example that meets my needs.
I have also tried experimenting with the arrayFilters, documented on my mongo updateMany documentation but I cannot fathom how it works with objects:
test> db.innerArrayExample.updateMany(
... { },
... { $set: { "innerArray.$[element].copiedDateTime" : "$innerArray.$[element].originalDateTime" } },
... { arrayFilters: [ { "originalDateTime": { $exists: true } } ] }
... );
MongoServerError: No array filter found for identifier 'element' in path 'innerArray.$[element].copiedDateTime'
test> db.innerArrayExample.updateMany(
... { },
... { $set: { "innerArray.$[0].copiedDateTime" : "$innerArray.$[element].originalDateTime" } },
... { arrayFilters: [ { "0.originalDateTime": { $exists: true } } ] }
... );
MongoServerError: Error parsing array filter :: caused by :: The top-level field name must be an alphanumeric string beginning with a lowercase letter, found '0'
If someone can help me understand the subtleties of the Mongo syntax and help me back on to the right path I'd be very grateful.
You want to be using pipelined updates, the issue you're having with the syntax you're using is that it does not allow the usage of aggregation operators and document field values.
Here is a quick example on how to do it:
db.collection.updateMany({},
[
{
"$set": {
"innerArray": {
$map: {
input: "$innerArray",
in: {
$mergeObjects: [
"$$this",
{
copiedDateTime: "$$this.originalDateTime"
}
]
}
}
}
}
}
])
Mongo Playground

Mongodb find distinct value with range

I have a mongodb database where I have two collections - elements and properties. The elements look something like this -
{ _id: "someElementId", name: "Iron", type: "metal" }
And the properties look like this -
{ _id: "somePropertyId", propName: "molecular weight", propType: "number", unit: null }
Each element can have multiple properties and a value corresponding to the property. For example iron can have properties molecular weight, color, atomic weight etc.
For that I created another collection where I stored the element id and corresponding property id and its value -
{ elementId: "someElementId", propId: "somePropertyId", value: 55.845 }
Now I want to find the names of all the unique properties and the range of their values that occurs in the database. So for example, if 1 is the lowest value corresponding the above property and 100 is the highest value, I want something like -
[ { name: "molecular weight", range: { min: 1, max: 100 } } ]
I can get the distinct properties and iterate over them to get the range, but I was wondering if there's a better way. Or maybe this table structure is not efficient enough?
Please check if this would work for you:
db.collection.aggregate([
{
$group: {
_id: "$propId",
min: {
$min: "$value"
},
max: {
$max: "$value"
}
}
},
{
$lookup: {
from: "properties",
localField: "_id",
foreignField: "_id",
as: "name"
}
},
{
$unwind: "$name"
},
{
$project: {
_id: 0,
"name": "$name.propName",
"range": {
"min": "$min",
"max": "$max"
}
}
}
])
Mongo Playground

MongoDB sorting data fails

im trying to sort around 40k objects in mongo, what i have is two collections, one of comics and other of characters, characters have a field inside with an array of comic ids where they appear. What i want is a pipeline for the aggregation framework that retrieves the comic with the strongest characters (sum of the strength of each character). I am capable of getting the list of comics with the sum of the strength of each character, however when i try to sort it, the database keeps waiting and everything ends up in a timeout. What am i doing wrong?
Characters model:
{
_id: number,
name: string,
info: {
alignment: string // can be "good" or "bad"
}
stats: {
strength: number
},
comics: [] //array of numbers referencing the id of the comic
}
Comics model:
{
_id: number,
name: string
}
And here my query:
db.comics.aggregation(
{
$lookup: {
from: 'characters',
let: {
comic_id: '$_id',
},
as: 'total_comic_str',
pipeline: [
{
$match: {
$expr: {
$and: [
{$in: ['$$comic_id', '$comics']}, // the character is from this comic
{$eq: ['$info.alignment', 'good']} // the character is a hero
]
}
}
},
{
$group: { // group by comic id and accumulate strength of each hero
_id: '$$comic_id',
str: {
$sum: '$stats.strength'
}
}
}
]
}
},
{
$unwind: {
path: '$total_comic_str',
preserveNullAndEmptyArrays: false
}
},
{
$sort: {
'total_comic_str.str': -1
}
},
{
$limit: 1
}
)
You are facing a cursor timeout.
When you have a query cursor (like what returns by find()) you can set noCursorTimeout() (which is generally not a good practice) to prevent the issue.
But when using an aggregation, the Cursor type is different so there is no noCursorTimeout.
As a solution, you can use the $out pipeline to store aggregation result into a temporary collection, then working with the generated collection as you wish.
$lookup with pipeline has shown to have performance issues for large collections
So I would suggest using just the $lookup without pipeline. This will work for your particular dataset that have relatively large characters collection and presumably smaller comics arrays
First, it's better to index what you are going to use in $lookup, so you should add an index for the field comics for this to have a meaningful improvement.
Since the characters will a subdocument array, We are going to use $reduce instead of $group to calculate total strength
Your aggregation pipeline should look like this
[
{
$lookup: {
from: "characters",
localField: "_id", // lookup with _id only we will filter out alignment later
foreignField: "comics",
as: "characters"
}
},
{
$project: {
name: true,
total_strength: {
$reduce: {
input: "$characters",
initialValue: 0,
in: {
$add: [
"$$value",
{
$cond: [
{ $eq: [ "$$this.info.alignment", "good"] }, // calculating only "good" character here
"$$this.stats.strength",
0
]
}
]
}
}
}
}
},
{
$sort: { total_strength: -1 }
},
{
$limit: 1
}
]

MongoDB remove duplicate subdocuments inside array based on a specific field

My documents have the following structure:
{
_id: ObjectId("59303aa1bad1081d4b98d636"),
clear_number: "83490",
items: [
{
name: "83490_1",
file_id: "e7209bbb",
hash: "2f568bb196f74263c64b7cf273f8ceaa",
},
{
name: "83490_2",
file_id: "9a56a935",
hash: "9c6230f7bf19d3f3186c6c3231ac2055",
},
{
name: "83490_2",
file_id: "ce5f6773",
hash: "9c6230f7bf19d3f3186c6c3231ac2055",
}
],
group_id: null
}
How to remove one of two subdocuments with the same items hash?
The following should do the trick if I understand you question correctly:
collection.aggregate({
$unwind: "$items" // flatten the items array
}, {
$group: {
"_id": { "_id": "$_id", "clear_number": "$clear_number", "group_id": "$group_id", "hash": "$items.hash" }, // per each document group by hash value
"items": { $first: "$items" } // keep only the first of all matching ones per group
}
}, {
$group: {
"_id": { "_id": "$_id._id", "clear_number": "$_id.clear_number", "group_id": "$_id.group_id" }, // now let's group everything again without the hashes
"items": { $push: "$items" } // push all single items into the "items" array
}
}, {
$project: { // this is just to restore the original document layout
"_id": "$_id._id",
"clear_number": "$_id.clear_number",
"group_id": "$_id.group_id",
"items": "$items"
}
})
In response to your comment I would suggest the following query to get the list of all document ids that contain duplicate hashes:
collection.aggregate({
$addFields: {
"hashes": {
$setUnion: [
[ { $size: "$items.hash" } ], // total number of hashes
[ { $size: { $setUnion: "$items.hash" } } ] // number of distinct hashes
]
}
}
}, {
$match:
{
"hashes.1": { $exists: true } // find all documents with a different value for distinct vs total number of hashes
}
}, {
$project: { _id: 1 } // only return _id field
})
There might be different approaches but this one seems pretty straight forward:
Basically, in the $addFields part, for each document, we first create an array consisting of two numbers:
the total number of hashes
the number of distinct hashes
Then we drive this array of two numbers through a $setUnion. After this step there can
either be two different numbers left in the array in which case the hash field does contain duplicates
or there is only one element left, in which case the number of distinct hashes equals the total number of hashes (so there are no duplicates).
We can check if there are two items in the array by testing if the element at position 1 (arrays are zero-based!) exists. That's what the $match stage does.
And the final $project stage is just to limit the output to the _id field only.

Resources