Mongo DB find value in array of multiple nested arrays - arrays

I need to check if an ObjectId exists in a non nested array and in multiple nested arrays, I've managed to get very close using the aggregation framework, but got stuck in the very last step.
My documents have this structure:
{
"_id" : ObjectId("605ce5f063b1c2eb384c2b7f"),
"name" : "Test",
"attrs" : [
ObjectId("6058e94c3994d04d28639616"),
ObjectId("6058e94c3994d04d28639627"),
ObjectId("6058e94c3994d04d28639622"),
ObjectId("6058e94c3994d04d2863962e")
],
"variations" : [
{
"varName" : "Var1",
"attrs" : [
ObjectId("6058e94c3994d04d28639616"),
ObjectId("6058e94c3994d04d28639627"),
ObjectId("6058e94c3994d04d28639622"),
ObjectId("60591791d4d41d0a6817d23f")
],
},
{
"varName" : "Var2",
"attrs" : [
ObjectId("60591791d4d41d0a6817d22a"),
ObjectId("60591791d4d41d0a6817d255"),
ObjectId("6058e94c3994d04d28639622"),
ObjectId("60591791d4d41d0a6817d23f")
],
},
],
"storeId" : "9acdq9zgke49pw85"
}
Let´s say I need to check if this if this _id exists "6058e94c3994d04d28639616" in all arrays named attrs.
My aggregation query goes like this:
db.product.aggregate([
{
$match: {
storeId,
},
},
{
$project: {
_id: 0,
attrs: 1,
'variations.attrs': 1,
},
},
{
$project: {
attrs: 1,
vars: '$variations.attrs',
},
},
{
$unwind: '$vars',
},
{
$project: {
attr: {
$concatArrays: ['$vars', '$attrs'],
},
},
},
]);
which results in this:
[
{
attr: [
6058e94c3994d04d28639616,
6058e94c3994d04d28639627,
6058e94c3994d04d28639622,
6058e94c3994d04d2863962e,
6058e94c3994d04d28639616,
6058e94c3994d04d28639627,
6058e94c3994d04d28639622,
60591791d4d41d0a6817d23f,
60591791d4d41d0a6817d22a,
60591791d4d41d0a6817d255,
6058e94c3994d04d28639622,
60591791d4d41d0a6817d23f
]
},
{
attr: [
60591791d4d41d0a6817d22a,
60591791d4d41d0a6817d255,
6058e94c3994d04d28639622,
60591791d4d41d0a6817d23f,
6058e94c3994d04d28639624,
6058e94c3994d04d28639627,
6058e94c3994d04d28639628,
6058e94c3994d04d2863963e
]
}
]
Assuming I have two products in my DB, I get this result. Each element in the outermost array is a different product.
The last bit, which is checking for this key "6058e94c3994d04d28639616", I could not find a way to do it with $group, since I dont have keys to group on.
Or with $match, adding this to the end of the aggregation:
{
$match: {
attr: "6058e94c3994d04d28639616",
},
},
But that results in an empty array. I know that $match does not query arrays like this, but could not find a way to do it with $in as well.
Is this too complicated of a Schema? I cannot have the original data embedded, since it is mutable and I would not be happy to change all products if something changed.
Will this be very expensive if I had like 10000 products?
Thanks in advance

You are trying to compare string 6058e94c3994d04d28639616 with ObjectId. Convert the string to ObjectId using $toObjectId operator when perform $match operation like this:
{
$match: {
$expr: {
$in: [{ $toObjectId: "6058e94c3994d04d28639616" }, "$attr"]
}
}
}

Related

Finding documents in mongodb collection by order of elements index of array field

Array field in collection:
"fruits": [ "fruits": [ "fruits": [
{"fruit1": "banana"}, {"fruit2": "apple"}, {"fruit3": "pear"},
{"fruit2": "apple"}, {"fruit4": "orange"}, {"fruit2": "apple"},
{"fruit3": "pear"}, {"fruit1": "banana"}, {"fruit4": "orange"},
{"fruit4": "orange"} {"fruit3": "pear"} {"fruit1": "banana"}
]
I need to find those documents in collections, where "banana" signed before "apple". Does mongodb allows to compare elements in array just like :
if (fruits.indexOf('banana') < fruits.indexOf('apple')) return true;
Or maybe there is any other method to get result i need?
MongoDB's array query operations do not support any positional search as you want.
You can, however, write a $where query to do what you want:
db.yourCollection.find({
$where: function() {
return (this.fruits.indexOf('banana') < this.fruits.indexOf('apple'))
}
})
Be advised though, you won't be able to use indexes here and the performance will be a problem.
Another approach you can take is to rethink the database design, if you can specify what it is you're trying to build, someone can give you specific advise.
One more approach: pre-calculate the boolean value before persisting to DB as a field and query on true / false.
Consider refactoring your schema if possible. The dynamic field names(i.e. fruit1, fruit2...) make it unnecessarily complicated to construct a query. Also, if you require frequent queries by array index, you should probably store your array entries in individual documents with some sort keys to facilitate sorting with index.
Nevertheless, it is achievable through $unwind and $group the documents again. With includeArrayIndex clause, you can get the index inside array.
db.collection.aggregate([
{
"$unwind": {
path: "$fruits",
includeArrayIndex: "idx"
}
},
{
"$addFields": {
fruits: {
"$objectToArray": "$fruits"
}
}
},
{
"$addFields": {
"bananaIdx": {
"$cond": {
"if": {
$eq: [
"banana",
{
$first: "$fruits.v"
}
]
},
"then": "$idx",
"else": "$$REMOVE"
}
},
"appleIdx": {
"$cond": {
"if": {
$eq: [
"apple",
{
$first: "$fruits.v"
}
]
},
"then": "$idx",
"else": "$$REMOVE"
}
}
}
},
{
$group: {
_id: "$_id",
fruits: {
$push: {
"$arrayToObject": "$fruits"
}
},
bananaIdx: {
$max: "$bananaIdx"
},
appleIdx: {
$max: "$appleIdx"
}
}
},
{
$match: {
$expr: {
$lt: [
"$bananaIdx",
"$appleIdx"
]
}
}
},
{
$unset: [
"bananaIdx",
"appleIdx"
]
}
])
Mongo Playground

Mongo updateMany statement with an inner array of objects to manipulate

I'm struggling to write a Mongo UpdateMany statement that can reference and update an object within an array.
Here I create 3 documents. Each document has an array called innerArray always containing a single object, with a single date field.
use test;
db.innerArrayExample.insertOne({ _id: 1, "innerArray": [ { "originalDateTime" : ISODate("2022-01-01T01:01:01Z") } ]});
db.innerArrayExample.insertOne({ _id: 2, "innerArray": [ { "originalDateTime" : ISODate("2022-01-02T01:01:01Z") } ]});
db.innerArrayExample.insertOne({ _id: 3, "innerArray": [ { "originalDateTime" : ISODate("2022-01-03T01:01:01Z") } ]});
I want to add a new date field, based on the original date field, to end up with this:
{ _id: 1, "innerArray": [ { "originalDateTime" : ISODate("2022-01-01T01:01:01Z"), "copiedDateTime" : ISODate("2022-01-01T12:01:01Z") } ]}
{ _id: 2, "innerArray": [ { "originalDateTime" : ISODate("2022-01-02T01:01:01Z"), "copiedDateTime" : ISODate("2022-01-02T12:01:01Z") } ]}
{ _id: 3, "innerArray": [ { "originalDateTime" : ISODate("2022-01-03T01:01:01Z"), "copiedDateTime" : ISODate("2022-01-03T12:01:01Z") } ]}
In pseudo code I am saying take the originalDateTime, run it through a function and add a related copiedDateTime value.
For my specific use-case, the function I want to run strips the timezone from originalDateTime, then overwrites it with a new one, equivalent to the Java ZonedDateTime function withZoneSameLocal. Aka 9pm UTC becomes 9pm Brussels (therefore effectively 7pm UTC). The technical justification and methodology were answered in another Stack Overflow question here.
The part of the query I'm struggling with, is the part that updates/selects data from an element inside an array. In my simplistic example, for example I have crafted this query, but unfortunately it doesn't work:
This function puts copiedDateTime in the correct place... but doesn't evaluate the commands to manipulate the date:
db.innerArrayExample.updateMany({ "innerArray.0.originalDateTime" : { $exists : true }}, { $set: { "innerArray.0.copiedDateTime" : { $dateFromString: { dateString: { $dateToString: { "date" : "$innerArray.0.originalDateTime", format: "%Y-%m-%dT%H:%M:%S.%L" }}, format: "%Y-%m-%dT%H:%M:%S.%L", timezone: "Europe/Paris" }}});
// output
{
_id: 1,
innerArray: [
{
originalDateTime: ISODate("2022-01-01T01:01:01.000Z"),
copiedDateTime: {
'$dateFromString': {
dateString: { '$dateToString': [Object] },
format: '%Y-%m-%dT%H:%M:%S.%L',
timezone: 'Europe/Paris'
}
}
}
]
}
This simplified query, also has the same issue:
b.innerArrayExample.updateMany({ "innerArray.0.originalDateTime" : { $exists : true }}, { $set: { "innerArray.0.copiedDateTime" : "$innerArray.0.originalDateTime" }});
//output
{
_id: 1,
innerArray: [
{
originalDateTime: ISODate("2022-01-01T01:01:01.000Z"),
copiedDateTime: '$innerArray.0.originalDateTime'
}
]
}
As you can see this issue looks to be separate from the other stack overflow question. Instead of being able changing timezones, it's about getting things inside arrays to update.
I plan to take this query, create 70,000 variations of it with different location/timezone combinations and run it against a database with millions of records, so I would prefer something that uses updateMany instead of using Javascript to iterate over each row in the database... unless that's the only viable solution.
I have tried putting $set in square brackets. This changes the way it interprets everything, evaluating the right side, but causing other problems:
test> db.innerArrayExample.updateMany({ "_id" : 1 }, [{ $set: { "innerArray.0.copiedDateTime" : "$innerArray.0.originalDateTime" }}]);
//output
{
_id: 1,
innerArray: [
{
'0': { copiedDateTime: [] },
originalDateTime: ISODate("2022-01-01T01:01:01.000Z")
}
]
}
Above it seems to interpret .0. as a literal rather than an array element. (For my needs I know the array only has 1 item at all times). I'm at a loss finding an example that meets my needs.
I have also tried experimenting with the arrayFilters, documented on my mongo updateMany documentation but I cannot fathom how it works with objects:
test> db.innerArrayExample.updateMany(
... { },
... { $set: { "innerArray.$[element].copiedDateTime" : "$innerArray.$[element].originalDateTime" } },
... { arrayFilters: [ { "originalDateTime": { $exists: true } } ] }
... );
MongoServerError: No array filter found for identifier 'element' in path 'innerArray.$[element].copiedDateTime'
test> db.innerArrayExample.updateMany(
... { },
... { $set: { "innerArray.$[0].copiedDateTime" : "$innerArray.$[element].originalDateTime" } },
... { arrayFilters: [ { "0.originalDateTime": { $exists: true } } ] }
... );
MongoServerError: Error parsing array filter :: caused by :: The top-level field name must be an alphanumeric string beginning with a lowercase letter, found '0'
If someone can help me understand the subtleties of the Mongo syntax and help me back on to the right path I'd be very grateful.
You want to be using pipelined updates, the issue you're having with the syntax you're using is that it does not allow the usage of aggregation operators and document field values.
Here is a quick example on how to do it:
db.collection.updateMany({},
[
{
"$set": {
"innerArray": {
$map: {
input: "$innerArray",
in: {
$mergeObjects: [
"$$this",
{
copiedDateTime: "$$this.originalDateTime"
}
]
}
}
}
}
}
])
Mongo Playground

Mongodb: Query the size of nested arrays

I have the following Schema:
Schema({
caller_address: {
type: String,
required: true,
},
traces: [[{
type: mongoose.Schema.Types.ObjectId,
ref: 'Call',
}]]
});
And I would like to retrieve only the objects that have traces with the Calls amount bigger than a specified number. In other words, the size of at least one nested array of traces should be bigger than a specified number.
I'm trying to use $elemMatch and $size, but no success. For now, I have this code:
CallerTraces.find({ 'traces' : { $elemMatch: { $size : { $gt: minTraceSize } }}})
Where minTraceSize is an int.
Could you guys help me?
I would really appreciate it!
Thanks for the sample data. My answer will be a raw MQL solution, not a mongoose solution, so some translation will be required.
I was able to insert two documents based on your comments in your post. I had to change the ObjectId of one of the two sample documents because your samples had the same primary key value and was generating a duplicate key exception.
Insert Sample Data
db.CallerTraces.insert(
{
"_id": ObjectId("6175e7ecc62cff004462d4a6"),
"traces": [
[
ObjectId("6175e7ecc62cff004462d4a4")
]
],
"caller_address": "0x4e204793bc4b8acee32edaf1fbba1f3ea45f7990"
})
db.CallerTraces.insert(
{
"_id": ObjectId("6175e7ecc62cff004462d4a7"),
"traces": [
[
ObjectId("6175e7ecc62cff004462d4a4"),
ObjectId("6175e7ecc62cff004462d4a4")
],
[
ObjectId("6175e7ecc62cff004462d4a4")
]
],
"caller_address": "0x4e204793bc4b8acee32edaf1fbba1f3ea45f7990"
})
If I want to find records having more than 0 items in the array traces I can issue the following:
Find more than zero traces
db.CallerTraces.find({ $expr: { $gt: [ { $size: "$traces" }, 0 ] } })
This returns the following:
Enterprise replSet [primary] barrydb> db.CallerTraces.find({ $expr: { $gt: [ { $size: "$traces" }, 0 ] } })
[
{
_id: ObjectId("6175e7ecc62cff004462d4a6"),
traces: [ [ ObjectId("6175e7ecc62cff004462d4a4") ] ],
caller_address: '0x4e204793bc4b8acee32edaf1fbba1f3ea45f7990'
},
{
_id: ObjectId("6175e7ecc62cff004462d4a7"),
traces: [
[
ObjectId("6175e7ecc62cff004462d4a4"),
ObjectId("6175e7ecc62cff004462d4a4")
],
[ ObjectId("6175e7ecc62cff004462d4a4") ]
],
caller_address: '0x4e204793bc4b8acee32edaf1fbba1f3ea45f7990'
}
]
Find more than 1 trace
If instead I want to find more than one trace I simply alter the query slightly:
db.CallerTraces.find({ $expr: { $gt: [ { $size: "$traces" }, 1 ] } })
... and this returns with the following results:
Enterprise replSet [primary] barrydb> db.CallerTraces.find({ $expr: { $gt: [ { $size: "$traces" }, 1 ] } })
[
{
_id: ObjectId("6175e7ecc62cff004462d4a7"),
traces: [
[
ObjectId("6175e7ecc62cff004462d4a4"),
ObjectId("6175e7ecc62cff004462d4a4")
],
[ ObjectId("6175e7ecc62cff004462d4a4") ]
],
caller_address: '0x4e204793bc4b8acee32edaf1fbba1f3ea45f7990'
}
]
Conclusion
When attempting to evaluate the length of the array within the query processor we must elect to use the $eval option as the syntax for MQL does not consider your use case. The $eval is somewhat of a catch-all option for things that do not fit nicely in the MQL framework.
UPDATE #1
OP introduced additional requirements. Rather than look at the count of the array, we must consider the count of the array within the array (nested inner array). Since the find() method with the $expr cannot evaluate nested arrays we must instead use the aggregation framework and unwind the outer array. This example stores the original form in a new field called original then replaces root after all the evaluation is complete. Since unwinding can result in duplicates in the pipeline we finalize with a $group to suppress duplicates.
Solution
db.CallerTraces.aggregate([
{
$addFields: {
"original._id": "$_id",
"original.traces": "$traces",
"original.caller_address": "$caller_address"
}
},
{
$unwind: "$traces"
},
{
$match: { $expr: { $gt: [ { $size: "$traces" }, 1 ] } }
},
{
$replaceRoot: { newRoot: "$original" }
},
{
$group:
{
_id: "$_id",
traces: { "$first": "$traces" },
caller_address: { "$first": "$caller_address" }
}
}
])

How to fix MongoDB array concatination error?

I have a collection in mongodb with a few million documents. there is an attribute(categories) that is an array that contains all the categories that a document belongs to. I am using following query to convert the array into a comma separated string to add it to SQL server through a spoon transformation.
for example
the document has ["a","b","c",...] and i need a,b,c,.... so i can pit it in a column
categories: {
$cond: [
{ $eq: [{ $type: "$categories" }, "array"] },
{
$trim: {
input: {
$reduce: {
input: "$categories",
initialValue: "",
in: { $concat: ["$$value", ",", "$$this"] }
}
}
}
},
"$categories"
]
}
when i run the query i get the following error and i cannot figure out what the problem is.
com.mongodb.MongoQueryException: Query failed with error code 16702 and error message '$concat only supports strings, not array' on server
a few documents had this attribute as string and not array so i added a type check. but still the issue is there. any help on how to narrow down the issue will be very appreciated.
A few other attributes were the same in the same collection and this query is working fine for the rest of them.
I don't see any problem in your aggregation. It shouldn't give this error. Can you try to update your mongodb version?
However, your aggregation is not working properly reduce wasn't working . I converted it to this:
db.collection.aggregate([
{
"$project": {
categories: {
$cond: [
{
$eq: [{ $type: "$categories" }, "array"]
},
{
'$reduce': {
'input': '$categories',
'initialValue': '',
'in': {
'$concat': [
'$$value',
{ '$cond': [{ '$eq': ['$$value', ''] }, '', ', '] },
'$$this'
]
}
}
},
"$categories"
]
}
}
}
])
Edit:
So, if you have nested arrays in the categories field. We can flat our arrays with unwind stage. So if you can add these 3 stages above the $project stage. Our aggregation will work.
{
"$unwind": "$categories"
},
{
"$unwind": "$categories"
},
{
"$group": {
_id: null,
categories: {
$push: "$categories"
}
}
},
Playground

MongoDB query to find document with duplicate value in array

tldr; I'm struggling to construct a query to
Make an aggregation to get a count of values on a certain key ("original_text_source"), which
Is in a sub-document that is in an array
Full description
I have embedded documents with arrays that are structured like this:
{
"_id" : ObjectId("0123456789"),
"type" : "some_object",
"relationships" : {
"x" : [ ObjectId("0123456789") ],
"y" : [ ObjectId("0123456789") ],
},
"properties" : [
{
"a" : "1"
},
{
"b" : "1"
},
{
"original_text_source" : "foo.txt"
},
]
}
The docs were created from exactly 10k text files, sorted in various folders. During inserting documents into the MongoDB (in batches) I messed up and moved a few files around, causing one file to be imported twice (my database has a count of exactly 10001 docs), but obviously I don't know which one it is. Since one of the "original_text_source" values has to have a count of 2, I was planning on just deleting one.
I read up on solutions with $elemMatch, but since my array element is a document, I'm not sure how to proceed. Maybe with mapReduce? But I can't transfer the logic to my doc structure.
I also could just create a new collection and reupload all, but in case I mess up again, I'd rather like to learn how to query for duplicates. It seems more elegant :-)
You can find duplicates with a simple aggregation like this:
db.collection.aggregate(
{ $group: { _id: "$properties.original_text_source", docIds: { $push: "$_id" }, docCount: { $sum: 1 } } },
{ $match: { "docCount": { $gt: 1 } } }
)
which gives you something like this:
{
"_id" : [
"foo.txt"
],
"docIds" : [
ObjectId("59d6323613940a78ba1d5ffa"),
ObjectId("59d6324213940a78ba1d5ffc")
],
"docCount" : 2.0
}
Run the following:
db.collection.aggregate([
{ $group: {
_id: { name: "$properties.original_text_source" },
idsForDuplicatedDocs: { $addToSet: "$_id" },
count: { $sum: 1 }
} },
{ $match: {
count: { $gte: 2 }
} },
{ $sort : { count : -1} }
]);
Given a collection which contains two copies of the document you showed in your question, the above command will return:
{
"_id" : {
"name" : [
"foo.txt"
]
},
"idsForDuplicatedDocs" : [
ObjectId("59d631d2c26584cd8b7b3337"),
ObjectId("59d631cbc26584cd8b7b3333")
],
"count" : 2
}
Where ...
The attribute _id.name is the value of the duplicated properties.original_text_source
The attribute idsForDuplicatedDocs contains the _id values for each of the documents which have a duplicated properties.original_text_source
"reviewAndRating": [
{
"review": "aksjdhfkashdfkashfdkjashjdkfhasdkjfhsafkjhasdkjfhasdjkfhsdakfj",
"productId": "5bd956f29fcaca161f6b7517",
"_id": "5bd9745e2d66162a6dd1f0ef",
"rating": "5"
},
{
"review": "aksjdhfkashdfkashfdkjashjdkfhasdkjfhsafkjhasdkjfhasdjkfhsdakfj",
"productId": "5bd956f29fcaca161f6b7518",
"_id": "5bd974612d66162a6dd1f0f0",
"rating": "5"
},
{
"review": "aksjdhfkashdfkashfdkjashjdkfhasdkjfhsafkjhasdkjfhasdjkfhsdakfj",
"productId": "5bd956f29fcaca161f6b7517",
"_id": "5bd974622d66162a6dd1f0f1",
"rating": "5"
}
]

Resources