Hierachically flatten MongoDB collection of documents with arrays into documents - arrays

Block model (which goes on block 0 -> block 1 -> block 2 -> block 3 -> […]):
Example input document [700+ of these in the modulestore.structures collection]:
{
_id: ObjectId('5932d50ff8f46c0a8098ab79'),
blocks: [
{
definition: ObjectId('5923556ef8f46c0a787e9c0f'),
block_type: 'chapter',
block_id: '5b053a7f10ba41df85a3221c3ef3956e',
fields: {
format: 'Foo exam',
children: [
[
'sequential',
'9f1e58553ad448818ec8e7915d3d94d3'
],
[
'sequential',
'f052c7aa44274769a4631e95405834e0'
]
]
}
},
{
definition: ObjectId('59235569f8f46c0a7be1debc'),
block_type: 'sequential',
block_id: '9f1e58553ad448818ec8e7915d3d94d3',
fields: {
display_name: 'FooBar'
}
},
{
definition: ObjectId('59317406f8f46c0a8098aaf5'),
block_type: 'sequential',
block_id: 'f052c7aa44274769a4631e95405834e0',
fields: {
display_name: 'CanHaz'
}
}
]
}
My goal here is to:
flatten out the blocks so all blocks are at the collection level;
cursor the children array for traversal;
walk and amend the 'tree' such that each child/grandchild/great-grandchild/*-child gets a new property top_ancestor_fields containing the fields property from their topmost ancestor.
Example output:
[
{
_id: ObjectId('5a00f611f995363c2b63c9a6'),
block_type: 'chapter',
block_id: '5b053a7f10ba41df85a3221c3ef3956e',
fields: {
format: 'Foo exam'
children: [
[
'sequential',
'9f1e58553ad448818ec8e7915d3d94d3'
],
[
'sequential',
'f052c7aa44274769a4631e95405834e0'
]
]
},
top_ancestor_fields: {
format: 'Foo exam'
}
},
{
_id: ObjectId('5a00f611f995363c2b63c9a7'),
block_id: '9f1e58553ad448818ec8e7915d3d94d3',
block_type: 'sequential',
fields: {
display_name: 'FooBar'
},
top_ancestor_fields: {
format: 'Foo exam'
}
},
{
_id: ObjectId('5a00f611f995363c2b63c9a8'),
block_id: 'f052c7aa44274769a4631e95405834e0',
block_type: 'sequential',
fields: {
display_name: 'CanHaz'
},
top_ancestor_fields: {
format: 'Foo exam'
}
},
]
Almost have it working based off #neil-lunn's suggestion:
db.modulestore.structures.aggregate([
{ $unwind: '$blocks' },
{ $project: { _id: 0,
block_id: '$blocks.block_id',
children: '$blocks.fields.children',
display_name: '$blocks.fields.display_name',
block_type: '$blocks.block_type',
exam: '$blocks.fields.format',
fields: '$blocks.fields'
}},
{ $out: 'modulestore.mapped0' }
])
db.modulestore.mapped0.aggregate([
{ $graphLookup: {
from: 'modulestore.mapped0',
startWith: '$block_id',
connectToField: 'children',
connectFromField: 'block_id',
as: 'block_ids',
maxDepth: 0
} },
{ $unwind: '$block_ids' },
{ $project: {
name: 1,
_id: 0,
ancestor: '$block_ids.block_id'
} },
{ $out: 'modulestore.mapped1' }
]);
But this just hangs. I've tried configuring maxDepth $graphLookup option. FYI: db.modulestore.mapped0.count() is 80772 for me.
Each document potentially contains a children array with up to 180 elements.
Not sure how to approach this larger pipeline to map children hierarchies…

The following should get you started:
db.modulestore.structures.aggregate([{
$unwind: '$blocks' // flatten "blocks" array
}, {
$replaceRoot: { // move "blocks" field to top level
newRoot: "$blocks"
}
}, {
$unwind: { // flatten "fields.children" array
path: "$fields.children",
preserveNullAndEmptyArrays: true
}
}, {
// this step is technically not needed but it might speed up things - try running with and without that
$addFields: { // we only keep the second (last, really) entry of all your arrays since this is the only valid join key for the graphLookup
"fields.children": {
$slice: [ "$fields.children", -1 ]
}
}
}, {
$unwind: { // flatten "fields.children" array one more time because it was nested before
path: "$fields.children",
preserveNullAndEmptyArrays: true
}
}, {
$group: { // reduce the number of lookups required later by eliminating duplicate parent-child paths
"_id": "$block_id",
"block_type": { $first: "$block_type" },
"definition": { $first: "$definition" },
"fieldsFormat": { $first: "$fields.format" },
"fieldsChildren": { $addToSet: "$fields.children" }
}
}, {
$project: { // restore original structure
"block_id": "$_id",
"block_type": "$block_type",
"definition": "$definition",
"fields": {
"format": "$fieldsFormat",
"children": "$fieldsChildren"
}
}
}, { // spit out the result into "modulestore.mapped0" collection, overwriting all existing content
$out: 'modulestore.mapped0'
}])
and then
db.modulestore.mapped0.aggregate([{
$graphLookup: {
from: 'modulestore.mapped0',
startWith: '$block_id',
connectToField: 'fields.children',
connectFromField: 'block_id',
as: 'block_ids',
maxDepth: 0
}
}, {
$lookup: {
from: 'modulestore.mapped0',
localField: 'block_ids.fields.children',
foreignField: '_id',
as: 'block_ids.fields.children'
}
}])

Partial solution [gist]:
def update_descendants(modulestore, blocks, ancestor_fields):
"""
:keyword modulestore: modulestore containing the blocks
:type modulestore: ``Collection``
:keyword blocks: iterator over the blocks (collections within modulestore)
:type blocks: ``Cursor`` | `tuple`
:keyword ancestor_fields: fields of the top most ancestor
:type ancestor_fields: ``dict``
"""
for block in blocks:
modulestore.replace_one({'block_id': block['block_id'],
'block_type': block['block_type']},
update_d(block, add={'ancestor_fields': ancestor_fields},
rm=('_id',)))
update_descendants.counter += 1
print 'Updated:', update_descendants.counter
if 'children' in block and block['children']:
for block_type, block_id in block['children']:
update_descendants(modulestore,
modulestore.find({'block_id': block_id,
'block_type': block_type,
'ancestor_fields': {
'$exists': False
}}),
ancestor_fields)
Would prefer a solution that's wholly in the database though, and without all these inefficient queries.

Related

Mongo aggregation framework match a given _id

My model :
const scheduleTaskSchema = new Schema({
activity: { type: Object, required: true },
date: { type: Date, required: true },
crew: Object,
vehicle: Object,
pickups: Array,
details: String,
});
const ScheduleTaskModel = mongoose.model("schedule_task", scheduleTaskSchema),
and this aggregation pipeline :
let aggregation = [
{
$sort: {
"pickups.0.time": 1,
},
},
{
$group: {
_id: "$date",
tasks: { $push: "$$ROOT" },
},
},
{ $sort: { _id: -1 } },
];
if (hasDateQuery) {
aggregation.unshift({
$match: {
date: { $gte: new Date(start_date), $lte: new Date(end_date) },
},
});
} else {
aggregation.push({ $limit: 2 });
}
const scheduledTasksGroups = await ScheduleTaskModel.aggregate(aggregation);
the crew object can have arbitrary number of keys with this structure :
crew : {
drivers: [
{
_id: "656b1e9cf5b894a4f2v643bc",
name: "john"
},
{
_id: "567b1e9cf5b954a4f2c643bhh",
name: "bill"
}
],
officers: [
{
_id: "655b1e9cf5b6632a4f2c643jk",
name: "mark"
},
{
_id: "876b1e9af5b664a4f2c234bb",
name: "jane"
}
],
//...any number of keys that contain an array of objects that all have an _id
}
I'm looking for a way to return all documents (before sorting/grouping) that contain a given _id anywhere within the crew object without knowing which key to search,it can be many different keys that all contain an array of objects that all have an _id
Any ideas ?
You can use $objectToArray for this:
db.collection.aggregate([
{$addFields: {crewFilter: {$objectToArray: "$crew"}}},
{$set: {
crewFilter: {$size: {
$reduce: {
input: "$crewFilter",
initialValue: [],
in: {$concatArrays: [
"$$value",
{$filter: {
input: "$$this.v",
as: "member",
cond: {$eq: ["$$member._id", _id]}
}
}
]
}
}
}}
}},
{$match: {crewFilter: {$gt: 0}}}
])
See how it works on the playground example

aggregation in nodejs resulting in nested json, can I get it without nesting, taking only one data _id from all collections

aggregation in nodejs resulting in nested json, can I get it without nesting, taking only one data _id from all collections. Is there any possibility to get the data without a nested json
I was trying aggregation in nodejs with the below code. I got the output as given in output session below. But I would like to get the output as expected output, since I cant use looping on looping
Student.aggregate([
{
$match: { name: 'abcd'}
},
{
$lookup:{
from:'teachers',
pipeline: [
{
$match: { name: 'pqrs' }
},
{
$project:{
"_id":1
}
}
],
as: "teacherLookup"
}
},
{
$lookup:
{
from:'subjects',
pipeline: [
{
$match: { name: 'computer' }
},
{
$project:{
"_id":1
}
}
],
as: "subjectLookup"
}
}
])
output
[
{
_id: '52301c7878965455d2a4',
teacherLookup: [ '5ea737412589688930' ],
subjectLookup: [ '5ea745821369999917' ]
}
]
I am expecting the output as (without nested json)
[
{
studentId: '5ea1c7878965455d2a4',
teacherId: '5ea737412589688930' ,
subjectId: '5ea745821369999917'
}
]
You can use $arrayElemAt to get the first element from the array.
Student.aggregate([
{
$match: { name: "abcd" },
},
{
$lookup: {
from: "teachers",
pipeline: [
{
$match: { name: "pqrs" },
},
{
$project: {
_id: 1,
},
},
],
as: "teacherId",
},
},
{
$lookup: {
from: "subjects",
pipeline: [
{
$match: { name: "computer" },
},
{
$project: {
_id: 1,
},
},
],
as: "subjectId",
},
},
{
$project: {
teacherId: { $arrayElemAt: ["$teacherId", 0] },
subjectId: { $arrayElemAt: ["subjectId", 0] },
},
}
]);

Lookup VS Lookup with pipeline MongoDB (Performace & How it internally works)

I'm making a blog and have an query about which would give me better performace, simple lookup or lookup with pipeline because sometime simple lookup gave me fast result and sometime pipleline lookup. So, I am bit confused now which one to use or where to use. Suppose I have 2 collection, user and comment collection.
// Users Collection
{
_id: "MONGO_OBJECT_ID",
userName: "Web Alchemist"
}
// Comments Collection
{
_id: "MONGO_OBJECT_ID",
userId: "USER_MONGO_OBJECT_ID",
isActive: "YES", // YES or NO
comment: "xyz"
}
Now I want to Lookup from users collection to comments, which one would be better for this. I made two query which giving me same result.
[
{
$match: { _id: ObjectId("5d68c019c7d56410cc33b01a") }
},
{
$lookup: {
from: "comments",
as: "comments",
localField: "_id",
foreignField: "userId"
}
},
{
$unwind: "$comments"
},
{
$match: {
"comments.isActive": "YES"
}
},
{ $limit: 5},
{
_id: 1, userName: 1, comments: { _id: "$comments._id", comment: "$comments.comment"}
},
{
$group: {
_id: "$_id",
userName: { '$first': '$userName' },
comments: { $addToSet: "comments"}
}
}
]
OR
[
{
$match: { _id: ObjectId("5d68c019c7d56410cc33b01a") }
},
{
$lookup: {
from: "comments",
as: "comments",
let: { userId: "$_id" },
pipeline: [
{
$match: {
$expr: {
$and: [
{ $eq: ['$userId', '$$userId'] },
{ $eq: ['$isActive', 'YES'] }
]
}
}
},
{ limit: 5 },
{
$project: { _id: 1, comment: 1 }
}
]
}
}
]

Updating data type to an Object in mongoDB

I have changed one of the fields of my collection in mongoDB from an array of strings to an array of object containing 2 strings. New documents get inserted without any problem, but when a get method is called to get , querying all the documents I get this error:
Failed to decode 'Students'. Decoding 'photoAddresses' errored
with: readStartDocument can only be called when CurrentBSONType is
DOCUMENT, not when CurrentBSONType is STRING.
photoAddresses is the field that was changed in Students.
I was wondering is there any way to update all the records so they all have the same data type, without losing any data.
The old version of photoAdresses:
"photoAddresses" : ["something","something else"]
This should be updated to the new version like this:
"photoAddresses" : [{photoAddresses:"something"},{photoAddresses:"something else"}]
The following aggregation queries update the string array to object array, only if the array has string elements. The aggregation operator $map is used to map the string array elements to objects. You can use any of the two queries.
db.test.aggregate( [
{
$match: {
$expr: { $and: [ { $isArray: "$photo" },
{ $gt: [ { $size: "$photo" }, 0 ] }
]
},
"photo.0": { $type: "string" }
}
},
{
$project: {
photo: {
$map: {
input: "$photo",
as: "ph",
in: { addr: "$$ph" }
}
}
}
},
] ).forEach( doc => db.test.updateOne( { _id: doc._id }, { $set: { photo: doc.photo } } ) )
The following query works with MongoDB version 4.2+ only. Note the update operation is an aggregation instead of an update. See updateMany.
db.test.updateMany(
{
$expr: { $and: [ { $isArray: "$photo" },
{ $gt: [ { $size: "$photo" }, 0 ] }
]
},
"photo.0": { $type: "string" }
},
[
{
$set: {
photo: {
$map: {
input: "$photo",
as: "ph",
in: { addr: "$$ph" }
}
}
}
}
]
)
[EDIT ADD]: The following query works with version MongoDB 3.4:
db.test.aggregate( [
{
$addFields: {
matches: {
$cond: {
if: { $and: [
{ $isArray: "$photoAddresses" },
{ $gt: [ { $size: "$photoAddresses" }, 0 ] },
{ $eq: [ { $type: { $arrayElemAt: [ "$photoAddresses", 0 ] } }, "string" ] }
] },
then: true,
else: false
}
}
}
},
{
$match: { matches: true }
},
{
$project: {
photoAddresses: {
$map: {
input: "$photoAddresses",
as: "ph",
in: { photoAddresses: "$$ph" }
}
}
}
},
] ).forEach( doc => db.test.updateOne( { _id: doc._id }, { $set: { photoAddresses: doc.photoAddresses } } ) )

mongodb - using join on a local variable

I'm using node.js and mongodb, I have an array of objects which holds the names of an id. Let's say below is my array
let names = [
{ value: 1, text: 'One' },
{ value: 2, text: 'Two' },
{ value: 3, text: 'Three' },
{ value: 4, text: 'Gour' }
]
And this is my query result of a collection using $group which gives me the distinct values.
[
{ _id: { code: '1', number: 5 } },
{ _id: { code: '2', number: 5 } },
{ _id: { code: '3', number: 2 } },
{ _id: { code: '4', number: 22 } },
]
$lookup let's us to join the data from a different collection, but in my case I have an array which holds the text value for each of the codes which I got from the query.
Is there a way we can map the text from the array to the results from mongodb?
Any help will be much appreciated.
EDIT
MongoDB query which I was trying
db.collection.aggregate([
{
$match: {
_Id: id
}
},
{
$lookup: {
localField: "code",
from: names,
foreignField: "value",
as: "renderedNames"
}
},
{
"$group" : {
"_id": {
code: "$code",
number: "$number"
}
}
}
]);
Local variable lives in nodejs app, and mongodb knows nothing about it.
It looks like it belongs to representation layer, where you want to show codes as meaningful names. The mapping should be done there. I believe find is the most suitable here:
names.find(name => name.code === doc._id.code).text
If the names are not truly variable but quite constant, you can move it to own collection, e.g. codeNames:
db.codeNames.insert([
{ _id: "1", text: 'One' },
{ _id: "2", text: 'Two' },
{ _id: "3", text: 'Three' },
{ _id: "4", text: 'Gour' }
]);
and use $lookup as following:
db.collection.aggregate([
{
$match: {
_Id: id
}
},
{
"$group" : {
"_id": {
code: "$code",
number: "$number"
}
}
},
{
$lookup: {
localField: "_id.code",
from: "codeNames",
foreignField: "_id",
as: "renderedNames"
}
}
]);
If none of the above suit your usecase, you can pass the names to the database in each request to map names db-side, but you must be really really sure you cannot use 2 previous options:
db.collection.aggregate([
{
$match: {
_Id: id
}
},
{
"$group" : {
"_id": {
code: "$code",
number: "$number"
}
}
},
{
$project: {
renderedNames: { $filter: {
input: [
{ value: "1", text: 'One' },
{ value: "2", text: 'Two' },
{ value: "3", text: 'Three' },
{ value: "4", text: 'Gour' }
],
as: "name",
cond: { $eq: [ "$$name.value", "$_id.code" ] }
}
}
}
},
]);
As a side note, I find $match: {_Id: id} quite confusing, especially followed by $group. If _Id is _id, it is unique. You can have no more than 1 document after this stage, so there is not too much to group really.

Resources