MongoDB count array in array - arrays

How to realize count array in array using aggregate? I have this document structure:
{
"_id" : 1,
"link" : [
{
"linkHistory" : [
{
"_id" : 1,
},
{
"_id" : 2,
}
]
}
]
}
and MongoDB code:
db.emailGroup.aggregate([
{
"$lookup":
{
"from": "link",
"localField": "_id",
"foreignField": "emailGroupId",
"as": "link"
},
},
{
"$unwind": "$link"
},
{
"$match": {
'link.originalLink': ""
}
},
{
"$group" : {
_id: '$_id',
link: {
$push: '$link'
}
}
}
])
I want to get count in other field for linkHistory. Is this possible?

Related

Merge two objects inside an array, one with a key nested deeper than the other in the same document in Mongo DB

Schema of the document
id is the key to join
{
"foo" : [
{
"properties" : {
"id" : 1
},
},
{
"properties" : {
"id" : 2
},
}],
"bar" : [
{
"id" : 1,
"metadata" : abc
},
{
"id" : 2,
"metadata" : def
}
]
}
Goal
{
"foo" : [
{
"properties" : {
"id" : 1,
"metadata" : abc
},
},
{
"properties" : {
"id" : 2,
"metadata" : def
},
},
}
You can use the $lookup on the schema and with the help of $unwind and $project you will get the desired result.
Query:
db.foo.aggregate({
$lookup: {
from: "bar",
localField: "properties.id",
foreignField: "id",
as: "properties"
},
},
{
$unwind: {
path: "$properties",
}
},
{
$project: {
_id: 0,
properties: 1
}
})
Output:
[
{
"properties": {
"_id": ObjectId("5a934e000102030405000000"),
"id": 1,
"metadata": "abc"
}
},
{
"properties": {
"_id": ObjectId("5a934e000102030405000001"),
"id": 2,
"metadata": "def"
}
}
]

Counting words in the array - How can I query with mongo?

"sourceList": [
{
"source" : "hello world, how are you?",
"_id" : ObjectId("5f0eb9946db57c0007841153")
},
{
"source" : "hello world, I am fine",
"_id" : ObjectId("5f0eb9946db57c0007841153")
},
{
"source" : "Is it raining?",
"_id" : ObjectId("5f0eb9946db57c0007841153")
}
]
Total words in hello world, how are you? = 5, in hello world, I am fine = 5, and in Is it raining?= 3.
Thus the total number of words = 13
Is there a mongo query to do this calculation? I could do this using javascript, but is there a direct way to query via mongo?
EDIT
Is there a way I can do this query across the documents? For documents obeying specific criteria, I want to run a similar calculation with an added constraint, that words of duplicate sentences are not counted twice. For example,
Document - 1
"sourceList": [
{
"source" : "hello world, how are you?",
"_id" : ObjectId("5f0eb9946db57c0007841153")
},
{
"source" : "hello world, I am fine",
"_id" : ObjectId("5f0eb9946db57c0007841153")
},
{
"source" : "Is it raining?",
"_id" : ObjectId("5f0eb9946db57c0007841153")
}
]
Document - 2
"sourceList": [
{
"source" : "hello world, how are you?",
"_id" : ObjectId("5f0eb9946db57c0007841153")
},
{
"source" : "hello world, I am fine",
"_id" : ObjectId("5f0eb9946db57c0007841153")
},
{
"source" : "Is it raining?",
"_id" : ObjectId("5f0eb9946db57c0007841153")
}
]
Here the count still remains the same. The reason being, sentences are exactly same in both the documents. But if we combine Document 1 + Document 3 (given as follows)
"sourceList": [
{
"source" : "Look at the beautiful tiger!",
"_id" : ObjectId("5f0eb9946db57c0007841153")
}
]
The count would come as 13 + 5 (document 3) = 18.
Yes, You can do that with the help of powerful aggregate framework.
mongo play-ground
db.collection.aggregate([
{
"$unwind": "$sourceList" //For each array element
},
{
$project: {
"sp": {
$split: [
"$sourceList.source", //split by spaces
" "
]
}
}
},
{
"$project": {
"sizes": {
"$size": "$sp". //count the words in each array
}
}
},
{
"$group": {
"_id": "$_id",
"count": {
"$sum": "$sizes" //group by id to reverse unwind and add the sizes
}
}
}
])
Update:
play
db.collection.aggregate([
{
"$unwind": "$sourceList"
},
{
$project: {
"sp": {
$split: [
"$sourceList.source",
" "
]
}
}
},
{
"$project": {
"sizes": {
"$size": "$sp"
}
}
},
{
"$group": {
"_id": null,
"count": {
"$sum": "$sizes"
}
}
}
])
For huge collections, you may need to use allowDiskUse but it is very heavy operation for larger collections.
Update:
play
db.collection.aggregate([
{
"$unwind": "$sourceList"
},
{
$project: {
"sp": {
$split: [
"$sourceList.source",
" "
]
}
}
},
{
"$group": {
"_id": null,
"elements": {
$addToSet: "$sp"
}
}
},
{
"$unwind": "$elements"
},
{
"$project": {
"sizes": {
"$size": "$elements"
}
}
},
{
"$group": {
"_id": null,
"count": {
"$sum": "$sizes"
}
}
}
])

lookup on ObjectId's in an array

I have shops collection and user collection with list of shops ids inside of it as strings.
example of shop document:
{
"_id" : ObjectId("5a0c6797fd3eb67969316ce2"),
"picture" : "http://placehold.it/150x150",
"name" : "Genmom",
"email" : "leilaware#genmom.com",
"city" : "Rabat",
"location" : {
"type" : "Point",
"coordinates" : [
-6.79387,
33.83957
]
}
}
example of user collection:
{
"_id" : ObjectId("5c04b943ff491824b806686a"),
"email" : "ayoub.khial#gmail.com",
"password" : "$2a$10$4Wt5Rn6udxREdXCIt3hGb.sKhKUKOlyiYKmLTjYG3SqEPKFSw9phq",
"likedShops" : [
"5a0c6797fd3eb67969316ce2",
"5c07ada8ff49183284e509d1",
"5c07acc1ff49183284e509d0"
],
"dislikedShops" : [ ]
}
I want to return the detail of the likedShops.
You can use below $lookup aggregation
db.users.aggregate([
{ "$lookup": {
"from": "shops",
"let": { "likedShops": "$likedShops" },
"pipeline": [
{ "$match": { "$expr": { "$in": ["$_id", "$$likedShops"] }}}
],
"as": "likedShops"
}}
])
Or if your ids are string then use $toString aggregation with the ObjectIds
db.users.aggregate([
{ "$lookup": {
"from": "shops",
"let": { "likedShops": "$likedShops" },
"pipeline": [
{ "$match": { "$expr": { "$in": [{ "$toString": "$_id" }, "$$likedShops"] }}}
],
"as": "likedShops"
}}
])

Aggregate multiple collections based on student Id

I am trying to aggregate 2 collections in MongoDB based on a student's ID. One collection consists of student personal information, another one consists of the students logs. The issue is that the data is in array which is why I think my aggregation is not working. Any help will be appreciated.
student collection
{
"_id" : ObjectId("(Object ID here"),
"data" : [
{
"name" : "John",
"id" : 1
},
{
"name" : "Sandy",
"id" : 2
}
]
}
logs collection
{
"_id" : ObjectId("(Object ID here"),
"logs" : [
{
"studentId" : 1,
"activity" : "11112,334,123"
},
{
"studentId" : 2,
"activity" : "11112,334,123"
}
]
}
Here is what I have tried:
dbo.collection("student").aggregate([
{ "$lookup": {
"localField": "data.id",
"from": "logs",
"foreignField": "logs.studentId",
"as": "studentInfo"
}
}]).toArray(function(err, results) {
console.log(results);
});
Expected result:
studentinfo: {
id: 1,
name: "John",
activity" : "11112,334,123"
}
You can use below aggregation with mongodb 3.6
So basically your foreign field is an array you need to use $lookup with the pipeline to $unwind the foreign array inside the $lookup pipeline and to match the corresponding ids.
db.students.aggregate([
{ "$lookup": {
"from": "logs",
"let": { "dataId": "$data.id" },
"pipeline": [
{ "$unwind": "$logs" },
{ "$match": { "$expr": { "$in": ["$logs.studentId", "$$dataId"] }}},
{ "$replaceRoot": { "newRoot": "$logs" }}
],
"as": "students"
}}
])
or use this to merge both the arrays
db.students.aggregate([
{ "$lookup": {
"from": "logs",
"let": { "dataId": "$data.id" },
"pipeline": [
{ "$unwind": "$logs" },
{ "$match": { "$expr": { "$in": ["$logs.studentId", "$$dataId"] }}},
{ "$replaceRoot": { "newRoot": "$logs" }}
],
"as": "students"
}},
{ "$project": {
"students": {
"$map": {
"input": "$students",
"in": {
"studentId": "$$this.studentId",
"activity": "$$this.activity",
"name": { "$arrayElemAt": ["$data.name", { "$indexOfArray": ["$data.id", "$$this.studentId"] }]}
}
}
}
}}
])
Output
[
{
"students": [
{
"activity": "11112,334,123",
"name": "John",
"studentId": 1
},
{
"activity": "11112,334,123",
"name": "Sandy",
"studentId": 2
}
]
}
]

how to limit the count of every single array elements in find using $in function in mongodb? [duplicate]

For example, I have these documents:
{
"addr": "address1",
"book": "book1"
},
{
"addr": "address2",
"book": "book1"
},
{
"addr": "address1",
"book": "book5"
},
{
"addr": "address3",
"book": "book9"
},
{
"addr": "address2",
"book": "book5"
},
{
"addr": "address2",
"book": "book1"
},
{
"addr": "address1",
"book": "book1"
},
{
"addr": "address15",
"book": "book1"
},
{
"addr": "address9",
"book": "book99"
},
{
"addr": "address90",
"book": "book33"
},
{
"addr": "address4",
"book": "book3"
},
{
"addr": "address5",
"book": "book1"
},
{
"addr": "address77",
"book": "book11"
},
{
"addr": "address1",
"book": "book1"
}
and so on.How can I make a request, which will describe the top N addresses and the top M books per address?Example of expected result: address1 | book_1: 5 | book_2: 10 | book_3: 50 | total: 65 ______________________ address2 | book_1: 10 | book_2: 10 |... | book_M: 10 | total: M*10... ______________________ addressN | book_1: 20 | book_2: 20 |... | book_M: 20 | total: M*20
TLDR Summary
In modern MongoDB releases you can brute force this with $slice just off the basic aggregation result. For "large" results, run parallel queries instead for each grouping ( a demonstration listing is at the end of the answer ), or wait for SERVER-9377 to resolve, which would allow a "limit" to the number of items to $push to an array.
db.books.aggregate([
{ "$group": {
"_id": {
"addr": "$addr",
"book": "$book"
},
"bookCount": { "$sum": 1 }
}},
{ "$group": {
"_id": "$_id.addr",
"books": {
"$push": {
"book": "$_id.book",
"count": "$bookCount"
},
},
"count": { "$sum": "$bookCount" }
}},
{ "$sort": { "count": -1 } },
{ "$limit": 2 },
{ "$project": {
"books": { "$slice": [ "$books", 2 ] },
"count": 1
}}
])
MongoDB 3.6 Preview
Still not resolving SERVER-9377, but in this release $lookup allows a new "non-correlated" option which takes an "pipeline" expression as an argument instead of the "localFields" and "foreignFields" options. This then allows a "self-join" with another pipeline expression, in which we can apply $limit in order to return the "top-n" results.
db.books.aggregate([
{ "$group": {
"_id": "$addr",
"count": { "$sum": 1 }
}},
{ "$sort": { "count": -1 } },
{ "$limit": 2 },
{ "$lookup": {
"from": "books",
"let": {
"addr": "$_id"
},
"pipeline": [
{ "$match": {
"$expr": { "$eq": [ "$addr", "$$addr"] }
}},
{ "$group": {
"_id": "$book",
"count": { "$sum": 1 }
}},
{ "$sort": { "count": -1 } },
{ "$limit": 2 }
],
"as": "books"
}}
])
The other addition here is of course the ability to interpolate the variable through $expr using $match to select the matching items in the "join", but the general premise is a "pipeline within a pipeline" where the inner content can be filtered by matches from the parent. Since they are both "pipelines" themselves we can $limit each result separately.
This would be the next best option to running parallel queries, and actually would be better if the $match were allowed and able to use an index in the "sub-pipeline" processing. So which is does not use the "limit to $push" as the referenced issue asks, it actually delivers something that should work better.
Original Content
You seem have stumbled upon the top "N" problem. In a way your problem is fairly easy to solve though not with the exact limiting that you ask for:
db.books.aggregate([
{ "$group": {
"_id": {
"addr": "$addr",
"book": "$book"
},
"bookCount": { "$sum": 1 }
}},
{ "$group": {
"_id": "$_id.addr",
"books": {
"$push": {
"book": "$_id.book",
"count": "$bookCount"
},
},
"count": { "$sum": "$bookCount" }
}},
{ "$sort": { "count": -1 } },
{ "$limit": 2 }
])
Now that will give you a result like this:
{
"result" : [
{
"_id" : "address1",
"books" : [
{
"book" : "book4",
"count" : 1
},
{
"book" : "book5",
"count" : 1
},
{
"book" : "book1",
"count" : 3
}
],
"count" : 5
},
{
"_id" : "address2",
"books" : [
{
"book" : "book5",
"count" : 1
},
{
"book" : "book1",
"count" : 2
}
],
"count" : 3
}
],
"ok" : 1
}
So this differs from what you are asking in that, while we do get the top results for the address values the underlying "books" selection is not limited to only a required amount of results.
This turns out to be very difficult to do, but it can be done though the complexity just increases with the number of items you need to match. To keep it simple we can keep this at 2 matches at most:
db.books.aggregate([
{ "$group": {
"_id": {
"addr": "$addr",
"book": "$book"
},
"bookCount": { "$sum": 1 }
}},
{ "$group": {
"_id": "$_id.addr",
"books": {
"$push": {
"book": "$_id.book",
"count": "$bookCount"
},
},
"count": { "$sum": "$bookCount" }
}},
{ "$sort": { "count": -1 } },
{ "$limit": 2 },
{ "$unwind": "$books" },
{ "$sort": { "count": 1, "books.count": -1 } },
{ "$group": {
"_id": "$_id",
"books": { "$push": "$books" },
"count": { "$first": "$count" }
}},
{ "$project": {
"_id": {
"_id": "$_id",
"books": "$books",
"count": "$count"
},
"newBooks": "$books"
}},
{ "$unwind": "$newBooks" },
{ "$group": {
"_id": "$_id",
"num1": { "$first": "$newBooks" }
}},
{ "$project": {
"_id": "$_id",
"newBooks": "$_id.books",
"num1": 1
}},
{ "$unwind": "$newBooks" },
{ "$project": {
"_id": "$_id",
"num1": 1,
"newBooks": 1,
"seen": { "$eq": [
"$num1",
"$newBooks"
]}
}},
{ "$match": { "seen": false } },
{ "$group":{
"_id": "$_id._id",
"num1": { "$first": "$num1" },
"num2": { "$first": "$newBooks" },
"count": { "$first": "$_id.count" }
}},
{ "$project": {
"num1": 1,
"num2": 1,
"count": 1,
"type": { "$cond": [ 1, [true,false],0 ] }
}},
{ "$unwind": "$type" },
{ "$project": {
"books": { "$cond": [
"$type",
"$num1",
"$num2"
]},
"count": 1
}},
{ "$group": {
"_id": "$_id",
"count": { "$first": "$count" },
"books": { "$push": "$books" }
}},
{ "$sort": { "count": -1 } }
])
So that will actually give you the top 2 "books" from the top two "address" entries.
But for my money, stay with the first form and then simply "slice" the elements of the array that are returned to take the first "N" elements.
Demonstration Code
The demonstration code is appropriate for usage with current LTS versions of NodeJS from v8.x and v10.x releases. That's mostly for the async/await syntax, but there is nothing really within the general flow that has any such restriction, and adapts with little alteration to plain promises or even back to plain callback implementation.
index.js
const { MongoClient } = require('mongodb');
const fs = require('mz/fs');
const uri = 'mongodb://localhost:27017';
const log = data => console.log(JSON.stringify(data, undefined, 2));
(async function() {
try {
const client = await MongoClient.connect(uri);
const db = client.db('bookDemo');
const books = db.collection('books');
let { version } = await db.command({ buildInfo: 1 });
version = parseFloat(version.match(new RegExp(/(?:(?!-).)*/))[0]);
// Clear and load books
await books.deleteMany({});
await books.insertMany(
(await fs.readFile('books.json'))
.toString()
.replace(/\n$/,"")
.split("\n")
.map(JSON.parse)
);
if ( version >= 3.6 ) {
// Non-correlated pipeline with limits
let result = await books.aggregate([
{ "$group": {
"_id": "$addr",
"count": { "$sum": 1 }
}},
{ "$sort": { "count": -1 } },
{ "$limit": 2 },
{ "$lookup": {
"from": "books",
"as": "books",
"let": { "addr": "$_id" },
"pipeline": [
{ "$match": {
"$expr": { "$eq": [ "$addr", "$$addr" ] }
}},
{ "$group": {
"_id": "$book",
"count": { "$sum": 1 },
}},
{ "$sort": { "count": -1 } },
{ "$limit": 2 }
]
}}
]).toArray();
log({ result });
}
// Serial result procesing with parallel fetch
// First get top addr items
let topaddr = await books.aggregate([
{ "$group": {
"_id": "$addr",
"count": { "$sum": 1 }
}},
{ "$sort": { "count": -1 } },
{ "$limit": 2 }
]).toArray();
// Run parallel top books for each addr
let topbooks = await Promise.all(
topaddr.map(({ _id: addr }) =>
books.aggregate([
{ "$match": { addr } },
{ "$group": {
"_id": "$book",
"count": { "$sum": 1 }
}},
{ "$sort": { "count": -1 } },
{ "$limit": 2 }
]).toArray()
)
);
// Merge output
topaddr = topaddr.map((d,i) => ({ ...d, books: topbooks[i] }));
log({ topaddr });
client.close();
} catch(e) {
console.error(e)
} finally {
process.exit()
}
})()
books.json
{ "addr": "address1", "book": "book1" }
{ "addr": "address2", "book": "book1" }
{ "addr": "address1", "book": "book5" }
{ "addr": "address3", "book": "book9" }
{ "addr": "address2", "book": "book5" }
{ "addr": "address2", "book": "book1" }
{ "addr": "address1", "book": "book1" }
{ "addr": "address15", "book": "book1" }
{ "addr": "address9", "book": "book99" }
{ "addr": "address90", "book": "book33" }
{ "addr": "address4", "book": "book3" }
{ "addr": "address5", "book": "book1" }
{ "addr": "address77", "book": "book11" }
{ "addr": "address1", "book": "book1" }
Using aggregate function like below :
[
{$group: {_id : {book : '$book',address:'$addr'}, total:{$sum :1}}},
{$project : {book : '$_id.book', address : '$_id.address', total : '$total', _id : 0}}
]
it will give you result like following :
{
"total" : 1,
"book" : "book33",
"address" : "address90"
},
{
"total" : 1,
"book" : "book5",
"address" : "address1"
},
{
"total" : 1,
"book" : "book99",
"address" : "address9"
},
{
"total" : 1,
"book" : "book1",
"address" : "address5"
},
{
"total" : 1,
"book" : "book5",
"address" : "address2"
},
{
"total" : 1,
"book" : "book3",
"address" : "address4"
},
{
"total" : 1,
"book" : "book11",
"address" : "address77"
},
{
"total" : 1,
"book" : "book9",
"address" : "address3"
},
{
"total" : 1,
"book" : "book1",
"address" : "address15"
},
{
"total" : 2,
"book" : "book1",
"address" : "address2"
},
{
"total" : 3,
"book" : "book1",
"address" : "address1"
}
I didn't quite get your expected result format, so feel free to modify this to one you need.
Below query will provide exactly the same result as given in the desired response:
db.books.aggregate([
{
$group: {
_id: { addresses: "$addr", books: "$book" },
num: { $sum :1 }
}
},
{
$group: {
_id: "$_id.addresses",
bookCounts: { $push: { bookName: "$_id.books",count: "$num" } }
}
},
{
$project: {
_id: 1,
bookCounts:1,
"totalBookAtAddress": {
"$sum": "$bookCounts.count"
}
}
}
])
The response will be looking like below:
/* 1 */
{
"_id" : "address4",
"bookCounts" : [
{
"bookName" : "book3",
"count" : 1
}
],
"totalBookAtAddress" : 1
},
/* 2 */
{
"_id" : "address90",
"bookCounts" : [
{
"bookName" : "book33",
"count" : 1
}
],
"totalBookAtAddress" : 1
},
/* 3 */
{
"_id" : "address15",
"bookCounts" : [
{
"bookName" : "book1",
"count" : 1
}
],
"totalBookAtAddress" : 1
},
/* 4 */
{
"_id" : "address3",
"bookCounts" : [
{
"bookName" : "book9",
"count" : 1
}
],
"totalBookAtAddress" : 1
},
/* 5 */
{
"_id" : "address5",
"bookCounts" : [
{
"bookName" : "book1",
"count" : 1
}
],
"totalBookAtAddress" : 1
},
/* 6 */
{
"_id" : "address1",
"bookCounts" : [
{
"bookName" : "book1",
"count" : 3
},
{
"bookName" : "book5",
"count" : 1
}
],
"totalBookAtAddress" : 4
},
/* 7 */
{
"_id" : "address2",
"bookCounts" : [
{
"bookName" : "book1",
"count" : 2
},
{
"bookName" : "book5",
"count" : 1
}
],
"totalBookAtAddress" : 3
},
/* 8 */
{
"_id" : "address77",
"bookCounts" : [
{
"bookName" : "book11",
"count" : 1
}
],
"totalBookAtAddress" : 1
},
/* 9 */
{
"_id" : "address9",
"bookCounts" : [
{
"bookName" : "book99",
"count" : 1
}
],
"totalBookAtAddress" : 1
}
Since mongoDB version 3.6 this is easy to do, using $group, $slice, $limit, and $sort:
$group the books to count them
$sort so they will be later pushed according to count
$group by address, $push relevant books, and $sum the total per address.
$sort by address total
$limit the address results to topN
Limit the books in the array to topM using $slice
db.collection.aggregate([
{$group: {_id: {book: "$book", addr: "$addr"}, count: {$sum: 1}}},
{$sort: {"_id.addr": 1, count: -1}},
{$group: {
_id: "$_id.addr", totalCount: {$sum: "$count"},
books: {$push: {book: "$_id.book", count: "$count"}}
}
},
{$sort: {totalCount: -1}},
{$limit: topN},
{$set: {addr: "$_id", _id: "$$REMOVE", books: {$slice: ["$books", 0, topM]}}}
])
See how it works on the playground example-v3.4
On mongoDB version 5.2 there is a topN accumulator that can simplify even more:
db.collection.aggregate([
{$group: {_id: {book: "$book", addr: "$addr"}, count: {$sum: 1}}},
{$group: {
_id: "$_id.addr",
totalCount: {$sum: "$count"},
books: {$topN: {output: {book: "$_id.book", count: "$count"},
sortBy: {count: -1},
n: topM
}}
}},
{$sort: {totalCount: -1}},
{$limit: topN},
{$project: {addr: "$_id", _id: 0, books: 1, totalCount: 1}}
])
See how it works on the playground example-v5.2

Resources