Find duplicate urls in mongodb [duplicate] - database

This question already has answers here:
Find duplicate records in MongoDB
(10 answers)
Closed 2 years ago.
I have a DB with news articles, and I am trying to do a little DB cleaning. I want to find all duplicate documents, and the best way i think to accomplish this by using the url field. My documents are structured as follows:
{
_id:
author:
title:
description:
url:
urlToImage:
publishedAt:
content:
summarization:
source_id:
}
Any help is greatly appreciated

Assuming a collection documents with name (using name instead of url) field consisting duplicate values. I have two aggregations which return some output which can be used to do further processing. I hope you will find this useful.
{ _id: 1, name: "jack" },
{ _id: 2, name: "john" },
{ _id: 3, name: "jim" },
{ _id: 4, name: "john" }
{ _id: 5, name: "john" },
{ _id: 6, name: "jim" }
Note that "john" has 3 occurrances and "jim" has 2.
(1) This aggregation returns the names which have duplicates (more than one occurance):
db.collection.aggregate( [
{
$group: {
_id: "$name",
count: { $sum: 1 }
}
},
{
$group: {
_id: "duplicate_names",
names: { $push: { $cond: [ { $gt: [ "$count", 1 ] }, "$_id", "$DUMMY" ] } }
}
}
] )
The output:
{ "_id" : "duplicate_names", "names" : [ "john", "jim" ] }
(2) The following aggregation just returns the _id field values for the duplicate documents. For example, the name "jim" has _idvalues 3 and 6. The output has only the id's for the duplicate documents, i.e., 6.
db.colection.aggregate( [
{
$group: {
_id: "$name",
count: { $sum: 1 },
ids: { $push: "$_id" }
}
},
{
$group: {
_id: "duplicate_ids",
ids: { $push: { $slice: [ "$ids", 1, 9999 ] } }
}
},
{
$project: {
ids: {
$reduce: {
input: "$ids",
initialValue: [ ],
in: { $concatArrays: [ "$$this", "$$value" ] }
}
}
}
}
] )
The output:
{ "_id" : duplicate_ids", "ids" : [ 6, 4, 5 ] }

Related

Mongodb: check that all the fields of the elements of an array of objects respect a condition

I have a database of a the employees of a company that looks like this:
{
_id: 7698,
name: 'Blake',
job: 'manager',
manager: 7839,
hired: ISODate("1981-05-01T00:00:00.000Z"),
salary: 2850,
department: {name: 'Sales', location: 'Chicago'},
missions: [
{company: 'Mac Donald', location: 'Chicago'},
{company: 'IBM', location: 'Chicago'}
]
}
I have an exercise in which I need to write the MongoDb command that returns all them employees who did all their missions in Chicago. I struggle with the all because I cannot find a way to check that all the locations of the missions array are equal to 'Chicago'.
I was thinking about doing it in two time: first find the total number of missions the employee has and then compare it to the number of mission he has in Chicago (that how I would do in SQL I guess). But I cannot found the number of mission the employee did in Chicago. Here is what I tried:
db.employees.aggregate([
{
$match: { "missions": { $exists: true } }
},
{
$project: {
name: 1,
nbMissionsChicago: {
$sum: {
$cond: [
{
$eq: [{
$getField: {
field: { $literal: "$location" },
input: "$missions"
}
}, "Chicago"]
}, 1, 0
]
}
}
}
}
])
Here is the result :
{ _id: 7698, name: 'Blake', nbMissionsChicago: 0 }
{ _id: 7782, name: 'Clark', nbMissionsChicago: 0 }
{ _id: 8000, name: 'Smith', nbMissionsChicago: 0 }
{ _id: 7902, name: 'Ford', nbMissionsChicago: 0 }
{ _id: 7499, name: 'Allen', nbMissionsChicago: 0 }
{ _id: 7654, name: 'Martin', nbMissionsChicago: 0 }
{ _id: 7900, name: 'James', nbMissionsChicago: 0 }
{ _id: 7369, name: 'Smith', nbMissionsChicago: 0 }
First of all, is there a better method to check that all the locations of the missions array respect the condition? And why does this commands returns only 0 ?
Thanks!
If all you need is the agents who had all their missions in "Chicago" then you don't need an aggregation pipeline for it, specifically the approach of filtering the array as part of the aggregation can't utilize an index and will make performance even worse.
A simple query should suffice here:
db.collection.find({
$and: [
{
"missions": {
$exists: true
}
},
{
"missions.location": {
$not: {
$gt: "Chicago"
}
}
},
{
"missions.location": {
$not: {
$lt: "Chicago"
}
}
}
]
})
Mongo Playground
This way we can build an index on the missions field and utilize it properly, any documents with a different value other then "Chigaco" will not match as they will fail the $gt or $lt comparion.
Note that an empty array also matches the condition, you can change the generic "missions" exists condition key into "missions.0": {$exists: true}, this will also require at least one mission.
You are unable to get the correct result as it is not the correct way to iterate the element in an array field.
Instead, you need to work with $size operator to get the size of an array and the $filter operator to filter the document.
Updated: You can directly compare the filtered array with the original array.
db.employees.aggregate([
{
$match: {
"missions": {
$exists: true
}
}
},
{
$project: {
name: 1,
nbMissionsChicago: {
$eq: [
{
$filter: {
input: "$missions",
cond: {
$eq: [
"$$this.location",
"Chicago"
]
}
}
},
"$missions"
]
}
}
}
])
Demo # Mongo Playground

MongoDB using skip and distinct in a query based on values inside an array

So I have document that is structure like this
_id: ObjectId('62bbe17d8fececa06b91873d')
clubName: 'test'
staff:[
'62bbe47f8fececa06b9187d8'
'624f4b56ab4f5170570cdba3' //IDS of staff members
]
A single staff can be assigned to multiple clubs so what I'm trying to achieve is to get all staff that has been assigned to at least one club and display them on a table on the front end, I followed this solution since distinct and skip can't be used on a single query but it just returned this:
[
{ _id: [ '624f5054ab4f5170570cdd16', '624f5054ab4f5170570cdd16' ] } //staff from club 1,
{ _id: [ '624f5054ab4f5170570cdd16', '624f9194ab4f5170570cded1' ] } //staff from club 2,
{ _id: [ '624f4b56ab4f5170570cdba3' ]} //staff from club 3
]
my desired outcome would be like this:
[ _id : ['624f5054ab4f5170570cdd16', '624f9194ab4f5170570cded1', '624f4b56ab4f5170570cdba3'] ]
here's my query:
const query = this.clubModel.aggregate(
[{ $group: { _id: '$staff' } }, { $skip: 0}, { $limit: 10}],
(err, results) => {
console.log(results);
},
);
the values returned are not distinct at all, is there an operation that can evaluate the value inside an array and make them distinct?
Here's my new query after adding the 'createdAt' field in my document structure:
const query = this.clubModel.aggregate([
{ $sort: { createdAt: -1 } },
{
$unwind: '$drivers',
},
{
$project: {
isActive: true,
},
},
{
$group: {
_id: 'null',
ids: {
$addToSet: '$drivers',
},
},
},
{
$project: {
_id: 0,
},
},
{
$skip: skip,
},
{
$limit: limit,
},
]);
Does this works for you, first UNWIND the staff array, and then group on "_id" as null and add staff values using $addToSet:
db.collection.aggregate([
{
"$unwind": "$staff"
},
{
"$group": {
"_id": "null",
"ids": {
"$addToSet": "$staff"
}
}
},
{
"$project": {
"_id": 0,
}
},
{
$skip: 0
},
{
$limit: 10
}
])
Here's the working link.

How to get specific fields on document MangoDB&Mongoose and aggregate some of the fields?

My data looks like this:
[
{
"_id":"61717cafd351f3ae8b6d205a",
"restaurant":"Hogwarts",
"purchasedAt":"2021-10-20T17:47:40.166Z",
"products":[
{
"name":"Meat Samosa",
"price":3.95,
"quantity":1,
"_id":"61717cafd351f3ae8b6d205b"
},
{
"name":"Pilau Rice",
"price":2.95,
"quantity":1,
"_id":"61717cafd351f3ae8b6d205f"
}
]
},
{
"_id":"61717cb2d351f3ae8b6dd05b",
"restaurant":"Hogwarts",
"purchasedAt":"2021-10-20T03:14:11.111Z",
"products":[
{
"name":"Pilau Rice",
"price":2.95,
"quantity":1,
"_id":"61717cb2d351f3ae8b6dd05d"
}
]
},
]
I am trying to find a query that will get me all the products (no duplicates) and their quantities added up. Notice that the products id are different even when they are the same(same name) Ideally my response would look like this
[
{
name: "Meat Samosa",
price: 3.95,
quantity: 1
},
{
name: "Pilau Rice",
price: 2.95,
quantity: 2
}
]
$project to show required fields
$unwind deconstruct the products array
$group by name and get the first price and count the quantity sum
$project to show required fields
db.collection.aggregate([
{
$project: {
_id: 0,
products: 1
}
},
{ $unwind: "$products" },
{
$group: {
_id: "$products.name",
price: { $first: "$products.price" },
quantity: { $sum: "$products.quantity" }
}
},
{
$project: {
_id: 0,
name: "$_id",
price: 1,
quantity: 1
}
}
])
Playground

Mongodb user table with friends

I have a USER table with documents:
{
_id: 1,
name: 'funny-guy43',
image: '../../../img1.jpg',
friends: [2, 3]
},
{
_id: 2,
name: 'SurfinGirl3',
image: '../../../img2.jpg',
friends: []
},
{
_id: 3,
name: 'FooBarMan',
image: '../../../img3.jpg',
friends: [2]
}
friends is an array of USER _ids. (1) I want to get user by _id, (2) look at his friends and (3) query the USER table with the friend ids to return all friends.
for example, find user 1, query the table based on his friends 2 and 3, and return 2 and 3.
Can I do that in one transaction? Or do I query the table to get user array of friends, then query the table again with array of friends ids.
I'm using .Net Core if that matters.
I am very open to alternative approaches as well.
It is, in fact, possible to do this in one transaction. Or, to be more exact, in one aggregation.
I would first split the users into 2 different subsets, one called searched_user and the other other_users, where searched_user will have only the user we are searching for and other_users will have everyone else. We can do that using $facet. Here is the idea:
{
"$facet": {
"searched_user": [
{
$match: {
_id: 1
}
}
],
"other_users": [
{
$match: {
_id: {
$ne: 1
}
}
}
]
}
}
Once they are separated like this, we can search the other_users subset using the friend ids from the searched_user. So here is the full aggregation:
db.collection.aggregate([
{
"$facet": {
"searched_user": [
{
$match: {
_id: 1
}
}
],
"other_users": [
{
$match: {
_id: {
$ne: 1
}
}
}
]
}
},
{
"$unwind": "$searched_user"
},
{
$project: {
user_friends: {
$filter: {
input: "$other_users",
as: "other_users",
cond: {
$in: [
"$$other_users._id",
"$searched_user.friends"
]
}
}
}
}
}
])
Here we are looking for user 1 and the result will be user 1's friends.
[
{
"user_friends": [
{
"_id": 2,
"friends": [],
"image": "../../../img2.jpg",
"name": "SurfinGirl3"
},
{
"_id": 3,
"friends": [
2
],
"image": "../../../img3.jpg",
"name": "FooBarMan"
}
]
}
]
Playground: https://mongoplayground.net/p/-8pNnQXg8r6
You can achieve this by using lookup in aggregation, Tried it with MongoDB version v4.2.11.
db.users.aggregate([
{
'$match': {
'_id': 1,
}
},
{
'$lookup': {
'from' : 'users',
'let' : {
'friendIds': '$friends',
},
'pipeline': [
{
'$match':{
'$expr': {'$in': [ '$_id', '$$friendIds']}
}
}
],
'as': 'friendsArr'
}
}
])
Result:
[
{
"_id" : 1,
"name" : "funny-guy43",
"image" : "../../../img1.jpg",
"friends" : [
2,
3
],
"friendsArr" : [
{
"_id" : 2,
"name" : "SurfinGirl3",
"image" : "../../../img2.jpg",
"friends" : [ ]
},
{
"_id" : 3,
"name" : "FooBarMan",
"image" : "../../../img3.jpg",
"friends" : [
2
]
}
]
}
]

Is there a way to check if two values in separate "columns" are equal?

For example, with this data:
{id: 1, fname: "Barry", lname: "Sullivan"}
{id: 2, fname: "Sarah", lname: "Bailey"}
{id: 3, fname: "Drake", lname: "Barry"}
Is there a way, with a single query, that I could check to see if anyone had the same lname as id: 1 fname?
You can use $facet to run two separate queries and get the result as one document. This will give you two separate arrays: 1-element with id:1 and the other documents. Then you can simply run $filter to get matching lnames:
db.collection.aggregate([
{
$facet: {
first: [ { $match: { id: 1 } } ],
others: [ { $match: { $expr: { $ne: [ "$id", 1 ] } } } ]
}
},
{
$unwind: "$first"
},
{
$project: {
matches: {
$filter: {
input: "$others",
cond: { $eq: [ "$$this.lname", "$first.lname" ] }
}
}
}
}
])
Mongo Playground

Resources