I'm trying to analyze a funnel using event data in Elasticsearch and have difficulties finding an efficient query to extract that data.
For example, in Elasticsearch I have:
timestamp action user id
--------- ------ -------
2015-05-05 12:00 homepage 1
2015-05-05 12:01 product page 1
2015-05-05 12:02 homepage 2
2015-05-05 12:03 checkout 1
I would like to extract the funnel statistics. For example:
homepage_count product_page_count checkout_count
-------------- ------------------ --------------
2 1 1
Where homepage_count represent the distinct number of users who visited the homepage, product_page_count represents the distinct numbers of users who visited the homepage after visiting the homepage, and checkout_count represents the number of users who checked out after visiting the homepage and the product page.
What would be the best query to achieve that with Elasticsearch?
This can be achieved with a combination of a terms aggregation for the actions and then a cardinality sub-aggregation for the unique user count per action, like below. note that I've also added a range query in case you want to restrict the period to observe:
{
"size": 0,
"query": {
"range": {
"timestamp": {
"gte": "2021-06-01",
"lte": "2021-06-07"
}
}
},
"aggs": {
"actions": {
"terms": {
"field": "action"
},
"aggs": {
"users": {
"cardinality": {
"field": "user_id"
}
}
}
}
}
}
UPDATE
This is a typical case where the scripted_metric aggregation comes in handy. The implementation is a bit naive, but it shows you the basics of implementing a funnel.
POST test/_search
{
"size": 0,
"aggs": {
"funnel": {
"scripted_metric": {
"init_script": """
state.users = new HashMap()
""",
"map_script": """
def user = doc['user'].value.toString();
def action = doc['action.keyword'].value;
if (!state.users.containsKey(user)) {
state.users[user] = [
'homepage': false,
'product': false,
'checkout': false
];
}
state.users[user][action] = true;
""",
"combine_script": """
return state.users;
""",
"reduce_script": """
def global = [
'homepage': 0,
'product': 0,
'checkout': 0
];
def res = [];
for (state in states) {
for (user in state.keySet()) {
if (state[user].homepage) global.homepage++;
if (state[user].product) global.product++;
if (state[user].checkout) global.checkout++;
}
}
return global;
"""
}
}
}
}
The above aggregation will return exactly the numbers you expect, i.e.:
"aggregations" : {
"funnel" : {
"value" : {
"product" : 1,
"checkout" : 1,
"homepage" : 2
}
}
}
Related
Despite looking at many other posts I can't figure out what I'm doing wrong. I want to do a simple "count and group by", so I found out I need to use collection.aggregate....
Here is my [pubkey].ts where I execute the aggregation, basically I want to count each entry grouped by address sp I'd get something like :
{address1: 6},
{address2: 1},
...
import { connect } from '../../../utils/db'
import Raffles from '../../../utils/db/raffle'
export default async function handler(req, res) {
const { method } = req
const { pubkey } = req.query
await connect()
switch (method) {
case 'POST':
...
break
case 'GET':
try{
const ticket = await Raffles.aggregate([{
"$match": {
"address": pubkey
}
},{
"$count": { $sum: 1}
}])
res.status(201).json({ success: true, data: ticket })
} catch (error){
res.status(400).json({ success: false, data: error })
}
break
default:
res.status(400).json({ success: false })
break
}
}
I can't figure out how to do the summing part, if I replace "$count": "xxyz" I get a success but right now with "$count" : {$sum: 1} I get an error
Any ideas ?
If you use the $count stage, it only receives a string parameter for the name of the field, so your pipeline could be like this:
[
{
"$match": {
"address": pubkey
}
},
{
"$count": "total"
}
]
It filters the documents that match the address and then gets the count. If you want the count of documents grouped by address, you can use this $group stage in the pipeline (in this case $count is an aggregation accumulator, so it doesn't take any parameter):
[
{
$group: {
"_id": "$address",
"total": {
$count: {}
}
}
}
]
That should give you some results like this:
{_id: address1, total: 6}
{_id: address2, total: 1}
can I get the sum of amount if I have an array like the one below?
[{"_id":"5e154cf38c52231ee19f8",
"refunds":[
{"_id":"5e38f10a754fcf3d48015",
"reason":"refund 1",
"amount":50000,
]},
{"_id":"5e1578b48c52231ee19f8",
"refunds":[
{"_id":"5e37e09ef9ea5e3784043",
"reason":"refund 1",
"amount":100000,
{"_id":"5e37e12a02c27c14580a1",
"reason":"refund 2",
"amount":100000,
{"_id":"5e38f02b754fcf3d48015",
"reason":"refund 3",
"amount":50000,
]},
{"_id":"5e1578b48c52231ee19f8",
"refunds":[]
}]
I hope to get res = 300000
This should be relatively straightforward, we create a flat array of all refund objects using flatMap, then we'll use reduce to sum the total amount.
I've added another function, sumRefundAmountsII to workaround JavaScript environments that do not support.flatMap (pre ES2019)
const data = [ { "_id": "5e154cf38c52231ee19f8", "refunds": [ { "_id": "5e38f10a754fcf3d48015", "reason": "refund 1", "amount": 50000 } ] }, { "_id": "5e1578b48c52231ee19f8", "refunds": [ { "_id": "5e37e09ef9ea5e3784043", "reason": "refund 1", "amount": 100000 }, { "_id": "5e37e12a02c27c14580a1", "reason": "refund 2", "amount": 100000 }, { "_id": "5e38f02b754fcf3d48015", "reason": "refund 3", "amount": 50000 } ] }, { "_id": "5e1578b48c52231ee19f8", "refunds": [] }];
function sumRefundAmounts(array) {
// Get an array of all refunds.
const refunds = array.flatMap(obj => obj.refunds);
// Sum amount for each refund.
return refunds.reduce((sum, refund) => {
return sum + refund.amount;
}, 0);
}
console.log("Total amount:", sumRefundAmounts(data));
// Use this function if your JavaScript environment complains about .flatMap
function sumRefundAmountsII(array) {
// Use a work around if we don't have flatMap...
const refunds = [].concat.apply([], array.map(obj => obj.refunds));
// Sum amount for each refund.
return refunds.reduce((sum, refund) => {
return sum + refund.amount;
}, 0);
}
console.log("Total amount (no flat map):", sumRefundAmountsII(data));
Before answering ... I assume 2 things -
1st - you are looking answer in the Java. 2nd - And you can do things much easier way in java List :
Let we make 2 classes Refund & Composite (Containing List)-
Refund Class -
package com.sabre.ticketing.dhs.service.create.domain;
public class Refund {
String _id;
String reason;
int amount;
public Refund(String _id, String reason, int amount) {
this._id = _id;
this.reason = reason;
this.amount = amount;
}
public int getAmount() {
return amount;
}
}
And here is Composite Class as -
package com.sabre.ticketing.dhs.service.create.domain;
import java.util.List;
public class Composite {
String _id;
List<Refund> refunds;
public Composite(String _id, List<Refund> refunds) {
this._id = _id;
this.refunds = refunds;
}
public List<Refund> getRefunds() {
return refunds;
}
}
The calculation technique is in SumCalc class -
package com.sabre.ticketing.dhs.service.create.domain;
import java.util.ArrayList;
import java.util.List;
public class SumCalc {
public static void main(String[] args){
List<Composite> composites = List.of(new Composite("5e154cf38c52231ee19f8", List.of(new Refund("5e38f10a754fcf3d48015", "refund 1", 50000))),
new Composite("5e154cf38c52231ee19f8",
List.of(new Refund("5e37e09ef9ea5e3784043", "refund 1", 100000),
new Refund("5e37e12a02c27c14580a1", "refund 2", 100000),
new Refund("5e38f02b754fcf3d48015", "refund 3", 50000))),
new Composite("5e154cf38c52231ee19f8", new ArrayList<>()));
// Basically get you Json converted into composites list.. for simplicity and dont want to jackson thing i have initialized list as new ..
Integer finalSum = composites.stream()
.map(Composite::getRefunds)
.flatMap(List::stream)
.map(Refund::getAmount)
.reduce((sum, val) -> sum + val)
.orElse(0);
System.out.println("final Sum is " + finalSum);
}
}
Run the SumClac ...
final Sum is 300000
Process finished with exit code 0
I have a mongo document that contains an array called history:
{
"_id" : ObjectId("575fe85bfe98c1fba0a6e535"),
"email" : "email#address",
"__v" : 0,
"history" : [
{
"name" : "Test123",
"organisation" : "Rat",
"field" : 4,
"another": 3
}
]
}
I want to add fields to each history object or update fields IF the name AND organisation match, however if they don't, I want to add a new object to the array with the queried name and organisation and add/update the other fields to the object when necessary.
So:
This query, finds one that matches:
db.users.find({
email:"email#address",
$and: [
{ "history.name": "Test123", "history.organisation": "Rat"}
]
})
However, I'm struggling to get the update/upsert to work IF that combination of history.name and history.organisation dont exist in the array.
What I think I need to do is a :
"If this history name does not equal 'Test123' AND the history organisation does not equal 'Rat' then add an object to the array with those fields and any other field provided in the update query."
I tried this:
db.users.update({
email:"email#address",
$and: [
{ "history.name": "Test123", "history.organisation": "Rat"}
]
}, {
history: { name: "Test123"},
history: { organisation: "Rat"}
}, {upsert:true})
But that gave me E11000 duplicate key error index: db.users.$email_1 dup key: { : null }
Any help greatly appreciated.
Thanks community!
Not possible with a single atomic update I'm afraid, you would have to do a couple of update operations that satisfy both conditions.
Break down the update logic into two distinct update operations, the first one would require using the positional $ operator to identify the element in the history array you want and the $set to update the existing fields. This operation follows the logic update fields IF the name AND organisation match
Now, you'd want to use the findAndModify() method for this operation since it can return the updated document. By default, the returned document does not include the modifications made on the update.
So, armed with this arsenal, you can then probe your second logic in the next operation i.e. update IF that combination of "history.name" and "history.organisation" don't exist in the array. With this second
update operation, you'd need to then use the $push operator to add the elements.
The following example demonstrates the above concept. It initially assumes you have the query part and the document to be updated as separate objects.
Take for instance when we have documents that match the existing history array, it will just do a single update operation, but if the documents do not match, then the findAndModify() method will return null, use this logic in your second update operation to push the document to the array:
var doc = {
"name": "Test123",
"organisation": "Rat"
}, // document to update. Note: the doc here matches the existing array
query = { "email": "email#address" }; // query document
query["history.name"] = doc.name; // create the update query
query["history.organisation"] = doc.organisation;
var update = db.users.findAndModify({
"query": query,
"update": {
"$set": {
"history.$.name": doc.name,
"history.$.organisation": doc.organisation
}
}
}); // return the document modified, if there's no matched document update = null
if (!update) {
db.users.update(
{ "email": query.email },
{ "$push": { "history": doc } }
);
}
After this operation for documents that match, querying the collection will yield the same
db.users.find({ "email": "email#address" });
Output:
{
"_id" : ObjectId("575fe85bfe98c1fba0a6e535"),
"email" : "email#address",
"__v" : 0,
"history" : [
{
"name" : "Test123",
"organisation" : "Rat",
"field" : 4,
"another" : 3
}
]
}
Now consider documents that won't match:
var doc = {
"name": "foo",
"organisation": "bar"
}, // document to update. Note: the doc here does not matches the current array
query = { "email": "email#address" }; // query document
query["history.name"] = doc.name; // create the update query
query["history.organisation"] = doc.organisation;
var update = db.users.findAndModify({
"query": query,
"update": {
"$set": {
"history.$.name": doc.name,
"history.$.organisation": doc.organisation
}
}
}); // return the document modified, if there's no matched document update = null
if (!update) {
db.users.update(
{ "email": query.email },
{ "$push": { "history": doc } }
);
}
Querying this collection for this document
db.users.find({ "email": "email#address" });
would yield
Output:
{
"_id" : ObjectId("575fe85bfe98c1fba0a6e535"),
"email" : "email#address",
"__v" : 0,
"history" : [
{
"name" : "Test123",
"organisation" : "Rat",
"field" : 4,
"another" : 3
},
{
"name" : "foo",
"organisation" : "bar"
}
]
}
I seem to have an issue in retrieving a selected few documents from a PouchDB using startkey and endkey. I need to get the documents back with key starting with "profile" (in this example profile41 & profile48).
When using the chrome PouchDB extension, this works fine when doing a query using startkey:"profile" and endkey:"profile\0ffff", but for some reason, this doesn't work when running my code (angular/ionic).
My code returns an empty doc list. When I set the startkey:"profile41" and endkey:"profile41" I do get the doc back, so I know it connects and can retrieve documents from the DB.
PS: First use of PouchDB, so I might have overlooked something simple here.
Some documents in my db
{
"_id": "animaltypes",
"_rev": "7-e413c314272a62a6a14ed293f5f934cf",
"value": {
"rev": "7-e413c314272a62a6a14ed293f5f934cf"
},
"key": "animaltypes"
}
{
"_id": "profile41",
"_rev": "3-f4065b825d304d79479e3576409ce744",
"value": {
"rev": "3-f4065b825d304d79479e3576409ce744"
},
"key": "profile41"
}
{
"_id": "profile48",
"_rev": "3-5e62a6e33f022a8ac30d46b80126dedd",
"value": {
"rev": "3-5e62a6e33f022a8ac30d46b80126dedd"
},
"key": "profile48"
}
My javascript that retrieves docs
this.getData = function(keystart,keyend){
var deferred = $q.defer();
localDB.allDocs({include_docs: true,
descending: true,
startkey:keystart,
endkey:keyend}, function(err, doc) {
if (err) {
deferred.reject(err);
} else {
var rows = [];
for (var x in doc.rows) {
rows.push(doc.rows[x].doc.data);
}
deferred.resolve(rows);
}
});
return deferred.promise;
};
This returns an empty array with the following params
startkey = "profile"
endkey = "profile\0ffff"
This returns the correct single doc when parameters are set to
startkey = "profile41"
endkey = "profile41"
If you want to use "descending:true" you have to change the order of startkey and endkey.
Indeed, if you don't use "descending:true" you B-Tree looks like that:
1- animaltypes
2 - profile41
3 - profile48
startkey="profile" will be between id 1 and id 2
endkey="profile\0ffff" will be after id 3
In result you will have 2 records
If you use "descending=true" you B-Tree looks like that:
1- profile48
2 - profile41
3 - animaltypes
startkey="profile" will be between id 2 and id 3
endkey="profile\0ffff" will be after id 2 and id 3
in result you will have 0 record.
I have a big collection of songs and want to get most played songs per week, in a array. as example:
{
"_id" : {
"title" : "demons savaites hitas",
"name" : "imagine dragons"
},
"value" : {
"weeks" : [
{
"played" : 56,
"week" : 9,
"year" : 2014
}
]
}
}
It sometimes becomes:
{
"_id" : {
"title" : "",
"name" : "top 15"
},
"value" : {
"played" : 1,
"week" : 8,
"year" : 2014
}
}
The collection which i get the data from is named songs and new fields get added all the time when a songs get added. No unique artistnames or songtitles and every document in the collection looks like this:
{
"_id" : ObjectId("530536e3d4ca1a783342f1c8"),
"week" : 8,
"artistname" : "City Shakerz",
"songtitle" : "Love Somebody (Summer 2012 Mix Edit)",
"year" : 2014,
"date" : ISODate("2014-02-19T22:57:39.926Z")
}
I now want to do a mapreduce which add the new week to the array. It now overwrites it.
I also noted when trying to change to a array, not all the played get counted, with the new mapreduce.
The new mapreduce not working, with weeks:
map = function () {
if (this.week == 9 && this.year == 2014) emit({title:this.songtitle.toLowerCase(), name:this.artistname.toLowerCase()}, {played:1, week:this.week, year:this.year});
}
reduce = function(k, values) {
var result = {};
result.weeks = new Array();
var object = {played:0, week: 0, year: 0};
values.forEach(function(value) {
object.played += value.played;
object.week = value.week;
object.year = value.year;
});
result.weeks.push(object);
return result;
}
db.songs.mapReduce(map,reduce,{out: {reduce:"played2"}})
This is the old one i'm using with is a new field in the collection per week and song:
map = function () {
if (this.week == 10 && this.year == 2014) emit({title:this.songtitle.toLowerCase(), name:this.artistname.toLowerCase(), week:this.week, year:this.year}, {count:1});
}
reduce = function(k, values) {
var result = {count: 0,};
values.forEach(function(value) {
result.count += value.count;
});
return result;
}
db.songs.mapReduce(map,reduce,{out: {merge:"played"}})
I get the information fro the toplist right now from played2 like this:
db.played2.find({'_id.week': 9,'_id.year': 2014}).sort(array("value.count" => -1)).limit(50)
Above line can include any typo because i use mongoclient for php and needed to change it to javascript syntax for you.
What am I doing wrong?
I found out that I could do mapreduce as the code snippet above and then just get this week in a query and another one for previous week and do simple double for with a if to update this week with previous week place.
I made the script in python, which i run also for my mapreduce as a cronjob. As example:
if len(sys.argv) > 1 and sys.argv[1] is not None:
week = int(sys.argv[1])
else:
week = (datetime.date.today().isocalendar()[1]) - 1
year = datetime.date.today().year
previous_week = week - 1
client = MongoClient()
db = client.db
played = db.played
print "Updating it for week: " + str(week)
previous = played.find({"_id.week": previous_week, "_id.year": year}).sort("value.count", -1).limit(50)
thisweek = played.find({"_id.week": week, "_id.year": year}).sort("value.count", -1).limit(50)
thisplace = 1
for f in thisweek:
previous.rewind() # Reset second_collection_records's iterator
place = 1
if previous.count() > 0:
checker = bool(1)
for s in previous:
if s["_id"]["name"] == f["_id"]["name"] and s["_id"]["title"] == f["_id"]["title"]:
result = played.update({"_id.week": f["_id"]["week"], "_id.year": f["_id"]["year"], "_id.title": f["_id"]["title"], "_id.name": f["_id"]["name"]}, {"$set": {"place.previous_week":place, "place.this_week":thisplace}})
checker = bool(0)
print result
place = place + 1
if checker is True:
result = played.update({"_id.week": f["_id"]["week"], "_id.year": f["_id"]["year"], "_id.title": f["_id"]["title"], "_id.name": f["_id"]["name"]}, {"$set": {"place.previous_week":0, "place.this_week":thisplace}})
print result
else:
result = played.update({"_id.week": f["_id"]["week"], "_id.year": f["_id"]["year"], "_id.title": f["_id"]["title"], "_id.name": f["_id"]["name"]}, {"$set": {"place.previous_week":0, "place.this_week":thisplace}})
print result
thisplace = thisplace + 1
print "done."
This seems to work very good. Hopefully mongodb adds support to just update a field or anything in mapreduce to add information to a document without overwrite it.
I'm taking a stab at the structure of your collection based on your input fields, but I don't think mapReduce is the tool you want. Your apparent desired output can be achieved using aggregate :
db.collection.aggregate([
// Match a specific week and year if you want - remove if you want all
{ "$match": { "year": inputYear, "week": inputWeek } },
// Group to get the total number of times played
{ "$group": {
"_id": {
"title": { "$toLower": "$songtitle" },
"name": { "$toLower": "$artistname" },
"week": "$week",
"year": "$year"
},
played: { "$sum": 1 }
}},
// Sort the results by the most played in the range
{ "$sort": { "year": -1, "week": -1, "played": -1 } },
// Optionally limit to the top 15 results
{ "$limit": 15 }
])
That basically is what you appear to be trying to do. So this sums up the "number of appearances" as the number of times played. Then we take the additional steps of sorting the results, and optionally (if you can live with looking for one week at a time) limits the results to a set number. Those last two steps you won't get with mapReduce.
If you are ultimately looking for the "top ten" for each week, as a single query result, then you can look at this for a discussion (and methods to achieve) what we call the "topN" results problem.