30. conventions are fun to play with
(list of tags per day)
> db.user_scores.find(
{"_id": /^4d873ce631f238241d00000d-‐day-‐20091106-‐/}, {"_id": 1}
).map(function(document) {
return document._id.replace(
"4d873ce631f238241d00000d-‐day-‐20091106-‐", ""
)
})
[
"advertising",
"art",
"artist",
"blogging",
"culture",
"html",
"illustration",
"information",
...
]
31. conventions are fun to play with
(anchored regexp uses indexes)
> db.user_scores.find(
{"_id": /^4d873ce631f238241d00000d-‐day-‐20091106-‐/}, {"_id": 1}
).explain()
{
"cursor" : "BtreeCursor _id_ multi",
"nscanned" : 15,
"nscannedObjects" : 15,
"n" : 15,
"millis" : 0,
"indexBounds" : {
"_id" : [
[
"4d873ce631f238241d00000d-‐day-‐20091106-‐",
"4d873ce631f238241d00000d-‐day-‐20091106."
],
[
/^4d873ce631f238241d00000d-‐day-‐20091106-‐/,
/^4d873ce631f238241d00000d-‐day-‐20091106-‐/
]
]
32. conventions are fun to play with
(anchored regexp uses indexes)
> db.user_scores.find(
{"_id": /4d873ce631f238241d00000d-‐day-‐20091106-‐/}, {"_id": 1}
).explain()
{
"cursor" : "BtreeCursor _id_ multi",
"nscanned" : 109349,
"nscannedObjects" : 15,
"n" : 15,
"millis" : 217,
"indexBounds" : {
"_id" : [
...
]
}
}
33. query & use “group”
design method to
do small
computations
without
fetching
related
documents
34. group to compute data in mongo
(inject client side)
days = [ 20091110, 20091111, 20091112 ]
scores_id = %r{^4d87d00931f2380c7700000d-day-(#{days.join("|")})$}
scores = db["user_scores"].find(:_id => scores_id)
pomodori = scores.inject(0) do |pomodori, scores|
pomodori + scores["pomodori"]
end
puts "Pomodori in days #{days.join(",")}: #{pomodori}"
35. group to compute data in mongo
(inject client side)
days = [ 20091110, 20091111, 20091112 ]
scores_id = %r{^4d87d00931f2380c7700000d-day-(#{days.join("|")})$}
scores = db["user_scores"].find(:_id => scores_id)
pomodori = scores.inject(0) do |pomodori, scores|
$ ruby src/inject_for_reduce.rb
pomodori + scores["pomodori"]
Pomodori in days 20091110,20091111,20091112: 36
end
puts "Pomodori in days #{days.join(",")}: #{pomodori}"
36. group to compute data in mongo
(group server side)
days = [ 20091110, 20091111, 20091112 ]
scores_id = %r{^4d87d00931f2380c7700000d-day-(#{days.join("|")})$}
result = db["user_scores"].group(
:cond => { :_id => scores_id },
:initial => { :pomodori => 0 },
:reduce => <<-EOF
function(document, result) {
result.pomodori += document.pomodori
}
EOF
)
puts "Pomodori in days #{days.join(",")}: #{result.first["pomodori"]}"
37. group to compute data in mongo
(group server side)
days = [ 20091110, 20091111, 20091112 ]
scores_id = %r{^4d87d00931f2380c7700000d-day-(#{days.join("|")})$}
result = db["user_scores"].group(
:cond => { :_id => scores_id },
:initial => { :pomodori => 0 },
:reduce => <<-EOF $ ruby src/group_for_reduce.rb
Pomodori in days 20091110,20091111,20091112: 36
function(document, result) {
result.pomodori += document.pomodori
}
EOF
)
puts "Pomodori in days #{days.join(",")}: #{result.first["pomodori"]}"
38. group to compute data in mongo
(ex. sum pomodori by tag “ruby”)
result = db["user_scores"].group(
:cond => {
:_id => /^4d87d00931f2380c7700000d-day-d{8}-ruby$/
},
:initial => { :pomodori => 0, :days => 0 },
:reduce => <<-EOF
function(document, result) {
result.days += 1
result.pomodori += document.pomodori
}
EOF
).first
puts "In #{result["days"]} days, #{result["pomodori"]} done for ruby"
39. group to compute data in mongo
(ex. sum pomodori by tag “ruby”)
result = db["user_scores"].group(
:cond => {
:_id => /^4d87d00931f2380c7700000d-day-d{8}-ruby$/
},
:initial => { :pomodori => 0, :days => 0 },
:reduce => <<-EOF
function(document, result) {
$ ruby src/group_for_ruby_tag.rb
In 43 days, 45 pomodori
result.days += 1
result.pomodori += document.pomodori
}
EOF
).first
puts "In #{result["days"]} days, #{result["pomodori"]} pomodori"
40. group to compute data in mongo
(ex. sum pomodori by tag “ruby”)
> db.user_scores.find({
"_id": /^4d87d00931f2380c7700000d-‐day-‐d{8}-‐ruby$/
}).explain()
{
"cursor" : "BtreeCursor _id_ multi",
"nscanned" : 43,
"nscannedObjects" : 43,
"n" : 43,
"millis" : 3,
"indexBounds" : {
"_id" : [...]
}
}
41. query &
design create indexes
on arrays to
create local
reverse
indexes in
documents
42. reverse index in place
(an array could be indexed)
> db.tasks.find({ "tags": { $in: [ "nosqlday" ] } })
{ "_id" : ObjectId("4d7de446175ca8243d000004"),
"tags" : [ "nosqlday" ],
"description" : "#nosqlday keynote",
"is_recurrent" : false,
"estimated" : 0,
"worked_in" : [
"Mon Mar 14 2011 00:00:00 GMT+0100 (CET)",
"Tue Mar 15 2011 00:00:00 GMT+0100 (CET)"
],
"done_at" : "Tue Mar 15 2011 13:05:03 GMT+0100 (CET)",
"todo_at" : null,
"created_at" : "Mon Mar 14 2011 10:47:50 GMT+0100 (CET)",
"updated_at" : "Tue Mar 15 2011 13:05:03 GMT+0100 (CET)",
"keywords": [ "nosqldai", "keynot" ],
"user_id": ObjectId("4d53996c137ce423ff000001"),
"annotations" : [ ]
}
43. reverse index in place
(an array could be indexed)
> db.tasks.getIndexes()
[
{
"name" : "_id_",
"ns" : "app435386.tasks",
"key" : {
"_id" : 1
}
},
{
"name" : "tags_1",
"ns" : "app435386.tasks",
"key" : {
"tags" : 1
},
"unique" : false
},
...
]
44. reverse index in place
(container for deduced data, array)
db["orders"].insert({
:placed_at => [
now.strftime("%Y"), # year: "2011"
now.strftime("%Y%m"), # month: "201103"
now.strftime("%Yw%U"), # week: "2011w11"
now.strftime("%Y%m%d") # day: "20110316"
],
:user_id => user,
:items => items_in_order.map{|item| item[:id]},
:total => items_in_order.inject(0){|total,item| total += item[:price]}
})
# ...
db["orders"].ensure_index([["placed_at", Mongo::DESCENDING]])
45. reverse index in place
(container for deduced data, array)
> db.orders.findOne()
{ "_id" : ObjectId("4d88bf1f31f23812de0003fd"),
"placed_at" : [ "2011", "201103", "2011w11", "20110316" ],
"user_id" : ObjectId("4d88bf1f31f23812de0003e9"),
"items" : [
ObjectId("4d88bf1f31f23812de0003da"),
ObjectId("4d88bf1f31f23812de000047"),
ObjectId("4d88bf1f31f23812de000078"),
ObjectId("4d88bf1f31f23812de000068"),
ObjectId("4d88bf1f31f23812de000288")
],
"total" : 3502
}
54. plain dates are good too
> db.orders.find({
"placed_at": {
$gte: new Date(2011,2,10),
$lt: new Date(2011,2,11)
}
}).explain()
{
"cursor" : "BtreeCursor placed_at_-‐1",
"nscanned" : 53,
"nscannedObjects" : 53,
"n" : 53,
"millis" : 0,
"indexBounds" : {
"placed_at" : [
[
"Fri Mar 11 2011 00:00:00 GMT+0100 (CET)",
"Thu Mar 10 2011 00:00:00 GMT+0100 (CET)"
]
]
}
55. plain dates are good too, but...
(total sold on this year’s mondays)
# find all mondays of the year
now = Time.now.beginning_of_year
now += 1.day until now.monday?
mondays = [ now ]
mondays << now += 7.days while now.year == Time.now.year
# find all orders placed on mondays
query = {
:$or => mondays.map do |day|
{ :placed_at => {
:$gte => day.beginning_of_day,
:$lte => day.end_of_day
}
}
end
}
puts query
56. plain dates are good too, but...
(total sold on this year’s mondays)
# find all mondays of the year
now = Time.now.beginning_of_year
now += 1.day until now.monday?
mondays = [ now ]
mondays << now += 7.days while now.year == Time.now.year
$ ruby src/orders_on_mondays.rb
# find all orders placed on mondays
{:$or=>[
query = { {:placed_at=>{
:$or => mondays.map do |day|
:$gte=>2011-‐01-‐03 00:00:00 +0100,
{ :placed_at => { :$lte=>2011-‐01-‐03 23:59:59 +0100
}},
:$gte => day.beginning_of_day,
{:placed_at=>{
:$lte => day.end_of_day
:$gte=>2011-‐01-‐10 00:00:00 +0100,
:$lte=>2011-‐01-‐10 23:59:59 +0100
} }},
} {:placed_at=>{
:$gte=>2011-‐01-‐17 00:00:00 +0100,
end :$lte=>2011-‐01-‐17 23:59:59 +0100
} }},
...
]}
puts query
57. plain dates are good too, but...
(it works but it’s too slooow)
db["orders"].find({
:$or => mondays.map do |day|
{ :placed_at => {
:$gte => day.beginning_of_day,
:$lte => day.end_of_day
}
}
end
})
58. plain dates are good too, but...
(why it’s too slow)
> db.orders.find({
$or: [
"placed_at":{ $gte: new Date(2011,2,3), $lt: new Date(2011,2,4) },
"placed_at":{ $gte: new Date(2011,2,10), $lt: new Date(2011,2,11) }
]
}).explain()
{
"clauses" : [{
"cursor" : "BtreeCursor placed_at_-‐1",
"indexBounds" : {
"placed_at" : [[
"Tue Mar 3 2011 00:00:00 GMT+0100 (CET)",
"Wed Mar 4 2011 00:00:00 GMT+0100 (CET)"
]]}
}, {
"cursor" : "BtreeCursor placed_at_-‐1",
"indexBounds" : {
"placed_at" : [[
"Tue Mar 10 2011 00:00:00 GMT+0100 (CET)",
"Wed Mar 11 2011 00:00:00 GMT+0100 (CET)"
59. with destructured dates
(total sold on mondays this year)
> db.orders.findOne()
{ "_id" : ObjectId("4d88bf1f31f23812de0003fd"),
"placed_at" : [ "2011", "201103", "2011w11", "20110316" ],
"user_id" : ObjectId("4d88bf1f31f23812de0003e9"),
"items" : [
ObjectId("4d88bf1f31f23812de0003da"),
ObjectId("4d88bf1f31f23812de000047"),
ObjectId("4d88bf1f31f23812de000078"),
ObjectId("4d88bf1f31f23812de000068"),
ObjectId("4d88bf1f31f23812de000288")
],
"total" : 3502
}
60. with destructured dates
(total sold on mondays this year)
now = Time.now.beginning_of_year
now += 1.day until now.monday?
mondays = [ now ]
mondays << now += 7.days while now.year == Time.now.year
orders = db["orders"].find({
:placed_at => {
:$in => mondays.map {|day| day.strftime("%Y%m%d")}
}
})
puts orders.explain
61. with destructured dates
(total sold on mondays this year)
now = Time.now.beginning_of_year
now += 1.day until now.monday?
mondays = [ now ]
mondays << now += 7.days while now.year == Time.now.year
orders = db["orders"].find({
$ ruby src/orders_on_mondays.rb
:placed_at => {
{ "cursor"=>"BtreeCursor placed_at_-‐1 multi",
:$in => mondays.map "nscanned"=>744,
{|day| day.strftime("%Y%m%d")}
} "nscannedObjects"=>744,
"n"=>744,
}) "millis"=>1,
"indexBounds"=>{
"placed_at"=>[
puts orders.explain ["20120102", "20120102"], ["20111226", "20111226"],
["20111219", "20111219"], ["20111212", "20111212"],
["20111205", "20111205"], ["20111128", "20111128"],
["20111121", "20111121"], ...
]
}
}
83. map/reduce hits per day
(we have raw events)
> db.visit_events.findOne()
{
"_id" : ObjectId("4d89fc6531f2381d2c00000b"),
"url" : "8aa8b68e0b849f70df6dbb3031c6182b",
"user_id" : ObjectId("4d89fc6531f2381d2c000005"),
"at" : "Thu Jan 13 2011 08:00:06 GMT+0100 (CET)"
}
84. map/reduce hits per day
(generate data WITH something like)
def generate_events(visits, db, now)
visits.times do |time|
now += BETWEEN_VISITS.sample.seconds
db["visit_events"].insert(
:url => Digest::MD5.hexdigest(URLS.sample),
:user_id => USERS.sample[:id],
:at => now
)
end
end
generate_events(10_000, db, now)
87. map/reduce hits per day
(implement format in place)
MAP = <<-EOF
function() {
Date.prototype.format = function(format) {
...
}
emit([ this.url, this.at.format("Ymd") ].join("-"), { "hits": 1 })
}
EOF
REDUCE = <<-EOF
function(key, values) {
var hits = 0
for(var index in values) hits += values[index]["hits"]
return { "hits": hits }
}
EOF
88. map/reduce hits per day
(implement format only if needed)
MAP = <<-EOF
function() {
if (!Date.prototype.format) {
Date.prototype.format = function(format) {
...
}
}
emit([ this.url, this.at.format("Ymd") ].join("-"), { "hits": 1 })
}
EOF
REDUCE = <<-EOF
function(key, values) {
var hits = 0
for(var index in values) hits += values[index]["hits"]
return { "hits": hits }
}
EOF
89. map/reduce hits per day
(implement format once and for all)
db[Mongo::DB::SYSTEM_JS_COLLECTION].save(
:_id => "formatDate",
:value => BSON::Code.new(
<<-EOF
function(date, format) {
if (!Date.prototype.format) {
Date.prototype.format = function(format) { ... }
}
return date.format(format)
}
EOF
)
)
MAP = <<-EOF
function() {
emit([ this.url, formatDate(this.at, "Ymd") ].join("-"), {"hits":1})
}
EOF
90. map/reduce hits per day
(implement format once and for all)
db[Mongo::DB::SYSTEM_JS_COLLECTION].save(
:_id => "load",
:value => BSON::Code.new(
<<-EOF
function(module) {
if ((module === "date") && !Date.prototype.format) {
Date.prototype.format = function(format) { ... }
}
return true
}
EOF
)
)
MAP = <<-EOF
function() {
load("date") && emit(
[ this.url, this.at.format("Ymd") ].join("-"),
{ "hits": 1 }
)
}
EOF
91. map/reduce hits per day
(ok, but could be taking too long)
MAP = <<-EOF
function() {
emit([ this.url, this.at.format("Ymd") ].join("-"), { "hits": 1 })
}
EOF
REDUCE = <<-EOF $ ruby src/incremental_mr.rb
function(key, values)
{
{ "result"=>"visits",
var hits = 0 "timeMillis"=>4197,
for(var index in values) hits += values[index]["hits"]
"timing"=> {
"mapTime"=>3932,
return { "hits": hits }
"emitLoop"=>4170,
} "total"=>4197
EOF },
"counts"=> {
"input"=>10000,
result = db["visit_events"].map_reduce(
"emit"=>10000,
"output"=>200
MAP, REDUCE, :out => "visits", :raw =>
}, true, :verbose => true
) "ok"=>1.0
}
puts result.inspect
92. map/reduce hits per day
(ok, every time we need to start over)
> db.visits.find()
{ "_id" : "019640ff7952425b1b8695605459d223-‐20110316",
"value" : { "hits" : 47 }
}
{ "_id" : "019640ff7952425b1b8695605459d223-‐20110317",
"value" : { "hits" : 49 }
}
{ "_id" : "019640ff7952425b1b8695605459d223-‐20110318",
"value" : { "hits" : 59 }
}
{ "_id" : "019640ff7952425b1b8695605459d223-‐20110319",
"value" : { "hits" : 37 }
}
93. map/reduce hits per day
(incremental with savepoints)
visit-elements visit
collection collection
map/reduce
on last changed upsert
documents
temporary
collection
94. map/reduce hits per day
(incremental with savepoints)
db.create_collection("visit_events",
:capped => true,
visit-elements
:max => 50_000,
:size => 5_000_000 collection
)
map/reduce
on last changed
documents
temporary
collection
99. map/reduce hits per day
(incremental with savepoints)
def savepoint(db)
db["visits"].find_one(:_id => "savepoint") or
{ "at" => BSON::ObjectId.from_time(10.years.ago) }
end
def from_last_updated(db)
savepoint["at"]
end
def to_last_inserted(db)
db["visit_events"].find.sort([:_id, Mongo::DESCENDING]).first["_id"]
end