Skip to content

Commit

Permalink
add metrics for rdb lru cache (#2586)
Browse files Browse the repository at this point in the history
This is a first step towards measuring the efficiency of the LRU caches
over time - metrics can be collected during import or when running
regulary.

Since `nim-metrics` carries some overhead for its default way of
reporting metrics, this PR implements a custom collector over atomic
counters, given that this is one of the hottest spots in the block
processing pipeline.

Using a compile-time flag, the same metrics can be printed on exit which
is useful when comparing different strategies for caching - here's a
recent run over blocks 16000001-1616384 - this is a good candidate to
expose in a better way in the future, maybe:

```
   state    vtype       miss        hit      total hitrate
 Account     Leaf    4909417    4466215    9375632  47.64%
 Account   Branch   20742574   72015123   92757697  77.64%
   World     Leaf     940483    1140946    2081429  54.82%
   World   Branch    8224151  131496580  139720731  94.11%
     all      all   34816625  209118864  243935489  85.73%
```
  • Loading branch information
arnetheduck committed Sep 2, 2024
1 parent ef1bab0 commit 35cc78c
Showing 1 changed file with 125 additions and 3 deletions.
128 changes: 125 additions & 3 deletions nimbus/db/aristo/aristo_init/rocks_db/rdb_get.nim
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@ import
stew/keyed_queue,
../../[aristo_blobify, aristo_desc],
../init_common,
./rdb_desc
./rdb_desc,
metrics,
std/concurrency/atomics

const
extraTraceMessages = false
Expand All @@ -33,6 +35,73 @@ when extraTraceMessages:
logScope:
topics = "aristo-rocksdb"

type
RdbVtxLruCounter = ref object of Counter
RdbKeyLruCounter = ref object of Counter

LruCounter = array[bool, Atomic[uint64]]

StateType = enum
Account
World

var
# Hit/miss counters for LRU cache - global so as to integrate easily with
# nim-metrics and `uint64` to ensure that increasing them is fast - collection
# happens from a separate thread.
# TODO maybe turn this into more general framework for LRU reporting since
# we have lots of caches of this sort
rdbVtxLruStats: array[StateType, array[VertexType, LruCounter]]
rdbKeyLruStats: array[StateType, LruCounter]

var
rdbVtxLruStatsMetric {.used.} = RdbVtxLruCounter.newCollector(
"aristo_rdb_vtx_lru_total",
"Vertex LRU lookup (hit/miss, world/account, branch/leaf)",
labels = ["state", "vtype", "hit"],
)
rdbKeyLruStatsMetric {.used.} = RdbKeyLruCounter.newCollector(
"aristo_rdb_key_lru_total", "HashKey LRU lookup", labels = ["state", "hit"]
)

template to(v: RootedVertexID, T: type StateType): StateType =
if v.root == VertexID(1): StateType.World else: StateType.Account

template inc(v: var LruCounter, hit: bool) =
discard v[hit].fetchAdd(1, moRelaxed)

template get(v: LruCounter, hit: bool): uint64 =
v[hit].load(moRelaxed)

method collect*(collector: RdbVtxLruCounter, output: MetricHandler) =
let timestamp = collector.now()

# We don't care about synchronization between each type of metric or between
# the metrics thread and others since small differences like this don't matter
for state in StateType:
for vtype in VertexType:
for hit in [false, true]:
output(
name = "aristo_rdb_vtx_lru_total",
value = float64(rdbVtxLruStats[state][vtype].get(hit)),
labels = ["state", "vtype", "hit"],
labelValues = [$state, $vtype, $ord(hit)],
timestamp = timestamp,
)

method collect*(collector: RdbKeyLruCounter, output: MetricHandler) =
let timestamp = collector.now()

for state in StateType:
for hit in [false, true]:
output(
name = "aristo_rdb_key_lru_total",
value = float64(rdbKeyLruStats[state].get(hit)),
labels = ["state", "hit"],
labelValues = [$state, $ord(hit)],
timestamp = timestamp,
)

# ------------------------------------------------------------------------------
# Public functions
# ------------------------------------------------------------------------------
Expand Down Expand Up @@ -60,8 +129,11 @@ proc getKey*(
# Try LRU cache first
var rc = rdb.rdKeyLru.lruFetch(rvid.vid)
if rc.isOK:
rdbKeyLruStats[rvid.to(StateType)].inc(true)
return ok(move(rc.value))

rdbKeyLruStats[rvid.to(StateType)].inc(false)

# Otherwise fetch from backend database
# A threadvar is used to avoid allocating an environment for onData
var res{.threadvar.}: Opt[HashKey]
Expand Down Expand Up @@ -90,6 +162,7 @@ proc getVtx*(
# Try LRU cache first
var rc = rdb.rdVtxLru.lruFetch(rvid.vid)
if rc.isOK:
rdbVtxLruStats[rvid.to(StateType)][rc.value().vType].inc(true)
return ok(move(rc.value))

# Otherwise fetch from backend database
Expand All @@ -105,13 +178,62 @@ proc getVtx*(
return err((errSym,error))

if not gotData:
res.ok(VertexRef(nil))
elif res.isErr():
# As a hack, we count missing data as leaf nodes
rdbVtxLruStats[rvid.to(StateType)][VertexType.Leaf].inc(false)
return ok(VertexRef(nil))

if res.isErr():
return err((res.error(), "Parsing failed")) # Parsing failed

rdbVtxLruStats[rvid.to(StateType)][res.value().vType].inc(false)

# Update cache and return
ok rdb.rdVtxLru.lruAppend(rvid.vid, res.value(), RdVtxLruMaxSize)

# ------------------------------------------------------------------------------
# End
# ------------------------------------------------------------------------------

when defined(printStatsAtExit):
# Useful hack for printing exact metrics to compare runs with different
# settings
import std/[exitprocs, strformat]
addExitProc(
proc() =
block vtx:
var misses, hits: uint64
echo "vtxLru(", RdVtxLruMaxSize, ")"
echo " state vtype miss hit total hitrate"
for state in StateType:
for vtype in VertexType:
let
(miss, hit) = (
rdbVtxLruStats[state][vtype].get(false),
rdbVtxLruStats[state][vtype].get(true),
)
hitRate = float64(hit * 100) / (float64(hit + miss))
misses += miss
hits += hit
echo &"{state:>8} {vtype:>8} {miss:>10} {hit:>10} {miss+hit:>10} {hitRate:>6.2f}%"
let hitRate = float64(hits * 100) / (float64(hits + misses))
echo &" all all {misses:>10} {hits:>10} {misses+hits:>10} {hitRate:>6.2f}%"

block key:
var misses, hits: uint64
echo "keyLru(", RdKeyLruMaxSize, ") "

echo " state miss hit total hitrate"

for state in StateType:
let
(miss, hit) =
(rdbKeyLruStats[state].get(false), rdbKeyLruStats[state].get(true))
hitRate = float64(hit * 100) / (float64(hit + miss))
misses += miss
hits += hit

echo &"{state:>8} {miss:>10} {hit:>10} {miss+hit:>10} {hitRate:>5.2f}%"

let hitRate = float64(hits * 100) / (float64(hits + misses))
echo &" all {misses:>10} {hits:>10} {misses+hits:>10} {hitRate:>5.2f}%"
)

0 comments on commit 35cc78c

Please sign in to comment.