add metrics for rdb lru cache (#2586)

This is a first step towards measuring the efficiency of the LRU caches over time - metrics can be collected during import or when running regulary. Since `nim-metrics` carries some overhead for its default way of reporting metrics, this PR implements a custom collector over atomic counters, given that this is one of the hottest spots in the block processing pipeline. Using a compile-time flag, the same metrics can be printed on exit which is useful when comparing different strategies for caching - here's a recent run over blocks 16000001-1616384 - this is a good candidate to expose in a better way in the future, maybe: ``` state vtype miss hit total hitrate Account Leaf 4909417 4466215 9375632 47.64% Account Branch 20742574 72015123 92757697 77.64% World Leaf 940483 1140946 2081429 54.82% World Branch 8224151 131496580 139720731 94.11% all all 34816625 209118864 243935489 85.73% ```
status-im · Sep 2, 2024 · 35cc78c · 35cc78c
1 parent ef1bab0
commit 35cc78c
Showing 1 changed file with 125 additions and 3 deletions.
diff --git a/nimbus/db/aristo/aristo_init/rocks_db/rdb_get.nim b/nimbus/db/aristo/aristo_init/rocks_db/rdb_get.nim
@@ -20,7 +20,9 @@ import
   stew/keyed_queue,
   ../../[aristo_blobify, aristo_desc],
   ../init_common,
-  ./rdb_desc
+  ./rdb_desc,
+  metrics,
+  std/concurrency/atomics
 
 const
   extraTraceMessages = false
@@ -33,6 +35,73 @@ when extraTraceMessages:
   logScope:
     topics = "aristo-rocksdb"
 
+type
+  RdbVtxLruCounter = ref object of Counter
+  RdbKeyLruCounter = ref object of Counter
+
+  LruCounter = array[bool, Atomic[uint64]]
+
+  StateType = enum
+    Account
+    World
+
+var
+  # Hit/miss counters for LRU cache - global so as to integrate easily with
+  # nim-metrics and `uint64` to ensure that increasing them is fast - collection
+  # happens from a separate thread.
+  # TODO maybe turn this into more general framework for LRU reporting since
+  #      we have lots of caches of this sort
+  rdbVtxLruStats: array[StateType, array[VertexType, LruCounter]]
+  rdbKeyLruStats: array[StateType, LruCounter]
+
+var
+  rdbVtxLruStatsMetric {.used.} = RdbVtxLruCounter.newCollector(
+    "aristo_rdb_vtx_lru_total",
+    "Vertex LRU lookup (hit/miss, world/account, branch/leaf)",
+    labels = ["state", "vtype", "hit"],
+  )
+  rdbKeyLruStatsMetric {.used.} = RdbKeyLruCounter.newCollector(
+    "aristo_rdb_key_lru_total", "HashKey LRU lookup", labels = ["state", "hit"]
+  )
+
+template to(v: RootedVertexID, T: type StateType): StateType =
+  if v.root == VertexID(1): StateType.World else: StateType.Account
+
+template inc(v: var LruCounter, hit: bool) =
+  discard v[hit].fetchAdd(1, moRelaxed)
+
+template get(v: LruCounter, hit: bool): uint64 =
+  v[hit].load(moRelaxed)
+
+method collect*(collector: RdbVtxLruCounter, output: MetricHandler) =
+  let timestamp = collector.now()
+
+  # We don't care about synchronization between each type of metric or between
+  # the metrics thread and others since small differences like this don't matter
+  for state in StateType:
+    for vtype in VertexType:
+      for hit in [false, true]:
+        output(
+          name = "aristo_rdb_vtx_lru_total",
+          value = float64(rdbVtxLruStats[state][vtype].get(hit)),
+          labels = ["state", "vtype", "hit"],
+          labelValues = [$state, $vtype, $ord(hit)],
+          timestamp = timestamp,
+        )
+
+method collect*(collector: RdbKeyLruCounter, output: MetricHandler) =
+  let timestamp = collector.now()
+
+  for state in StateType:
+    for hit in [false, true]:
+      output(
+        name = "aristo_rdb_key_lru_total",
+        value = float64(rdbKeyLruStats[state].get(hit)),
+        labels = ["state", "hit"],
+        labelValues = [$state, $ord(hit)],
+        timestamp = timestamp,
+      )
+
 # ------------------------------------------------------------------------------
 # Public functions
 # ------------------------------------------------------------------------------
@@ -60,8 +129,11 @@ proc getKey*(
   # Try LRU cache first
   var rc = rdb.rdKeyLru.lruFetch(rvid.vid)
   if rc.isOK:
+    rdbKeyLruStats[rvid.to(StateType)].inc(true)
     return ok(move(rc.value))
 
+  rdbKeyLruStats[rvid.to(StateType)].inc(false)
+
   # Otherwise fetch from backend database
   # A threadvar is used to avoid allocating an environment for onData
   var res{.threadvar.}: Opt[HashKey]
@@ -90,6 +162,7 @@ proc getVtx*(
   # Try LRU cache first
   var rc = rdb.rdVtxLru.lruFetch(rvid.vid)
   if rc.isOK:
+    rdbVtxLruStats[rvid.to(StateType)][rc.value().vType].inc(true)
     return ok(move(rc.value))
 
   # Otherwise fetch from backend database
@@ -105,13 +178,62 @@ proc getVtx*(
     return err((errSym,error))
 
   if not gotData:
-    res.ok(VertexRef(nil))
-  elif res.isErr():
+    # As a hack, we count missing data as leaf nodes
+    rdbVtxLruStats[rvid.to(StateType)][VertexType.Leaf].inc(false)
+    return ok(VertexRef(nil))
+
+  if res.isErr():
     return err((res.error(), "Parsing failed")) # Parsing failed
 
+  rdbVtxLruStats[rvid.to(StateType)][res.value().vType].inc(false)
+
   # Update cache and return
   ok rdb.rdVtxLru.lruAppend(rvid.vid, res.value(), RdVtxLruMaxSize)
 
 # ------------------------------------------------------------------------------
 # End
 # ------------------------------------------------------------------------------
+
+when defined(printStatsAtExit):
+  # Useful hack for printing exact metrics to compare runs with different
+  # settings
+  import std/[exitprocs, strformat]
+  addExitProc(
+    proc() =
+      block vtx:
+        var misses, hits: uint64
+        echo "vtxLru(", RdVtxLruMaxSize, ")"
+        echo "   state    vtype       miss        hit      total hitrate"
+        for state in StateType:
+          for vtype in VertexType:
+            let
+              (miss, hit) = (
+                rdbVtxLruStats[state][vtype].get(false),
+                rdbVtxLruStats[state][vtype].get(true),
+              )
+              hitRate = float64(hit * 100) / (float64(hit + miss))
+            misses += miss
+            hits += hit
+            echo &"{state:>8} {vtype:>8} {miss:>10} {hit:>10} {miss+hit:>10} {hitRate:>6.2f}%"
+        let hitRate = float64(hits * 100) / (float64(hits + misses))
+        echo &"     all      all {misses:>10} {hits:>10} {misses+hits:>10} {hitRate:>6.2f}%"
+
+      block key:
+        var misses, hits: uint64
+        echo "keyLru(", RdKeyLruMaxSize, ") "
+
+        echo "   state       miss        hit      total hitrate"
+
+        for state in StateType:
+          let
+            (miss, hit) =
+              (rdbKeyLruStats[state].get(false), rdbKeyLruStats[state].get(true))
+            hitRate = float64(hit * 100) / (float64(hit + miss))
+          misses += miss
+          hits += hit
+
+          echo &"{state:>8} {miss:>10} {hit:>10} {miss+hit:>10} {hitRate:>5.2f}%"
+
+        let hitRate = float64(hits * 100) / (float64(hits + misses))
+        echo &"     all {misses:>10} {hits:>10} {misses+hits:>10} {hitRate:>5.2f}%"
+  )