Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added tooling for trie stats #8

Merged
merged 11 commits into from
Feb 8, 2024
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ keccak-hash = "0.10.0"
parking_lot = { version = "0.12.1", features = ["serde"] }
thiserror = "1.0.40"
log = "0.4.17"
num = { version = "0.4.1", optional = true }
num-traits = "0.2.15"
uint = "0.9.5"
rlp = "0.5.2"
Expand All @@ -39,7 +40,7 @@ serde_json = "1.0.96"

[features]
default = ["trie_debug"]
trie_debug = []
trie_debug = ["num"]

[lib]
doc-scrape-examples = true
Expand Down
1 change: 1 addition & 0 deletions src/debug_tools/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@
pub mod common;
pub mod diff;
pub mod query;
pub mod stats;
376 changes: 376 additions & 0 deletions src/debug_tools/stats.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,376 @@
//! Simple tooling to extract stats from tries.
//!
//! This is particularly useful when comparing a "base" trie against a sub-trie
//! (hashed out trie) created from it.

use std::fmt::{self, Display};

use num_traits::ToPrimitive;

use crate::partial_trie::{Node, PartialTrie};

#[derive(Debug, Default)]
pub struct TrieStats {
pub name: Option<String>,
pub counts: NodeCounts,
pub depth_stats: DepthStats,
}

impl Display for TrieStats {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "Trie Stats:")?;

match self.name.as_ref() {
Some(name) => writeln!(f, " ({})", name)?,
None => writeln!(f)?,
}

writeln!(f, "Counts:\n{}", self.counts)?;
writeln!(f, "Depth stats:\n{}", self.depth_stats)
}
}

impl TrieStats {
pub fn compare(&self, other: &Self) -> TrieComparison {
BGluth marked this conversation as resolved.
Show resolved Hide resolved
TrieComparison {
node_comp: self.counts.compare(&other.counts),
depth_comp: self.depth_stats.compare(&other.depth_stats),
}
}
}

/// Total node counts for a trie.
#[derive(Debug, Default)]
pub struct NodeCounts {
BGluth marked this conversation as resolved.
Show resolved Hide resolved
pub empty: usize,
pub hash: usize,
pub branch: usize,
pub extension: usize,
pub leaf: usize,
}

impl Display for NodeCounts {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let tot_nodes = self.total_nodes();

Self::write_node_count_stats(f, "Empty", self.empty, tot_nodes)?;
Self::write_node_count_stats(f, "Hash", self.hash, tot_nodes)?;
Self::write_node_count_stats(f, "Branch", self.branch, tot_nodes)?;
Self::write_node_count_stats(f, "Extension", self.extension, tot_nodes)?;
Self::write_node_count_stats(f, "Leaf", self.leaf, tot_nodes)
}
}

impl NodeCounts {
fn write_node_count_stats(
f: &mut fmt::Formatter<'_>,
node_t_name: &str,
count: usize,
tot_count: usize,
) -> fmt::Result {
let perc = (count as f32 / tot_count as f32) * 100.0;
writeln!(f, "{}: {} ({:.3}%)", node_t_name, count, perc)
BGluth marked this conversation as resolved.
Show resolved Hide resolved
}
}

impl NodeCounts {
pub fn total_nodes(&self) -> usize {
self.empty + self.total_node_non_empty()
}

pub fn total_node_non_empty(&self) -> usize {
self.branch + self.extension + self.hash_and_leaf_node_count()
}

pub fn hash_and_leaf_node_count(&self) -> usize {
self.hash + self.leaf
}

pub fn compare(&self, other: &Self) -> NodeComparison {
BGluth marked this conversation as resolved.
Show resolved Hide resolved
NodeComparison {
tot_node_rat: RatioStat::new(self.total_nodes(), other.total_nodes()),
non_empty_rat: RatioStat::new(
self.total_node_non_empty(),
other.total_node_non_empty(),
),
empty_rat: RatioStat::new(self.empty, other.empty),
hash_rat: RatioStat::new(self.hash, other.hash),
branch_rat: RatioStat::new(self.branch, other.branch),
extension_rat: RatioStat::new(self.extension, other.extension),
leaf_rat: RatioStat::new(self.leaf, other.leaf),
}
}
}

/// Information on the comparison between two tries.
#[derive(Debug)]
pub struct TrieComparison {
pub node_comp: NodeComparison,
pub depth_comp: DepthComparison,
}
BGluth marked this conversation as resolved.
Show resolved Hide resolved

impl Display for TrieComparison {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
writeln!(f, "Node comparison: {}", self.node_comp)?;
writeln!(f, "Depth comparison: {}", self.depth_comp)
}
}

// TODO: Consider computing these values lazily?
#[derive(Debug)]
pub struct NodeComparison {
pub tot_node_rat: RatioStat<usize>,
pub non_empty_rat: RatioStat<usize>,

pub empty_rat: RatioStat<usize>,
pub hash_rat: RatioStat<usize>,
pub branch_rat: RatioStat<usize>,
pub extension_rat: RatioStat<usize>,
pub leaf_rat: RatioStat<usize>,
}
BGluth marked this conversation as resolved.
Show resolved Hide resolved

impl Display for NodeComparison {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
writeln!(f, "Total nodes: {}", self.tot_node_rat)?;
writeln!(f, "Non-empty: {}", self.non_empty_rat)?;

writeln!(f, "Total empty: {}", self.empty_rat)?;
writeln!(f, "Total hash: {}", self.hash_rat)?;
writeln!(f, "Total branch: {}", self.branch_rat)?;
writeln!(f, "Total extension: {}", self.extension_rat)?;
writeln!(f, "Total leaf: {}", self.leaf_rat)
}
}

#[derive(Debug)]
pub struct DepthComparison {
pub lowest_depth_rat: RatioStat<usize>,
pub avg_leaf_depth_rat: RatioStat<f32>,
pub avg_hash_depth_rat: RatioStat<f32>,
}

impl Display for DepthComparison {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
writeln!(f, "Lowest depth: {}", self.lowest_depth_rat)?;
writeln!(f, "Avg leaf depth: {}", self.avg_leaf_depth_rat)?;
writeln!(f, "Avg hash depth: {}", self.avg_hash_depth_rat)
}
}

/// Type to hold (and compare) a given variable from two different tries.s
#[derive(Debug)]
pub struct RatioStat<T> {
pub a: T,
pub b: T,
}

impl<T: Display + ToPrimitive> Display for RatioStat<T> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
"{:.3} / {:.3} ({:.3}%)",
self.a,
self.b,
self.get_a_over_b_perc()
)
}
}

impl<T: ToPrimitive> RatioStat<T> {
/// `new` doesn't have any logic, but this will reduce a lot of line lengths
/// since this is called so many times.
fn new(a: T, b: T) -> Self {
Self { a, b }
}
BGluth marked this conversation as resolved.
Show resolved Hide resolved

fn get_a_over_b_perc(&self) -> f32 {
(self.a.to_f32().unwrap() / self.b.to_f32().unwrap()) * 100.0
BGluth marked this conversation as resolved.
Show resolved Hide resolved
}
}

/// "Raw" state that is mutated as we traverse down the trie. Is processed into
/// a more useful format later on.
#[derive(Debug, Default)]
struct CurrTrackingState {
counts: NodeCounts,

// The "*_sum" variables are just accumulators that we process later to get average depths.
leaf_depth_sum: u64,
hash_depth_sum: u64,
lowest_depth: usize,
}

impl CurrTrackingState {
fn update_lowest_depth_if_larger(&mut self, curr_depth: usize) {
if self.lowest_depth < curr_depth {
self.lowest_depth = curr_depth;
}
}
}

/// Depth in terms of node depth (not key length).
#[derive(Clone, Debug, Default)]
pub struct DepthStats {
pub lowest_depth: usize,
pub avg_leaf_depth: f32,
pub avg_hash_depth: f32,
}

impl Display for DepthStats {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
writeln!(f, "Lowest depth: {}", self.lowest_depth)?;
writeln!(f, "Average leaf depth: {:.3}", self.avg_leaf_depth)?;
writeln!(f, "Average hash depth: {:.3}", self.avg_hash_depth)
}
}

impl DepthStats {
fn compare(&self, other: &Self) -> DepthComparison {
DepthComparison {
lowest_depth_rat: RatioStat::new(self.lowest_depth, other.lowest_depth),
avg_leaf_depth_rat: RatioStat::new(self.avg_leaf_depth, other.avg_leaf_depth),
avg_hash_depth_rat: RatioStat::new(self.avg_hash_depth, other.avg_hash_depth),
}
}
}

pub fn get_trie_stats<T: PartialTrie>(trie: &T) -> TrieStats {
get_trie_stats_common(trie, None)
}

pub fn get_trie_stats_with_name<T: PartialTrie>(trie: &T, name: String) -> TrieStats {
get_trie_stats_common(trie, Some(name))
}

fn get_trie_stats_common<T: PartialTrie>(trie: &T, name: Option<String>) -> TrieStats {
let mut state = CurrTrackingState::default();

get_trie_stats_rec(trie, &mut state, 0);

let depth_stats = DepthStats {
lowest_depth: state.lowest_depth,
avg_leaf_depth: state.leaf_depth_sum as f32 / state.counts.leaf as f32,
avg_hash_depth: state.hash_depth_sum as f32 / state.counts.hash as f32,
};

TrieStats {
name,
counts: state.counts,
depth_stats,
}
}

fn get_trie_stats_rec<T: PartialTrie>(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

returning the state rather than using a mut state as an arg would be more rusty

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The only downside is then we would need to do more computation when merging together multiple CurrTrackingState together, which I guess doesn't really matter too much.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will revisit this afterwards!

node: &Node<T>,
state: &mut CurrTrackingState,
curr_depth: usize,
) {
match node {
Node::Empty => {
state.counts.empty += 1;
}
Node::Hash(_) => {
state.counts.hash += 1;
state.hash_depth_sum += curr_depth as u64;
state.update_lowest_depth_if_larger(curr_depth);
}
Node::Branch { children, value: _ } => {
state.counts.branch += 1;

for c in children {
get_trie_stats_rec(c, state, curr_depth + 1);
}
}
Node::Extension { nibbles: _, child } => {
state.counts.extension += 1;
get_trie_stats_rec(child, state, curr_depth + 1);
}
Node::Leaf {
nibbles: _,
value: _,
} => {
state.counts.leaf += 1;
state.leaf_depth_sum += curr_depth as u64;
state.update_lowest_depth_if_larger(curr_depth);
}
}
}

#[cfg(test)]
mod tests {
use super::get_trie_stats;
use crate::{
partial_trie::{HashedPartialTrie, PartialTrie},
testing_utils::{
generate_n_random_fixed_trie_hash_entries, generate_n_random_fixed_trie_value_entries,
handmade_trie_1,
},
};

const MASSIVE_TRIE_SIZE: usize = 100_000;

#[test]
fn hand_made_trie_has_correct_node_stats() {
let (trie, _) = handmade_trie_1();
let stats = get_trie_stats(&trie);

assert_eq!(stats.counts.leaf, 4);
assert_eq!(stats.counts.hash, 0);
assert_eq!(stats.counts.branch, 4);
assert_eq!(stats.counts.extension, 2);

// empty = (n_branch * 4) - n_leaf - (n_branch - 1)
assert_eq!(stats.counts.empty, 57);
}

// TODO: Low-priority. Finish later.
#[test]
#[ignore]
fn perfectly_balanced_trie_has_correct_node_stats() {
todo!()
}

#[test]
fn massive_leaf_trie_has_correct_leaf_node_stats() {
create_trie_and_stats_from_entries_and_assert(MASSIVE_TRIE_SIZE, 0, 9522);
}

#[test]
fn massive_hash_trie_has_correct_hash_node_stats() {
create_trie_and_stats_from_entries_and_assert(0, MASSIVE_TRIE_SIZE, 9855);
}

#[test]
fn massive_mixed_trie_has_correct_hash_node_stats() {
create_trie_and_stats_from_entries_and_assert(
MASSIVE_TRIE_SIZE / 2,
MASSIVE_TRIE_SIZE / 2,
1992,
);
}

fn create_trie_and_stats_from_entries_and_assert(
n_leaf_nodes: usize,
n_hash_nodes: usize,
seed: u64,
) {
let val_entries = generate_n_random_fixed_trie_value_entries(n_leaf_nodes, seed);
let hash_entries = generate_n_random_fixed_trie_hash_entries(n_hash_nodes, seed + 1);

let mut trie = HashedPartialTrie::default();
trie.extend(val_entries);
trie.extend(hash_entries);

let stats = get_trie_stats(&trie);

assert_eq!(stats.counts.leaf, n_leaf_nodes);
assert_eq!(stats.counts.hash, n_hash_nodes);
}

// TODO: Low-priority. Finish later.
#[test]
#[ignore]
fn depth_stats_work() {
todo!()
}
}
Loading