diff --git a/node/actors/bft/src/chonky_bft/commit.rs b/node/actors/bft/src/chonky_bft/commit.rs index afb6deb4..48ec1a94 100644 --- a/node/actors/bft/src/chonky_bft/commit.rs +++ b/node/actors/bft/src/chonky_bft/commit.rs @@ -146,12 +146,11 @@ impl StateMachine { .await .wrap("process_commit_qc()")?; - // Metrics. - let now = ctx.now(); + // Metrics. We observe the latency of commiting to a block measured + // from the start of this view. metrics::METRICS - .leader_commit_phase_latency - .observe_latency(now - self.phase_start); - self.phase_start = now; + .commit_latency + .observe_latency(ctx.now() - self.view_start); // Start a new view. self.start_new_view(ctx, message.view.number.next()).await?; diff --git a/node/actors/bft/src/chonky_bft/mod.rs b/node/actors/bft/src/chonky_bft/mod.rs index 548c1421..6eadd5d1 100644 --- a/node/actors/bft/src/chonky_bft/mod.rs +++ b/node/actors/bft/src/chonky_bft/mod.rs @@ -65,8 +65,8 @@ pub(crate) struct StateMachine { /// The deadline to receive an input message before timing out. pub(crate) timeout_deadline: time::Deadline, - /// Time when the current phase has started. Used for metrics. - pub(crate) phase_start: time::Instant, + /// Time when the current view phase has started. Used for metrics. + pub(crate) view_start: time::Instant, } impl StateMachine { @@ -116,7 +116,7 @@ impl StateMachine { timeout_views_cache: BTreeMap::new(), timeout_qcs_cache: BTreeMap::new(), timeout_deadline: time::Deadline::Finite(ctx.now() + Self::TIMEOUT_DURATION), - phase_start: ctx.now(), + view_start: ctx.now(), }; Ok((this, send)) @@ -126,6 +126,8 @@ impl StateMachine { /// This is the main entry point for the state machine, /// potentially triggering state modifications and message sending to the executor. pub(crate) async fn run(mut self, ctx: &ctx::Ctx) -> ctx::Result<()> { + self.view_start = ctx.now(); + // If this is the first view, we immediately timeout. This will force the replicas // to synchronize right at the beginning and will provide a justification for the // next view. This is necessary because the first view is not justified by any @@ -179,7 +181,7 @@ impl StateMachine { Err(()) } }; - metrics::ConsensusMsgLabel::ReplicaPrepare.with_result(&res) + metrics::ConsensusMsgLabel::LeaderProposal.with_result(&res) } ConsensusMsg::ReplicaCommit(_) => { let res = match self @@ -206,7 +208,7 @@ impl StateMachine { Err(()) } }; - metrics::ConsensusMsgLabel::ReplicaPrepare.with_result(&res) + metrics::ConsensusMsgLabel::ReplicaCommit.with_result(&res) } ConsensusMsg::ReplicaTimeout(_) => { let res = match self @@ -233,7 +235,7 @@ impl StateMachine { Err(()) } }; - metrics::ConsensusMsgLabel::ReplicaPrepare.with_result(&res) + metrics::ConsensusMsgLabel::ReplicaTimeout.with_result(&res) } ConsensusMsg::ReplicaNewView(_) => { let res = match self @@ -260,10 +262,10 @@ impl StateMachine { Err(()) } }; - metrics::ConsensusMsgLabel::ReplicaPrepare.with_result(&res) + metrics::ConsensusMsgLabel::ReplicaNewView.with_result(&res) } }; - metrics::METRICS.replica_processing_latency[&label].observe_latency(ctx.now() - now); + metrics::METRICS.message_processing_latency[&label].observe_latency(ctx.now() - now); // Notify network actor that the message has been processed. // Ignore sending error. diff --git a/node/actors/bft/src/chonky_bft/new_view.rs b/node/actors/bft/src/chonky_bft/new_view.rs index 5d85af46..a4804f4b 100644 --- a/node/actors/bft/src/chonky_bft/new_view.rs +++ b/node/actors/bft/src/chonky_bft/new_view.rs @@ -2,7 +2,7 @@ use std::cmp::max; use super::StateMachine; use crate::metrics; -use zksync_concurrency::{ctx, error::Wrap, time}; +use zksync_concurrency::{ctx, error::Wrap, metrics::LatencyHistogramExt as _, time}; use zksync_consensus_network::io::ConsensusInputMessage; use zksync_consensus_roles::validator; @@ -141,9 +141,14 @@ impl StateMachine { }; self.outbound_pipe.send(output_message.into()); - // Log the event. + // Log the event and update the metrics. tracing::info!("Starting view {}", self.view_number); metrics::METRICS.replica_view_number.set(self.view_number.0); + let now = ctx.now(); + metrics::METRICS + .view_latency + .observe_latency(now - self.view_start); + self.view_start = now; // Reset the timeout. self.timeout_deadline = time::Deadline::Finite(ctx.now() + Self::TIMEOUT_DURATION); diff --git a/node/actors/bft/src/chonky_bft/proposal.rs b/node/actors/bft/src/chonky_bft/proposal.rs index f757f3e7..2b9e0933 100644 --- a/node/actors/bft/src/chonky_bft/proposal.rs +++ b/node/actors/bft/src/chonky_bft/proposal.rs @@ -1,6 +1,8 @@ +use crate::metrics; + use super::StateMachine; use std::cmp::max; -use zksync_concurrency::{ctx, error::Wrap}; +use zksync_concurrency::{ctx, error::Wrap, metrics::LatencyHistogramExt as _}; use zksync_consensus_network::io::ConsensusInputMessage; use zksync_consensus_roles::validator::{self, BlockHeader, BlockNumber}; @@ -188,6 +190,12 @@ impl StateMachine { // ----------- All checks finished. Now we process the message. -------------- + // Metrics. We observe the latency of receiving a proposal measured + // from the start of this view. + metrics::METRICS + .proposal_latency + .observe_latency(ctx.now() - self.view_start); + // Create our commit vote. let commit_vote = validator::ReplicaCommit { view: message.view(), diff --git a/node/actors/bft/src/chonky_bft/proposer.rs b/node/actors/bft/src/chonky_bft/proposer.rs index e460ba6e..3f6eb823 100644 --- a/node/actors/bft/src/chonky_bft/proposer.rs +++ b/node/actors/bft/src/chonky_bft/proposer.rs @@ -89,7 +89,7 @@ pub(crate) async fn create_proposal( } metrics::METRICS - .leader_proposal_payload_size + .proposal_payload_size .observe(payload.0.len()); Some(payload) diff --git a/node/actors/bft/src/chonky_bft/timeout.rs b/node/actors/bft/src/chonky_bft/timeout.rs index d01a9d2b..0bcf34a9 100644 --- a/node/actors/bft/src/chonky_bft/timeout.rs +++ b/node/actors/bft/src/chonky_bft/timeout.rs @@ -146,9 +146,9 @@ impl StateMachine { // Metrics. let now = ctx.now(); metrics::METRICS - .leader_commit_phase_latency - .observe_latency(now - self.phase_start); - self.phase_start = now; + .commit_phase_latency + .observe_latency(now - self.view_start); + self.view_start = now; // Start a new view. self.start_new_view(ctx, message.view.number.next()).await?; diff --git a/node/actors/bft/src/metrics.rs b/node/actors/bft/src/metrics.rs index ea23c711..248b57c9 100644 --- a/node/actors/bft/src/metrics.rs +++ b/node/actors/bft/src/metrics.rs @@ -12,14 +12,14 @@ const PAYLOAD_SIZE_BUCKETS: Buckets = Buckets::exponential( #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, EncodeLabelValue)] #[metrics(rename_all = "snake_case")] pub(crate) enum ConsensusMsgLabel { - /// Label for a `LeaderPrepare` message. - LeaderPrepare, - /// Label for a `LeaderCommit` message. - LeaderCommit, - /// Label for a `ReplicaPrepare` message. - ReplicaPrepare, + /// Label for a `LeaderProposal` message. + LeaderProposal, /// Label for a `ReplicaCommit` message. ReplicaCommit, + /// Label for a `ReplicaTimeout` message. + ReplicaTimeout, + /// Label for a `ReplicaNewView` message. + ReplicaNewView, } impl ConsensusMsgLabel { @@ -53,26 +53,29 @@ pub(crate) struct ProcessingLatencyLabels { #[derive(Debug, Metrics)] #[metrics(prefix = "consensus")] pub(crate) struct ConsensusMetrics { + /// Number of the current view of the replica. + #[metrics(unit = Unit::Seconds)] + pub(crate) replica_view_number: Gauge, + /// Number of the last finalized block observed by the node. + pub(crate) finalized_block_number: Gauge, /// Size of the proposed payload in bytes. #[metrics(buckets = PAYLOAD_SIZE_BUCKETS, unit = Unit::Bytes)] - pub(crate) leader_proposal_payload_size: Histogram, - /// Latency of the commit phase observed by the leader. + pub(crate) proposal_payload_size: Histogram, + /// Latency of receiving a proposal as observed by the replica. Measures from + /// the start of the view until we have a verified proposal. #[metrics(buckets = Buckets::exponential(0.01..=20.0, 1.5), unit = Unit::Seconds)] - pub(crate) leader_commit_phase_latency: Histogram, - /// Currently set timeout after which replica will proceed to the next view. - #[metrics(unit = Unit::Seconds)] - pub(crate) replica_view_timeout: Gauge, + pub(crate) proposal_latency: Histogram, + /// Latency of committing to a block as observed by the replica. Measures from + /// the start of the view until we send a commit vote. + #[metrics(buckets = Buckets::exponential(0.01..=20.0, 1.5), unit = Unit::Seconds)] + pub(crate) commit_latency: Histogram, + /// Latency of a single view as observed by the replica. Measures from + /// the start of the view until the start of the next. + #[metrics(buckets = Buckets::exponential(0.01..=20.0, 1.5), unit = Unit::Seconds)] + pub(crate) view_latency: Histogram, /// Latency of processing messages by the replicas. #[metrics(buckets = Buckets::LATENCIES, unit = Unit::Seconds)] - pub(crate) replica_processing_latency: Family>, - /// Latency of processing messages by the leader. - #[metrics(buckets = Buckets::LATENCIES, unit = Unit::Seconds)] - pub(crate) leader_processing_latency: Family>, - /// Number of the last finalized block observed by the node. - pub(crate) finalized_block_number: Gauge, - /// Number of the current view of the replica. - #[metrics(unit = Unit::Seconds)] - pub(crate) replica_view_number: Gauge, + pub(crate) message_processing_latency: Family>, } /// Global instance of [`ConsensusMetrics`].