Skip to content

Commit

Permalink
metrics
Browse files Browse the repository at this point in the history
  • Loading branch information
brunoffranca committed Oct 30, 2024
1 parent 68527cb commit 51d2e48
Show file tree
Hide file tree
Showing 7 changed files with 58 additions and 41 deletions.
9 changes: 4 additions & 5 deletions node/actors/bft/src/chonky_bft/commit.rs
Original file line number Diff line number Diff line change
Expand Up @@ -146,12 +146,11 @@ impl StateMachine {
.await
.wrap("process_commit_qc()")?;

// Metrics.
let now = ctx.now();
// Metrics. We observe the latency of commiting to a block measured

Check warning on line 149 in node/actors/bft/src/chonky_bft/commit.rs

View workflow job for this annotation

GitHub Actions / typos

"commiting" should be "committing".
// from the start of this view.
metrics::METRICS
.leader_commit_phase_latency
.observe_latency(now - self.phase_start);
self.phase_start = now;
.commit_latency
.observe_latency(ctx.now() - self.view_start);

// Start a new view.
self.start_new_view(ctx, message.view.number.next()).await?;
Expand Down
18 changes: 10 additions & 8 deletions node/actors/bft/src/chonky_bft/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,8 @@ pub(crate) struct StateMachine {

/// The deadline to receive an input message before timing out.
pub(crate) timeout_deadline: time::Deadline,
/// Time when the current phase has started. Used for metrics.
pub(crate) phase_start: time::Instant,
/// Time when the current view phase has started. Used for metrics.
pub(crate) view_start: time::Instant,
}

impl StateMachine {
Expand Down Expand Up @@ -116,7 +116,7 @@ impl StateMachine {
timeout_views_cache: BTreeMap::new(),
timeout_qcs_cache: BTreeMap::new(),
timeout_deadline: time::Deadline::Finite(ctx.now() + Self::TIMEOUT_DURATION),
phase_start: ctx.now(),
view_start: ctx.now(),
};

Ok((this, send))
Expand All @@ -126,6 +126,8 @@ impl StateMachine {
/// This is the main entry point for the state machine,
/// potentially triggering state modifications and message sending to the executor.
pub(crate) async fn run(mut self, ctx: &ctx::Ctx) -> ctx::Result<()> {
self.view_start = ctx.now();

// If this is the first view, we immediately timeout. This will force the replicas
// to synchronize right at the beginning and will provide a justification for the
// next view. This is necessary because the first view is not justified by any
Expand Down Expand Up @@ -179,7 +181,7 @@ impl StateMachine {
Err(())
}
};
metrics::ConsensusMsgLabel::ReplicaPrepare.with_result(&res)
metrics::ConsensusMsgLabel::LeaderProposal.with_result(&res)
}
ConsensusMsg::ReplicaCommit(_) => {
let res = match self
Expand All @@ -206,7 +208,7 @@ impl StateMachine {
Err(())
}
};
metrics::ConsensusMsgLabel::ReplicaPrepare.with_result(&res)
metrics::ConsensusMsgLabel::ReplicaCommit.with_result(&res)
}
ConsensusMsg::ReplicaTimeout(_) => {
let res = match self
Expand All @@ -233,7 +235,7 @@ impl StateMachine {
Err(())
}
};
metrics::ConsensusMsgLabel::ReplicaPrepare.with_result(&res)
metrics::ConsensusMsgLabel::ReplicaTimeout.with_result(&res)
}
ConsensusMsg::ReplicaNewView(_) => {
let res = match self
Expand All @@ -260,10 +262,10 @@ impl StateMachine {
Err(())
}
};
metrics::ConsensusMsgLabel::ReplicaPrepare.with_result(&res)
metrics::ConsensusMsgLabel::ReplicaNewView.with_result(&res)
}
};
metrics::METRICS.replica_processing_latency[&label].observe_latency(ctx.now() - now);
metrics::METRICS.message_processing_latency[&label].observe_latency(ctx.now() - now);

// Notify network actor that the message has been processed.
// Ignore sending error.
Expand Down
9 changes: 7 additions & 2 deletions node/actors/bft/src/chonky_bft/new_view.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use std::cmp::max;

use super::StateMachine;
use crate::metrics;
use zksync_concurrency::{ctx, error::Wrap, time};
use zksync_concurrency::{ctx, error::Wrap, metrics::LatencyHistogramExt as _, time};
use zksync_consensus_network::io::ConsensusInputMessage;
use zksync_consensus_roles::validator;

Expand Down Expand Up @@ -141,9 +141,14 @@ impl StateMachine {
};
self.outbound_pipe.send(output_message.into());

// Log the event.
// Log the event and update the metrics.
tracing::info!("Starting view {}", self.view_number);
metrics::METRICS.replica_view_number.set(self.view_number.0);
let now = ctx.now();
metrics::METRICS
.view_latency
.observe_latency(now - self.view_start);
self.view_start = now;

// Reset the timeout.
self.timeout_deadline = time::Deadline::Finite(ctx.now() + Self::TIMEOUT_DURATION);
Expand Down
10 changes: 9 additions & 1 deletion node/actors/bft/src/chonky_bft/proposal.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
use crate::metrics;

use super::StateMachine;
use std::cmp::max;
use zksync_concurrency::{ctx, error::Wrap};
use zksync_concurrency::{ctx, error::Wrap, metrics::LatencyHistogramExt as _};
use zksync_consensus_network::io::ConsensusInputMessage;
use zksync_consensus_roles::validator::{self, BlockHeader, BlockNumber};

Expand Down Expand Up @@ -188,6 +190,12 @@ impl StateMachine {

// ----------- All checks finished. Now we process the message. --------------

// Metrics. We observe the latency of receiving a proposal measured
// from the start of this view.
metrics::METRICS
.proposal_latency
.observe_latency(ctx.now() - self.view_start);

// Create our commit vote.
let commit_vote = validator::ReplicaCommit {
view: message.view(),
Expand Down
2 changes: 1 addition & 1 deletion node/actors/bft/src/chonky_bft/proposer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ pub(crate) async fn create_proposal(
}

metrics::METRICS
.leader_proposal_payload_size
.proposal_payload_size
.observe(payload.0.len());

Some(payload)
Expand Down
6 changes: 3 additions & 3 deletions node/actors/bft/src/chonky_bft/timeout.rs
Original file line number Diff line number Diff line change
Expand Up @@ -146,9 +146,9 @@ impl StateMachine {
// Metrics.
let now = ctx.now();
metrics::METRICS
.leader_commit_phase_latency
.observe_latency(now - self.phase_start);
self.phase_start = now;
.commit_phase_latency
.observe_latency(now - self.view_start);
self.view_start = now;

// Start a new view.
self.start_new_view(ctx, message.view.number.next()).await?;
Expand Down
45 changes: 24 additions & 21 deletions node/actors/bft/src/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,14 @@ const PAYLOAD_SIZE_BUCKETS: Buckets = Buckets::exponential(
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, EncodeLabelValue)]
#[metrics(rename_all = "snake_case")]
pub(crate) enum ConsensusMsgLabel {
/// Label for a `LeaderPrepare` message.
LeaderPrepare,
/// Label for a `LeaderCommit` message.
LeaderCommit,
/// Label for a `ReplicaPrepare` message.
ReplicaPrepare,
/// Label for a `LeaderProposal` message.
LeaderProposal,
/// Label for a `ReplicaCommit` message.
ReplicaCommit,
/// Label for a `ReplicaTimeout` message.
ReplicaTimeout,
/// Label for a `ReplicaNewView` message.
ReplicaNewView,
}

impl ConsensusMsgLabel {
Expand Down Expand Up @@ -53,26 +53,29 @@ pub(crate) struct ProcessingLatencyLabels {
#[derive(Debug, Metrics)]
#[metrics(prefix = "consensus")]
pub(crate) struct ConsensusMetrics {
/// Number of the current view of the replica.
#[metrics(unit = Unit::Seconds)]
pub(crate) replica_view_number: Gauge<u64>,
/// Number of the last finalized block observed by the node.
pub(crate) finalized_block_number: Gauge<u64>,
/// Size of the proposed payload in bytes.
#[metrics(buckets = PAYLOAD_SIZE_BUCKETS, unit = Unit::Bytes)]
pub(crate) leader_proposal_payload_size: Histogram<usize>,
/// Latency of the commit phase observed by the leader.
pub(crate) proposal_payload_size: Histogram<usize>,
/// Latency of receiving a proposal as observed by the replica. Measures from
/// the start of the view until we have a verified proposal.
#[metrics(buckets = Buckets::exponential(0.01..=20.0, 1.5), unit = Unit::Seconds)]
pub(crate) leader_commit_phase_latency: Histogram<Duration>,
/// Currently set timeout after which replica will proceed to the next view.
#[metrics(unit = Unit::Seconds)]
pub(crate) replica_view_timeout: Gauge<Duration>,
pub(crate) proposal_latency: Histogram<Duration>,
/// Latency of committing to a block as observed by the replica. Measures from
/// the start of the view until we send a commit vote.
#[metrics(buckets = Buckets::exponential(0.01..=20.0, 1.5), unit = Unit::Seconds)]
pub(crate) commit_latency: Histogram<Duration>,
/// Latency of a single view as observed by the replica. Measures from
/// the start of the view until the start of the next.
#[metrics(buckets = Buckets::exponential(0.01..=20.0, 1.5), unit = Unit::Seconds)]
pub(crate) view_latency: Histogram<Duration>,
/// Latency of processing messages by the replicas.
#[metrics(buckets = Buckets::LATENCIES, unit = Unit::Seconds)]
pub(crate) replica_processing_latency: Family<ProcessingLatencyLabels, Histogram<Duration>>,
/// Latency of processing messages by the leader.
#[metrics(buckets = Buckets::LATENCIES, unit = Unit::Seconds)]
pub(crate) leader_processing_latency: Family<ProcessingLatencyLabels, Histogram<Duration>>,
/// Number of the last finalized block observed by the node.
pub(crate) finalized_block_number: Gauge<u64>,
/// Number of the current view of the replica.
#[metrics(unit = Unit::Seconds)]
pub(crate) replica_view_number: Gauge<u64>,
pub(crate) message_processing_latency: Family<ProcessingLatencyLabels, Histogram<Duration>>,
}

/// Global instance of [`ConsensusMetrics`].
Expand Down

0 comments on commit 51d2e48

Please sign in to comment.