diff --git a/.github/dependabot.yml b/.github/dependabot.yml deleted file mode 100644 index c2fc36a3e6a61f..00000000000000 --- a/.github/dependabot.yml +++ /dev/null @@ -1,41 +0,0 @@ -# To get started with Dependabot version updates, you'll need to specify which -# package ecosystems to update and where the package manifests are located. -# Please see the documentation for all configuration options: -# https://help.github.com/github/administering-a-repository/configuration-options-for-dependency-updates - -version: 2 -updates: -- package-ecosystem: cargo - directory: "/" - schedule: - interval: daily - time: "01:00" - timezone: America/Los_Angeles - #labels: - # - "automerge" - open-pull-requests-limit: 3 - -- package-ecosystem: npm - directory: "/web3.js" - schedule: - interval: daily - time: "01:00" - timezone: America/Los_Angeles - labels: - - "automerge" - commit-message: - prefix: "chore:" - open-pull-requests-limit: 3 - -- package-ecosystem: npm - directory: "/explorer" - schedule: - interval: daily - time: "01:00" - timezone: America/Los_Angeles - labels: - - "automerge" - commit-message: - prefix: "chore:" - include: "scope" - open-pull-requests-limit: 3 diff --git a/Cargo.lock b/Cargo.lock index f5806f14654309..999c43ed8d9bce 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5358,6 +5358,8 @@ dependencies = [ "solana-metrics", "solana-net-utils", "solana-perf", + "solana-prometheus-macro", + "solana-prometheus-utils", "solana-rayon-threadlimit", "solana-runtime", "solana-sdk 1.10.32", @@ -5851,6 +5853,34 @@ dependencies = [ "tokio", ] +[[package]] +name = "solana-prometheus" +version = "1.10.28" +dependencies = [ + "jsonrpc-http-server", + "solana-gossip", + "solana-prometheus-utils", + "solana-runtime", + "solana-sdk 1.10.32", + "solana-vote-program", +] + +[[package]] +name = "solana-prometheus-macro" +version = "1.0.0" +dependencies = [ + "bs58", + "proc-macro2 1.0.32", + "quote 1.0.10", + "rustversion", + "solana-prometheus-utils", + "syn 1.0.91", +] + +[[package]] +name = "solana-prometheus-utils" +version = "1.0.0" + [[package]] name = "solana-rayon-threadlimit" version = "1.10.32" @@ -5960,6 +5990,7 @@ dependencies = [ "solana-net-utils", "solana-perf", "solana-poh", + "solana-prometheus", "solana-rayon-threadlimit", "solana-runtime", "solana-sdk 1.10.32", diff --git a/core/src/validator.rs b/core/src/validator.rs index a3382a6145a343..ab5fbf1ddc9b74 100644 --- a/core/src/validator.rs +++ b/core/src/validator.rs @@ -173,6 +173,7 @@ pub struct ValidatorConfig { pub wait_to_vote_slot: Option, pub ledger_column_options: LedgerColumnOptions, pub enable_quic_servers: bool, + pub vote_accounts_to_monitor: Arc>, } impl Default for ValidatorConfig { @@ -237,6 +238,7 @@ impl Default for ValidatorConfig { wait_to_vote_slot: None, ledger_column_options: LedgerColumnOptions::default(), enable_quic_servers: true, + vote_accounts_to_monitor: Arc::new(HashSet::new()), } } } @@ -736,6 +738,7 @@ impl Validator { leader_schedule_cache.clone(), connection_cache.clone(), max_complete_transaction_status_slot, + config.vote_accounts_to_monitor.clone(), )), if !config.rpc_config.full_api { None diff --git a/gossip/Cargo.toml b/gossip/Cargo.toml index ad9ac0773a0a87..cf078140728aee 100644 --- a/gossip/Cargo.toml +++ b/gossip/Cargo.toml @@ -46,6 +46,8 @@ solana-streamer = { path = "../streamer", version = "=1.10.32" } solana-version = { path = "../version", version = "=1.10.32" } solana-vote-program = { path = "../programs/vote", version = "=1.10.32" } thiserror = "1.0" +solana-prometheus-macro = { path = "../prometheus/macros", version = "=1.0.0" } +solana-prometheus-utils = { path = "../prometheus/utils", version = "=1.0.0" } [dev-dependencies] num_cpus = "1.13.1" diff --git a/gossip/src/cluster_info.rs b/gossip/src/cluster_info.rs index 27652597d180fe..23bc98154bc4c2 100644 --- a/gossip/src/cluster_info.rs +++ b/gossip/src/cluster_info.rs @@ -160,7 +160,7 @@ pub struct ClusterInfo { outbound_budget: DataBudget, my_contact_info: RwLock, ping_cache: Mutex, - stats: GossipStats, + pub stats: GossipStats, socket: UdpSocket, local_message_pending_push_queue: Mutex>, contact_debug_interval: u64, // milliseconds, 0 = disabled @@ -2527,8 +2527,8 @@ impl ClusterInfo { should_check_duplicate_instance, )?; if last_print.elapsed() > SUBMIT_GOSSIP_STATS_INTERVAL { - submit_gossip_stats(&self.stats, &self.gossip, &stakes); - *last_print = Instant::now(); + // submit_gossip_stats(&self.stats, &self.gossip, &stakes); + // *last_print = Instant::now(); } Ok(()) } diff --git a/gossip/src/cluster_info_metrics.rs b/gossip/src/cluster_info_metrics.rs index 27bd8b98b22743..a81326d67ea020 100644 --- a/gossip/src/cluster_info_metrics.rs +++ b/gossip/src/cluster_info_metrics.rs @@ -1,3 +1,5 @@ +use solana_prometheus_macro::ExportPrometheus; + use { crate::crds_gossip::CrdsGossip, itertools::Itertools, @@ -87,7 +89,7 @@ impl<'a, T> Drop for TimedGuard<'a, T> { } } -#[derive(Default)] +#[derive(Default, ExportPrometheus)] pub struct GossipStats { pub(crate) all_tvu_peers: Counter, pub(crate) bad_prune_destination: Counter, diff --git a/local-cluster/src/validator_configs.rs b/local-cluster/src/validator_configs.rs index f8aae532d825b4..437c0bbfaea755 100644 --- a/local-cluster/src/validator_configs.rs +++ b/local-cluster/src/validator_configs.rs @@ -66,6 +66,7 @@ pub fn safe_clone_config(config: &ValidatorConfig) -> ValidatorConfig { wait_to_vote_slot: config.wait_to_vote_slot, ledger_column_options: config.ledger_column_options.clone(), enable_quic_servers: config.enable_quic_servers, + vote_accounts_to_monitor: config.vote_accounts_to_monitor.clone(), } } diff --git a/prometheus/Cargo.toml b/prometheus/Cargo.toml new file mode 100644 index 00000000000000..b0d5384c853e67 --- /dev/null +++ b/prometheus/Cargo.toml @@ -0,0 +1,23 @@ +[package] +name = "solana-prometheus" +version = "1.10.28" +description = "Solana Prometheus" +authors = ["ChorusOne "] +repository = "https://github.com/ChorusOne/solana" +license = "Apache-2.0" +edition = "2021" + +[dependencies] +jsonrpc-http-server = "18.0.0" +solana-gossip = { path = "../gossip" } +solana-runtime = { path = "../runtime" } +solana-sdk = { path = "../sdk" } +solana-vote-program = { path = "../programs/vote" } +solana-prometheus-utils = { path = "utils" } + +[lib] +crate-type = ["lib"] +name = "solana_prometheus" + +[package.metadata.docs.rs] +targets = ["x86_64-unknown-linux-gnu"] diff --git a/prometheus/macros/Cargo.toml b/prometheus/macros/Cargo.toml new file mode 100644 index 00000000000000..ae4a85212b6a8a --- /dev/null +++ b/prometheus/macros/Cargo.toml @@ -0,0 +1,23 @@ +[package] +name = "solana-prometheus-macro" +version = "1.0.0" +description = "Solana Prometheus" +authors = ["ChorusOne "] +repository = "https://github.com/ChorusOne/solana" +license = "Apache-2.0" +edition = "2021" + + +[lib] +proc-macro = true + +[dependencies] +bs58 = "0.4.0" +proc-macro2 = "1.0.19" +quote = "1.0" +syn = { version = "1.0", features = ["full", "extra-traits"] } +rustversion = "1.0.3" +solana-prometheus-utils = { path = "../utils" } + +[package.metadata.docs.rs] +targets = ["x86_64-unknown-linux-gnu"] diff --git a/prometheus/macros/src/lib.rs b/prometheus/macros/src/lib.rs new file mode 100644 index 00000000000000..823073667b5ea2 --- /dev/null +++ b/prometheus/macros/src/lib.rs @@ -0,0 +1,45 @@ +extern crate proc_macro2; + +use proc_macro::TokenStream; +#[macro_use] +extern crate quote; + +#[proc_macro_derive(ExportPrometheus)] +pub fn derive_field_count(input: TokenStream) -> TokenStream { + // Parse the input tokens into a syntax tree + let ast = syn::parse(input).unwrap(); + parse(&ast) +} + +fn parse(ast: &syn::DeriveInput) -> TokenStream { + let name = &ast.ident; + let data = &ast.data; + + let idents: Vec<_> = match data { + syn::Data::Struct(struct_data) => struct_data + .fields + .iter() + .filter_map(|field| field.ident.as_ref().map(|ident| ident)) + .collect(), + _ => panic!("Should be derived from struct"), + }; + + let expanded = quote! { + impl #name { + pub fn write_prometheus(&self, out: &mut W) -> std::io::Result<()> { + use core::sync::atomic::Ordering; + #(solana_prometheus_utils::write_metric( + out, + &solana_prometheus_utils::MetricFamily { + name: &format!("solana_gossip_{}", stringify!(#idents)), + help: "Auto generated with Prometheus macro", + type_: "counter", + metrics: vec![solana_prometheus_utils::Metric::new(self.#idents.0.load(Ordering::Relaxed))], + }, + )?;)* + Ok(()) + } + } + }; + expanded.into() +} diff --git a/prometheus/src/bank_metrics.rs b/prometheus/src/bank_metrics.rs new file mode 100644 index 00000000000000..7ec2380babc49b --- /dev/null +++ b/prometheus/src/bank_metrics.rs @@ -0,0 +1,83 @@ +use crate::banks_with_commitments::BanksWithCommitments; +use solana_prometheus_utils::{write_metric, Metric, MetricFamily}; +use solana_sdk::sysvar; +use solana_sdk::sysvar::epoch_schedule::EpochSchedule; +use std::io; + +pub fn write_bank_metrics( + banks_with_commitments: &BanksWithCommitments, + out: &mut W, +) -> io::Result<()> { + write_metric( + out, + &MetricFamily { + name: "solana_block_slot", + help: "Block Slot", + type_: "gauge", + metrics: banks_with_commitments + .for_each_commitment(|bank| Some(Metric::new(bank.clock().slot))), + }, + )?; + write_metric( + out, + &MetricFamily { + name: "solana_block_epoch", + help: "Block Epoch", + type_: "gauge", + metrics: banks_with_commitments + .for_each_commitment(|bank| Some(Metric::new(bank.clock().epoch))), + }, + )?; + write_metric( + out, + &MetricFamily { + name: "solana_block_epoch_start_slot", + help: "The first slot in the current epoch", + type_: "gauge", + metrics: banks_with_commitments + .for_each_commitment(|bank| { + // Note, the bank actually has a field that holds the EpochSchedule, + // but it is not public, so we can't easily access it here. We could + // make it public, but to make our patches less invasive, load the + // epoch schedule from the sysvar instead. It should always exist. + let epoch_schedule: EpochSchedule = bank + .get_account(&sysvar::epoch_schedule::id())? + .deserialize_data().ok()?; + let clock = bank.clock(); + Some(Metric::new(epoch_schedule.get_first_slot_in_epoch(clock.epoch))) + }), + }, + )?; + write_metric( + out, + &MetricFamily { + name: "solana_block_epoch_slots_total", + help: "The duration of the current epoch, in slots.", + type_: "gauge", + metrics: banks_with_commitments + .for_each_commitment(|bank| { + // Note, the bank actually has a field that holds the EpochSchedule, + // but it is not public, so we can't easily access it here. We could + // make it public, but to make our patches less invasive, load the + // epoch schedule from the sysvar instead. It should always exist. + let epoch_schedule: EpochSchedule = bank + .get_account(&sysvar::epoch_schedule::id())? + .deserialize_data().ok()?; + let clock = bank.clock(); + Some(Metric::new(epoch_schedule.get_slots_in_epoch(clock.epoch))) + }), + }, + )?; + write_metric( + out, + &MetricFamily { + name: "solana_block_timestamp_seconds", + help: "The block's UNIX timestamp, in seconds since epoch, UTC", + type_: "gauge", + metrics: banks_with_commitments + .for_each_commitment(|bank| Some(Metric::new(bank.clock().unix_timestamp as u64))), + }, + )?; + + Ok(()) +} diff --git a/prometheus/src/banks_with_commitments.rs b/prometheus/src/banks_with_commitments.rs new file mode 100644 index 00000000000000..4ad41620e1e0a6 --- /dev/null +++ b/prometheus/src/banks_with_commitments.rs @@ -0,0 +1,77 @@ +use std::sync::{Arc, RwLock}; + +use solana_prometheus_utils::Metric; +use solana_runtime::{bank::Bank, bank_forks::BankForks, commitment::BlockCommitmentCache}; + +pub struct BanksWithCommitments { + pub finalized_bank: Arc, + pub confirmed_bank: Arc, + pub processed_bank: Arc, +} + +impl BanksWithCommitments { + pub fn new( + bank_forks: &Arc>, + block_commitment_cache: &Arc>, + ) -> Self { + let (finalized_slot, confirmed_slot, processed_slot) = { + let block_commitment_cache = block_commitment_cache.read().unwrap(); + ( + block_commitment_cache.slot_with_commitment( + solana_sdk::commitment_config::CommitmentLevel::Finalized, + ), + block_commitment_cache.slot_with_commitment( + solana_sdk::commitment_config::CommitmentLevel::Confirmed, + ), + block_commitment_cache.slot_with_commitment( + solana_sdk::commitment_config::CommitmentLevel::Processed, + ), + ) + }; + let r_bank_forks = bank_forks.read().unwrap(); + + let default_closure = || { + // From rpc/src/rpc.rs + // + // It may occur after an old bank has been purged from BankForks and a new + // BlockCommitmentCache has not yet arrived. To make this case impossible, + // BlockCommitmentCache should hold an `Arc` everywhere it currently holds + // a slot. + // + // For more information, see https://github.com/solana-labs/solana/issues/11078 + r_bank_forks.root_bank() + }; + let finalized_bank = r_bank_forks + .get(finalized_slot) + .unwrap_or_else(default_closure); + let confirmed_bank = r_bank_forks + .get(confirmed_slot) + .unwrap_or_else(default_closure); + let processed_bank = r_bank_forks + .get(processed_slot) + .unwrap_or_else(default_closure); + BanksWithCommitments { + finalized_bank, + confirmed_bank, + processed_bank, + } + } + + /// Call function callback for each commitment level, and returns a vector of metrics. + pub fn for_each_commitment Option>(&self, get: F) -> Vec { + let mut result = Vec::with_capacity(3); + result.extend( + get(&self.finalized_bank) + .map(|m| m.with_label("commitment_level", "finalized".to_owned())), + ); + result.extend( + get(&self.confirmed_bank) + .map(|m| m.with_label("commitment_level", "confirmed".to_owned())), + ); + result.extend( + get(&self.processed_bank) + .map(|m| m.with_label("commitment_level", "processed".to_owned())), + ); + result + } +} diff --git a/prometheus/src/cluster_metrics.rs b/prometheus/src/cluster_metrics.rs new file mode 100644 index 00000000000000..6c98e2970ff518 --- /dev/null +++ b/prometheus/src/cluster_metrics.rs @@ -0,0 +1,158 @@ +use solana_gossip::cluster_info::ClusterInfo; +use solana_prometheus_utils::{write_metric, Lamports, Metric, MetricFamily}; +use solana_runtime::bank::Bank; +use solana_sdk::{clock::Slot, pubkey::Pubkey}; +use solana_vote_program::vote_state::VoteState; + +use crate::banks_with_commitments::BanksWithCommitments; +use std::{collections::HashSet, io, sync::Arc}; + +struct ValidatorVoteInfo { + balance: Lamports, + last_vote: Slot, + vote_credits: u64, + identity: Pubkey, + activated_stake: Lamports, +} + +fn get_vote_state(bank: &Bank, vote_pubkey: &Pubkey) -> Option { + let default_vote_state = VoteState::default(); + let vote_accounts = bank.vote_accounts(); + let (activated_stake, vote_account) = vote_accounts.get(vote_pubkey)?; + let vote_state = vote_account.vote_state(); + let vote_state = vote_state.as_ref().unwrap_or(&default_vote_state); + + let last_vote = vote_state.votes.back()?.slot; + let balance = Lamports(bank.get_balance(&vote_pubkey)); + let vote_credits = vote_state.credits(); + Some(ValidatorVoteInfo { + balance, + last_vote, + vote_credits, + identity: vote_state.node_pubkey, + activated_stake: Lamports(*activated_stake), + }) +} + +pub fn write_cluster_metrics( + banks_with_commitments: &BanksWithCommitments, + cluster_info: &Arc, + vote_accounts: &Arc>, + out: &mut W, +) -> io::Result<()> { + let identity_pubkey = cluster_info.id(); + let version = cluster_info + .get_node_version(&identity_pubkey) + .unwrap_or_default(); + + write_metric( + out, + &MetricFamily { + name: "solana_node_identity_public_key_info", + help: "The node's current identity", + type_: "counter", + metrics: vec![ + Metric::new(1).with_label("identity_account", identity_pubkey.to_string()) + ], + }, + )?; + + write_metric( + out, + &MetricFamily { + name: "solana_node_identity_balance_sol", + help: "The balance of the node's identity account", + type_: "gauge", + metrics: banks_with_commitments.for_each_commitment(|bank| { + Some( + Metric::new_sol(Lamports(bank.get_balance(&identity_pubkey))) + .with_label("identity_account", identity_pubkey.to_string()), + ) + }), + }, + )?; + + write_metric( + out, + &MetricFamily { + name: "solana_node_version_info", + help: "The current Solana node's version", + type_: "counter", + metrics: vec![Metric::new(1).with_label("version", version.to_string())], + }, + )?; + + // Vote accounts information + for vote_account in vote_accounts.iter() { + write_metric( + out, + &MetricFamily { + name: "solana_validator_last_vote_slot", + help: + "The voted-on slot of the validator's last vote that got included in the chain", + type_: "gauge", + metrics: banks_with_commitments.for_each_commitment(|bank| { + let vote_info = get_vote_state(bank, vote_account)?; + Some( + Metric::new(vote_info.last_vote) + .with_label("identity_account", vote_info.identity.to_string()) + .with_label("vote_account", vote_account.to_string()), + ) + }), + }, + )?; + + write_metric( + out, + &MetricFamily { + name: "solana_validator_vote_account_balance_sol", + help: "The balance of the vote account at the given address", + type_: "gauge", + metrics: banks_with_commitments.for_each_commitment(|bank| { + let vote_info = get_vote_state(bank, vote_account)?; + Some( + Metric::new_sol(vote_info.balance) + .with_label("identity_account", vote_info.identity.to_string()) + .with_label("vote_account", vote_account.to_string()), + ) + }), + }, + )?; + + write_metric( + out, + &MetricFamily { + name: "solana_validator_vote_credits", + help: "The total number of vote credits credited to this vote account", + type_: "gauge", + metrics: banks_with_commitments.for_each_commitment(|bank| { + let vote_info = get_vote_state(bank, vote_account)?; + Some( + Metric::new(vote_info.vote_credits) + .with_label("identity_account", vote_info.identity.to_string()) + .with_label("vote_account", vote_account.to_string()), + ) + }), + }, + )?; + + write_metric( + out, + &MetricFamily { + name: "solana_validator_active_stake_sol", + help: "The total amount of Sol actively staked to this validator", + type_: "gauge", + metrics: banks_with_commitments.for_each_commitment(|bank| { + let vote_info = get_vote_state(bank, vote_account)?; + Some( + Metric::new_sol(vote_info.activated_stake) + .with_label("identity_account", vote_info.identity.to_string()) + .with_label("vote_account", vote_account.to_string()), + ) + }), + }, + )?; + } + + Ok(()) +} diff --git a/prometheus/src/lib.rs b/prometheus/src/lib.rs new file mode 100644 index 00000000000000..f6c07e1be2331a --- /dev/null +++ b/prometheus/src/lib.rs @@ -0,0 +1,35 @@ +mod bank_metrics; +pub mod banks_with_commitments; +mod cluster_metrics; + +use banks_with_commitments::BanksWithCommitments; +use solana_gossip::cluster_info::ClusterInfo; +use solana_sdk::pubkey::Pubkey; +use std::{collections::HashSet, sync::Arc}; + +pub fn render_prometheus( + banks_with_commitments: BanksWithCommitments, + cluster_info: &Arc, + vote_accounts: &Arc>, +) -> Vec { + // There are 3 levels of commitment for a bank: + // - finalized: most recent block *confirmed* by supermajority of the + // cluster. + // - confirmed: most recent block that has been *voted* on by supermajority + // of the cluster. + // - processed: most recent block. + let mut out: Vec = Vec::new(); + bank_metrics::write_bank_metrics(&banks_with_commitments, &mut out).expect("IO error"); + cluster_metrics::write_cluster_metrics( + &banks_with_commitments, + &cluster_info, + vote_accounts, + &mut out, + ) + .expect("IO error"); + cluster_info + .stats + .write_prometheus(&mut out) + .expect("IO error"); + out +} diff --git a/prometheus/utils/Cargo.toml b/prometheus/utils/Cargo.toml new file mode 100644 index 00000000000000..2d13fc9a3209be --- /dev/null +++ b/prometheus/utils/Cargo.toml @@ -0,0 +1,17 @@ +[package] +name = "solana-prometheus-utils" +version = "1.0.0" +description = "Solana Utils Prometheus" +authors = ["ChorusOne "] +repository = "https://github.com/ChorusOne/solana" +license = "Apache-2.0" +edition = "2021" + +[dependencies] + +[lib] +crate-type = ["lib"] +name = "solana_prometheus_utils" + +[package.metadata.docs.rs] +targets = ["x86_64-unknown-linux-gnu"] diff --git a/prometheus/utils/src/lib.rs b/prometheus/utils/src/lib.rs new file mode 100644 index 00000000000000..b384fcdf23a2c1 --- /dev/null +++ b/prometheus/utils/src/lib.rs @@ -0,0 +1,307 @@ +// SPDX-FileCopyrightText: 2022 Chorus One AG +// SPDX-License-Identifier: Apache-2.0 + +//! Utilities for formatting Prometheus metrics. +//! +//! See also . + +use std::io; +use std::io::Write; +use std::time::SystemTime; + +#[derive(Clone, Copy)] +pub struct Lamports(pub u64); + +pub struct MetricFamily<'a> { + /// Name of the metric, e.g. [`goats_teleported_total`](https://crbug.com/31482). + pub name: &'a str, + /// HELP line content. + pub help: &'a str, + /// TYPE line content. Most common are `counter`, `gauge`, and `histogram`. + pub type_: &'a str, + /// Values for this metric, possibly with labels or a suffix. + pub metrics: Vec>, +} + +pub enum MetricValue { + /// Render the inner value as-is, as an integer. + Int(u64), + + /// Divide the inner value by 109 and render as fixed-point number. + /// + /// E.g. `Nano(12)` renders as `0.000000012`. + Nano(u64), + + Float(f64), +} + +impl From for MetricValue { + fn from(v: u64) -> MetricValue { + MetricValue::Int(v) + } +} + +impl From for MetricValue { + fn from(v: f64) -> MetricValue { + MetricValue::Float(v) + } +} + +pub struct Metric<'a> { + /// Suffix to append to the metric name, useful for e.g. the `_bucket` suffix on histograms. + pub suffix: &'a str, + + /// Name-value label pairs. + pub labels: Vec<(&'a str, String)>, + + /// Metric value, either an integer, or a fixed-point number. + pub value: MetricValue, + + /// Time at which this metric was observed, when proxying metrics. + pub timestamp: Option, +} + +impl<'a> Metric<'a> { + /// Construct a basic metric with just a value. + /// + /// Can be extended with the builder-style methods below. + pub fn new>(value: T) -> Metric<'a> { + Metric { + labels: Vec::new(), + suffix: "", + value: value.into(), + timestamp: None, + } + } + + /// Construct a metric that measures an amount of SOL. + pub fn new_sol(amount: Lamports) -> Metric<'a> { + // One Lamport is 1e-9 SOL, so we use nano here. + Metric::new(MetricValue::Nano(amount.0)) + } + + pub fn with_label(mut self, label_key: &'a str, label_value: String) -> Metric<'a> { + self.labels.push((label_key, label_value)); + self + } + + /// Set the suffix. + pub fn with_suffix(mut self, suffix: &'a str) -> Metric<'a> { + self.suffix = suffix; + self + } +} + +pub fn write_metric(out: &mut W, family: &MetricFamily) -> io::Result<()> { + writeln!(out, "# HELP {} {}", family.name, family.help)?; + writeln!(out, "# TYPE {} {}", family.name, family.type_)?; + for metric in &family.metrics { + write!(out, "{}{}", family.name, metric.suffix)?; + + // If there are labels, write the key-value pairs between {}. + // Escaping of the value uses Rust's string syntax, which is + // not exactly what Prometheus wants, but it is identical for + // all of the values that we use it with; this is not a general + // Prometheus formatter, just a quick one for our use. + if !metric.labels.is_empty() { + write!(out, "{{")?; + let mut separator = ""; + for (key, value) in &metric.labels { + write!(out, "{}{}={:?}", separator, key, value)?; + separator = ","; + } + write!(out, "}}")?; + } + + match metric.value { + MetricValue::Int(v) => write!(out, " {}", v)?, + MetricValue::Nano(v) => { + write!(out, " {}.{:0>9}", v / 1_000_000_000, v % 1_000_000_000)? + } + MetricValue::Float(v) => write!(out, " {}", v)?, + } + + if let Some(timestamp) = metric.timestamp { + let unix_time_ms = match timestamp.duration_since(SystemTime::UNIX_EPOCH) { + Ok(duration) => duration.as_millis(), + Err(..) => panic!("Found a metric dated before UNIX_EPOCH."), + }; + // Timestamps in Prometheus are milliseconds since epoch, + // excluding leap seconds. (Which is what you get if your system + // clock tracks UTC.) + write!(out, " {}", unix_time_ms)?; + } + + writeln!(out)?; + } + + // Add a blank line for readability by humans. + writeln!(out) +} + +#[cfg(test)] +mod test { + use std::str; + + use super::{write_metric, Metric, MetricFamily, MetricValue}; + + #[test] + fn write_metric_without_labels() { + let mut out: Vec = Vec::new(); + write_metric( + &mut out, + &MetricFamily { + // The metric names are just for testing purposes. + // See also https://crbug.com/31482. + name: "goats_teleported_total", + help: "Number of goats teleported since launch.", + type_: "counter", + metrics: vec![Metric::new(144)], + }, + ) + .unwrap(); + + assert_eq!( + str::from_utf8(&out[..]), + Ok( + "# HELP goats_teleported_total Number of goats teleported since launch.\n\ + # TYPE goats_teleported_total counter\n\ + goats_teleported_total 144\n\n\ + " + ) + ) + } + + #[test] + fn write_metric_histogram() { + let mut out: Vec = Vec::new(); + write_metric( + &mut out, + &MetricFamily { + name: "teleported_goat_weight_kg", + help: "Histogram of the weight of teleported goats.", + type_: "histogram", + metrics: vec![ + Metric::new(44) + .with_suffix("_bucket") + .with_label("le", "50.0".to_string()), + Metric::new(67) + .with_suffix("_bucket") + .with_label("le", "75.0".to_string()), + Metric::new(144) + .with_suffix("_bucket") + .with_label("le", "+Inf".to_string()), + Metric::new(11520).with_suffix("_sum"), + Metric::new(144).with_suffix("_count"), + ], + }, + ) + .unwrap(); + + assert_eq!( + str::from_utf8(&out[..]), + Ok( + "# HELP teleported_goat_weight_kg Histogram of the weight of teleported goats.\n\ + # TYPE teleported_goat_weight_kg histogram\n\ + teleported_goat_weight_kg_bucket{le=\"50.0\"} 44\n\ + teleported_goat_weight_kg_bucket{le=\"75.0\"} 67\n\ + teleported_goat_weight_kg_bucket{le=\"+Inf\"} 144\n\ + teleported_goat_weight_kg_sum 11520\n\ + teleported_goat_weight_kg_count 144\n\n\ + " + ) + ) + } + + #[test] + fn write_metric_multiple_labels() { + let mut out: Vec = Vec::new(); + write_metric( + &mut out, + &MetricFamily { + name: "goats_teleported_total", + help: "Number of goats teleported since launch by departure and arrival.", + type_: "counter", + metrics: vec![ + Metric::new(10) + .with_label("src", "AMS".to_string()) + .with_label("dst", "ZRH".to_string()), + Metric::new(53) + .with_label("src", "ZRH".to_string()) + .with_label("dst", "DXB".to_string()), + ], + }, + ) + .unwrap(); + + assert_eq!( + str::from_utf8(&out[..]), + Ok( + "# HELP goats_teleported_total Number of goats teleported since launch by departure and arrival.\n\ + # TYPE goats_teleported_total counter\n\ + goats_teleported_total{src=\"AMS\",dst=\"ZRH\"} 10\n\ + goats_teleported_total{src=\"ZRH\",dst=\"DXB\"} 53\n\n\ + " + ) + ) + } + + #[test] + fn write_metric_with_timestamp() { + use std::time::{Duration, SystemTime}; + + let mut out: Vec = Vec::new(); + let t = SystemTime::UNIX_EPOCH + Duration::from_secs(77); + write_metric( + &mut out, + &MetricFamily { + name: "goats_teleported_total", + help: "Number of goats teleported since launch.", + type_: "counter", + metrics: vec![Metric::new(10)], + }, + ) + .unwrap(); + + assert_eq!( + str::from_utf8(&out[..]), + Ok( + "# HELP goats_teleported_total Number of goats teleported since launch.\n\ + # TYPE goats_teleported_total counter\n\ + goats_teleported_total 10 77000\n\n\ + " + ) + ) + } + + #[test] + fn write_metric_nano_micro() { + let mut out: Vec = Vec::new(); + write_metric( + &mut out, + &MetricFamily { + name: "goat_weight_kg", + help: "Weight of the goat in kilograms.", + type_: "gauge", + metrics: vec![ + // One greater than 1, with no need for zero padding. + Metric::new(MetricValue::Nano(67_533_128_017)), + // One smaller than 1, with the need for zero padding. + Metric::new(MetricValue::Nano(128_017)), + ], + }, + ) + .unwrap(); + + assert_eq!( + str::from_utf8(&out[..]), + Ok("# HELP goat_weight_kg Weight of the goat in kilograms.\n\ + # TYPE goat_weight_kg gauge\n\ + goat_weight_kg 67.533128017\n\ + goat_weight_kg 67.533128\n\ + goat_weight_kg 0.000128017\n\ + goat_weight_kg 0.000128\n\n\ + ") + ) + } +} diff --git a/replica-node/src/main.rs b/replica-node/src/main.rs index 07369ebd635c33..06cf8b2add6153 100644 --- a/replica-node/src/main.rs +++ b/replica-node/src/main.rs @@ -401,6 +401,7 @@ pub fn main() { account_indexes: AccountSecondaryIndexes::default(), accounts_db_caching_enabled: false, replica_exit: Arc::new(RwLock::new(Exit::default())), + vote_accounts_to_monitor: Arc::new(HashSet::default()), }; let replica = ReplicaNode::new(config); diff --git a/replica-node/src/replica_node.rs b/replica-node/src/replica_node.rs index 29ba56cf81f1fb..872d96c985842c 100644 --- a/replica-node/src/replica_node.rs +++ b/replica-node/src/replica_node.rs @@ -1,3 +1,7 @@ +use std::collections::HashSet; + +use solana_sdk::pubkey::Pubkey; + use { crate::accountsdb_repl_service::AccountsDbReplService, crossbeam_channel::unbounded, @@ -58,6 +62,7 @@ pub struct ReplicaNodeConfig { pub accounts_db_caching_enabled: bool, pub replica_exit: Arc>, pub socket_addr_space: SocketAddrSpace, + pub vote_accounts_to_monitor: Arc>, } pub struct ReplicaNode { @@ -251,6 +256,7 @@ fn start_client_rpc_services( leader_schedule_cache.clone(), connection_cache, max_complete_transaction_status_slot, + replica_config.vote_accounts_to_monitor.clone(), )), Some(pubsub_service), Some(OptimisticallyConfirmedBankTracker::new( diff --git a/rpc/Cargo.toml b/rpc/Cargo.toml index d5f24e4eddbb10..8af7af78758776 100644 --- a/rpc/Cargo.toml +++ b/rpc/Cargo.toml @@ -55,6 +55,8 @@ thiserror = "1.0" tokio = { version = "~1.14.1", features = ["full"] } tokio-util = { version = "0.6", features = ["codec", "compat"] } +solana-prometheus = { path = "../prometheus" } + [dev-dependencies] serial_test = "0.6.0" solana-address-lookup-table-program = { path = "../programs/address-lookup-table", version = "=1.10.32" } diff --git a/rpc/src/rpc_health.rs b/rpc/src/rpc_health.rs index 8f4b4dfc4c53d8..d5048ce3626840 100644 --- a/rpc/src/rpc_health.rs +++ b/rpc/src/rpc_health.rs @@ -18,7 +18,7 @@ pub enum RpcHealthStatus { } pub struct RpcHealth { - cluster_info: Arc, + pub cluster_info: Arc, known_validators: Option>, health_check_slot_distance: u64, override_health_check: Arc, diff --git a/rpc/src/rpc_service.rs b/rpc/src/rpc_service.rs index 7e32bf81ea195d..907a17458d8b91 100644 --- a/rpc/src/rpc_service.rs +++ b/rpc/src/rpc_service.rs @@ -28,6 +28,7 @@ use { solana_metrics::inc_new_counter_info, solana_perf::thread::renice_this_thread, solana_poh::poh_recorder::PohRecorder, + solana_prometheus::{banks_with_commitments::BanksWithCommitments, render_prometheus}, solana_runtime::{ bank_forks::BankForks, commitment::BlockCommitmentCache, snapshot_archive_info::SnapshotArchiveInfoGetter, snapshot_config::SnapshotConfig, @@ -72,6 +73,8 @@ struct RpcRequestMiddleware { snapshot_config: Option, bank_forks: Arc>, health: Arc, + block_commitment_cache: Arc>, + vote_accounts_to_monitor: Arc>, } impl RpcRequestMiddleware { @@ -80,6 +83,8 @@ impl RpcRequestMiddleware { snapshot_config: Option, bank_forks: Arc>, health: Arc, + block_commitment_cache: Arc>, + vote_accounts_to_monitor: Arc>, ) -> Self { Self { ledger_path, @@ -94,6 +99,8 @@ impl RpcRequestMiddleware { snapshot_config, bank_forks, health, + block_commitment_cache, + vote_accounts_to_monitor, } } @@ -282,14 +289,29 @@ impl RequestMiddleware for RpcRequestMiddleware { .into() } else if self.is_file_get_path(request.uri().path()) { self.process_file_get(request.uri().path()) - } else if request.uri().path() == "/health" { - hyper::Response::builder() - .status(hyper::StatusCode::OK) - .body(hyper::Body::from(self.health_check())) - .unwrap() - .into() } else { - request.into() + match request.uri().path() { + "/health" => hyper::Response::builder() + .status(hyper::StatusCode::OK) + .body(hyper::Body::from(self.health_check())) + .unwrap() + .into(), + "/metrics" => { + let banks_with_commitment = + BanksWithCommitments::new(&self.bank_forks, &self.block_commitment_cache); + hyper::Response::builder() + .status(hyper::StatusCode::OK) + .header("Content-Type", "text/plain; version=0.0.4; charset=UTF-8") + .body(hyper::Body::from(render_prometheus( + banks_with_commitment, + &self.health.cluster_info, + &self.vote_accounts_to_monitor, + ))) + .unwrap() + .into() + } + _ => request.into(), + } } } } @@ -339,6 +361,7 @@ impl JsonRpcService { leader_schedule_cache: Arc, connection_cache: Arc, current_transaction_status_slot: Arc, + vote_accounts_to_monitor: Arc>, ) -> Self { info!("rpc bound to {:?}", rpc_addr); info!("rpc configuration: {:?}", config); @@ -431,7 +454,7 @@ impl JsonRpcService { config, snapshot_config.clone(), bank_forks.clone(), - block_commitment_cache, + block_commitment_cache.clone(), blockstore, validator_exit.clone(), health.clone(), @@ -486,6 +509,8 @@ impl JsonRpcService { snapshot_config, bank_forks.clone(), health.clone(), + block_commitment_cache.clone(), + vote_accounts_to_monitor, ); let server = ServerBuilder::with_meta_extractor( io, @@ -623,6 +648,7 @@ mod tests { Arc::new(LeaderScheduleCache::default()), connection_cache, Arc::new(AtomicU64::default()), + None, ); let thread = rpc_service.thread_hdl.thread(); assert_eq!(thread.name().unwrap(), "solana-jsonrpc"); @@ -662,17 +688,22 @@ mod tests { #[test] fn test_is_file_get_path() { let bank_forks = create_bank_forks(); + let block_commitment_cache = Arc::new(RwLock::new(BlockCommitmentCache::default())); let rrm = RpcRequestMiddleware::new( PathBuf::from("/"), None, bank_forks.clone(), RpcHealth::stub(), + block_commitment_cache, + None, ); let rrm_with_snapshot_config = RpcRequestMiddleware::new( PathBuf::from("/"), Some(SnapshotConfig::default()), bank_forks, RpcHealth::stub(), + block_commitment_cache, + None, ); assert!(rrm.is_file_get_path(DEFAULT_GENESIS_DOWNLOAD_PATH)); @@ -743,6 +774,8 @@ mod tests { None, create_bank_forks(), RpcHealth::stub(), + Arc::new(RwLock::new(BlockCommitmentCache::default())), + None, ); // File does not exist => request should fail. @@ -798,6 +831,8 @@ mod tests { None, create_bank_forks(), RpcHealth::stub(), + Arc::new(RwLock::new(BlockCommitmentCache::default())), + None, ); assert_eq!(rm.health_check(), "ok"); } @@ -824,7 +859,14 @@ mod tests { override_health_check.clone(), )); - let rm = RpcRequestMiddleware::new(PathBuf::from("/"), None, create_bank_forks(), health); + let rm = RpcRequestMiddleware::new( + PathBuf::from("/"), + None, + create_bank_forks(), + health, + Arc::new(RwLock::new(BlockCommitmentCache::default())), + None, + ); // No account hashes for this node or any known validators assert_eq!(rm.health_check(), "unknown"); diff --git a/test-validator/src/lib.rs b/test-validator/src/lib.rs index 6afc8f5df41e37..995337e198d7a1 100644 --- a/test-validator/src/lib.rs +++ b/test-validator/src/lib.rs @@ -709,6 +709,7 @@ impl TestValidator { max_ledger_shreds: config.max_ledger_shreds, no_wait_for_vote_to_start_leader: true, accounts_db_config, + vote_accounts_to_monitor: Arc::new(HashSet::from_iter(vec![vote_account_address])), ..ValidatorConfig::default_for_test() }; if let Some(ref tower_storage) = config.tower_storage { diff --git a/validator/src/main.rs b/validator/src/main.rs index 947a0a0e22a2ed..24db1ca7018a58 100644 --- a/validator/src/main.rs +++ b/validator/src/main.rs @@ -378,6 +378,23 @@ fn hardforks_of(matches: &ArgMatches<'_>, name: &str) -> Option> { } } +fn get_vote_accounts_to_monitor(matches: &ArgMatches<'_>) -> HashSet { + let vote_account = if matches.is_present("vote_account") { + vec![pubkey_of(&matches, "vote_account") + .expect("Does not fail, as this is validated by Clap earlier.")] + } else { + vec![] + }; + let mut monitor_vote_accounts = if matches.is_present("monitor_vote_account") { + let accounts = values_t_or_exit!(matches, "monitor_vote_account", Pubkey); + accounts.into_iter().collect::>() + } else { + HashSet::new() + }; + monitor_vote_accounts.extend(vote_account.iter()); + monitor_vote_accounts +} + fn validators_set( identity_pubkey: &Pubkey, matches: &ArgMatches<'_>, @@ -1873,6 +1890,17 @@ pub fn main() { .after_help("Note: If this command exits with a non-zero status \ then this not a good time for a restart") ) + .arg( + Arg::with_name("monitor_vote_account") + .long("monitor-vote-account") + .takes_value(true) + .value_name("PUBKEY") + .validator(is_pubkey) + .multiple(true) + .help("Additional vote accounts expose Prometheus metrics about. \ + The validator's own vote account is always included implicitly \ + if there is one.") + ) .get_matches(); let socket_addr_space = SocketAddrSpace::new(matches.is_present("allow_private_addr")); @@ -2591,6 +2619,8 @@ pub fn main() { Keypair::new().pubkey() }); + validator_config.vote_accounts_to_monitor = Arc::new(get_vote_accounts_to_monitor(&matches)); + let dynamic_port_range = solana_net_utils::parse_port_range(matches.value_of("dynamic_port_range").unwrap()) .expect("invalid dynamic_port_range");