From 8dd58dc4ae5faf353aff2dc822a6fe1b017ca2d3 Mon Sep 17 00:00:00 2001 From: Manuel Mauro Date: Tue, 29 Oct 2024 17:08:02 +0100 Subject: [PATCH 01/60] feat: add dummy node version heathcheck --- Cargo.lock | 2 + core/node/house_keeper/Cargo.toml | 2 + core/node/house_keeper/src/lib.rs | 1 + core/node/house_keeper/src/node_metadata.rs | 41 +++++++++++++++++++ .../implementations/layers/house_keeper.rs | 22 +++++++++- 5 files changed, 66 insertions(+), 2 deletions(-) create mode 100644 core/node/house_keeper/src/node_metadata.rs diff --git a/Cargo.lock b/Cargo.lock index 597da3c1b31b..bd25dd35d4c2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -10383,11 +10383,13 @@ version = "0.1.0" dependencies = [ "anyhow", "async-trait", + "serde", "tokio", "tracing", "vise", "zksync_config", "zksync_dal", + "zksync_health_check", "zksync_shared_metrics", "zksync_types", ] diff --git a/core/node/house_keeper/Cargo.toml b/core/node/house_keeper/Cargo.toml index b2ed3c14c20f..99a88306facc 100644 --- a/core/node/house_keeper/Cargo.toml +++ b/core/node/house_keeper/Cargo.toml @@ -11,8 +11,10 @@ keywords.workspace = true categories.workspace = true [dependencies] +serde.workspace = true vise.workspace = true zksync_dal.workspace = true +zksync_health_check.workspace = true zksync_shared_metrics.workspace = true zksync_types.workspace = true zksync_config.workspace = true diff --git a/core/node/house_keeper/src/lib.rs b/core/node/house_keeper/src/lib.rs index 4e0d1962fc02..eb18e474990c 100644 --- a/core/node/house_keeper/src/lib.rs +++ b/core/node/house_keeper/src/lib.rs @@ -1,3 +1,4 @@ pub mod blocks_state_reporter; mod metrics; +pub mod node_metadata; pub mod periodic_job; diff --git a/core/node/house_keeper/src/node_metadata.rs b/core/node/house_keeper/src/node_metadata.rs new file mode 100644 index 000000000000..e3e201f25afd --- /dev/null +++ b/core/node/house_keeper/src/node_metadata.rs @@ -0,0 +1,41 @@ +use async_trait::async_trait; +use serde::{Deserialize, Serialize}; +use zksync_health_check::{CheckHealth, Health, HealthStatus}; + +/// General information about the node. +#[derive(Debug, Serialize, Deserialize)] +pub struct NodeInfo { + pub version: String, +} + +/// Health details for a node. +#[derive(Debug, Serialize)] +#[serde(tag = "stage", rename_all = "snake_case")] +pub enum NodeHealth { + Initializing, + Running(NodeInfo), +} + +impl From for Health { + fn from(details: NodeHealth) -> Self { + let status = match &details { + NodeHealth::Initializing => HealthStatus::Affected, + NodeHealth::Running(_) => HealthStatus::Ready, + }; + Self::from(status).with_details(details) + } +} + +#[async_trait] +impl CheckHealth for NodeHealth { + fn name(&self) -> &'static str { + "node" + } + + async fn check_health(&self) -> Health { + NodeHealth::Running(NodeInfo { + version: "0.1.0-yolo".to_string(), + }) + .into() + } +} diff --git a/core/node/node_framework/src/implementations/layers/house_keeper.rs b/core/node/node_framework/src/implementations/layers/house_keeper.rs index 1e2bc568d50f..2a977781e402 100644 --- a/core/node/node_framework/src/implementations/layers/house_keeper.rs +++ b/core/node/node_framework/src/implementations/layers/house_keeper.rs @@ -1,10 +1,17 @@ +use std::sync::Arc; + use zksync_config::configs::house_keeper::HouseKeeperConfig; use zksync_house_keeper::{ - blocks_state_reporter::L1BatchMetricsReporter, periodic_job::PeriodicJob, + blocks_state_reporter::L1BatchMetricsReporter, + node_metadata::{NodeHealth, NodeInfo}, + periodic_job::PeriodicJob, }; use crate::{ - implementations::resources::pools::{PoolResource, ReplicaPool}, + implementations::resources::{ + healthcheck::AppHealthCheckResource, + pools::{PoolResource, ReplicaPool}, + }, service::StopReceiver, task::{Task, TaskId}, wiring_layer::{WiringError, WiringLayer}, @@ -22,6 +29,8 @@ pub struct HouseKeeperLayer { #[context(crate = crate)] pub struct Input { pub replica_pool: PoolResource, + #[context(default)] + pub app_health: AppHealthCheckResource, } #[derive(Debug, IntoContext)] @@ -59,6 +68,15 @@ impl WiringLayer for HouseKeeperLayer { replica_pool.clone(), ); + let node_health = NodeHealth::Running(NodeInfo { + version: "".to_string(), + }); + + let app_health = input.app_health.0; + app_health + .insert_custom_component(Arc::new(node_health)) + .map_err(WiringError::internal)?; + Ok(Output { l1_batch_metrics_reporter, }) From 98167a13df5b666d27dec9a1478c149bd934b441 Mon Sep 17 00:00:00 2001 From: Manuel Mauro Date: Wed, 30 Oct 2024 13:30:29 +0100 Subject: [PATCH 02/60] feat: add version information to healthcheck --- Cargo.lock | 8 ++ Cargo.toml | 2 + core/lib/git_version_macro/Cargo.toml | 16 ++++ core/lib/git_version_macro/src/lib.rs | 81 +++++++++++++++++++ core/node/house_keeper/Cargo.toml | 1 + core/node/house_keeper/src/node_metadata.rs | 12 ++- .../implementations/layers/house_keeper.rs | 3 +- 7 files changed, 119 insertions(+), 4 deletions(-) create mode 100644 core/lib/git_version_macro/Cargo.toml create mode 100644 core/lib/git_version_macro/src/lib.rs diff --git a/Cargo.lock b/Cargo.lock index bd25dd35d4c2..1982463d9c27 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -10362,6 +10362,13 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "zksync_git_version_macro" +version = "0.1.0" +dependencies = [ + "chrono", +] + [[package]] name = "zksync_health_check" version = "0.1.0" @@ -10389,6 +10396,7 @@ dependencies = [ "vise", "zksync_config", "zksync_dal", + "zksync_git_version_macro", "zksync_health_check", "zksync_shared_metrics", "zksync_types", diff --git a/Cargo.toml b/Cargo.toml index 6d51e5060aa8..b8880db7b400 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -53,6 +53,7 @@ members = [ "core/lib/da_client", "core/lib/eth_client", "core/lib/eth_signer", + "core/lib/git_version_macro", "core/lib/l1_contract_interface", "core/lib/mempool", "core/lib/merkle_tree", @@ -262,6 +263,7 @@ zksync_eth_client = { version = "0.1.0", path = "core/lib/eth_client" } zksync_da_client = { version = "0.1.0", path = "core/lib/da_client" } zksync_eth_signer = { version = "0.1.0", path = "core/lib/eth_signer" } zksync_health_check = { version = "0.1.0", path = "core/lib/health_check" } +zksync_git_version_macro = { version = "0.1.0", path = "core/lib/git_version_macro" } zksync_l1_contract_interface = { version = "0.1.0", path = "core/lib/l1_contract_interface" } zksync_mempool = { version = "0.1.0", path = "core/lib/mempool" } zksync_merkle_tree = { version = "0.1.0", path = "core/lib/merkle_tree" } diff --git a/core/lib/git_version_macro/Cargo.toml b/core/lib/git_version_macro/Cargo.toml new file mode 100644 index 000000000000..461a72060042 --- /dev/null +++ b/core/lib/git_version_macro/Cargo.toml @@ -0,0 +1,16 @@ +[package] +name = "zksync_git_version_macro" +edition = "2021" +description = "Procedural macro to generate metainformation about build in compile time" +version.workspace = true +homepage.workspace = true +license.workspace = true +authors.workspace = true +repository.workspace = true +keywords.workspace = true + +[lib] +proc-macro = true + +[dependencies] +chrono.workspace = true diff --git a/core/lib/git_version_macro/src/lib.rs b/core/lib/git_version_macro/src/lib.rs new file mode 100644 index 000000000000..34b83efce195 --- /dev/null +++ b/core/lib/git_version_macro/src/lib.rs @@ -0,0 +1,81 @@ +extern crate proc_macro; +use std::{process::Command, str::FromStr}; + +use proc_macro::TokenStream; + +/// Outputs the current date and time as a string literal. +/// Can be used to include the build timestamp in the binary. +#[proc_macro] +pub fn build_timestamp(_item: TokenStream) -> TokenStream { + let now = chrono::Local::now().format("%Y-%m-%d %H:%M:%S").to_string(); + encode_as_str(&now) +} + +/// Outputs the current git branch as a string literal. +#[proc_macro] +pub fn build_git_branch(_item: TokenStream) -> TokenStream { + let out = run_cmd("git", &["rev-parse", "--abbrev-ref", "HEAD"]); + encode_as_str(&out) +} + +/// Outputs the current git commit hash as a string literal. +#[proc_macro] +pub fn build_git_revision(_item: TokenStream) -> TokenStream { + let out = run_cmd("git", &["rev-parse", "--short", "HEAD"]); + encode_as_str(&out) +} + +/// Creates a slice of `&[(&str, &str)]` tuples that correspond to +/// the submodule name -> revision. +/// Results in an empty list if there are no submodules or if +/// the command fails. +#[proc_macro] +pub fn build_git_submodules(_item: TokenStream) -> TokenStream { + let Some(out) = run_cmd_opt("git", &["submodule", "status"]) else { + return TokenStream::from_str("&[]").unwrap(); + }; + let submodules = out + .lines() + .filter_map(|line| { + let parts: Vec<&str> = line.split_whitespace().collect(); + // Index 0 is commit hash, index 1 is the path to the folder, and there + // may be some metainformation after that. + if parts.len() >= 2 { + let folder_name = parts[1].split('/').last().unwrap_or(parts[1]); + Some((folder_name, parts[0])) + } else { + None + } + }) + .collect::>(); + let submodules = submodules + .iter() + .map(|(name, rev)| format!("(\"{}\", \"{}\")", name, rev)) + .collect::>() + .join(", "); + TokenStream::from_str(format!("&[{}]", submodules).as_str()) + .unwrap_or_else(|_| panic!("Unable to encode submodules: {}", submodules)) +} + +/// Tries to run the command, only returns `Some` if the command +/// succeeded and the output was valid utf8. +fn run_cmd(cmd: &str, args: &[&str]) -> String { + run_cmd_opt(cmd, args).unwrap_or("unknown".to_string()) +} + +fn run_cmd_opt(cmd: &str, args: &[&str]) -> Option { + let output = Command::new(cmd).args(args).output().ok()?; + if output.status.success() { + String::from_utf8(output.stdout) + .ok() + .map(|s| s.trim().to_string()) + } else { + None + } +} + +/// Encodes string as a literal. +fn encode_as_str(s: &str) -> TokenStream { + TokenStream::from_str(format!("\"{}\"", s).as_str()) + .unwrap_or_else(|_| panic!("Unable to encode string: {}", s)) +} diff --git a/core/node/house_keeper/Cargo.toml b/core/node/house_keeper/Cargo.toml index 99a88306facc..f4ea38daa688 100644 --- a/core/node/house_keeper/Cargo.toml +++ b/core/node/house_keeper/Cargo.toml @@ -14,6 +14,7 @@ categories.workspace = true serde.workspace = true vise.workspace = true zksync_dal.workspace = true +zksync_git_version_macro.workspace = true zksync_health_check.workspace = true zksync_shared_metrics.workspace = true zksync_types.workspace = true diff --git a/core/node/house_keeper/src/node_metadata.rs b/core/node/house_keeper/src/node_metadata.rs index e3e201f25afd..bfea9ca17b61 100644 --- a/core/node/house_keeper/src/node_metadata.rs +++ b/core/node/house_keeper/src/node_metadata.rs @@ -2,16 +2,21 @@ use async_trait::async_trait; use serde::{Deserialize, Serialize}; use zksync_health_check::{CheckHealth, Health, HealthStatus}; +const GIT_VERSION: &str = zksync_git_version_macro::build_git_revision!(); +const GIT_BRANCH: &str = zksync_git_version_macro::build_git_branch!(); + /// General information about the node. #[derive(Debug, Serialize, Deserialize)] pub struct NodeInfo { - pub version: String, + pub git_version: String, + pub git_branch: String, } /// Health details for a node. -#[derive(Debug, Serialize)] +#[derive(Debug, Default, Serialize)] #[serde(tag = "stage", rename_all = "snake_case")] pub enum NodeHealth { + #[default] Initializing, Running(NodeInfo), } @@ -34,7 +39,8 @@ impl CheckHealth for NodeHealth { async fn check_health(&self) -> Health { NodeHealth::Running(NodeInfo { - version: "0.1.0-yolo".to_string(), + git_version: GIT_VERSION.to_string(), + git_branch: GIT_BRANCH.to_string(), }) .into() } diff --git a/core/node/node_framework/src/implementations/layers/house_keeper.rs b/core/node/node_framework/src/implementations/layers/house_keeper.rs index 2a977781e402..763841b0fd94 100644 --- a/core/node/node_framework/src/implementations/layers/house_keeper.rs +++ b/core/node/node_framework/src/implementations/layers/house_keeper.rs @@ -69,7 +69,8 @@ impl WiringLayer for HouseKeeperLayer { ); let node_health = NodeHealth::Running(NodeInfo { - version: "".to_string(), + git_version: "GIT_VERSION".to_string(), + git_branch: "GIT_BRANCH".to_string(), }); let app_health = input.app_health.0; From d08ecae9d506dc5e13b6ed62e53c6d4f20973c53 Mon Sep 17 00:00:00 2001 From: Manuel Mauro Date: Wed, 30 Oct 2024 14:02:45 +0100 Subject: [PATCH 03/60] refactor: simplify static health check --- core/node/house_keeper/src/lib.rs | 2 +- core/node/house_keeper/src/node_metadata.rs | 47 ------------------- core/node/house_keeper/src/version.rs | 39 +++++++++++++++ .../implementations/layers/house_keeper.rs | 12 ++--- 4 files changed, 43 insertions(+), 57 deletions(-) delete mode 100644 core/node/house_keeper/src/node_metadata.rs create mode 100644 core/node/house_keeper/src/version.rs diff --git a/core/node/house_keeper/src/lib.rs b/core/node/house_keeper/src/lib.rs index eb18e474990c..2326b0a6e2d2 100644 --- a/core/node/house_keeper/src/lib.rs +++ b/core/node/house_keeper/src/lib.rs @@ -1,4 +1,4 @@ pub mod blocks_state_reporter; mod metrics; -pub mod node_metadata; pub mod periodic_job; +pub mod version; diff --git a/core/node/house_keeper/src/node_metadata.rs b/core/node/house_keeper/src/node_metadata.rs deleted file mode 100644 index bfea9ca17b61..000000000000 --- a/core/node/house_keeper/src/node_metadata.rs +++ /dev/null @@ -1,47 +0,0 @@ -use async_trait::async_trait; -use serde::{Deserialize, Serialize}; -use zksync_health_check::{CheckHealth, Health, HealthStatus}; - -const GIT_VERSION: &str = zksync_git_version_macro::build_git_revision!(); -const GIT_BRANCH: &str = zksync_git_version_macro::build_git_branch!(); - -/// General information about the node. -#[derive(Debug, Serialize, Deserialize)] -pub struct NodeInfo { - pub git_version: String, - pub git_branch: String, -} - -/// Health details for a node. -#[derive(Debug, Default, Serialize)] -#[serde(tag = "stage", rename_all = "snake_case")] -pub enum NodeHealth { - #[default] - Initializing, - Running(NodeInfo), -} - -impl From for Health { - fn from(details: NodeHealth) -> Self { - let status = match &details { - NodeHealth::Initializing => HealthStatus::Affected, - NodeHealth::Running(_) => HealthStatus::Ready, - }; - Self::from(status).with_details(details) - } -} - -#[async_trait] -impl CheckHealth for NodeHealth { - fn name(&self) -> &'static str { - "node" - } - - async fn check_health(&self) -> Health { - NodeHealth::Running(NodeInfo { - git_version: GIT_VERSION.to_string(), - git_branch: GIT_BRANCH.to_string(), - }) - .into() - } -} diff --git a/core/node/house_keeper/src/version.rs b/core/node/house_keeper/src/version.rs new file mode 100644 index 000000000000..a6cb330a6e75 --- /dev/null +++ b/core/node/house_keeper/src/version.rs @@ -0,0 +1,39 @@ +use async_trait::async_trait; +use serde::{Deserialize, Serialize}; +use zksync_health_check::{CheckHealth, Health, HealthStatus}; + +const GIT_VERSION: &str = zksync_git_version_macro::build_git_revision!(); +const GIT_BRANCH: &str = zksync_git_version_macro::build_git_branch!(); + +/// This struct implements a static health check describing node's version information. +#[derive(Debug, Serialize, Deserialize)] +pub struct NodeVersionInfo { + git_version: String, + git_branch: String, +} + +impl Default for NodeVersionInfo { + fn default() -> Self { + Self { + git_version: GIT_VERSION.to_string(), + git_branch: GIT_BRANCH.to_string(), + } + } +} + +impl From<&NodeVersionInfo> for Health { + fn from(details: &NodeVersionInfo) -> Self { + Self::from(HealthStatus::Ready).with_details(details) + } +} + +#[async_trait] +impl CheckHealth for NodeVersionInfo { + fn name(&self) -> &'static str { + "version" + } + + async fn check_health(&self) -> Health { + self.into() + } +} diff --git a/core/node/node_framework/src/implementations/layers/house_keeper.rs b/core/node/node_framework/src/implementations/layers/house_keeper.rs index 763841b0fd94..0f2f6f90a861 100644 --- a/core/node/node_framework/src/implementations/layers/house_keeper.rs +++ b/core/node/node_framework/src/implementations/layers/house_keeper.rs @@ -2,9 +2,8 @@ use std::sync::Arc; use zksync_config::configs::house_keeper::HouseKeeperConfig; use zksync_house_keeper::{ - blocks_state_reporter::L1BatchMetricsReporter, - node_metadata::{NodeHealth, NodeInfo}, - periodic_job::PeriodicJob, + blocks_state_reporter::L1BatchMetricsReporter, periodic_job::PeriodicJob, + version::NodeVersionInfo, }; use crate::{ @@ -68,14 +67,9 @@ impl WiringLayer for HouseKeeperLayer { replica_pool.clone(), ); - let node_health = NodeHealth::Running(NodeInfo { - git_version: "GIT_VERSION".to_string(), - git_branch: "GIT_BRANCH".to_string(), - }); - let app_health = input.app_health.0; app_health - .insert_custom_component(Arc::new(node_health)) + .insert_custom_component(Arc::new(NodeVersionInfo::default())) .map_err(WiringError::internal)?; Ok(Output { From a05838843ba500fcea8047460037d66a3c6f121f Mon Sep 17 00:00:00 2001 From: Manuel Mauro Date: Wed, 30 Oct 2024 15:28:03 +0100 Subject: [PATCH 04/60] feat: add last migration to system_dal --- ...57124dfd63c26f107c927cb9be30fd7a970a8.json | 50 +++++++++++++++++++ core/lib/dal/src/system_dal.rs | 16 ++++++ 2 files changed, 66 insertions(+) create mode 100644 core/lib/dal/.sqlx/query-c7b7eebed15f002a45b2dfaef3357124dfd63c26f107c927cb9be30fd7a970a8.json diff --git a/core/lib/dal/.sqlx/query-c7b7eebed15f002a45b2dfaef3357124dfd63c26f107c927cb9be30fd7a970a8.json b/core/lib/dal/.sqlx/query-c7b7eebed15f002a45b2dfaef3357124dfd63c26f107c927cb9be30fd7a970a8.json new file mode 100644 index 000000000000..df0824ff4a95 --- /dev/null +++ b/core/lib/dal/.sqlx/query-c7b7eebed15f002a45b2dfaef3357124dfd63c26f107c927cb9be30fd7a970a8.json @@ -0,0 +1,50 @@ +{ + "db_name": "PostgreSQL", + "query": "\n SELECT *\n FROM _sqlx_migrations\n ORDER BY version DESC\n LIMIT 1;\n ", + "describe": { + "columns": [ + { + "ordinal": 0, + "name": "version", + "type_info": "Int8" + }, + { + "ordinal": 1, + "name": "description", + "type_info": "Text" + }, + { + "ordinal": 2, + "name": "installed_on", + "type_info": "Timestamptz" + }, + { + "ordinal": 3, + "name": "success", + "type_info": "Bool" + }, + { + "ordinal": 4, + "name": "checksum", + "type_info": "Bytea" + }, + { + "ordinal": 5, + "name": "execution_time", + "type_info": "Int8" + } + ], + "parameters": { + "Left": [] + }, + "nullable": [ + false, + false, + false, + false, + false, + false + ] + }, + "hash": "c7b7eebed15f002a45b2dfaef3357124dfd63c26f107c927cb9be30fd7a970a8" +} diff --git a/core/lib/dal/src/system_dal.rs b/core/lib/dal/src/system_dal.rs index 105665fa2ec6..715548f9b17d 100644 --- a/core/lib/dal/src/system_dal.rs +++ b/core/lib/dal/src/system_dal.rs @@ -86,4 +86,20 @@ impl SystemDal<'_, '_> { }); Ok(table_sizes.collect()) } + + pub async fn get_last_migration(&mut self) -> DalResult { + let row = sqlx::query!( + r#" + SELECT * + FROM _sqlx_migrations + ORDER BY version DESC + LIMIT 1; + "# + ) + .instrument("get_last_migration") + .fetch_one(self.storage) + .await?; + + Ok(row.version) + } } From a5f571bad6c25eeac7b665ca550c37d76c4dc7b2 Mon Sep 17 00:00:00 2001 From: Manuel Mauro Date: Thu, 31 Oct 2024 10:32:42 +0100 Subject: [PATCH 05/60] feat: add database healthcheck --- core/node/house_keeper/src/database.rs | 46 +++++++++++++++++++ core/node/house_keeper/src/lib.rs | 1 + .../implementations/layers/house_keeper.rs | 37 +++++++++++++-- 3 files changed, 81 insertions(+), 3 deletions(-) create mode 100644 core/node/house_keeper/src/database.rs diff --git a/core/node/house_keeper/src/database.rs b/core/node/house_keeper/src/database.rs new file mode 100644 index 000000000000..2b12a5068609 --- /dev/null +++ b/core/node/house_keeper/src/database.rs @@ -0,0 +1,46 @@ +use async_trait::async_trait; +use serde::{Deserialize, Serialize}; +use zksync_dal::{ConnectionPool, Core, CoreDal}; +use zksync_health_check::{Health, HealthStatus, HealthUpdater}; + +use crate::periodic_job::PeriodicJob; + +/// This struct implements a static health check describing node's version information. +#[derive(Debug, Serialize, Deserialize)] +pub struct DatabaseInfo { + last_migration: i64, +} + +impl From for Health { + fn from(details: DatabaseInfo) -> Self { + Self::from(HealthStatus::Ready).with_details(details) + } +} + +#[derive(Debug)] +pub struct DatabaseHealthTask { + pub connection_pool: ConnectionPool, + pub database_health_updater: HealthUpdater, +} + +impl DatabaseHealthTask { + pub const POLLING_INTERVAL_MS: u64 = 10_000; +} + +#[async_trait] +impl PeriodicJob for DatabaseHealthTask { + const SERVICE_NAME: &'static str = "L1BatchMetricsReporter"; + + async fn run_routine_task(&mut self) -> anyhow::Result<()> { + let mut conn = self.connection_pool.connection().await.unwrap(); + let last_migration = conn.system_dal().get_last_migration().await.unwrap(); + + self.database_health_updater + .update(DatabaseInfo { last_migration }.into()); + Ok(()) + } + + fn polling_interval_ms(&self) -> u64 { + Self::POLLING_INTERVAL_MS + } +} diff --git a/core/node/house_keeper/src/lib.rs b/core/node/house_keeper/src/lib.rs index 2326b0a6e2d2..3401151c24c3 100644 --- a/core/node/house_keeper/src/lib.rs +++ b/core/node/house_keeper/src/lib.rs @@ -1,4 +1,5 @@ pub mod blocks_state_reporter; +pub mod database; mod metrics; pub mod periodic_job; pub mod version; diff --git a/core/node/node_framework/src/implementations/layers/house_keeper.rs b/core/node/node_framework/src/implementations/layers/house_keeper.rs index 0f2f6f90a861..d1e3ab76d1f0 100644 --- a/core/node/node_framework/src/implementations/layers/house_keeper.rs +++ b/core/node/node_framework/src/implementations/layers/house_keeper.rs @@ -1,9 +1,10 @@ use std::sync::Arc; use zksync_config::configs::house_keeper::HouseKeeperConfig; +use zksync_health_check::ReactiveHealthCheck; use zksync_house_keeper::{ - blocks_state_reporter::L1BatchMetricsReporter, periodic_job::PeriodicJob, - version::NodeVersionInfo, + blocks_state_reporter::L1BatchMetricsReporter, database::DatabaseHealthTask, + periodic_job::PeriodicJob, version::NodeVersionInfo, }; use crate::{ @@ -12,7 +13,7 @@ use crate::{ pools::{PoolResource, ReplicaPool}, }, service::StopReceiver, - task::{Task, TaskId}, + task::{Task, TaskId, TaskKind}, wiring_layer::{WiringError, WiringLayer}, FromContext, IntoContext, }; @@ -37,6 +38,8 @@ pub struct Input { pub struct Output { #[context(task)] pub l1_batch_metrics_reporter: L1BatchMetricsReporter, + #[context(task)] + pub database_health_task: DatabaseHealthTask, } impl HouseKeeperLayer { @@ -72,8 +75,21 @@ impl WiringLayer for HouseKeeperLayer { .insert_custom_component(Arc::new(NodeVersionInfo::default())) .map_err(WiringError::internal)?; + let (database_health_check, database_health_updater) = + ReactiveHealthCheck::new("database_health"); + + app_health + .insert_component(database_health_check) + .map_err(WiringError::internal)?; + + let database_health_task = DatabaseHealthTask { + connection_pool: replica_pool.clone(), + database_health_updater, + }; + Ok(Output { l1_batch_metrics_reporter, + database_health_task, }) } } @@ -88,3 +104,18 @@ impl Task for L1BatchMetricsReporter { (*self).run(stop_receiver.0).await } } + +#[async_trait::async_trait] +impl Task for DatabaseHealthTask { + fn kind(&self) -> TaskKind { + TaskKind::UnconstrainedTask + } + + fn id(&self) -> TaskId { + "database_health".into() + } + + async fn run(self: Box, stop_receiver: StopReceiver) -> anyhow::Result<()> { + (*self).run(stop_receiver.0).await + } +} From a5db49a76aa89d0d3f535135fcb9afc74335ec73 Mon Sep 17 00:00:00 2001 From: Manuel Mauro Date: Thu, 31 Oct 2024 11:06:52 +0100 Subject: [PATCH 06/60] feat: add more information to database heathcheck --- core/lib/dal/src/system_dal.rs | 24 ++++++++++++++++++++++-- core/node/house_keeper/src/database.rs | 4 ++-- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/core/lib/dal/src/system_dal.rs b/core/lib/dal/src/system_dal.rs index 715548f9b17d..b221d619779b 100644 --- a/core/lib/dal/src/system_dal.rs +++ b/core/lib/dal/src/system_dal.rs @@ -1,5 +1,7 @@ use std::{collections::HashMap, time::Duration}; +use chrono::DateTime; +use serde::{Deserialize, Serialize}; use zksync_db_connection::{connection::Connection, error::DalResult, instrument::InstrumentExt}; use crate::Core; @@ -12,6 +14,16 @@ pub(crate) struct TableSize { pub total_size: u64, } +#[derive(Debug, Serialize, Deserialize)] +pub struct DatabaseMigration { + pub version: i64, + pub description: String, + pub installed_on: DateTime, + pub success: bool, + pub checksum: String, + pub execution_time: i64, +} + #[derive(Debug)] pub struct SystemDal<'a, 'c> { pub(crate) storage: &'a mut Connection<'c, Core>, @@ -87,7 +99,7 @@ impl SystemDal<'_, '_> { Ok(table_sizes.collect()) } - pub async fn get_last_migration(&mut self) -> DalResult { + pub async fn get_last_migration(&mut self) -> DalResult { let row = sqlx::query!( r#" SELECT * @@ -100,6 +112,14 @@ impl SystemDal<'_, '_> { .fetch_one(self.storage) .await?; - Ok(row.version) + Ok(DatabaseMigration { + version: row.version, + description: row.description, + installed_on: row.installed_on, + success: row.success, + // TODO improve presentation + checksum: String::from_utf8_lossy(row.checksum.as_ref()).to_string(), + execution_time: row.execution_time, + }) } } diff --git a/core/node/house_keeper/src/database.rs b/core/node/house_keeper/src/database.rs index 2b12a5068609..26ed92208afc 100644 --- a/core/node/house_keeper/src/database.rs +++ b/core/node/house_keeper/src/database.rs @@ -1,6 +1,6 @@ use async_trait::async_trait; use serde::{Deserialize, Serialize}; -use zksync_dal::{ConnectionPool, Core, CoreDal}; +use zksync_dal::{system_dal::DatabaseMigration, ConnectionPool, Core, CoreDal}; use zksync_health_check::{Health, HealthStatus, HealthUpdater}; use crate::periodic_job::PeriodicJob; @@ -8,7 +8,7 @@ use crate::periodic_job::PeriodicJob; /// This struct implements a static health check describing node's version information. #[derive(Debug, Serialize, Deserialize)] pub struct DatabaseInfo { - last_migration: i64, + last_migration: DatabaseMigration, } impl From for Health { From b57027c0a8fe87fc46f2a15ba6e91296c88f5f2e Mon Sep 17 00:00:00 2001 From: Manuel Mauro Date: Thu, 31 Oct 2024 11:08:05 +0100 Subject: [PATCH 07/60] style: format code --- core/lib/dal/src/system_dal.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/core/lib/dal/src/system_dal.rs b/core/lib/dal/src/system_dal.rs index b221d619779b..a3af14f90fd3 100644 --- a/core/lib/dal/src/system_dal.rs +++ b/core/lib/dal/src/system_dal.rs @@ -102,10 +102,10 @@ impl SystemDal<'_, '_> { pub async fn get_last_migration(&mut self) -> DalResult { let row = sqlx::query!( r#" - SELECT * - FROM _sqlx_migrations - ORDER BY version DESC - LIMIT 1; + SELECT * + FROM _sqlx_migrations + ORDER BY version DESC + LIMIT 1; "# ) .instrument("get_last_migration") From 5d23bd66a0fdf43c3175ec0eee89a7e9e3701088 Mon Sep 17 00:00:00 2001 From: Manuel Mauro Date: Thu, 31 Oct 2024 11:10:49 +0100 Subject: [PATCH 08/60] chore: prepare sqlx queries --- ...5c0ad2579709ae355234811e09fdf5c40e3d2.json | 50 +++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 core/lib/dal/.sqlx/query-ee1bfbbbed5f28c8a30743770b55c0ad2579709ae355234811e09fdf5c40e3d2.json diff --git a/core/lib/dal/.sqlx/query-ee1bfbbbed5f28c8a30743770b55c0ad2579709ae355234811e09fdf5c40e3d2.json b/core/lib/dal/.sqlx/query-ee1bfbbbed5f28c8a30743770b55c0ad2579709ae355234811e09fdf5c40e3d2.json new file mode 100644 index 000000000000..d9d6b56bac8a --- /dev/null +++ b/core/lib/dal/.sqlx/query-ee1bfbbbed5f28c8a30743770b55c0ad2579709ae355234811e09fdf5c40e3d2.json @@ -0,0 +1,50 @@ +{ + "db_name": "PostgreSQL", + "query": "\n SELECT *\n FROM _sqlx_migrations\n ORDER BY version DESC\n LIMIT 1;\n ", + "describe": { + "columns": [ + { + "ordinal": 0, + "name": "version", + "type_info": "Int8" + }, + { + "ordinal": 1, + "name": "description", + "type_info": "Text" + }, + { + "ordinal": 2, + "name": "installed_on", + "type_info": "Timestamptz" + }, + { + "ordinal": 3, + "name": "success", + "type_info": "Bool" + }, + { + "ordinal": 4, + "name": "checksum", + "type_info": "Bytea" + }, + { + "ordinal": 5, + "name": "execution_time", + "type_info": "Int8" + } + ], + "parameters": { + "Left": [] + }, + "nullable": [ + false, + false, + false, + false, + false, + false + ] + }, + "hash": "ee1bfbbbed5f28c8a30743770b55c0ad2579709ae355234811e09fdf5c40e3d2" +} From 959542bdae58f2bff60a46ba272b428c30e02e4f Mon Sep 17 00:00:00 2001 From: Manuel Mauro Date: Thu, 31 Oct 2024 11:11:20 +0100 Subject: [PATCH 09/60] fix: remove outdated query file --- ...57124dfd63c26f107c927cb9be30fd7a970a8.json | 50 ------------------- 1 file changed, 50 deletions(-) delete mode 100644 core/lib/dal/.sqlx/query-c7b7eebed15f002a45b2dfaef3357124dfd63c26f107c927cb9be30fd7a970a8.json diff --git a/core/lib/dal/.sqlx/query-c7b7eebed15f002a45b2dfaef3357124dfd63c26f107c927cb9be30fd7a970a8.json b/core/lib/dal/.sqlx/query-c7b7eebed15f002a45b2dfaef3357124dfd63c26f107c927cb9be30fd7a970a8.json deleted file mode 100644 index df0824ff4a95..000000000000 --- a/core/lib/dal/.sqlx/query-c7b7eebed15f002a45b2dfaef3357124dfd63c26f107c927cb9be30fd7a970a8.json +++ /dev/null @@ -1,50 +0,0 @@ -{ - "db_name": "PostgreSQL", - "query": "\n SELECT *\n FROM _sqlx_migrations\n ORDER BY version DESC\n LIMIT 1;\n ", - "describe": { - "columns": [ - { - "ordinal": 0, - "name": "version", - "type_info": "Int8" - }, - { - "ordinal": 1, - "name": "description", - "type_info": "Text" - }, - { - "ordinal": 2, - "name": "installed_on", - "type_info": "Timestamptz" - }, - { - "ordinal": 3, - "name": "success", - "type_info": "Bool" - }, - { - "ordinal": 4, - "name": "checksum", - "type_info": "Bytea" - }, - { - "ordinal": 5, - "name": "execution_time", - "type_info": "Int8" - } - ], - "parameters": { - "Left": [] - }, - "nullable": [ - false, - false, - false, - false, - false, - false - ] - }, - "hash": "c7b7eebed15f002a45b2dfaef3357124dfd63c26f107c927cb9be30fd7a970a8" -} From 28ecc96c1c92adde1d37988cb2e7e4db0c3bdc79 Mon Sep 17 00:00:00 2001 From: Manuel Mauro Date: Thu, 31 Oct 2024 16:25:14 +0100 Subject: [PATCH 10/60] feat: improve bytes encoding in healthcheck --- core/lib/dal/src/system_dal.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/core/lib/dal/src/system_dal.rs b/core/lib/dal/src/system_dal.rs index a3af14f90fd3..f54d5d13ce71 100644 --- a/core/lib/dal/src/system_dal.rs +++ b/core/lib/dal/src/system_dal.rs @@ -117,8 +117,7 @@ impl SystemDal<'_, '_> { description: row.description, installed_on: row.installed_on, success: row.success, - // TODO improve presentation - checksum: String::from_utf8_lossy(row.checksum.as_ref()).to_string(), + checksum: hex::encode(row.checksum), execution_time: row.execution_time, }) } From de3a5e1ad68bb7719d95ddfc3d94101300aade26 Mon Sep 17 00:00:00 2001 From: Manuel Mauro Date: Thu, 31 Oct 2024 18:12:05 +0100 Subject: [PATCH 11/60] feat: add dummy health check tasks for state keeper and eth sender --- core/node/house_keeper/src/database.rs | 3 +- core/node/house_keeper/src/eth_sender.rs | 65 ++++++++++++++++++ core/node/house_keeper/src/lib.rs | 2 + core/node/house_keeper/src/state_keeper.rs | 53 +++++++++++++++ .../implementations/layers/house_keeper.rs | 66 ++++++++++++++++++- 5 files changed, 184 insertions(+), 5 deletions(-) create mode 100644 core/node/house_keeper/src/eth_sender.rs create mode 100644 core/node/house_keeper/src/state_keeper.rs diff --git a/core/node/house_keeper/src/database.rs b/core/node/house_keeper/src/database.rs index 26ed92208afc..8fe7b78b890f 100644 --- a/core/node/house_keeper/src/database.rs +++ b/core/node/house_keeper/src/database.rs @@ -5,7 +5,6 @@ use zksync_health_check::{Health, HealthStatus, HealthUpdater}; use crate::periodic_job::PeriodicJob; -/// This struct implements a static health check describing node's version information. #[derive(Debug, Serialize, Deserialize)] pub struct DatabaseInfo { last_migration: DatabaseMigration, @@ -29,7 +28,7 @@ impl DatabaseHealthTask { #[async_trait] impl PeriodicJob for DatabaseHealthTask { - const SERVICE_NAME: &'static str = "L1BatchMetricsReporter"; + const SERVICE_NAME: &'static str = "DatabaseHealth"; async fn run_routine_task(&mut self) -> anyhow::Result<()> { let mut conn = self.connection_pool.connection().await.unwrap(); diff --git a/core/node/house_keeper/src/eth_sender.rs b/core/node/house_keeper/src/eth_sender.rs new file mode 100644 index 000000000000..19c58d6973eb --- /dev/null +++ b/core/node/house_keeper/src/eth_sender.rs @@ -0,0 +1,65 @@ +use async_trait::async_trait; +use serde::{Deserialize, Serialize}; +use zksync_dal::{ConnectionPool, Core, CoreDal}; +use zksync_health_check::{Health, HealthStatus, HealthUpdater}; + +use crate::periodic_job::PeriodicJob; + +#[derive(Debug, Serialize, Deserialize)] +pub struct EthSenderInfo { + failed_l1_txns: Option<()>, + last_created_commit_batch: Option<()>, + last_created_prove_batch: Option<()>, + last_created_execute_batch: Option<()>, + last_executed_commit_batch: Option<()>, + last_executed_prove_batch: Option<()>, + last_executed_execute_batch: Option<()>, + current_nonce: Option<()>, + latest_operator_nonce: Option<()>, +} + +impl From for Health { + fn from(details: EthSenderInfo) -> Self { + Self::from(HealthStatus::Ready).with_details(details) + } +} + +#[derive(Debug)] +pub struct EthSenderHealthTask { + pub connection_pool: ConnectionPool, + pub eth_sender_health_updater: HealthUpdater, +} + +impl EthSenderHealthTask { + pub const POLLING_INTERVAL_MS: u64 = 10_000; +} + +#[async_trait] +impl PeriodicJob for EthSenderHealthTask { + const SERVICE_NAME: &'static str = "EthSenderHealth"; + + async fn run_routine_task(&mut self) -> anyhow::Result<()> { + let mut conn = self.connection_pool.connection().await.unwrap(); + let _last_migration = conn.system_dal().get_last_migration().await.unwrap(); + + self.eth_sender_health_updater.update( + EthSenderInfo { + failed_l1_txns: None, + last_created_commit_batch: None, + last_created_prove_batch: None, + last_created_execute_batch: None, + last_executed_commit_batch: None, + last_executed_prove_batch: None, + last_executed_execute_batch: None, + current_nonce: None, + latest_operator_nonce: None, + } + .into(), + ); + Ok(()) + } + + fn polling_interval_ms(&self) -> u64 { + Self::POLLING_INTERVAL_MS + } +} diff --git a/core/node/house_keeper/src/lib.rs b/core/node/house_keeper/src/lib.rs index 3401151c24c3..8011716a86ce 100644 --- a/core/node/house_keeper/src/lib.rs +++ b/core/node/house_keeper/src/lib.rs @@ -1,5 +1,7 @@ pub mod blocks_state_reporter; pub mod database; +pub mod eth_sender; mod metrics; pub mod periodic_job; +pub mod state_keeper; pub mod version; diff --git a/core/node/house_keeper/src/state_keeper.rs b/core/node/house_keeper/src/state_keeper.rs new file mode 100644 index 000000000000..8a255e967118 --- /dev/null +++ b/core/node/house_keeper/src/state_keeper.rs @@ -0,0 +1,53 @@ +use async_trait::async_trait; +use serde::{Deserialize, Serialize}; +use zksync_dal::{ConnectionPool, Core, CoreDal}; +use zksync_health_check::{Health, HealthStatus, HealthUpdater}; + +use crate::periodic_job::PeriodicJob; + +#[derive(Debug, Serialize, Deserialize)] +pub struct StateKeeperInfo { + last_miniblock_protocol_upgrade: Option<()>, + last_miniblock: Option<()>, + batch_number: Option<()>, +} + +impl From for Health { + fn from(details: StateKeeperInfo) -> Self { + Self::from(HealthStatus::Ready).with_details(details) + } +} + +#[derive(Debug)] +pub struct StateKeeperHealthTask { + pub connection_pool: ConnectionPool, + pub state_keeper_health_updater: HealthUpdater, +} + +impl StateKeeperHealthTask { + pub const POLLING_INTERVAL_MS: u64 = 10_000; +} + +#[async_trait] +impl PeriodicJob for StateKeeperHealthTask { + const SERVICE_NAME: &'static str = "StateKeeperHealth"; + + async fn run_routine_task(&mut self) -> anyhow::Result<()> { + let mut conn = self.connection_pool.connection().await.unwrap(); + let _last_migration = conn.system_dal().get_last_migration().await.unwrap(); + + self.state_keeper_health_updater.update( + StateKeeperInfo { + last_miniblock_protocol_upgrade: None, + last_miniblock: None, + batch_number: None, + } + .into(), + ); + Ok(()) + } + + fn polling_interval_ms(&self) -> u64 { + Self::POLLING_INTERVAL_MS + } +} diff --git a/core/node/node_framework/src/implementations/layers/house_keeper.rs b/core/node/node_framework/src/implementations/layers/house_keeper.rs index d1e3ab76d1f0..e48d47972280 100644 --- a/core/node/node_framework/src/implementations/layers/house_keeper.rs +++ b/core/node/node_framework/src/implementations/layers/house_keeper.rs @@ -4,7 +4,8 @@ use zksync_config::configs::house_keeper::HouseKeeperConfig; use zksync_health_check::ReactiveHealthCheck; use zksync_house_keeper::{ blocks_state_reporter::L1BatchMetricsReporter, database::DatabaseHealthTask, - periodic_job::PeriodicJob, version::NodeVersionInfo, + eth_sender::EthSenderHealthTask, periodic_job::PeriodicJob, + state_keeper::StateKeeperHealthTask, version::NodeVersionInfo, }; use crate::{ @@ -40,6 +41,10 @@ pub struct Output { pub l1_batch_metrics_reporter: L1BatchMetricsReporter, #[context(task)] pub database_health_task: DatabaseHealthTask, + #[context(task)] + pub eth_sender_health_task: EthSenderHealthTask, + #[context(task)] + pub state_keeper_health_task: StateKeeperHealthTask, } impl HouseKeeperLayer { @@ -75,8 +80,7 @@ impl WiringLayer for HouseKeeperLayer { .insert_custom_component(Arc::new(NodeVersionInfo::default())) .map_err(WiringError::internal)?; - let (database_health_check, database_health_updater) = - ReactiveHealthCheck::new("database_health"); + let (database_health_check, database_health_updater) = ReactiveHealthCheck::new("database"); app_health .insert_component(database_health_check) @@ -87,9 +91,35 @@ impl WiringLayer for HouseKeeperLayer { database_health_updater, }; + let (eth_sender_health_check, eth_sender_health_updater) = + ReactiveHealthCheck::new("eth_sender"); + + app_health + .insert_component(eth_sender_health_check) + .map_err(WiringError::internal)?; + + let eth_sender_health_task = EthSenderHealthTask { + connection_pool: replica_pool.clone(), + eth_sender_health_updater, + }; + + let (state_keeper_health_check, state_keeper_health_updater) = + ReactiveHealthCheck::new("state_keeper"); + + app_health + .insert_component(state_keeper_health_check) + .map_err(WiringError::internal)?; + + let state_keeper_health_task = StateKeeperHealthTask { + connection_pool: replica_pool.clone(), + state_keeper_health_updater, + }; + Ok(Output { l1_batch_metrics_reporter, database_health_task, + eth_sender_health_task, + state_keeper_health_task, }) } } @@ -119,3 +149,33 @@ impl Task for DatabaseHealthTask { (*self).run(stop_receiver.0).await } } + +#[async_trait::async_trait] +impl Task for EthSenderHealthTask { + fn kind(&self) -> TaskKind { + TaskKind::UnconstrainedTask + } + + fn id(&self) -> TaskId { + "eth_sender_health".into() + } + + async fn run(self: Box, stop_receiver: StopReceiver) -> anyhow::Result<()> { + (*self).run(stop_receiver.0).await + } +} + +#[async_trait::async_trait] +impl Task for StateKeeperHealthTask { + fn kind(&self) -> TaskKind { + TaskKind::UnconstrainedTask + } + + fn id(&self) -> TaskId { + "state_keeper_health".into() + } + + async fn run(self: Box, stop_receiver: StopReceiver) -> anyhow::Result<()> { + (*self).run(stop_receiver.0).await + } +} From 582754b9f49b9444f0c66e150d2664df52e28f8a Mon Sep 17 00:00:00 2001 From: Manuel Mauro Date: Thu, 31 Oct 2024 18:30:46 +0100 Subject: [PATCH 12/60] fix: do not unwrap --- core/node/house_keeper/src/database.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/node/house_keeper/src/database.rs b/core/node/house_keeper/src/database.rs index 8fe7b78b890f..0ee91d9dd8af 100644 --- a/core/node/house_keeper/src/database.rs +++ b/core/node/house_keeper/src/database.rs @@ -31,8 +31,8 @@ impl PeriodicJob for DatabaseHealthTask { const SERVICE_NAME: &'static str = "DatabaseHealth"; async fn run_routine_task(&mut self) -> anyhow::Result<()> { - let mut conn = self.connection_pool.connection().await.unwrap(); - let last_migration = conn.system_dal().get_last_migration().await.unwrap(); + let mut conn = self.connection_pool.connection().await?; + let last_migration = conn.system_dal().get_last_migration().await?; self.database_health_updater .update(DatabaseInfo { last_migration }.into()); From 1b8e50ece51f374081d346ad309c10c2783aaa3e Mon Sep 17 00:00:00 2001 From: Manuel Mauro Date: Thu, 31 Oct 2024 18:31:13 +0100 Subject: [PATCH 13/60] feat: retrieve failed L1 transactions and next operator nonce --- core/node/house_keeper/src/eth_sender.rs | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/core/node/house_keeper/src/eth_sender.rs b/core/node/house_keeper/src/eth_sender.rs index 19c58d6973eb..fc7277eb3d5d 100644 --- a/core/node/house_keeper/src/eth_sender.rs +++ b/core/node/house_keeper/src/eth_sender.rs @@ -7,15 +7,14 @@ use crate::periodic_job::PeriodicJob; #[derive(Debug, Serialize, Deserialize)] pub struct EthSenderInfo { - failed_l1_txns: Option<()>, + failed_l1_txns: i64, last_created_commit_batch: Option<()>, last_created_prove_batch: Option<()>, last_created_execute_batch: Option<()>, last_executed_commit_batch: Option<()>, last_executed_prove_batch: Option<()>, last_executed_execute_batch: Option<()>, - current_nonce: Option<()>, - latest_operator_nonce: Option<()>, + next_nonce: Option, } impl From for Health { @@ -39,20 +38,25 @@ impl PeriodicJob for EthSenderHealthTask { const SERVICE_NAME: &'static str = "EthSenderHealth"; async fn run_routine_task(&mut self) -> anyhow::Result<()> { - let mut conn = self.connection_pool.connection().await.unwrap(); - let _last_migration = conn.system_dal().get_last_migration().await.unwrap(); + let mut conn = self.connection_pool.connection().await?; + let failed_l1_txns = conn + .eth_sender_dal() + .get_number_of_failed_transactions() + .await?; + + // TODO retrieve SettlementMode from config + let next_nonce = conn.eth_sender_dal().get_next_nonce(None, false).await?; self.eth_sender_health_updater.update( EthSenderInfo { - failed_l1_txns: None, + failed_l1_txns, last_created_commit_batch: None, last_created_prove_batch: None, last_created_execute_batch: None, last_executed_commit_batch: None, last_executed_prove_batch: None, last_executed_execute_batch: None, - current_nonce: None, - latest_operator_nonce: None, + next_nonce, } .into(), ); From 68210aac1fb157892a7696dbb899dc25e464b06e Mon Sep 17 00:00:00 2001 From: Manuel Mauro Date: Mon, 4 Nov 2024 11:25:47 +0100 Subject: [PATCH 14/60] feat: add information on last saved/mined batches to healthcheck --- core/node/house_keeper/src/eth_sender.rs | 48 ++++++++++++++++++------ 1 file changed, 36 insertions(+), 12 deletions(-) diff --git a/core/node/house_keeper/src/eth_sender.rs b/core/node/house_keeper/src/eth_sender.rs index fc7277eb3d5d..d6fd7f42cabf 100644 --- a/core/node/house_keeper/src/eth_sender.rs +++ b/core/node/house_keeper/src/eth_sender.rs @@ -1,19 +1,26 @@ +use std::cmp::max; + +use anyhow::Ok; use async_trait::async_trait; use serde::{Deserialize, Serialize}; use zksync_dal::{ConnectionPool, Core, CoreDal}; use zksync_health_check::{Health, HealthStatus, HealthUpdater}; +use zksync_types::{aggregated_operations::AggregatedActionType, L1BatchNumber}; use crate::periodic_job::PeriodicJob; +#[derive(Debug, Serialize, Deserialize)] +struct LastBatchIndex { + commit: Option, + prove: Option, + execute: Option, +} + #[derive(Debug, Serialize, Deserialize)] pub struct EthSenderInfo { failed_l1_txns: i64, - last_created_commit_batch: Option<()>, - last_created_prove_batch: Option<()>, - last_created_execute_batch: Option<()>, - last_executed_commit_batch: Option<()>, - last_executed_prove_batch: Option<()>, - last_executed_execute_batch: Option<()>, + last_saved_batches: LastBatchIndex, + last_mined_batches: LastBatchIndex, next_nonce: Option, } @@ -44,18 +51,16 @@ impl PeriodicJob for EthSenderHealthTask { .get_number_of_failed_transactions() .await?; + let eth_stats = conn.eth_sender_dal().get_eth_l1_batches().await?; + // TODO retrieve SettlementMode from config let next_nonce = conn.eth_sender_dal().get_next_nonce(None, false).await?; self.eth_sender_health_updater.update( EthSenderInfo { failed_l1_txns, - last_created_commit_batch: None, - last_created_prove_batch: None, - last_created_execute_batch: None, - last_executed_commit_batch: None, - last_executed_prove_batch: None, - last_executed_execute_batch: None, + last_saved_batches: get_latest_batches(eth_stats.saved), + last_mined_batches: get_latest_batches(eth_stats.mined), next_nonce, } .into(), @@ -67,3 +72,22 @@ impl PeriodicJob for EthSenderHealthTask { Self::POLLING_INTERVAL_MS } } + +fn get_latest_batches(batches: Vec<(AggregatedActionType, L1BatchNumber)>) -> LastBatchIndex { + let (commit_batch, prove_batch, execute_batch) = batches.into_iter().fold( + (None, None, None), + |(commit, prove, execute), (action_type, batch_number)| match action_type { + AggregatedActionType::Commit => (max(commit, Some(batch_number)), prove, execute), + AggregatedActionType::PublishProofOnchain => { + (commit, max(prove, Some(batch_number)), execute) + } + AggregatedActionType::Execute => (commit, prove, max(execute, Some(batch_number))), + }, + ); + + LastBatchIndex { + commit: commit_batch, + prove: prove_batch, + execute: execute_batch, + } +} From 2fc3da9ff690800ec5be58c443ccd4e5a6c01ded Mon Sep 17 00:00:00 2001 From: Manuel Mauro Date: Mon, 4 Nov 2024 12:19:32 +0100 Subject: [PATCH 15/60] feat: get last miniblock number from DB --- ...858621472b2964e6cb12338a739398f6e311d.json | 20 ++++++++++++++++ core/lib/dal/src/blocks_web3_dal.rs | 24 +++++++++++++++++++ core/node/house_keeper/src/state_keeper.rs | 11 ++++++--- 3 files changed, 52 insertions(+), 3 deletions(-) create mode 100644 core/lib/dal/.sqlx/query-99bddc8a045abaae298fc95ba9c858621472b2964e6cb12338a739398f6e311d.json diff --git a/core/lib/dal/.sqlx/query-99bddc8a045abaae298fc95ba9c858621472b2964e6cb12338a739398f6e311d.json b/core/lib/dal/.sqlx/query-99bddc8a045abaae298fc95ba9c858621472b2964e6cb12338a739398f6e311d.json new file mode 100644 index 000000000000..4f4df7c32522 --- /dev/null +++ b/core/lib/dal/.sqlx/query-99bddc8a045abaae298fc95ba9c858621472b2964e6cb12338a739398f6e311d.json @@ -0,0 +1,20 @@ +{ + "db_name": "PostgreSQL", + "query": "\n SELECT max(number) FROM miniblocks\n ", + "describe": { + "columns": [ + { + "ordinal": 0, + "name": "max", + "type_info": "Int8" + } + ], + "parameters": { + "Left": [] + }, + "nullable": [ + null + ] + }, + "hash": "99bddc8a045abaae298fc95ba9c858621472b2964e6cb12338a739398f6e311d" +} diff --git a/core/lib/dal/src/blocks_web3_dal.rs b/core/lib/dal/src/blocks_web3_dal.rs index 4cb577986380..6ca861ea8997 100644 --- a/core/lib/dal/src/blocks_web3_dal.rs +++ b/core/lib/dal/src/blocks_web3_dal.rs @@ -32,6 +32,30 @@ pub struct BlocksWeb3Dal<'a, 'c> { } impl BlocksWeb3Dal<'_, '_> { + pub async fn get_last_miniblock_number(&mut self) -> DalResult> { + let record = sqlx::query!( + r#" + SELECT max(number) FROM miniblocks + "# + ) + .instrument("get_last_miniblock_number") + .fetch_one(self.storage) + .await?; + + // the database stores the miniblock numbers as i64, so we could safely unwrap here. + // Instead, we log the error to catch inconsistent states of the database. + let last_miniblock = record.max.and_then(|n| { + u32::try_from(n) + .map_err(|e| { + tracing::error!("Failed to convert i64 to u32 for miniblock number: {}", e); + e + }) + .ok() + }); + + Ok(last_miniblock.map(L2BlockNumber::from)) + } + pub async fn get_api_block( &mut self, block_number: L2BlockNumber, diff --git a/core/node/house_keeper/src/state_keeper.rs b/core/node/house_keeper/src/state_keeper.rs index 8a255e967118..f3f8d8434fcd 100644 --- a/core/node/house_keeper/src/state_keeper.rs +++ b/core/node/house_keeper/src/state_keeper.rs @@ -2,13 +2,14 @@ use async_trait::async_trait; use serde::{Deserialize, Serialize}; use zksync_dal::{ConnectionPool, Core, CoreDal}; use zksync_health_check::{Health, HealthStatus, HealthUpdater}; +use zksync_types::L2BlockNumber; use crate::periodic_job::PeriodicJob; #[derive(Debug, Serialize, Deserialize)] pub struct StateKeeperInfo { last_miniblock_protocol_upgrade: Option<()>, - last_miniblock: Option<()>, + last_miniblock: Option, batch_number: Option<()>, } @@ -34,12 +35,16 @@ impl PeriodicJob for StateKeeperHealthTask { async fn run_routine_task(&mut self) -> anyhow::Result<()> { let mut conn = self.connection_pool.connection().await.unwrap(); - let _last_migration = conn.system_dal().get_last_migration().await.unwrap(); + let last_miniblock = conn + .blocks_web3_dal() + .get_last_miniblock_number() + .await + .unwrap(); self.state_keeper_health_updater.update( StateKeeperInfo { last_miniblock_protocol_upgrade: None, - last_miniblock: None, + last_miniblock, batch_number: None, } .into(), From 098e54fa5c97f624328138f35901503959f1bd64 Mon Sep 17 00:00:00 2001 From: Manuel Mauro Date: Mon, 4 Nov 2024 12:44:32 +0100 Subject: [PATCH 16/60] feat: add protocol version information to healthcheck --- ...858621472b2964e6cb12338a739398f6e311d.json | 20 ------------ core/lib/dal/src/blocks_web3_dal.rs | 24 -------------- core/node/house_keeper/src/state_keeper.rs | 31 +++++++++++++------ 3 files changed, 21 insertions(+), 54 deletions(-) delete mode 100644 core/lib/dal/.sqlx/query-99bddc8a045abaae298fc95ba9c858621472b2964e6cb12338a739398f6e311d.json diff --git a/core/lib/dal/.sqlx/query-99bddc8a045abaae298fc95ba9c858621472b2964e6cb12338a739398f6e311d.json b/core/lib/dal/.sqlx/query-99bddc8a045abaae298fc95ba9c858621472b2964e6cb12338a739398f6e311d.json deleted file mode 100644 index 4f4df7c32522..000000000000 --- a/core/lib/dal/.sqlx/query-99bddc8a045abaae298fc95ba9c858621472b2964e6cb12338a739398f6e311d.json +++ /dev/null @@ -1,20 +0,0 @@ -{ - "db_name": "PostgreSQL", - "query": "\n SELECT max(number) FROM miniblocks\n ", - "describe": { - "columns": [ - { - "ordinal": 0, - "name": "max", - "type_info": "Int8" - } - ], - "parameters": { - "Left": [] - }, - "nullable": [ - null - ] - }, - "hash": "99bddc8a045abaae298fc95ba9c858621472b2964e6cb12338a739398f6e311d" -} diff --git a/core/lib/dal/src/blocks_web3_dal.rs b/core/lib/dal/src/blocks_web3_dal.rs index 6ca861ea8997..4cb577986380 100644 --- a/core/lib/dal/src/blocks_web3_dal.rs +++ b/core/lib/dal/src/blocks_web3_dal.rs @@ -32,30 +32,6 @@ pub struct BlocksWeb3Dal<'a, 'c> { } impl BlocksWeb3Dal<'_, '_> { - pub async fn get_last_miniblock_number(&mut self) -> DalResult> { - let record = sqlx::query!( - r#" - SELECT max(number) FROM miniblocks - "# - ) - .instrument("get_last_miniblock_number") - .fetch_one(self.storage) - .await?; - - // the database stores the miniblock numbers as i64, so we could safely unwrap here. - // Instead, we log the error to catch inconsistent states of the database. - let last_miniblock = record.max.and_then(|n| { - u32::try_from(n) - .map_err(|e| { - tracing::error!("Failed to convert i64 to u32 for miniblock number: {}", e); - e - }) - .ok() - }); - - Ok(last_miniblock.map(L2BlockNumber::from)) - } - pub async fn get_api_block( &mut self, block_number: L2BlockNumber, diff --git a/core/node/house_keeper/src/state_keeper.rs b/core/node/house_keeper/src/state_keeper.rs index f3f8d8434fcd..50fe2da04adb 100644 --- a/core/node/house_keeper/src/state_keeper.rs +++ b/core/node/house_keeper/src/state_keeper.rs @@ -2,14 +2,30 @@ use async_trait::async_trait; use serde::{Deserialize, Serialize}; use zksync_dal::{ConnectionPool, Core, CoreDal}; use zksync_health_check::{Health, HealthStatus, HealthUpdater}; -use zksync_types::L2BlockNumber; +use zksync_types::{block::L2BlockHeader, L2BlockNumber, ProtocolVersionId}; use crate::periodic_job::PeriodicJob; +#[derive(Debug, Serialize, Deserialize)] +pub struct L2BlockHeaderInfo { + pub number: L2BlockNumber, + pub timestamp: u64, + pub protocol_version: Option, +} + +impl From for L2BlockHeaderInfo { + fn from(header: L2BlockHeader) -> Self { + Self { + number: header.number, + timestamp: header.timestamp, + protocol_version: header.protocol_version, + } + } +} + #[derive(Debug, Serialize, Deserialize)] pub struct StateKeeperInfo { - last_miniblock_protocol_upgrade: Option<()>, - last_miniblock: Option, + last_sealed_miniblock: Option, batch_number: Option<()>, } @@ -35,16 +51,11 @@ impl PeriodicJob for StateKeeperHealthTask { async fn run_routine_task(&mut self) -> anyhow::Result<()> { let mut conn = self.connection_pool.connection().await.unwrap(); - let last_miniblock = conn - .blocks_web3_dal() - .get_last_miniblock_number() - .await - .unwrap(); + let last_sealed_miniblock = conn.blocks_dal().get_last_sealed_l2_block_header().await?; self.state_keeper_health_updater.update( StateKeeperInfo { - last_miniblock_protocol_upgrade: None, - last_miniblock, + last_sealed_miniblock: last_sealed_miniblock.map(L2BlockHeaderInfo::from), batch_number: None, } .into(), From 4cbb8904f8dccb8cd3924a6e43d47d7e00cdd710 Mon Sep 17 00:00:00 2001 From: Manuel Mauro Date: Mon, 4 Nov 2024 15:30:39 +0100 Subject: [PATCH 17/60] feat: add last processed L1 batch to health check --- core/node/house_keeper/src/state_keeper.rs | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/core/node/house_keeper/src/state_keeper.rs b/core/node/house_keeper/src/state_keeper.rs index 50fe2da04adb..7aff29bbc530 100644 --- a/core/node/house_keeper/src/state_keeper.rs +++ b/core/node/house_keeper/src/state_keeper.rs @@ -2,7 +2,7 @@ use async_trait::async_trait; use serde::{Deserialize, Serialize}; use zksync_dal::{ConnectionPool, Core, CoreDal}; use zksync_health_check::{Health, HealthStatus, HealthUpdater}; -use zksync_types::{block::L2BlockHeader, L2BlockNumber, ProtocolVersionId}; +use zksync_types::{block::L2BlockHeader, L1BatchNumber, L2BlockNumber, ProtocolVersionId}; use crate::periodic_job::PeriodicJob; @@ -26,7 +26,7 @@ impl From for L2BlockHeaderInfo { #[derive(Debug, Serialize, Deserialize)] pub struct StateKeeperInfo { last_sealed_miniblock: Option, - batch_number: Option<()>, + last_processed_l1_batch: L1BatchNumber, } impl From for Health { @@ -52,11 +52,15 @@ impl PeriodicJob for StateKeeperHealthTask { async fn run_routine_task(&mut self) -> anyhow::Result<()> { let mut conn = self.connection_pool.connection().await.unwrap(); let last_sealed_miniblock = conn.blocks_dal().get_last_sealed_l2_block_header().await?; + let last_processed_l1_batch = conn + .blocks_dal() + .get_consistency_checker_last_processed_l1_batch() + .await?; self.state_keeper_health_updater.update( StateKeeperInfo { last_sealed_miniblock: last_sealed_miniblock.map(L2BlockHeaderInfo::from), - batch_number: None, + last_processed_l1_batch, } .into(), ); From f60e3a94d91f7ebc1c3dac9fc9f73af16f8cd20b Mon Sep 17 00:00:00 2001 From: Manuel Mauro Date: Mon, 4 Nov 2024 15:48:06 +0100 Subject: [PATCH 18/60] refactor: use SELECT MAX instead of ORDER BY --- ...d66b26517394d5c8d7e96625d4e330ef38bc43f3c72a395ac1.json} | 4 ++-- core/lib/dal/src/system_dal.rs | 6 ++++-- 2 files changed, 6 insertions(+), 4 deletions(-) rename core/lib/dal/.sqlx/{query-ee1bfbbbed5f28c8a30743770b55c0ad2579709ae355234811e09fdf5c40e3d2.json => query-77c394266974e2d66b26517394d5c8d7e96625d4e330ef38bc43f3c72a395ac1.json} (77%) diff --git a/core/lib/dal/.sqlx/query-ee1bfbbbed5f28c8a30743770b55c0ad2579709ae355234811e09fdf5c40e3d2.json b/core/lib/dal/.sqlx/query-77c394266974e2d66b26517394d5c8d7e96625d4e330ef38bc43f3c72a395ac1.json similarity index 77% rename from core/lib/dal/.sqlx/query-ee1bfbbbed5f28c8a30743770b55c0ad2579709ae355234811e09fdf5c40e3d2.json rename to core/lib/dal/.sqlx/query-77c394266974e2d66b26517394d5c8d7e96625d4e330ef38bc43f3c72a395ac1.json index d9d6b56bac8a..41f5354b93fb 100644 --- a/core/lib/dal/.sqlx/query-ee1bfbbbed5f28c8a30743770b55c0ad2579709ae355234811e09fdf5c40e3d2.json +++ b/core/lib/dal/.sqlx/query-77c394266974e2d66b26517394d5c8d7e96625d4e330ef38bc43f3c72a395ac1.json @@ -1,6 +1,6 @@ { "db_name": "PostgreSQL", - "query": "\n SELECT *\n FROM _sqlx_migrations\n ORDER BY version DESC\n LIMIT 1;\n ", + "query": "\n SELECT *\n FROM _sqlx_migrations\n WHERE _sqlx_migrations.version = (\n SELECT MAX(_sqlx_migrations.version)\n FROM _sqlx_migrations\n );\n ", "describe": { "columns": [ { @@ -46,5 +46,5 @@ false ] }, - "hash": "ee1bfbbbed5f28c8a30743770b55c0ad2579709ae355234811e09fdf5c40e3d2" + "hash": "77c394266974e2d66b26517394d5c8d7e96625d4e330ef38bc43f3c72a395ac1" } diff --git a/core/lib/dal/src/system_dal.rs b/core/lib/dal/src/system_dal.rs index f54d5d13ce71..4b935b544eb7 100644 --- a/core/lib/dal/src/system_dal.rs +++ b/core/lib/dal/src/system_dal.rs @@ -104,8 +104,10 @@ impl SystemDal<'_, '_> { r#" SELECT * FROM _sqlx_migrations - ORDER BY version DESC - LIMIT 1; + WHERE _sqlx_migrations.version = ( + SELECT MAX(_sqlx_migrations.version) + FROM _sqlx_migrations + ); "# ) .instrument("get_last_migration") From 927cddb9b16f0554d5a4d779007037a83fe9294c Mon Sep 17 00:00:00 2001 From: Manuel Mauro Date: Mon, 4 Nov 2024 16:01:48 +0100 Subject: [PATCH 19/60] refactor: rename LastBatchIndex to BatchNumbers --- core/node/house_keeper/src/eth_sender.rs | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/core/node/house_keeper/src/eth_sender.rs b/core/node/house_keeper/src/eth_sender.rs index d6fd7f42cabf..ecc66e754af2 100644 --- a/core/node/house_keeper/src/eth_sender.rs +++ b/core/node/house_keeper/src/eth_sender.rs @@ -10,7 +10,7 @@ use zksync_types::{aggregated_operations::AggregatedActionType, L1BatchNumber}; use crate::periodic_job::PeriodicJob; #[derive(Debug, Serialize, Deserialize)] -struct LastBatchIndex { +struct BatchNumbers { commit: Option, prove: Option, execute: Option, @@ -19,8 +19,8 @@ struct LastBatchIndex { #[derive(Debug, Serialize, Deserialize)] pub struct EthSenderInfo { failed_l1_txns: i64, - last_saved_batches: LastBatchIndex, - last_mined_batches: LastBatchIndex, + last_saved_batches: BatchNumbers, + last_mined_batches: BatchNumbers, next_nonce: Option, } @@ -43,6 +43,9 @@ impl EthSenderHealthTask { #[async_trait] impl PeriodicJob for EthSenderHealthTask { const SERVICE_NAME: &'static str = "EthSenderHealth"; + async fn run(&mut self) -> anyhow::Result<()> { + self.run_routine_task().await + } async fn run_routine_task(&mut self) -> anyhow::Result<()> { let mut conn = self.connection_pool.connection().await?; @@ -73,7 +76,7 @@ impl PeriodicJob for EthSenderHealthTask { } } -fn get_latest_batches(batches: Vec<(AggregatedActionType, L1BatchNumber)>) -> LastBatchIndex { +fn get_latest_batches(batches: Vec<(AggregatedActionType, L1BatchNumber)>) -> BatchNumbers { let (commit_batch, prove_batch, execute_batch) = batches.into_iter().fold( (None, None, None), |(commit, prove, execute), (action_type, batch_number)| match action_type { @@ -85,7 +88,7 @@ fn get_latest_batches(batches: Vec<(AggregatedActionType, L1BatchNumber)>) -> La }, ); - LastBatchIndex { + BatchNumbers { commit: commit_batch, prove: prove_batch, execute: execute_batch, From 80db2ce732ea411de37496fc6260421603273c5c Mon Sep 17 00:00:00 2001 From: Manuel Mauro Date: Wed, 6 Nov 2024 11:00:53 +0100 Subject: [PATCH 20/60] fix: revert code committed by mistake --- core/node/house_keeper/src/eth_sender.rs | 3 --- 1 file changed, 3 deletions(-) diff --git a/core/node/house_keeper/src/eth_sender.rs b/core/node/house_keeper/src/eth_sender.rs index ecc66e754af2..580d8d24ea2e 100644 --- a/core/node/house_keeper/src/eth_sender.rs +++ b/core/node/house_keeper/src/eth_sender.rs @@ -43,9 +43,6 @@ impl EthSenderHealthTask { #[async_trait] impl PeriodicJob for EthSenderHealthTask { const SERVICE_NAME: &'static str = "EthSenderHealth"; - async fn run(&mut self) -> anyhow::Result<()> { - self.run_routine_task().await - } async fn run_routine_task(&mut self) -> anyhow::Result<()> { let mut conn = self.connection_pool.connection().await?; From 2c9ca8d3926e3f9c4ddf0ccc8c445e09c2d269f6 Mon Sep 17 00:00:00 2001 From: Manuel Mauro Date: Wed, 6 Nov 2024 11:53:16 +0100 Subject: [PATCH 21/60] feat: add config parameters for healthcheck polling intervals --- core/lib/config/src/configs/house_keeper.rs | 3 +++ core/lib/config/src/testonly.rs | 3 +++ core/lib/env_config/src/house_keeper.rs | 3 +++ core/lib/protobuf_config/src/house_keeper.rs | 20 +++++++++++++++++++ .../src/proto/config/house_keeper.proto | 3 +++ core/node/house_keeper/src/database.rs | 7 ++----- core/node/house_keeper/src/eth_sender.rs | 7 ++----- core/node/house_keeper/src/state_keeper.rs | 7 ++----- .../implementations/layers/house_keeper.rs | 7 +++++++ etc/env/base/house_keeper.toml | 3 +++ etc/env/file_based/general.yaml | 11 +++++----- 11 files changed, 53 insertions(+), 21 deletions(-) diff --git a/core/lib/config/src/configs/house_keeper.rs b/core/lib/config/src/configs/house_keeper.rs index 39e304562fa0..86be6cb0e807 100644 --- a/core/lib/config/src/configs/house_keeper.rs +++ b/core/lib/config/src/configs/house_keeper.rs @@ -4,4 +4,7 @@ use serde::Deserialize; #[derive(Debug, Deserialize, Clone, PartialEq)] pub struct HouseKeeperConfig { pub l1_batch_metrics_reporting_interval_ms: u64, + pub database_health_polling_interval_ms: u64, + pub eth_sender_health_polling_interval_ms: u64, + pub state_keeper_health_polling_interval_ms: u64, } diff --git a/core/lib/config/src/testonly.rs b/core/lib/config/src/testonly.rs index 49c5cff1dca0..311c3362e02e 100644 --- a/core/lib/config/src/testonly.rs +++ b/core/lib/config/src/testonly.rs @@ -637,6 +637,9 @@ impl Distribution for EncodeDist { fn sample(&self, rng: &mut R) -> configs::house_keeper::HouseKeeperConfig { configs::house_keeper::HouseKeeperConfig { l1_batch_metrics_reporting_interval_ms: self.sample(rng), + database_health_polling_interval_ms: self.sample(rng), + eth_sender_health_polling_interval_ms: self.sample(rng), + state_keeper_health_polling_interval_ms: self.sample(rng), } } } diff --git a/core/lib/env_config/src/house_keeper.rs b/core/lib/env_config/src/house_keeper.rs index 1a1ff4d27de2..960d3b50a91b 100644 --- a/core/lib/env_config/src/house_keeper.rs +++ b/core/lib/env_config/src/house_keeper.rs @@ -18,6 +18,9 @@ mod tests { fn expected_config() -> HouseKeeperConfig { HouseKeeperConfig { l1_batch_metrics_reporting_interval_ms: 10_000, + database_health_polling_interval_ms: 10_000, + eth_sender_health_polling_interval_ms: 10_000, + state_keeper_health_polling_interval_ms: 10_000, } } diff --git a/core/lib/protobuf_config/src/house_keeper.rs b/core/lib/protobuf_config/src/house_keeper.rs index e40fd1b37dc7..cbd6a3274737 100644 --- a/core/lib/protobuf_config/src/house_keeper.rs +++ b/core/lib/protobuf_config/src/house_keeper.rs @@ -12,6 +12,21 @@ impl ProtoRepr for proto::HouseKeeper { &self.l1_batch_metrics_reporting_interval_ms, ) .context("l1_batch_metrics_reporting_interval_ms")?, + + database_health_polling_interval_ms: *required( + &self.database_health_polling_interval_ms, + ) + .context("database_health_polling_interval_ms")?, + + eth_sender_health_polling_interval_ms: *required( + &self.eth_sender_health_polling_interval_ms, + ) + .context("eth_sender_health_polling_interval_ms")?, + + state_keeper_health_polling_interval_ms: *required( + &self.state_keeper_health_polling_interval_ms, + ) + .context("state_keeper_health_polling_interval_ms")?, }) } @@ -20,6 +35,11 @@ impl ProtoRepr for proto::HouseKeeper { l1_batch_metrics_reporting_interval_ms: Some( this.l1_batch_metrics_reporting_interval_ms, ), + database_health_polling_interval_ms: Some(this.database_health_polling_interval_ms), + eth_sender_health_polling_interval_ms: Some(this.eth_sender_health_polling_interval_ms), + state_keeper_health_polling_interval_ms: Some( + this.state_keeper_health_polling_interval_ms, + ), } } } diff --git a/core/lib/protobuf_config/src/proto/config/house_keeper.proto b/core/lib/protobuf_config/src/proto/config/house_keeper.proto index c3a4ca8ad672..aa3e12dd998a 100644 --- a/core/lib/protobuf_config/src/proto/config/house_keeper.proto +++ b/core/lib/protobuf_config/src/proto/config/house_keeper.proto @@ -17,4 +17,7 @@ message HouseKeeper { reserved 15; reserved "prover_job_archiver_archive_after_secs"; reserved 16; reserved "fri_gpu_prover_archiver_archiving_interval_ms"; reserved 17; reserved "fri_gpu_prover_archiver_archive_after_secs"; + optional uint64 database_health_polling_interval_ms = 18; // required; ms + optional uint64 eth_sender_health_polling_interval_ms = 19; // required; ms + optional uint64 state_keeper_health_polling_interval_ms = 20; // required; ms } diff --git a/core/node/house_keeper/src/database.rs b/core/node/house_keeper/src/database.rs index 0ee91d9dd8af..fc843d6ff6b9 100644 --- a/core/node/house_keeper/src/database.rs +++ b/core/node/house_keeper/src/database.rs @@ -18,14 +18,11 @@ impl From for Health { #[derive(Debug)] pub struct DatabaseHealthTask { + pub polling_interval_ms: u64, pub connection_pool: ConnectionPool, pub database_health_updater: HealthUpdater, } -impl DatabaseHealthTask { - pub const POLLING_INTERVAL_MS: u64 = 10_000; -} - #[async_trait] impl PeriodicJob for DatabaseHealthTask { const SERVICE_NAME: &'static str = "DatabaseHealth"; @@ -40,6 +37,6 @@ impl PeriodicJob for DatabaseHealthTask { } fn polling_interval_ms(&self) -> u64 { - Self::POLLING_INTERVAL_MS + self.polling_interval_ms } } diff --git a/core/node/house_keeper/src/eth_sender.rs b/core/node/house_keeper/src/eth_sender.rs index 580d8d24ea2e..4de093765a15 100644 --- a/core/node/house_keeper/src/eth_sender.rs +++ b/core/node/house_keeper/src/eth_sender.rs @@ -32,14 +32,11 @@ impl From for Health { #[derive(Debug)] pub struct EthSenderHealthTask { + pub polling_interval_ms: u64, pub connection_pool: ConnectionPool, pub eth_sender_health_updater: HealthUpdater, } -impl EthSenderHealthTask { - pub const POLLING_INTERVAL_MS: u64 = 10_000; -} - #[async_trait] impl PeriodicJob for EthSenderHealthTask { const SERVICE_NAME: &'static str = "EthSenderHealth"; @@ -69,7 +66,7 @@ impl PeriodicJob for EthSenderHealthTask { } fn polling_interval_ms(&self) -> u64 { - Self::POLLING_INTERVAL_MS + self.polling_interval_ms } } diff --git a/core/node/house_keeper/src/state_keeper.rs b/core/node/house_keeper/src/state_keeper.rs index 7aff29bbc530..52998fa53edc 100644 --- a/core/node/house_keeper/src/state_keeper.rs +++ b/core/node/house_keeper/src/state_keeper.rs @@ -37,14 +37,11 @@ impl From for Health { #[derive(Debug)] pub struct StateKeeperHealthTask { + pub polling_interval_ms: u64, pub connection_pool: ConnectionPool, pub state_keeper_health_updater: HealthUpdater, } -impl StateKeeperHealthTask { - pub const POLLING_INTERVAL_MS: u64 = 10_000; -} - #[async_trait] impl PeriodicJob for StateKeeperHealthTask { const SERVICE_NAME: &'static str = "StateKeeperHealth"; @@ -68,6 +65,6 @@ impl PeriodicJob for StateKeeperHealthTask { } fn polling_interval_ms(&self) -> u64 { - Self::POLLING_INTERVAL_MS + self.polling_interval_ms } } diff --git a/core/node/node_framework/src/implementations/layers/house_keeper.rs b/core/node/node_framework/src/implementations/layers/house_keeper.rs index e48d47972280..51d457e344cf 100644 --- a/core/node/node_framework/src/implementations/layers/house_keeper.rs +++ b/core/node/node_framework/src/implementations/layers/house_keeper.rs @@ -87,6 +87,7 @@ impl WiringLayer for HouseKeeperLayer { .map_err(WiringError::internal)?; let database_health_task = DatabaseHealthTask { + polling_interval_ms: self.house_keeper_config.database_health_polling_interval_ms, connection_pool: replica_pool.clone(), database_health_updater, }; @@ -99,6 +100,9 @@ impl WiringLayer for HouseKeeperLayer { .map_err(WiringError::internal)?; let eth_sender_health_task = EthSenderHealthTask { + polling_interval_ms: self + .house_keeper_config + .eth_sender_health_polling_interval_ms, connection_pool: replica_pool.clone(), eth_sender_health_updater, }; @@ -111,6 +115,9 @@ impl WiringLayer for HouseKeeperLayer { .map_err(WiringError::internal)?; let state_keeper_health_task = StateKeeperHealthTask { + polling_interval_ms: self + .house_keeper_config + .state_keeper_health_polling_interval_ms, connection_pool: replica_pool.clone(), state_keeper_health_updater, }; diff --git a/etc/env/base/house_keeper.toml b/etc/env/base/house_keeper.toml index 6f86561d1c60..c97fa1c2b67c 100644 --- a/etc/env/base/house_keeper.toml +++ b/etc/env/base/house_keeper.toml @@ -1,2 +1,5 @@ [house_keeper] l1_batch_metrics_reporting_interval_ms = 10000 +database_health_polling_interval_ms = 10000 +eth_sender_health_polling_interval_ms = 10000 +state_keeper_health_polling_interval_ms = 10000 diff --git a/etc/env/file_based/general.yaml b/etc/env/file_based/general.yaml index 94758d92e180..362e248ccec6 100644 --- a/etc/env/file_based/general.yaml +++ b/etc/env/file_based/general.yaml @@ -41,7 +41,7 @@ api: estimate_gas_scale_factor: 1.3 estimate_gas_acceptable_overestimation: 5000 max_tx_size: 1000000 - api_namespaces: [ en,eth,net,web3,zks,pubsub,debug ] + api_namespaces: [en, eth, net, web3, zks, pubsub, debug] state_keeper: transaction_slots: 8192 max_allowed_l2_tx_gas_limit: 15000000000 @@ -104,7 +104,7 @@ eth: aggregated_block_execute_deadline: 10 timestamp_criteria_max_allowed_lag: 30 max_eth_tx_data_size: 120000 - aggregated_proof_sizes: [ 1 ] + aggregated_proof_sizes: [1] max_aggregated_tx_gas: 15000000 max_acceptable_priority_fee_in_gwei: 100000000000 # typo: value is in wei (100 gwei) pubdata_sending_mode: BLOBS @@ -121,7 +121,6 @@ eth: confirmations_for_eth_event: 0 eth_node_poll_interval: 300 - snapshot_creator: object_store: file_backed: @@ -130,7 +129,6 @@ snapshot_creator: concurrent_queries_count: 25 storage_logs_chunk_size: 1000000 - prover: prover_object_store: file_backed: @@ -290,7 +288,6 @@ prover_job_monitor: witness_job_queuer_run_interval_ms: 10000 http_port: 3074 - base_token_adjuster: price_polling_interval_ms: 30000 price_cache_update_interval_ms: 2000 @@ -302,9 +299,11 @@ external_price_api_client: forced_numerator: 314 forced_denominator: 1000 - house_keeper: l1_batch_metrics_reporting_interval_ms: 10000 + database_health_polling_interval_ms: 10000 + eth_sender_health_polling_interval_ms: 10000 + state_keeper_health_polling_interval_ms: 10000 prometheus: listener_port: 3314 From 98f8d72cc6c4d13995734728a18d4d9d7b3a7ee9 Mon Sep 17 00:00:00 2001 From: Manuel Mauro Date: Wed, 6 Nov 2024 12:27:25 +0100 Subject: [PATCH 22/60] fix: fix house keeper config from env test --- core/lib/env_config/src/house_keeper.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/core/lib/env_config/src/house_keeper.rs b/core/lib/env_config/src/house_keeper.rs index 960d3b50a91b..871620bbe1fd 100644 --- a/core/lib/env_config/src/house_keeper.rs +++ b/core/lib/env_config/src/house_keeper.rs @@ -29,6 +29,9 @@ mod tests { let mut lock = MUTEX.lock(); let config = r#" HOUSE_KEEPER_L1_BATCH_METRICS_REPORTING_INTERVAL_MS="10000" + DATABASE_HEALTH_POLLING_INTERVAL_MS="10000" + ETH_SENDER_HEALTH_POLLING_INTERVAL_MS="10000" + STATE_KEEPER_HEALTH_POLLING_INTERVAL_MS="10000" "#; lock.set_env(config); From 988b957eabd488af4807184039974ae74eb331ad Mon Sep 17 00:00:00 2001 From: Manuel Mauro Date: Wed, 6 Nov 2024 13:19:19 +0100 Subject: [PATCH 23/60] fix: fix house keeper config parameters naming in unit test --- core/lib/env_config/src/house_keeper.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/core/lib/env_config/src/house_keeper.rs b/core/lib/env_config/src/house_keeper.rs index 871620bbe1fd..a1b956e83ee9 100644 --- a/core/lib/env_config/src/house_keeper.rs +++ b/core/lib/env_config/src/house_keeper.rs @@ -29,9 +29,9 @@ mod tests { let mut lock = MUTEX.lock(); let config = r#" HOUSE_KEEPER_L1_BATCH_METRICS_REPORTING_INTERVAL_MS="10000" - DATABASE_HEALTH_POLLING_INTERVAL_MS="10000" - ETH_SENDER_HEALTH_POLLING_INTERVAL_MS="10000" - STATE_KEEPER_HEALTH_POLLING_INTERVAL_MS="10000" + HOUSE_KEEPER_DATABASE_HEALTH_POLLING_INTERVAL_MS="10000" + HOUSE_KEEPER_ETH_SENDER_HEALTH_POLLING_INTERVAL_MS="10000" + HOUSE_KEEPER_STATE_KEEPER_HEALTH_POLLING_INTERVAL_MS="10000" "#; lock.set_env(config); From 293484dde1b13261a7c81fce9389889a49afd2bd Mon Sep 17 00:00:00 2001 From: Manuel Mauro Date: Thu, 7 Nov 2024 10:50:55 +0100 Subject: [PATCH 24/60] fix: use u64 for failed_l1_txns --- core/node/house_keeper/src/eth_sender.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/node/house_keeper/src/eth_sender.rs b/core/node/house_keeper/src/eth_sender.rs index 4de093765a15..2ced334596f6 100644 --- a/core/node/house_keeper/src/eth_sender.rs +++ b/core/node/house_keeper/src/eth_sender.rs @@ -18,7 +18,7 @@ struct BatchNumbers { #[derive(Debug, Serialize, Deserialize)] pub struct EthSenderInfo { - failed_l1_txns: i64, + failed_l1_txns: u64, last_saved_batches: BatchNumbers, last_mined_batches: BatchNumbers, next_nonce: Option, @@ -55,7 +55,7 @@ impl PeriodicJob for EthSenderHealthTask { self.eth_sender_health_updater.update( EthSenderInfo { - failed_l1_txns, + failed_l1_txns: failed_l1_txns as u64, last_saved_batches: get_latest_batches(eth_stats.saved), last_mined_batches: get_latest_batches(eth_stats.mined), next_nonce, From 3ad3fd25b2241f94343c55cb4d79e56c9203bf5d Mon Sep 17 00:00:00 2001 From: Manuel Mauro Date: Thu, 7 Nov 2024 11:02:40 +0100 Subject: [PATCH 25/60] fix: return u64 in get_number_of_failed_transactions --- core/lib/dal/src/eth_sender_dal.rs | 3 ++- core/node/house_keeper/src/eth_sender.rs | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/core/lib/dal/src/eth_sender_dal.rs b/core/lib/dal/src/eth_sender_dal.rs index 4ce76547ac9b..4efead269d1b 100644 --- a/core/lib/dal/src/eth_sender_dal.rs +++ b/core/lib/dal/src/eth_sender_dal.rs @@ -669,7 +669,7 @@ impl EthSenderDal<'_, '_> { Ok(()) } - pub async fn get_number_of_failed_transactions(&mut self) -> anyhow::Result { + pub async fn get_number_of_failed_transactions(&mut self) -> anyhow::Result { sqlx::query!( r#" SELECT @@ -683,6 +683,7 @@ impl EthSenderDal<'_, '_> { .fetch_one(self.storage.conn()) .await? .count + .map(|c| c as u64) .context("count field is missing") } diff --git a/core/node/house_keeper/src/eth_sender.rs b/core/node/house_keeper/src/eth_sender.rs index 2ced334596f6..c95ad3696f81 100644 --- a/core/node/house_keeper/src/eth_sender.rs +++ b/core/node/house_keeper/src/eth_sender.rs @@ -55,7 +55,7 @@ impl PeriodicJob for EthSenderHealthTask { self.eth_sender_health_updater.update( EthSenderInfo { - failed_l1_txns: failed_l1_txns as u64, + failed_l1_txns, last_saved_batches: get_latest_batches(eth_stats.saved), last_mined_batches: get_latest_batches(eth_stats.mined), next_nonce, From b31233b1561a3a78058fab39f0eaee2070a54aae Mon Sep 17 00:00:00 2001 From: Manuel Mauro Date: Thu, 7 Nov 2024 11:08:13 +0100 Subject: [PATCH 26/60] feat: use connection_tagged for better code instumentation --- core/node/house_keeper/src/blocks_state_reporter.rs | 5 ++++- core/node/house_keeper/src/database.rs | 5 ++++- core/node/house_keeper/src/eth_sender.rs | 5 ++++- core/node/house_keeper/src/state_keeper.rs | 6 +++++- 4 files changed, 17 insertions(+), 4 deletions(-) diff --git a/core/node/house_keeper/src/blocks_state_reporter.rs b/core/node/house_keeper/src/blocks_state_reporter.rs index 6f85aa0fbb09..abd2c6e8802d 100644 --- a/core/node/house_keeper/src/blocks_state_reporter.rs +++ b/core/node/house_keeper/src/blocks_state_reporter.rs @@ -22,7 +22,10 @@ impl L1BatchMetricsReporter { async fn report_metrics(&self) -> anyhow::Result<()> { let mut block_metrics = vec![]; - let mut conn = self.connection_pool.connection().await?; + let mut conn = self + .connection_pool + .connection_tagged("house_keeper") + .await?; let last_l1_batch = conn.blocks_dal().get_sealed_l1_batch_number().await?; if let Some(number) = last_l1_batch { block_metrics.push((number, BlockStage::Sealed)); diff --git a/core/node/house_keeper/src/database.rs b/core/node/house_keeper/src/database.rs index fc843d6ff6b9..26416be7ef07 100644 --- a/core/node/house_keeper/src/database.rs +++ b/core/node/house_keeper/src/database.rs @@ -28,7 +28,10 @@ impl PeriodicJob for DatabaseHealthTask { const SERVICE_NAME: &'static str = "DatabaseHealth"; async fn run_routine_task(&mut self) -> anyhow::Result<()> { - let mut conn = self.connection_pool.connection().await?; + let mut conn = self + .connection_pool + .connection_tagged("house_keeper") + .await?; let last_migration = conn.system_dal().get_last_migration().await?; self.database_health_updater diff --git a/core/node/house_keeper/src/eth_sender.rs b/core/node/house_keeper/src/eth_sender.rs index c95ad3696f81..6c82bd32ca5f 100644 --- a/core/node/house_keeper/src/eth_sender.rs +++ b/core/node/house_keeper/src/eth_sender.rs @@ -42,7 +42,10 @@ impl PeriodicJob for EthSenderHealthTask { const SERVICE_NAME: &'static str = "EthSenderHealth"; async fn run_routine_task(&mut self) -> anyhow::Result<()> { - let mut conn = self.connection_pool.connection().await?; + let mut conn = self + .connection_pool + .connection_tagged("house_keeper") + .await?; let failed_l1_txns = conn .eth_sender_dal() .get_number_of_failed_transactions() diff --git a/core/node/house_keeper/src/state_keeper.rs b/core/node/house_keeper/src/state_keeper.rs index 52998fa53edc..fcaabd55db41 100644 --- a/core/node/house_keeper/src/state_keeper.rs +++ b/core/node/house_keeper/src/state_keeper.rs @@ -47,7 +47,11 @@ impl PeriodicJob for StateKeeperHealthTask { const SERVICE_NAME: &'static str = "StateKeeperHealth"; async fn run_routine_task(&mut self) -> anyhow::Result<()> { - let mut conn = self.connection_pool.connection().await.unwrap(); + let mut conn = self + .connection_pool + .connection_tagged("house_keeper") + .await + .unwrap(); let last_sealed_miniblock = conn.blocks_dal().get_last_sealed_l2_block_header().await?; let last_processed_l1_batch = conn .blocks_dal() From 5396e5ac703ae8c3cca86d58fd3faa103fa54b2a Mon Sep 17 00:00:00 2001 From: Manuel Mauro Date: Thu, 7 Nov 2024 14:48:49 +0100 Subject: [PATCH 27/60] feat: add reactive health check to state keeper --- Cargo.lock | 2 + core/lib/config/src/configs/house_keeper.rs | 1 - core/lib/config/src/testonly.rs | 1 - core/lib/env_config/src/house_keeper.rs | 2 - core/lib/protobuf_config/src/house_keeper.rs | 8 -- .../src/proto/config/house_keeper.proto | 1 - core/node/consensus/src/testonly.rs | 6 +- core/node/house_keeper/src/lib.rs | 1 - core/node/house_keeper/src/state_keeper.rs | 74 --------------- .../implementations/layers/house_keeper.rs | 36 +------ .../layers/state_keeper/mod.rs | 59 ++++++++---- core/node/node_sync/src/tests.rs | 3 +- core/node/state_keeper/Cargo.toml | 2 + core/node/state_keeper/src/health.rs | 30 ++++++ core/node/state_keeper/src/keeper.rs | 94 +++++++++++++------ core/node/state_keeper/src/lib.rs | 1 + .../src/testonly/test_batch_executor.rs | 3 +- core/node/state_keeper/src/tests/mod.rs | 3 +- core/node/state_keeper/src/utils.rs | 5 + etc/env/base/house_keeper.toml | 1 - etc/env/file_based/general.yaml | 1 - 21 files changed, 153 insertions(+), 181 deletions(-) delete mode 100644 core/node/house_keeper/src/state_keeper.rs create mode 100644 core/node/state_keeper/src/health.rs diff --git a/Cargo.lock b/Cargo.lock index 231f963d9bf3..bcf29610cc03 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -12042,6 +12042,7 @@ dependencies = [ "itertools 0.10.5", "once_cell", "rand 0.8.5", + "serde", "tempfile", "test-casing", "thiserror", @@ -12053,6 +12054,7 @@ dependencies = [ "zksync_contracts", "zksync_dal", "zksync_eth_client", + "zksync_health_check", "zksync_mempool", "zksync_multivm", "zksync_node_fee_model", diff --git a/core/lib/config/src/configs/house_keeper.rs b/core/lib/config/src/configs/house_keeper.rs index 86be6cb0e807..364277690709 100644 --- a/core/lib/config/src/configs/house_keeper.rs +++ b/core/lib/config/src/configs/house_keeper.rs @@ -6,5 +6,4 @@ pub struct HouseKeeperConfig { pub l1_batch_metrics_reporting_interval_ms: u64, pub database_health_polling_interval_ms: u64, pub eth_sender_health_polling_interval_ms: u64, - pub state_keeper_health_polling_interval_ms: u64, } diff --git a/core/lib/config/src/testonly.rs b/core/lib/config/src/testonly.rs index 311c3362e02e..9cc50da47e5f 100644 --- a/core/lib/config/src/testonly.rs +++ b/core/lib/config/src/testonly.rs @@ -639,7 +639,6 @@ impl Distribution for EncodeDist { l1_batch_metrics_reporting_interval_ms: self.sample(rng), database_health_polling_interval_ms: self.sample(rng), eth_sender_health_polling_interval_ms: self.sample(rng), - state_keeper_health_polling_interval_ms: self.sample(rng), } } } diff --git a/core/lib/env_config/src/house_keeper.rs b/core/lib/env_config/src/house_keeper.rs index a1b956e83ee9..ed1fcdc18f47 100644 --- a/core/lib/env_config/src/house_keeper.rs +++ b/core/lib/env_config/src/house_keeper.rs @@ -20,7 +20,6 @@ mod tests { l1_batch_metrics_reporting_interval_ms: 10_000, database_health_polling_interval_ms: 10_000, eth_sender_health_polling_interval_ms: 10_000, - state_keeper_health_polling_interval_ms: 10_000, } } @@ -31,7 +30,6 @@ mod tests { HOUSE_KEEPER_L1_BATCH_METRICS_REPORTING_INTERVAL_MS="10000" HOUSE_KEEPER_DATABASE_HEALTH_POLLING_INTERVAL_MS="10000" HOUSE_KEEPER_ETH_SENDER_HEALTH_POLLING_INTERVAL_MS="10000" - HOUSE_KEEPER_STATE_KEEPER_HEALTH_POLLING_INTERVAL_MS="10000" "#; lock.set_env(config); diff --git a/core/lib/protobuf_config/src/house_keeper.rs b/core/lib/protobuf_config/src/house_keeper.rs index cbd6a3274737..4626b7eb4c83 100644 --- a/core/lib/protobuf_config/src/house_keeper.rs +++ b/core/lib/protobuf_config/src/house_keeper.rs @@ -22,11 +22,6 @@ impl ProtoRepr for proto::HouseKeeper { &self.eth_sender_health_polling_interval_ms, ) .context("eth_sender_health_polling_interval_ms")?, - - state_keeper_health_polling_interval_ms: *required( - &self.state_keeper_health_polling_interval_ms, - ) - .context("state_keeper_health_polling_interval_ms")?, }) } @@ -37,9 +32,6 @@ impl ProtoRepr for proto::HouseKeeper { ), database_health_polling_interval_ms: Some(this.database_health_polling_interval_ms), eth_sender_health_polling_interval_ms: Some(this.eth_sender_health_polling_interval_ms), - state_keeper_health_polling_interval_ms: Some( - this.state_keeper_health_polling_interval_ms, - ), } } } diff --git a/core/lib/protobuf_config/src/proto/config/house_keeper.proto b/core/lib/protobuf_config/src/proto/config/house_keeper.proto index aa3e12dd998a..c2ddb5983023 100644 --- a/core/lib/protobuf_config/src/proto/config/house_keeper.proto +++ b/core/lib/protobuf_config/src/proto/config/house_keeper.proto @@ -19,5 +19,4 @@ message HouseKeeper { reserved 17; reserved "fri_gpu_prover_archiver_archive_after_secs"; optional uint64 database_health_polling_interval_ms = 18; // required; ms optional uint64 eth_sender_health_polling_interval_ms = 19; // required; ms - optional uint64 state_keeper_health_polling_interval_ms = 20; // required; ms } diff --git a/core/node/consensus/src/testonly.rs b/core/node/consensus/src/testonly.rs index ef4226c915f0..fb89f88fafd5 100644 --- a/core/node/consensus/src/testonly.rs +++ b/core/node/consensus/src/testonly.rs @@ -620,7 +620,6 @@ impl StateKeeperRunner { let stop_recv = stop_recv.clone(); async { ZkSyncStateKeeper::new( - stop_recv, Box::new(io), Box::new(executor_factory), OutputHandler::new(Box::new(persistence.with_tx_insertion())) @@ -628,7 +627,7 @@ impl StateKeeperRunner { Arc::new(NoopSealer), Arc::new(async_cache), ) - .run() + .run(stop_recv) .await .context("ZkSyncStateKeeper::run()")?; Ok(()) @@ -701,7 +700,6 @@ impl StateKeeperRunner { let stop_recv = stop_recv.clone(); async { ZkSyncStateKeeper::new( - stop_recv, Box::new(io), Box::new(MockBatchExecutor), OutputHandler::new(Box::new(persistence.with_tx_insertion())) @@ -710,7 +708,7 @@ impl StateKeeperRunner { Arc::new(NoopSealer), Arc::new(MockReadStorageFactory), ) - .run() + .run(stop_recv) .await .context("ZkSyncStateKeeper::run()")?; Ok(()) diff --git a/core/node/house_keeper/src/lib.rs b/core/node/house_keeper/src/lib.rs index 8011716a86ce..e42b3b626f03 100644 --- a/core/node/house_keeper/src/lib.rs +++ b/core/node/house_keeper/src/lib.rs @@ -3,5 +3,4 @@ pub mod database; pub mod eth_sender; mod metrics; pub mod periodic_job; -pub mod state_keeper; pub mod version; diff --git a/core/node/house_keeper/src/state_keeper.rs b/core/node/house_keeper/src/state_keeper.rs deleted file mode 100644 index fcaabd55db41..000000000000 --- a/core/node/house_keeper/src/state_keeper.rs +++ /dev/null @@ -1,74 +0,0 @@ -use async_trait::async_trait; -use serde::{Deserialize, Serialize}; -use zksync_dal::{ConnectionPool, Core, CoreDal}; -use zksync_health_check::{Health, HealthStatus, HealthUpdater}; -use zksync_types::{block::L2BlockHeader, L1BatchNumber, L2BlockNumber, ProtocolVersionId}; - -use crate::periodic_job::PeriodicJob; - -#[derive(Debug, Serialize, Deserialize)] -pub struct L2BlockHeaderInfo { - pub number: L2BlockNumber, - pub timestamp: u64, - pub protocol_version: Option, -} - -impl From for L2BlockHeaderInfo { - fn from(header: L2BlockHeader) -> Self { - Self { - number: header.number, - timestamp: header.timestamp, - protocol_version: header.protocol_version, - } - } -} - -#[derive(Debug, Serialize, Deserialize)] -pub struct StateKeeperInfo { - last_sealed_miniblock: Option, - last_processed_l1_batch: L1BatchNumber, -} - -impl From for Health { - fn from(details: StateKeeperInfo) -> Self { - Self::from(HealthStatus::Ready).with_details(details) - } -} - -#[derive(Debug)] -pub struct StateKeeperHealthTask { - pub polling_interval_ms: u64, - pub connection_pool: ConnectionPool, - pub state_keeper_health_updater: HealthUpdater, -} - -#[async_trait] -impl PeriodicJob for StateKeeperHealthTask { - const SERVICE_NAME: &'static str = "StateKeeperHealth"; - - async fn run_routine_task(&mut self) -> anyhow::Result<()> { - let mut conn = self - .connection_pool - .connection_tagged("house_keeper") - .await - .unwrap(); - let last_sealed_miniblock = conn.blocks_dal().get_last_sealed_l2_block_header().await?; - let last_processed_l1_batch = conn - .blocks_dal() - .get_consistency_checker_last_processed_l1_batch() - .await?; - - self.state_keeper_health_updater.update( - StateKeeperInfo { - last_sealed_miniblock: last_sealed_miniblock.map(L2BlockHeaderInfo::from), - last_processed_l1_batch, - } - .into(), - ); - Ok(()) - } - - fn polling_interval_ms(&self) -> u64 { - self.polling_interval_ms - } -} diff --git a/core/node/node_framework/src/implementations/layers/house_keeper.rs b/core/node/node_framework/src/implementations/layers/house_keeper.rs index 51d457e344cf..005f645f159d 100644 --- a/core/node/node_framework/src/implementations/layers/house_keeper.rs +++ b/core/node/node_framework/src/implementations/layers/house_keeper.rs @@ -4,8 +4,7 @@ use zksync_config::configs::house_keeper::HouseKeeperConfig; use zksync_health_check::ReactiveHealthCheck; use zksync_house_keeper::{ blocks_state_reporter::L1BatchMetricsReporter, database::DatabaseHealthTask, - eth_sender::EthSenderHealthTask, periodic_job::PeriodicJob, - state_keeper::StateKeeperHealthTask, version::NodeVersionInfo, + eth_sender::EthSenderHealthTask, periodic_job::PeriodicJob, version::NodeVersionInfo, }; use crate::{ @@ -43,8 +42,6 @@ pub struct Output { pub database_health_task: DatabaseHealthTask, #[context(task)] pub eth_sender_health_task: EthSenderHealthTask, - #[context(task)] - pub state_keeper_health_task: StateKeeperHealthTask, } impl HouseKeeperLayer { @@ -107,26 +104,10 @@ impl WiringLayer for HouseKeeperLayer { eth_sender_health_updater, }; - let (state_keeper_health_check, state_keeper_health_updater) = - ReactiveHealthCheck::new("state_keeper"); - - app_health - .insert_component(state_keeper_health_check) - .map_err(WiringError::internal)?; - - let state_keeper_health_task = StateKeeperHealthTask { - polling_interval_ms: self - .house_keeper_config - .state_keeper_health_polling_interval_ms, - connection_pool: replica_pool.clone(), - state_keeper_health_updater, - }; - Ok(Output { l1_batch_metrics_reporter, database_health_task, eth_sender_health_task, - state_keeper_health_task, }) } } @@ -171,18 +152,3 @@ impl Task for EthSenderHealthTask { (*self).run(stop_receiver.0).await } } - -#[async_trait::async_trait] -impl Task for StateKeeperHealthTask { - fn kind(&self) -> TaskKind { - TaskKind::UnconstrainedTask - } - - fn id(&self) -> TaskId { - "state_keeper_health".into() - } - - async fn run(self: Box, stop_receiver: StopReceiver) -> anyhow::Result<()> { - (*self).run(stop_receiver.0).await - } -} diff --git a/core/node/node_framework/src/implementations/layers/state_keeper/mod.rs b/core/node/node_framework/src/implementations/layers/state_keeper/mod.rs index 55defd095be8..9a323a84e062 100644 --- a/core/node/node_framework/src/implementations/layers/state_keeper/mod.rs +++ b/core/node/node_framework/src/implementations/layers/state_keeper/mod.rs @@ -1,6 +1,7 @@ use std::sync::Arc; use anyhow::Context; +use zksync_health_check::ReactiveHealthCheck; pub use zksync_state::RocksdbStorageOptions; use zksync_state::{AsyncCatchupTask, OwnedStorage, ReadStorageFactory}; use zksync_state_keeper::{ @@ -12,6 +13,7 @@ use zksync_vm_executor::interface::BatchExecutorFactory; use crate::{ implementations::resources::{ + healthcheck::AppHealthCheckResource, pools::{MasterPool, PoolResource}, state_keeper::{ BatchExecutorResource, ConditionalSealerResource, OutputHandlerResource, @@ -44,6 +46,8 @@ pub struct Input { pub output_handler: OutputHandlerResource, pub conditional_sealer: ConditionalSealerResource, pub master_pool: PoolResource, + #[context(default)] + pub app_health: AppHealthCheckResource, } #[derive(Debug, IntoContext)] @@ -99,13 +103,19 @@ impl WiringLayer for StateKeeperLayer { self.rocksdb_options, ); - let state_keeper = StateKeeperTask { + let state_keeper = StateKeeperTask::new( io, - executor_factory: batch_executor_base, + batch_executor_base, output_handler, sealer, - storage_factory: Arc::new(storage_factory), - }; + Arc::new(storage_factory), + ); + + input + .app_health + .0 + .insert_component(state_keeper.health_check()) + .map_err(WiringError::internal)?; let rocksdb_termination_hook = ShutdownHook::new("rocksdb_terminaton", async { // Wait for all the instances of RocksDB to be destroyed. @@ -123,11 +133,32 @@ impl WiringLayer for StateKeeperLayer { #[derive(Debug)] pub struct StateKeeperTask { - io: Box, - executor_factory: Box>, - output_handler: OutputHandler, - sealer: Arc, - storage_factory: Arc, + state_keeper: ZkSyncStateKeeper, +} + +impl StateKeeperTask { + pub fn new( + io: Box, + executor_factory: Box>, + output_handler: OutputHandler, + sealer: Arc, + storage_factory: Arc, + ) -> Self { + let state_keeper = ZkSyncStateKeeper::new( + io, + executor_factory, + output_handler, + sealer, + storage_factory, + ); + + Self { state_keeper } + } + + /// Returns the health check for state keeper. + pub fn health_check(&self) -> ReactiveHealthCheck { + self.state_keeper.health_check() + } } #[async_trait::async_trait] @@ -137,15 +168,7 @@ impl Task for StateKeeperTask { } async fn run(self: Box, stop_receiver: StopReceiver) -> anyhow::Result<()> { - let state_keeper = ZkSyncStateKeeper::new( - stop_receiver.0, - self.io, - self.executor_factory, - self.output_handler, - self.sealer, - self.storage_factory, - ); - state_keeper.run().await + self.state_keeper.run(stop_receiver.0).await } } diff --git a/core/node/node_sync/src/tests.rs b/core/node/node_sync/src/tests.rs index 172a00e8c14c..21058144f778 100644 --- a/core/node/node_sync/src/tests.rs +++ b/core/node/node_sync/src/tests.rs @@ -132,7 +132,6 @@ impl StateKeeperHandles { } let state_keeper = ZkSyncStateKeeper::new( - stop_receiver, Box::new(io), Box::new(batch_executor), output_handler, @@ -143,7 +142,7 @@ impl StateKeeperHandles { Self { stop_sender, sync_state, - task: tokio::spawn(state_keeper.run()), + task: tokio::spawn(state_keeper.run(stop_receiver)), } } diff --git a/core/node/state_keeper/Cargo.toml b/core/node/state_keeper/Cargo.toml index 0e924b9f066d..7ebbe6565188 100644 --- a/core/node/state_keeper/Cargo.toml +++ b/core/node/state_keeper/Cargo.toml @@ -16,6 +16,7 @@ vise.workspace = true zksync_multivm.workspace = true zksync_types.workspace = true zksync_dal.workspace = true +zksync_health_check.workspace = true zksync_state.workspace = true zksync_storage.workspace = true zksync_mempool.workspace = true @@ -40,6 +41,7 @@ tracing.workspace = true futures.workspace = true once_cell.workspace = true itertools.workspace = true +serde.workspace = true hex.workspace = true [dev-dependencies] diff --git a/core/node/state_keeper/src/health.rs b/core/node/state_keeper/src/health.rs new file mode 100644 index 000000000000..4fc86263e439 --- /dev/null +++ b/core/node/state_keeper/src/health.rs @@ -0,0 +1,30 @@ +use serde::{Deserialize, Serialize}; +use zksync_health_check::{Health, HealthStatus}; +use zksync_types::{L1BatchNumber, L2BlockNumber, H256}; + +use crate::io::IoCursor; + +#[derive(Debug, Serialize, Deserialize)] +pub struct StateKeeperHealthDetails { + pub next_l2_block: L2BlockNumber, + pub prev_l2_block_hash: H256, + pub prev_l2_block_timestamp: u64, + pub l1_batch: L1BatchNumber, +} + +impl From for Health { + fn from(details: StateKeeperHealthDetails) -> Self { + Self::from(HealthStatus::Ready).with_details(details) + } +} + +impl From<&IoCursor> for StateKeeperHealthDetails { + fn from(details: &IoCursor) -> Self { + Self { + next_l2_block: details.next_l2_block, + prev_l2_block_hash: details.prev_l2_block_hash, + prev_l2_block_timestamp: details.prev_l2_block_timestamp, + l1_batch: details.l1_batch, + } + } +} diff --git a/core/node/state_keeper/src/keeper.rs b/core/node/state_keeper/src/keeper.rs index 523dd8ecebad..2621ae57a426 100644 --- a/core/node/state_keeper/src/keeper.rs +++ b/core/node/state_keeper/src/keeper.rs @@ -7,6 +7,7 @@ use std::{ use anyhow::Context as _; use tokio::sync::watch; use tracing::{info_span, Instrument}; +use zksync_health_check::{Health, HealthStatus, HealthUpdater, ReactiveHealthCheck}; use zksync_multivm::{ interface::{ executor::{BatchExecutor, BatchExecutorFactory}, @@ -24,12 +25,13 @@ use zksync_types::{ use crate::{ executor::TxExecutionResult, + health::StateKeeperHealthDetails, io::{IoCursor, L1BatchParams, L2BlockParams, OutputHandler, PendingBatchData, StateKeeperIO}, metrics::{AGGREGATION_METRICS, KEEPER_METRICS, L1_BATCH_METRICS}, seal_criteria::{ConditionalSealer, SealData, SealResolution, UnexecutableReason}, types::ExecutionMetricsForCriteria, updates::UpdatesManager, - utils::gas_count_from_writes, + utils::{gas_count_from_writes, is_canceled}, }; /// Amount of time to block on waiting for some resource. The exact value is not really important, @@ -65,17 +67,16 @@ impl Error { /// a sequence of executed L2 blocks and batches. #[derive(Debug)] pub struct ZkSyncStateKeeper { - stop_receiver: watch::Receiver, io: Box, output_handler: OutputHandler, batch_executor: Box>, sealer: Arc, storage_factory: Arc, + health_updater: HealthUpdater, } impl ZkSyncStateKeeper { pub fn new( - stop_receiver: watch::Receiver, sequencer: Box, batch_executor: Box>, output_handler: OutputHandler, @@ -83,20 +84,26 @@ impl ZkSyncStateKeeper { storage_factory: Arc, ) -> Self { Self { - stop_receiver, io: sequencer, batch_executor, output_handler, sealer, storage_factory, + health_updater: ReactiveHealthCheck::new("state_keeper").1, } } - pub async fn run(mut self) -> anyhow::Result<()> { - match self.run_inner().await { + pub async fn run(mut self, stop_receiver: watch::Receiver) -> anyhow::Result<()> { + match self.run_inner(stop_receiver).await { Ok(_) => unreachable!(), - Err(Error::Fatal(err)) => Err(err).context("state_keeper failed"), + Err(Error::Fatal(err)) => { + self.health_updater + .update(Health::from(HealthStatus::ShuttingDown)); + Err(err).context("state_keeper failed") + } Err(Error::Canceled) => { + self.health_updater + .update(Health::from(HealthStatus::ShuttingDown)); tracing::info!("Stop signal received, state keeper is shutting down"); Ok(()) } @@ -104,9 +111,14 @@ impl ZkSyncStateKeeper { } /// Fallible version of `run` routine that allows to easily exit upon cancellation. - async fn run_inner(&mut self) -> Result { + async fn run_inner( + &mut self, + mut stop_receiver: watch::Receiver, + ) -> Result { let (cursor, pending_batch_params) = self.io.initialize().await?; self.output_handler.initialize(&cursor).await?; + self.health_updater + .update(Health::from(HealthStatus::Ready)); tracing::info!( "Starting state keeper. Next l1 batch to seal: {}, next L2 block to seal: {}", cursor.l1_batch, @@ -135,7 +147,7 @@ impl ZkSyncStateKeeper { None => { tracing::info!("There is no open pending batch, starting a new empty batch"); let (system_env, l1_batch_env, pubdata_params) = self - .wait_for_new_batch_env(&cursor) + .wait_for_new_batch_env(&cursor, &mut stop_receiver) .await .map_err(|e| e.context("wait_for_new_batch_params()"))?; PendingBatchData { @@ -154,22 +166,29 @@ impl ZkSyncStateKeeper { .await?; let mut batch_executor = self - .create_batch_executor(l1_batch_env.clone(), system_env.clone(), pubdata_params) + .create_batch_executor( + l1_batch_env.clone(), + system_env.clone(), + pubdata_params, + &stop_receiver, + ) .await?; self.restore_state( &mut *batch_executor, &mut updates_manager, pending_l2_blocks, + &stop_receiver, ) .await?; let mut l1_batch_seal_delta: Option = None; - while !self.is_canceled() { + while !is_canceled(&stop_receiver) { // This function will run until the batch can be sealed. self.process_l1_batch( &mut *batch_executor, &mut updates_manager, protocol_upgrade_tx, + &stop_receiver, ) .await?; @@ -178,8 +197,9 @@ impl ZkSyncStateKeeper { self.seal_l2_block(&updates_manager).await?; // We've sealed the L2 block that we had, but we still need to set up the timestamp // for the fictive L2 block. - let new_l2_block_params = - self.wait_for_new_l2_block_params(&updates_manager).await?; + let new_l2_block_params = self + .wait_for_new_l2_block_params(&updates_manager, &stop_receiver) + .await?; Self::start_next_l2_block( new_l2_block_params, &mut updates_manager, @@ -204,11 +224,17 @@ impl ZkSyncStateKeeper { // Start the new batch. next_cursor.l1_batch += 1; - (system_env, l1_batch_env, pubdata_params) = - self.wait_for_new_batch_env(&next_cursor).await?; + (system_env, l1_batch_env, pubdata_params) = self + .wait_for_new_batch_env(&next_cursor, &mut stop_receiver) + .await?; updates_manager = UpdatesManager::new(&l1_batch_env, &system_env, pubdata_params); batch_executor = self - .create_batch_executor(l1_batch_env.clone(), system_env.clone(), pubdata_params) + .create_batch_executor( + l1_batch_env.clone(), + system_env.clone(), + pubdata_params, + &stop_receiver, + ) .await?; let version_changed = system_env.version != sealed_batch_protocol_version; @@ -217,6 +243,9 @@ impl ZkSyncStateKeeper { } else { None }; + + self.health_updater + .update(StateKeeperHealthDetails::from(&cursor).into()); } Err(Error::Canceled) } @@ -226,10 +255,11 @@ impl ZkSyncStateKeeper { l1_batch_env: L1BatchEnv, system_env: SystemEnv, pubdata_params: PubdataParams, + stop_receiver: &watch::Receiver, ) -> Result>, Error> { let storage = self .storage_factory - .access_storage(&self.stop_receiver, l1_batch_env.number - 1) + .access_storage(stop_receiver, l1_batch_env.number - 1) .await .context("failed creating VM storage")? .ok_or(Error::Canceled)?; @@ -287,10 +317,6 @@ impl ZkSyncStateKeeper { Ok(protocol_upgrade_tx) } - fn is_canceled(&self) -> bool { - *self.stop_receiver.borrow() - } - async fn load_upgrade_tx( &mut self, protocol_version: ProtocolVersionId, @@ -310,8 +336,9 @@ impl ZkSyncStateKeeper { async fn wait_for_new_batch_params( &mut self, cursor: &IoCursor, + stop_receiver: &watch::Receiver, ) -> Result { - while !self.is_canceled() { + while !is_canceled(stop_receiver) { if let Some(params) = self .io .wait_for_new_batch_params(cursor, POLL_WAIT_DURATION) @@ -332,10 +359,13 @@ impl ZkSyncStateKeeper { async fn wait_for_new_batch_env( &mut self, cursor: &IoCursor, + stop_receiver: &mut watch::Receiver, ) -> Result<(SystemEnv, L1BatchEnv, PubdataParams), Error> { // `io.wait_for_new_batch_params(..)` is not cancel-safe; once we get new batch params, we must hold onto them // until we get the rest of parameters from I/O or receive a stop signal. - let params = self.wait_for_new_batch_params(cursor).await?; + let params = self + .wait_for_new_batch_params(cursor, stop_receiver) + .await?; let contracts = self .io .load_base_system_contracts(params.protocol_version, cursor) @@ -353,7 +383,7 @@ impl ZkSyncStateKeeper { let previous_batch_hash = hash_result.context("cannot load state hash for previous L1 batch")?; Ok(params.into_env(self.io.chain_id(), contracts, cursor, previous_batch_hash)) } - _ = self.stop_receiver.changed() => Err(Error::Canceled), + _ = stop_receiver.changed() => Err(Error::Canceled), } } @@ -367,10 +397,11 @@ impl ZkSyncStateKeeper { async fn wait_for_new_l2_block_params( &mut self, updates: &UpdatesManager, + stop_receiver: &watch::Receiver, ) -> Result { let latency = KEEPER_METRICS.wait_for_l2_block_params.start(); let cursor = updates.io_cursor(); - while !self.is_canceled() { + while !is_canceled(stop_receiver) { if let Some(params) = self .io .wait_for_new_l2_block_params(&cursor, POLL_WAIT_DURATION) @@ -439,6 +470,7 @@ impl ZkSyncStateKeeper { batch_executor: &mut dyn BatchExecutor, updates_manager: &mut UpdatesManager, l2_blocks_to_reexecute: Vec, + stop_receiver: &watch::Receiver, ) -> Result<(), Error> { if l2_blocks_to_reexecute.is_empty() { return Ok(()); @@ -531,7 +563,7 @@ impl ZkSyncStateKeeper { // We've processed all the L2 blocks, and right now we're initializing the next *actual* L2 block. let new_l2_block_params = self - .wait_for_new_l2_block_params(updates_manager) + .wait_for_new_l2_block_params(updates_manager, stop_receiver) .await .map_err(|e| e.context("wait_for_new_l2_block_params"))?; Self::start_next_l2_block(new_l2_block_params, updates_manager, batch_executor).await?; @@ -548,13 +580,14 @@ impl ZkSyncStateKeeper { batch_executor: &mut dyn BatchExecutor, updates_manager: &mut UpdatesManager, protocol_upgrade_tx: Option, + stop_receiver: &watch::Receiver, ) -> Result<(), Error> { if let Some(protocol_upgrade_tx) = protocol_upgrade_tx { self.process_upgrade_tx(batch_executor, updates_manager, protocol_upgrade_tx) .await?; } - while !self.is_canceled() { + while !is_canceled(stop_receiver) { let full_latency = KEEPER_METRICS.process_l1_batch_loop_iteration.start(); if self @@ -577,7 +610,7 @@ impl ZkSyncStateKeeper { self.seal_l2_block(updates_manager).await?; let new_l2_block_params = self - .wait_for_new_l2_block_params(updates_manager) + .wait_for_new_l2_block_params(updates_manager, stop_receiver) .await .map_err(|e| e.context("wait_for_new_l2_block_params"))?; tracing::debug!( @@ -878,4 +911,9 @@ impl ZkSyncStateKeeper { latency.observe(); Ok((resolution, exec_result)) } + + /// Returns the health check for state keeper. + pub fn health_check(&self) -> ReactiveHealthCheck { + self.health_updater.subscribe() + } } diff --git a/core/node/state_keeper/src/lib.rs b/core/node/state_keeper/src/lib.rs index c12e4163fdd4..65637a8da6ec 100644 --- a/core/node/state_keeper/src/lib.rs +++ b/core/node/state_keeper/src/lib.rs @@ -12,6 +12,7 @@ pub use self::{ }; pub mod executor; +pub mod health; pub mod io; mod keeper; mod mempool_actor; diff --git a/core/node/state_keeper/src/testonly/test_batch_executor.rs b/core/node/state_keeper/src/testonly/test_batch_executor.rs index 45787b18f3c9..41515056feb4 100644 --- a/core/node/state_keeper/src/testonly/test_batch_executor.rs +++ b/core/node/state_keeper/src/testonly/test_batch_executor.rs @@ -204,14 +204,13 @@ impl TestScenario { let (stop_sender, stop_receiver) = watch::channel(false); let (io, output_handler) = TestIO::new(stop_sender, self); let state_keeper = ZkSyncStateKeeper::new( - stop_receiver, Box::new(io), Box::new(batch_executor), output_handler, Arc::new(sealer), Arc::new(MockReadStorageFactory), ); - let sk_thread = tokio::spawn(state_keeper.run()); + let sk_thread = tokio::spawn(state_keeper.run(stop_receiver)); // We must assume that *theoretically* state keeper may ignore the stop signal from IO once scenario is // completed, so we spawn it in a separate thread to not get test stuck. diff --git a/core/node/state_keeper/src/tests/mod.rs b/core/node/state_keeper/src/tests/mod.rs index 16eed0b2f7f7..3757d8242457 100644 --- a/core/node/state_keeper/src/tests/mod.rs +++ b/core/node/state_keeper/src/tests/mod.rs @@ -421,14 +421,13 @@ async fn load_upgrade_tx() { let sealer = SequencerSealer::default(); let scenario = TestScenario::new(); let batch_executor = TestBatchExecutorBuilder::new(&scenario); - let (stop_sender, stop_receiver) = watch::channel(false); + let (stop_sender, _stop_receiver) = watch::channel(false); let (mut io, output_handler) = TestIO::new(stop_sender, scenario); io.add_upgrade_tx(ProtocolVersionId::latest(), random_upgrade_tx(1)); io.add_upgrade_tx(ProtocolVersionId::next(), random_upgrade_tx(2)); let mut sk = ZkSyncStateKeeper::new( - stop_receiver, Box::new(io), Box::new(batch_executor), output_handler, diff --git a/core/node/state_keeper/src/utils.rs b/core/node/state_keeper/src/utils.rs index 4240ad306251..3093cb2104b6 100644 --- a/core/node/state_keeper/src/utils.rs +++ b/core/node/state_keeper/src/utils.rs @@ -1,3 +1,4 @@ +use tokio::sync::watch; use zksync_multivm::interface::{DeduplicatedWritesMetrics, VmExecutionMetrics}; use zksync_types::{ aggregated_operations::AggregatedActionType, block::BlockGasCount, ExecuteTransactionCommon, @@ -86,3 +87,7 @@ pub(super) fn gas_count_from_writes( execute: 0, } } + +pub(super) fn is_canceled(stop_receiver: &watch::Receiver) -> bool { + *stop_receiver.borrow() +} diff --git a/etc/env/base/house_keeper.toml b/etc/env/base/house_keeper.toml index c97fa1c2b67c..78364f7179ef 100644 --- a/etc/env/base/house_keeper.toml +++ b/etc/env/base/house_keeper.toml @@ -2,4 +2,3 @@ l1_batch_metrics_reporting_interval_ms = 10000 database_health_polling_interval_ms = 10000 eth_sender_health_polling_interval_ms = 10000 -state_keeper_health_polling_interval_ms = 10000 diff --git a/etc/env/file_based/general.yaml b/etc/env/file_based/general.yaml index 362e248ccec6..57ed996bdd44 100644 --- a/etc/env/file_based/general.yaml +++ b/etc/env/file_based/general.yaml @@ -303,7 +303,6 @@ house_keeper: l1_batch_metrics_reporting_interval_ms: 10000 database_health_polling_interval_ms: 10000 eth_sender_health_polling_interval_ms: 10000 - state_keeper_health_polling_interval_ms: 10000 prometheus: listener_port: 3314 From f51785c34fbeb71903f06e3e0c7437a78aae44eb Mon Sep 17 00:00:00 2001 From: Manuel Mauro Date: Thu, 7 Nov 2024 16:12:19 +0100 Subject: [PATCH 28/60] fix: update state keeper health at the right moment --- core/node/state_keeper/src/keeper.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/core/node/state_keeper/src/keeper.rs b/core/node/state_keeper/src/keeper.rs index 2621ae57a426..9c376489e3a3 100644 --- a/core/node/state_keeper/src/keeper.rs +++ b/core/node/state_keeper/src/keeper.rs @@ -243,9 +243,6 @@ impl ZkSyncStateKeeper { } else { None }; - - self.health_updater - .update(StateKeeperHealthDetails::from(&cursor).into()); } Err(Error::Canceled) } @@ -408,6 +405,9 @@ impl ZkSyncStateKeeper { .await .context("error waiting for new L2 block params")? { + self.health_updater + .update(StateKeeperHealthDetails::from(&cursor).into()); + latency.observe(); return Ok(params); } From bb5acfc3a99676226aa28fc2d48d28b3a2852dda Mon Sep 17 00:00:00 2001 From: Manuel Mauro Date: Thu, 7 Nov 2024 17:20:57 +0100 Subject: [PATCH 29/60] refactor: use ORDER BY to query last database migration --- ...bed7150f327d83613cd34138b59ef3b9271fd0bfdaddd086f8.json} | 4 ++-- core/lib/dal/src/system_dal.rs | 6 ++---- 2 files changed, 4 insertions(+), 6 deletions(-) rename core/lib/dal/.sqlx/{query-77c394266974e2d66b26517394d5c8d7e96625d4e330ef38bc43f3c72a395ac1.json => query-3566423188a5d6bed7150f327d83613cd34138b59ef3b9271fd0bfdaddd086f8.json} (77%) diff --git a/core/lib/dal/.sqlx/query-77c394266974e2d66b26517394d5c8d7e96625d4e330ef38bc43f3c72a395ac1.json b/core/lib/dal/.sqlx/query-3566423188a5d6bed7150f327d83613cd34138b59ef3b9271fd0bfdaddd086f8.json similarity index 77% rename from core/lib/dal/.sqlx/query-77c394266974e2d66b26517394d5c8d7e96625d4e330ef38bc43f3c72a395ac1.json rename to core/lib/dal/.sqlx/query-3566423188a5d6bed7150f327d83613cd34138b59ef3b9271fd0bfdaddd086f8.json index 41f5354b93fb..123afc6060a6 100644 --- a/core/lib/dal/.sqlx/query-77c394266974e2d66b26517394d5c8d7e96625d4e330ef38bc43f3c72a395ac1.json +++ b/core/lib/dal/.sqlx/query-3566423188a5d6bed7150f327d83613cd34138b59ef3b9271fd0bfdaddd086f8.json @@ -1,6 +1,6 @@ { "db_name": "PostgreSQL", - "query": "\n SELECT *\n FROM _sqlx_migrations\n WHERE _sqlx_migrations.version = (\n SELECT MAX(_sqlx_migrations.version)\n FROM _sqlx_migrations\n );\n ", + "query": "\n SELECT *\n FROM _sqlx_migrations\n ORDER BY _sqlx_migrations.version DESC\n LIMIT 1\n ", "describe": { "columns": [ { @@ -46,5 +46,5 @@ false ] }, - "hash": "77c394266974e2d66b26517394d5c8d7e96625d4e330ef38bc43f3c72a395ac1" + "hash": "3566423188a5d6bed7150f327d83613cd34138b59ef3b9271fd0bfdaddd086f8" } diff --git a/core/lib/dal/src/system_dal.rs b/core/lib/dal/src/system_dal.rs index 4b935b544eb7..3321a5af510f 100644 --- a/core/lib/dal/src/system_dal.rs +++ b/core/lib/dal/src/system_dal.rs @@ -104,10 +104,8 @@ impl SystemDal<'_, '_> { r#" SELECT * FROM _sqlx_migrations - WHERE _sqlx_migrations.version = ( - SELECT MAX(_sqlx_migrations.version) - FROM _sqlx_migrations - ); + ORDER BY _sqlx_migrations.version DESC + LIMIT 1 "# ) .instrument("get_last_migration") From fd35543fe875a662bdf88c68ea30490d088b6036 Mon Sep 17 00:00:00 2001 From: Manuel Mauro Date: Fri, 8 Nov 2024 14:18:12 +0100 Subject: [PATCH 30/60] feat: add reactive health check to eth sender --- Cargo.lock | 2 + core/lib/config/src/configs/house_keeper.rs | 1 - core/lib/config/src/testonly.rs | 1 - core/lib/env_config/src/house_keeper.rs | 2 - core/lib/protobuf_config/src/house_keeper.rs | 6 -- .../src/proto/config/house_keeper.proto | 1 - core/node/eth_sender/Cargo.toml | 2 + core/node/eth_sender/src/eth_tx_aggregator.rs | 21 +++++ core/node/eth_sender/src/eth_tx_manager.rs | 23 +++++ core/node/eth_sender/src/health.rs | 84 +++++++++++++++++ core/node/eth_sender/src/lib.rs | 1 + core/node/house_keeper/src/eth_sender.rs | 93 ------------------- core/node/house_keeper/src/lib.rs | 1 - .../layers/eth_sender/aggregator.rs | 9 ++ .../layers/eth_sender/manager.rs | 9 ++ .../implementations/layers/house_keeper.rs | 35 +------ etc/env/base/house_keeper.toml | 1 - etc/env/file_based/general.yaml | 1 - 18 files changed, 152 insertions(+), 141 deletions(-) create mode 100644 core/node/eth_sender/src/health.rs delete mode 100644 core/node/house_keeper/src/eth_sender.rs diff --git a/Cargo.lock b/Cargo.lock index bcf29610cc03..a6ae33afc5ad 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -11070,6 +11070,7 @@ dependencies = [ "async-trait", "chrono", "once_cell", + "serde", "test-casing", "test-log", "thiserror", @@ -11080,6 +11081,7 @@ dependencies = [ "zksync_contracts", "zksync_dal", "zksync_eth_client", + "zksync_health_check", "zksync_l1_contract_interface", "zksync_node_fee_model", "zksync_node_test_utils", diff --git a/core/lib/config/src/configs/house_keeper.rs b/core/lib/config/src/configs/house_keeper.rs index 364277690709..ae2ee41a08d8 100644 --- a/core/lib/config/src/configs/house_keeper.rs +++ b/core/lib/config/src/configs/house_keeper.rs @@ -5,5 +5,4 @@ use serde::Deserialize; pub struct HouseKeeperConfig { pub l1_batch_metrics_reporting_interval_ms: u64, pub database_health_polling_interval_ms: u64, - pub eth_sender_health_polling_interval_ms: u64, } diff --git a/core/lib/config/src/testonly.rs b/core/lib/config/src/testonly.rs index 9cc50da47e5f..63f936f6ffa6 100644 --- a/core/lib/config/src/testonly.rs +++ b/core/lib/config/src/testonly.rs @@ -638,7 +638,6 @@ impl Distribution for EncodeDist { configs::house_keeper::HouseKeeperConfig { l1_batch_metrics_reporting_interval_ms: self.sample(rng), database_health_polling_interval_ms: self.sample(rng), - eth_sender_health_polling_interval_ms: self.sample(rng), } } } diff --git a/core/lib/env_config/src/house_keeper.rs b/core/lib/env_config/src/house_keeper.rs index ed1fcdc18f47..e4c6f1bafa08 100644 --- a/core/lib/env_config/src/house_keeper.rs +++ b/core/lib/env_config/src/house_keeper.rs @@ -19,7 +19,6 @@ mod tests { HouseKeeperConfig { l1_batch_metrics_reporting_interval_ms: 10_000, database_health_polling_interval_ms: 10_000, - eth_sender_health_polling_interval_ms: 10_000, } } @@ -29,7 +28,6 @@ mod tests { let config = r#" HOUSE_KEEPER_L1_BATCH_METRICS_REPORTING_INTERVAL_MS="10000" HOUSE_KEEPER_DATABASE_HEALTH_POLLING_INTERVAL_MS="10000" - HOUSE_KEEPER_ETH_SENDER_HEALTH_POLLING_INTERVAL_MS="10000" "#; lock.set_env(config); diff --git a/core/lib/protobuf_config/src/house_keeper.rs b/core/lib/protobuf_config/src/house_keeper.rs index 4626b7eb4c83..301b79d99a9a 100644 --- a/core/lib/protobuf_config/src/house_keeper.rs +++ b/core/lib/protobuf_config/src/house_keeper.rs @@ -17,11 +17,6 @@ impl ProtoRepr for proto::HouseKeeper { &self.database_health_polling_interval_ms, ) .context("database_health_polling_interval_ms")?, - - eth_sender_health_polling_interval_ms: *required( - &self.eth_sender_health_polling_interval_ms, - ) - .context("eth_sender_health_polling_interval_ms")?, }) } @@ -31,7 +26,6 @@ impl ProtoRepr for proto::HouseKeeper { this.l1_batch_metrics_reporting_interval_ms, ), database_health_polling_interval_ms: Some(this.database_health_polling_interval_ms), - eth_sender_health_polling_interval_ms: Some(this.eth_sender_health_polling_interval_ms), } } } diff --git a/core/lib/protobuf_config/src/proto/config/house_keeper.proto b/core/lib/protobuf_config/src/proto/config/house_keeper.proto index c2ddb5983023..a69e50c46627 100644 --- a/core/lib/protobuf_config/src/proto/config/house_keeper.proto +++ b/core/lib/protobuf_config/src/proto/config/house_keeper.proto @@ -18,5 +18,4 @@ message HouseKeeper { reserved 16; reserved "fri_gpu_prover_archiver_archiving_interval_ms"; reserved 17; reserved "fri_gpu_prover_archiver_archive_after_secs"; optional uint64 database_health_polling_interval_ms = 18; // required; ms - optional uint64 eth_sender_health_polling_interval_ms = 19; // required; ms } diff --git a/core/node/eth_sender/Cargo.toml b/core/node/eth_sender/Cargo.toml index a7aa88c3550e..c7264d7dca3c 100644 --- a/core/node/eth_sender/Cargo.toml +++ b/core/node/eth_sender/Cargo.toml @@ -11,12 +11,14 @@ keywords.workspace = true categories.workspace = true [dependencies] +serde.workspace = true vise.workspace = true zksync_types.workspace = true zksync_dal.workspace = true zksync_config.workspace = true zksync_contracts.workspace = true zksync_eth_client.workspace = true +zksync_health_check.workspace = true zksync_utils.workspace = true zksync_l1_contract_interface.workspace = true zksync_object_store.workspace = true diff --git a/core/node/eth_sender/src/eth_tx_aggregator.rs b/core/node/eth_sender/src/eth_tx_aggregator.rs index ac9ed4aaaadb..9696eeb9e9e0 100644 --- a/core/node/eth_sender/src/eth_tx_aggregator.rs +++ b/core/node/eth_sender/src/eth_tx_aggregator.rs @@ -3,6 +3,7 @@ use zksync_config::configs::eth_sender::SenderConfig; use zksync_contracts::BaseSystemContractsHashes; use zksync_dal::{Connection, ConnectionPool, Core, CoreDal}; use zksync_eth_client::{BoundEthInterface, CallFunctionArgs}; +use zksync_health_check::{Health, HealthStatus, HealthUpdater, ReactiveHealthCheck}; use zksync_l1_contract_interface::{ i_executor::{ commit::kzg::{KzgInfo, ZK_SYNC_BYTES_PER_BLOB}, @@ -27,6 +28,7 @@ use zksync_types::{ use super::aggregated_operations::AggregatedOperation; use crate::{ + health::EthTxAggregatorHealthDetails, metrics::{PubdataKind, METRICS}, utils::agg_l1_batch_base_cost, zksync_functions::ZkSyncFunctions, @@ -65,6 +67,7 @@ pub struct EthTxAggregator { pool: ConnectionPool, settlement_mode: SettlementMode, sl_chain_id: SLChainId, + health_updater: HealthUpdater, } struct TxData { @@ -119,10 +122,14 @@ impl EthTxAggregator { pool, settlement_mode, sl_chain_id, + health_updater: ReactiveHealthCheck::new("eth_tx_aggregator").1, } } pub async fn run(mut self, stop_receiver: watch::Receiver) -> anyhow::Result<()> { + self.health_updater + .update(Health::from(HealthStatus::Ready)); + let pool = self.pool.clone(); loop { let mut storage = pool.connection_tagged("eth_sender").await.unwrap(); @@ -135,6 +142,8 @@ impl EthTxAggregator { if let Err(err) = self.loop_iteration(&mut storage).await { // Web3 API request failures can cause this, // and anything more important is already properly reported. + self.health_updater + .update(Health::from(HealthStatus::ShuttingDown)); tracing::warn!("eth_sender error {err:?}"); } @@ -431,6 +440,13 @@ impl EthTxAggregator { ) .await?; Self::report_eth_tx_saving(storage, &agg_op, &tx).await; + + self.health_updater.update( + EthTxAggregatorHealthDetails { + last_saved_tx: (&tx).into(), + } + .into(), + ); } Ok(()) } @@ -670,4 +686,9 @@ impl EthTxAggregator { ) }) } + + /// Returns the health check for eth tx aggregator. + pub fn health_check(&self) -> ReactiveHealthCheck { + self.health_updater.subscribe() + } } diff --git a/core/node/eth_sender/src/eth_tx_manager.rs b/core/node/eth_sender/src/eth_tx_manager.rs index 7de91a3b7736..f34f885371db 100644 --- a/core/node/eth_sender/src/eth_tx_manager.rs +++ b/core/node/eth_sender/src/eth_tx_manager.rs @@ -6,6 +6,7 @@ use zksync_dal::{Connection, ConnectionPool, Core, CoreDal}; use zksync_eth_client::{ encode_blob_tx_with_sidecar, BoundEthInterface, ExecutedTxStatus, RawTransactionBytes, }; +use zksync_health_check::{Health, HealthStatus, HealthUpdater, ReactiveHealthCheck}; use zksync_node_fee_model::l1_gas_price::TxParamsProvider; use zksync_shared_metrics::BlockL1Stage; use zksync_types::{eth_sender::EthTx, Address, L1BlockNumber, H256, U256}; @@ -17,6 +18,7 @@ use crate::{ AbstractL1Interface, L1BlockNumbers, OperatorNonce, OperatorType, RealL1Interface, }, eth_fees_oracle::{EthFees, EthFeesOracle, GasAdjusterFeesOracle}, + health::EthTxManagerHealthDetails, metrics::TransactionType, }; @@ -31,6 +33,7 @@ pub struct EthTxManager { config: SenderConfig, fees_oracle: Box, pool: ConnectionPool, + health_updater: HealthUpdater, } impl EthTxManager { @@ -65,6 +68,7 @@ impl EthTxManager { config, fees_oracle: Box::new(fees_oracle), pool, + health_updater: ReactiveHealthCheck::new("eth_tx_manager").1, } } @@ -415,6 +419,15 @@ impl EthTxManager { ) { let receipt_block_number = tx_status.receipt.block_number.unwrap().as_u32(); if receipt_block_number <= finalized_block.0 { + self.health_updater.update( + EthTxManagerHealthDetails { + last_mined_tx: tx.into(), + tx_status: (&tx_status).into(), + finalized_block: finalized_block, + } + .into(), + ); + if tx_status.success { self.confirm_tx(storage, tx, tx_status).await; } else { @@ -516,6 +529,9 @@ impl EthTxManager { } pub async fn run(mut self, stop_receiver: watch::Receiver) -> anyhow::Result<()> { + self.health_updater + .update(Health::from(HealthStatus::Ready)); + let pool = self.pool.clone(); loop { @@ -523,6 +539,8 @@ impl EthTxManager { if *stop_receiver.borrow() { tracing::info!("Stop signal received, eth_tx_manager is shutting down"); + self.health_updater + .update(Health::from(HealthStatus::ShuttingDown)); break; } let operator_to_track = self.l1_interface.supported_operator_types()[0]; @@ -676,4 +694,9 @@ impl EthTxManager { } } } + + /// Returns the health check for eth tx manager. + pub fn health_check(&self) -> ReactiveHealthCheck { + self.health_updater.subscribe() + } } diff --git a/core/node/eth_sender/src/health.rs b/core/node/eth_sender/src/health.rs new file mode 100644 index 000000000000..4845192eab42 --- /dev/null +++ b/core/node/eth_sender/src/health.rs @@ -0,0 +1,84 @@ +use serde::{Deserialize, Serialize}; +use zksync_eth_client::ExecutedTxStatus; +use zksync_health_check::{Health, HealthStatus}; +use zksync_types::{ + aggregated_operations::AggregatedActionType, eth_sender::EthTx, web3::TransactionReceipt, + L1BlockNumber, Nonce, H256, +}; + +#[derive(Debug, Serialize, Deserialize)] +pub enum ActionType { + Commit, + PublishProofOnchain, + Execute, +} + +impl From for ActionType { + fn from(action_type: AggregatedActionType) -> Self { + match action_type { + AggregatedActionType::Commit => Self::Commit, + AggregatedActionType::PublishProofOnchain => Self::PublishProofOnchain, + AggregatedActionType::Execute => Self::Execute, + } + } +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct TxStatus { + pub tx_hash: H256, + pub success: bool, + pub receipt: TransactionReceipt, +} + +impl From<&ExecutedTxStatus> for TxStatus { + fn from(status: &ExecutedTxStatus) -> Self { + Self { + tx_hash: status.tx_hash, + success: status.success, + receipt: status.receipt.clone(), + } + } +} + +#[derive(Debug, Serialize, Deserialize)] +pub struct EthTxAggregatorHealthDetails { + pub last_saved_tx: EthTxDetails, +} + +impl From for Health { + fn from(details: EthTxAggregatorHealthDetails) -> Self { + Self::from(HealthStatus::Ready).with_details(details) + } +} + +#[derive(Debug, Serialize, Deserialize)] +pub struct EthTxDetails { + pub nonce: Nonce, + pub tx_type: ActionType, + pub created_at_timestamp: u64, + pub predicted_gas_cost: u64, +} + +impl From<&EthTx> for EthTxDetails { + fn from(tx: &EthTx) -> Self { + Self { + nonce: tx.nonce, + tx_type: tx.tx_type.into(), + created_at_timestamp: tx.created_at_timestamp, + predicted_gas_cost: tx.predicted_gas_cost, + } + } +} + +#[derive(Debug, Serialize, Deserialize)] +pub struct EthTxManagerHealthDetails { + pub last_mined_tx: EthTxDetails, + pub tx_status: TxStatus, + pub finalized_block: L1BlockNumber, +} + +impl From for Health { + fn from(details: EthTxManagerHealthDetails) -> Self { + Self::from(HealthStatus::Ready).with_details(details) + } +} diff --git a/core/node/eth_sender/src/lib.rs b/core/node/eth_sender/src/lib.rs index 747ece93b811..fc6076ec640b 100644 --- a/core/node/eth_sender/src/lib.rs +++ b/core/node/eth_sender/src/lib.rs @@ -3,6 +3,7 @@ mod aggregator; mod error; mod eth_tx_aggregator; mod eth_tx_manager; +mod health; mod metrics; mod publish_criterion; mod utils; diff --git a/core/node/house_keeper/src/eth_sender.rs b/core/node/house_keeper/src/eth_sender.rs deleted file mode 100644 index 6c82bd32ca5f..000000000000 --- a/core/node/house_keeper/src/eth_sender.rs +++ /dev/null @@ -1,93 +0,0 @@ -use std::cmp::max; - -use anyhow::Ok; -use async_trait::async_trait; -use serde::{Deserialize, Serialize}; -use zksync_dal::{ConnectionPool, Core, CoreDal}; -use zksync_health_check::{Health, HealthStatus, HealthUpdater}; -use zksync_types::{aggregated_operations::AggregatedActionType, L1BatchNumber}; - -use crate::periodic_job::PeriodicJob; - -#[derive(Debug, Serialize, Deserialize)] -struct BatchNumbers { - commit: Option, - prove: Option, - execute: Option, -} - -#[derive(Debug, Serialize, Deserialize)] -pub struct EthSenderInfo { - failed_l1_txns: u64, - last_saved_batches: BatchNumbers, - last_mined_batches: BatchNumbers, - next_nonce: Option, -} - -impl From for Health { - fn from(details: EthSenderInfo) -> Self { - Self::from(HealthStatus::Ready).with_details(details) - } -} - -#[derive(Debug)] -pub struct EthSenderHealthTask { - pub polling_interval_ms: u64, - pub connection_pool: ConnectionPool, - pub eth_sender_health_updater: HealthUpdater, -} - -#[async_trait] -impl PeriodicJob for EthSenderHealthTask { - const SERVICE_NAME: &'static str = "EthSenderHealth"; - - async fn run_routine_task(&mut self) -> anyhow::Result<()> { - let mut conn = self - .connection_pool - .connection_tagged("house_keeper") - .await?; - let failed_l1_txns = conn - .eth_sender_dal() - .get_number_of_failed_transactions() - .await?; - - let eth_stats = conn.eth_sender_dal().get_eth_l1_batches().await?; - - // TODO retrieve SettlementMode from config - let next_nonce = conn.eth_sender_dal().get_next_nonce(None, false).await?; - - self.eth_sender_health_updater.update( - EthSenderInfo { - failed_l1_txns, - last_saved_batches: get_latest_batches(eth_stats.saved), - last_mined_batches: get_latest_batches(eth_stats.mined), - next_nonce, - } - .into(), - ); - Ok(()) - } - - fn polling_interval_ms(&self) -> u64 { - self.polling_interval_ms - } -} - -fn get_latest_batches(batches: Vec<(AggregatedActionType, L1BatchNumber)>) -> BatchNumbers { - let (commit_batch, prove_batch, execute_batch) = batches.into_iter().fold( - (None, None, None), - |(commit, prove, execute), (action_type, batch_number)| match action_type { - AggregatedActionType::Commit => (max(commit, Some(batch_number)), prove, execute), - AggregatedActionType::PublishProofOnchain => { - (commit, max(prove, Some(batch_number)), execute) - } - AggregatedActionType::Execute => (commit, prove, max(execute, Some(batch_number))), - }, - ); - - BatchNumbers { - commit: commit_batch, - prove: prove_batch, - execute: execute_batch, - } -} diff --git a/core/node/house_keeper/src/lib.rs b/core/node/house_keeper/src/lib.rs index e42b3b626f03..3401151c24c3 100644 --- a/core/node/house_keeper/src/lib.rs +++ b/core/node/house_keeper/src/lib.rs @@ -1,6 +1,5 @@ pub mod blocks_state_reporter; pub mod database; -pub mod eth_sender; mod metrics; pub mod periodic_job; pub mod version; diff --git a/core/node/node_framework/src/implementations/layers/eth_sender/aggregator.rs b/core/node/node_framework/src/implementations/layers/eth_sender/aggregator.rs index 310580aeb3a3..235158544c54 100644 --- a/core/node/node_framework/src/implementations/layers/eth_sender/aggregator.rs +++ b/core/node/node_framework/src/implementations/layers/eth_sender/aggregator.rs @@ -9,6 +9,7 @@ use crate::{ implementations::resources::{ circuit_breakers::CircuitBreakersResource, eth_interface::{BoundEthInterfaceForBlobsResource, BoundEthInterfaceResource}, + healthcheck::AppHealthCheckResource, object_store::ObjectStoreResource, pools::{MasterPool, PoolResource, ReplicaPool}, }, @@ -55,6 +56,8 @@ pub struct Input { pub object_store: ObjectStoreResource, #[context(default)] pub circuit_breakers: CircuitBreakersResource, + #[context(default)] + pub app_health: AppHealthCheckResource, } #[derive(Debug, IntoContext)] @@ -133,6 +136,12 @@ impl WiringLayer for EthTxAggregatorLayer { .insert(Box::new(FailedL1TransactionChecker { pool: replica_pool })) .await; + input + .app_health + .0 + .insert_component(eth_tx_aggregator.health_check()) + .map_err(WiringError::internal)?; + Ok(Output { eth_tx_aggregator }) } } diff --git a/core/node/node_framework/src/implementations/layers/eth_sender/manager.rs b/core/node/node_framework/src/implementations/layers/eth_sender/manager.rs index 5462fa575f94..e9ce4cc19e1a 100644 --- a/core/node/node_framework/src/implementations/layers/eth_sender/manager.rs +++ b/core/node/node_framework/src/implementations/layers/eth_sender/manager.rs @@ -8,6 +8,7 @@ use crate::{ circuit_breakers::CircuitBreakersResource, eth_interface::{BoundEthInterfaceForBlobsResource, BoundEthInterfaceResource}, gas_adjuster::GasAdjusterResource, + healthcheck::AppHealthCheckResource, pools::{MasterPool, PoolResource, ReplicaPool}, }, service::StopReceiver, @@ -48,6 +49,8 @@ pub struct Input { pub gas_adjuster: GasAdjusterResource, #[context(default)] pub circuit_breakers: CircuitBreakersResource, + #[context(default)] + pub app_health: AppHealthCheckResource, } #[derive(Debug, IntoContext)] @@ -114,6 +117,12 @@ impl WiringLayer for EthTxManagerLayer { .insert(Box::new(FailedL1TransactionChecker { pool: replica_pool })) .await; + input + .app_health + .0 + .insert_component(eth_tx_manager.health_check()) + .map_err(WiringError::internal)?; + Ok(Output { eth_tx_manager }) } } diff --git a/core/node/node_framework/src/implementations/layers/house_keeper.rs b/core/node/node_framework/src/implementations/layers/house_keeper.rs index 005f645f159d..820b8d649972 100644 --- a/core/node/node_framework/src/implementations/layers/house_keeper.rs +++ b/core/node/node_framework/src/implementations/layers/house_keeper.rs @@ -4,7 +4,7 @@ use zksync_config::configs::house_keeper::HouseKeeperConfig; use zksync_health_check::ReactiveHealthCheck; use zksync_house_keeper::{ blocks_state_reporter::L1BatchMetricsReporter, database::DatabaseHealthTask, - eth_sender::EthSenderHealthTask, periodic_job::PeriodicJob, version::NodeVersionInfo, + periodic_job::PeriodicJob, version::NodeVersionInfo, }; use crate::{ @@ -40,8 +40,6 @@ pub struct Output { pub l1_batch_metrics_reporter: L1BatchMetricsReporter, #[context(task)] pub database_health_task: DatabaseHealthTask, - #[context(task)] - pub eth_sender_health_task: EthSenderHealthTask, } impl HouseKeeperLayer { @@ -89,25 +87,9 @@ impl WiringLayer for HouseKeeperLayer { database_health_updater, }; - let (eth_sender_health_check, eth_sender_health_updater) = - ReactiveHealthCheck::new("eth_sender"); - - app_health - .insert_component(eth_sender_health_check) - .map_err(WiringError::internal)?; - - let eth_sender_health_task = EthSenderHealthTask { - polling_interval_ms: self - .house_keeper_config - .eth_sender_health_polling_interval_ms, - connection_pool: replica_pool.clone(), - eth_sender_health_updater, - }; - Ok(Output { l1_batch_metrics_reporter, database_health_task, - eth_sender_health_task, }) } } @@ -137,18 +119,3 @@ impl Task for DatabaseHealthTask { (*self).run(stop_receiver.0).await } } - -#[async_trait::async_trait] -impl Task for EthSenderHealthTask { - fn kind(&self) -> TaskKind { - TaskKind::UnconstrainedTask - } - - fn id(&self) -> TaskId { - "eth_sender_health".into() - } - - async fn run(self: Box, stop_receiver: StopReceiver) -> anyhow::Result<()> { - (*self).run(stop_receiver.0).await - } -} diff --git a/etc/env/base/house_keeper.toml b/etc/env/base/house_keeper.toml index 78364f7179ef..649a8ea87118 100644 --- a/etc/env/base/house_keeper.toml +++ b/etc/env/base/house_keeper.toml @@ -1,4 +1,3 @@ [house_keeper] l1_batch_metrics_reporting_interval_ms = 10000 database_health_polling_interval_ms = 10000 -eth_sender_health_polling_interval_ms = 10000 diff --git a/etc/env/file_based/general.yaml b/etc/env/file_based/general.yaml index 57ed996bdd44..146570ba0fdb 100644 --- a/etc/env/file_based/general.yaml +++ b/etc/env/file_based/general.yaml @@ -302,7 +302,6 @@ external_price_api_client: house_keeper: l1_batch_metrics_reporting_interval_ms: 10000 database_health_polling_interval_ms: 10000 - eth_sender_health_polling_interval_ms: 10000 prometheus: listener_port: 3314 From 416ea5032c90a7921ea450b89de83b27d75fd6a6 Mon Sep 17 00:00:00 2001 From: Manuel Mauro Date: Fri, 8 Nov 2024 17:44:35 +0100 Subject: [PATCH 31/60] style: clippy --- core/node/eth_sender/src/eth_tx_manager.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/node/eth_sender/src/eth_tx_manager.rs b/core/node/eth_sender/src/eth_tx_manager.rs index f34f885371db..0f2f60a3a42f 100644 --- a/core/node/eth_sender/src/eth_tx_manager.rs +++ b/core/node/eth_sender/src/eth_tx_manager.rs @@ -423,7 +423,7 @@ impl EthTxManager { EthTxManagerHealthDetails { last_mined_tx: tx.into(), tx_status: (&tx_status).into(), - finalized_block: finalized_block, + finalized_block, } .into(), ); From 8d4c17617317586cca9af1993ca2670d0461a540 Mon Sep 17 00:00:00 2001 From: Manuel Mauro Date: Wed, 13 Nov 2024 10:26:46 +0100 Subject: [PATCH 32/60] style: move field before reserved ones --- core/lib/protobuf_config/src/proto/config/house_keeper.proto | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/lib/protobuf_config/src/proto/config/house_keeper.proto b/core/lib/protobuf_config/src/proto/config/house_keeper.proto index a69e50c46627..924678f09868 100644 --- a/core/lib/protobuf_config/src/proto/config/house_keeper.proto +++ b/core/lib/protobuf_config/src/proto/config/house_keeper.proto @@ -4,6 +4,7 @@ package zksync.config.house_keeper; message HouseKeeper { optional uint64 l1_batch_metrics_reporting_interval_ms = 1; // required; ms + optional uint64 database_health_polling_interval_ms = 18; // required; ms reserved 2; reserved "gpu_prover_queue_reporting_interval_ms"; reserved 3; reserved "prover_job_retrying_interval_ms"; reserved 4; reserved "prover_stats_reporting_interval_ms"; @@ -17,5 +18,4 @@ message HouseKeeper { reserved 15; reserved "prover_job_archiver_archive_after_secs"; reserved 16; reserved "fri_gpu_prover_archiver_archiving_interval_ms"; reserved 17; reserved "fri_gpu_prover_archiver_archive_after_secs"; - optional uint64 database_health_polling_interval_ms = 18; // required; ms } From eb2fb68bfbda2841386ac3c7e31a0907e06a6bed Mon Sep 17 00:00:00 2001 From: Manuel Mauro Date: Wed, 13 Nov 2024 10:32:04 +0100 Subject: [PATCH 33/60] refactor: rename PostgresMetricsLayer to PostgresLayer --- core/bin/external_node/src/node_builder.rs | 4 ++-- core/bin/zksync_server/src/node_builder.rs | 4 ++-- .../src/implementations/layers/postgres_metrics.rs | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/core/bin/external_node/src/node_builder.rs b/core/bin/external_node/src/node_builder.rs index b7f6f8039025..9d1516f6fac5 100644 --- a/core/bin/external_node/src/node_builder.rs +++ b/core/bin/external_node/src/node_builder.rs @@ -33,7 +33,7 @@ use zksync_node_framework::{ NodeStorageInitializerLayer, }, pools_layer::PoolsLayerBuilder, - postgres_metrics::PostgresMetricsLayer, + postgres_metrics::PostgresLayer, prometheus_exporter::PrometheusExporterLayer, pruning::PruningLayer, query_eth_client::QueryEthClientLayer, @@ -126,7 +126,7 @@ impl ExternalNodeBuilder { } fn add_postgres_metrics_layer(mut self) -> anyhow::Result { - self.node.add_layer(PostgresMetricsLayer); + self.node.add_layer(PostgresLayer); Ok(self) } diff --git a/core/bin/zksync_server/src/node_builder.rs b/core/bin/zksync_server/src/node_builder.rs index 32478ede5bf8..e463291cc27d 100644 --- a/core/bin/zksync_server/src/node_builder.rs +++ b/core/bin/zksync_server/src/node_builder.rs @@ -48,7 +48,7 @@ use zksync_node_framework::{ object_store::ObjectStoreLayer, pk_signing_eth_client::PKSigningEthClientLayer, pools_layer::PoolsLayerBuilder, - postgres_metrics::PostgresMetricsLayer, + postgres_metrics::PostgresLayer, prometheus_exporter::PrometheusExporterLayer, proof_data_handler::ProofDataHandlerLayer, query_eth_client::QueryEthClientLayer, @@ -139,7 +139,7 @@ impl MainNodeBuilder { } fn add_postgres_metrics_layer(mut self) -> anyhow::Result { - self.node.add_layer(PostgresMetricsLayer); + self.node.add_layer(PostgresLayer); Ok(self) } diff --git a/core/node/node_framework/src/implementations/layers/postgres_metrics.rs b/core/node/node_framework/src/implementations/layers/postgres_metrics.rs index 238bee578678..11b9503bad18 100644 --- a/core/node/node_framework/src/implementations/layers/postgres_metrics.rs +++ b/core/node/node_framework/src/implementations/layers/postgres_metrics.rs @@ -14,7 +14,7 @@ const SCRAPE_INTERVAL: Duration = Duration::from_secs(60); /// Wiring layer for the Postgres metrics exporter. #[derive(Debug)] -pub struct PostgresMetricsLayer; +pub struct PostgresLayer; #[derive(Debug, FromContext)] #[context(crate = crate)] @@ -30,7 +30,7 @@ pub struct Output { } #[async_trait::async_trait] -impl WiringLayer for PostgresMetricsLayer { +impl WiringLayer for PostgresLayer { type Input = Input; type Output = Output; From 80986e496fdd38d63cc1cda1d525ef4f8eb22975 Mon Sep 17 00:00:00 2001 From: Manuel Mauro Date: Wed, 13 Nov 2024 10:38:13 +0100 Subject: [PATCH 34/60] refactor: rename postgres_metrics_layer to postgres_layer --- core/bin/external_node/src/node_builder.rs | 4 ++-- core/bin/zksync_server/src/node_builder.rs | 6 ++---- .../src/implementations/layers/postgres_metrics.rs | 2 +- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/core/bin/external_node/src/node_builder.rs b/core/bin/external_node/src/node_builder.rs index 9d1516f6fac5..7d0ede0b064b 100644 --- a/core/bin/external_node/src/node_builder.rs +++ b/core/bin/external_node/src/node_builder.rs @@ -125,7 +125,7 @@ impl ExternalNodeBuilder { Ok(self) } - fn add_postgres_metrics_layer(mut self) -> anyhow::Result { + fn add_postgres_layer(mut self) -> anyhow::Result { self.node.add_layer(PostgresLayer); Ok(self) } @@ -577,7 +577,7 @@ impl ExternalNodeBuilder { // so until we have a dedicated component for "auxiliary" tasks, // it's responsible for things like metrics. self = self - .add_postgres_metrics_layer()? + .add_postgres_layer()? .add_external_node_metrics_layer()?; // We assign the storage initialization to the core, as it's considered to be // the "main" component. diff --git a/core/bin/zksync_server/src/node_builder.rs b/core/bin/zksync_server/src/node_builder.rs index e463291cc27d..27b4480f5261 100644 --- a/core/bin/zksync_server/src/node_builder.rs +++ b/core/bin/zksync_server/src/node_builder.rs @@ -138,7 +138,7 @@ impl MainNodeBuilder { Ok(self) } - fn add_postgres_metrics_layer(mut self) -> anyhow::Result { + fn add_postgres_layer(mut self) -> anyhow::Result { self.node.add_layer(PostgresLayer); Ok(self) } @@ -752,9 +752,7 @@ impl MainNodeBuilder { self = self.add_eth_tx_manager_layer()?; } Component::Housekeeper => { - self = self - .add_house_keeper_layer()? - .add_postgres_metrics_layer()?; + self = self.add_house_keeper_layer()?.add_postgres_layer()?; } Component::ProofDataHandler => { self = self.add_proof_data_handler_layer()?; diff --git a/core/node/node_framework/src/implementations/layers/postgres_metrics.rs b/core/node/node_framework/src/implementations/layers/postgres_metrics.rs index 11b9503bad18..e24bc663df31 100644 --- a/core/node/node_framework/src/implementations/layers/postgres_metrics.rs +++ b/core/node/node_framework/src/implementations/layers/postgres_metrics.rs @@ -35,7 +35,7 @@ impl WiringLayer for PostgresLayer { type Output = Output; fn layer_name(&self) -> &'static str { - "postgres_metrics_layer" + "postgres_layer" } async fn wire(self, input: Self::Input) -> Result { From 9bffd99bdd1ce1866bdf3b2b7a57586995bbaf0e Mon Sep 17 00:00:00 2001 From: Manuel Mauro Date: Wed, 13 Nov 2024 10:40:58 +0100 Subject: [PATCH 35/60] refactor: rename module postgres_layer to postgres --- core/bin/external_node/src/node_builder.rs | 2 +- core/bin/zksync_server/src/node_builder.rs | 2 +- core/node/node_framework/src/implementations/layers/mod.rs | 2 +- .../implementations/layers/{postgres_metrics.rs => postgres.rs} | 0 4 files changed, 3 insertions(+), 3 deletions(-) rename core/node/node_framework/src/implementations/layers/{postgres_metrics.rs => postgres.rs} (100%) diff --git a/core/bin/external_node/src/node_builder.rs b/core/bin/external_node/src/node_builder.rs index 7d0ede0b064b..769d19f29e2b 100644 --- a/core/bin/external_node/src/node_builder.rs +++ b/core/bin/external_node/src/node_builder.rs @@ -33,7 +33,7 @@ use zksync_node_framework::{ NodeStorageInitializerLayer, }, pools_layer::PoolsLayerBuilder, - postgres_metrics::PostgresLayer, + postgres::PostgresLayer, prometheus_exporter::PrometheusExporterLayer, pruning::PruningLayer, query_eth_client::QueryEthClientLayer, diff --git a/core/bin/zksync_server/src/node_builder.rs b/core/bin/zksync_server/src/node_builder.rs index 27b4480f5261..a9a8dfa6373a 100644 --- a/core/bin/zksync_server/src/node_builder.rs +++ b/core/bin/zksync_server/src/node_builder.rs @@ -48,7 +48,7 @@ use zksync_node_framework::{ object_store::ObjectStoreLayer, pk_signing_eth_client::PKSigningEthClientLayer, pools_layer::PoolsLayerBuilder, - postgres_metrics::PostgresLayer, + postgres::PostgresLayer, prometheus_exporter::PrometheusExporterLayer, proof_data_handler::ProofDataHandlerLayer, query_eth_client::QueryEthClientLayer, diff --git a/core/node/node_framework/src/implementations/layers/mod.rs b/core/node/node_framework/src/implementations/layers/mod.rs index 11a62c9333b2..28a6f65600ab 100644 --- a/core/node/node_framework/src/implementations/layers/mod.rs +++ b/core/node/node_framework/src/implementations/layers/mod.rs @@ -24,7 +24,7 @@ pub mod node_storage_init; pub mod object_store; pub mod pk_signing_eth_client; pub mod pools_layer; -pub mod postgres_metrics; +pub mod postgres; pub mod prometheus_exporter; pub mod proof_data_handler; pub mod pruning; diff --git a/core/node/node_framework/src/implementations/layers/postgres_metrics.rs b/core/node/node_framework/src/implementations/layers/postgres.rs similarity index 100% rename from core/node/node_framework/src/implementations/layers/postgres_metrics.rs rename to core/node/node_framework/src/implementations/layers/postgres.rs From 5e235b13c54baac79e8dbc05b0bdd59c1650b407 Mon Sep 17 00:00:00 2001 From: Manuel Mauro Date: Wed, 13 Nov 2024 13:54:22 +0100 Subject: [PATCH 36/60] refactor: move database health check task to postgres layer --- Cargo.lock | 1 + core/bin/external_node/src/node_builder.rs | 2 +- core/bin/zksync_server/src/node_builder.rs | 2 +- core/lib/config/src/configs/house_keeper.rs | 1 - core/lib/config/src/testonly.rs | 1 - core/lib/env_config/src/house_keeper.rs | 2 - core/lib/protobuf_config/src/house_keeper.rs | 6 - .../src/proto/config/house_keeper.proto | 1 - core/node/house_keeper/src/database.rs | 45 ------- core/node/house_keeper/src/lib.rs | 1 - core/node/node_framework/Cargo.toml | 1 + .../implementations/layers/house_keeper.rs | 52 +------- .../src/implementations/layers/postgres.rs | 121 ++++++++++++++++-- etc/env/base/house_keeper.toml | 1 - etc/env/file_based/general.yaml | 1 - 15 files changed, 121 insertions(+), 117 deletions(-) delete mode 100644 core/node/house_keeper/src/database.rs diff --git a/Cargo.lock b/Cargo.lock index 3dcd7982ea40..5e71ea310a7d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -11598,6 +11598,7 @@ dependencies = [ "futures 0.3.31", "pin-project-lite", "semver 1.0.23", + "serde", "thiserror", "tokio", "tracing", diff --git a/core/bin/external_node/src/node_builder.rs b/core/bin/external_node/src/node_builder.rs index 769d19f29e2b..ebecbf513d22 100644 --- a/core/bin/external_node/src/node_builder.rs +++ b/core/bin/external_node/src/node_builder.rs @@ -126,7 +126,7 @@ impl ExternalNodeBuilder { } fn add_postgres_layer(mut self) -> anyhow::Result { - self.node.add_layer(PostgresLayer); + self.node.add_layer(PostgresLayer::default()); Ok(self) } diff --git a/core/bin/zksync_server/src/node_builder.rs b/core/bin/zksync_server/src/node_builder.rs index a9a8dfa6373a..48211f61e293 100644 --- a/core/bin/zksync_server/src/node_builder.rs +++ b/core/bin/zksync_server/src/node_builder.rs @@ -139,7 +139,7 @@ impl MainNodeBuilder { } fn add_postgres_layer(mut self) -> anyhow::Result { - self.node.add_layer(PostgresLayer); + self.node.add_layer(PostgresLayer::default()); Ok(self) } diff --git a/core/lib/config/src/configs/house_keeper.rs b/core/lib/config/src/configs/house_keeper.rs index ae2ee41a08d8..39e304562fa0 100644 --- a/core/lib/config/src/configs/house_keeper.rs +++ b/core/lib/config/src/configs/house_keeper.rs @@ -4,5 +4,4 @@ use serde::Deserialize; #[derive(Debug, Deserialize, Clone, PartialEq)] pub struct HouseKeeperConfig { pub l1_batch_metrics_reporting_interval_ms: u64, - pub database_health_polling_interval_ms: u64, } diff --git a/core/lib/config/src/testonly.rs b/core/lib/config/src/testonly.rs index c0421d8fe8c3..93d502cc4e8a 100644 --- a/core/lib/config/src/testonly.rs +++ b/core/lib/config/src/testonly.rs @@ -636,7 +636,6 @@ impl Distribution for EncodeDist { fn sample(&self, rng: &mut R) -> configs::house_keeper::HouseKeeperConfig { configs::house_keeper::HouseKeeperConfig { l1_batch_metrics_reporting_interval_ms: self.sample(rng), - database_health_polling_interval_ms: self.sample(rng), } } } diff --git a/core/lib/env_config/src/house_keeper.rs b/core/lib/env_config/src/house_keeper.rs index e4c6f1bafa08..1a1ff4d27de2 100644 --- a/core/lib/env_config/src/house_keeper.rs +++ b/core/lib/env_config/src/house_keeper.rs @@ -18,7 +18,6 @@ mod tests { fn expected_config() -> HouseKeeperConfig { HouseKeeperConfig { l1_batch_metrics_reporting_interval_ms: 10_000, - database_health_polling_interval_ms: 10_000, } } @@ -27,7 +26,6 @@ mod tests { let mut lock = MUTEX.lock(); let config = r#" HOUSE_KEEPER_L1_BATCH_METRICS_REPORTING_INTERVAL_MS="10000" - HOUSE_KEEPER_DATABASE_HEALTH_POLLING_INTERVAL_MS="10000" "#; lock.set_env(config); diff --git a/core/lib/protobuf_config/src/house_keeper.rs b/core/lib/protobuf_config/src/house_keeper.rs index 301b79d99a9a..e40fd1b37dc7 100644 --- a/core/lib/protobuf_config/src/house_keeper.rs +++ b/core/lib/protobuf_config/src/house_keeper.rs @@ -12,11 +12,6 @@ impl ProtoRepr for proto::HouseKeeper { &self.l1_batch_metrics_reporting_interval_ms, ) .context("l1_batch_metrics_reporting_interval_ms")?, - - database_health_polling_interval_ms: *required( - &self.database_health_polling_interval_ms, - ) - .context("database_health_polling_interval_ms")?, }) } @@ -25,7 +20,6 @@ impl ProtoRepr for proto::HouseKeeper { l1_batch_metrics_reporting_interval_ms: Some( this.l1_batch_metrics_reporting_interval_ms, ), - database_health_polling_interval_ms: Some(this.database_health_polling_interval_ms), } } } diff --git a/core/lib/protobuf_config/src/proto/config/house_keeper.proto b/core/lib/protobuf_config/src/proto/config/house_keeper.proto index 924678f09868..c3a4ca8ad672 100644 --- a/core/lib/protobuf_config/src/proto/config/house_keeper.proto +++ b/core/lib/protobuf_config/src/proto/config/house_keeper.proto @@ -4,7 +4,6 @@ package zksync.config.house_keeper; message HouseKeeper { optional uint64 l1_batch_metrics_reporting_interval_ms = 1; // required; ms - optional uint64 database_health_polling_interval_ms = 18; // required; ms reserved 2; reserved "gpu_prover_queue_reporting_interval_ms"; reserved 3; reserved "prover_job_retrying_interval_ms"; reserved 4; reserved "prover_stats_reporting_interval_ms"; diff --git a/core/node/house_keeper/src/database.rs b/core/node/house_keeper/src/database.rs deleted file mode 100644 index 26416be7ef07..000000000000 --- a/core/node/house_keeper/src/database.rs +++ /dev/null @@ -1,45 +0,0 @@ -use async_trait::async_trait; -use serde::{Deserialize, Serialize}; -use zksync_dal::{system_dal::DatabaseMigration, ConnectionPool, Core, CoreDal}; -use zksync_health_check::{Health, HealthStatus, HealthUpdater}; - -use crate::periodic_job::PeriodicJob; - -#[derive(Debug, Serialize, Deserialize)] -pub struct DatabaseInfo { - last_migration: DatabaseMigration, -} - -impl From for Health { - fn from(details: DatabaseInfo) -> Self { - Self::from(HealthStatus::Ready).with_details(details) - } -} - -#[derive(Debug)] -pub struct DatabaseHealthTask { - pub polling_interval_ms: u64, - pub connection_pool: ConnectionPool, - pub database_health_updater: HealthUpdater, -} - -#[async_trait] -impl PeriodicJob for DatabaseHealthTask { - const SERVICE_NAME: &'static str = "DatabaseHealth"; - - async fn run_routine_task(&mut self) -> anyhow::Result<()> { - let mut conn = self - .connection_pool - .connection_tagged("house_keeper") - .await?; - let last_migration = conn.system_dal().get_last_migration().await?; - - self.database_health_updater - .update(DatabaseInfo { last_migration }.into()); - Ok(()) - } - - fn polling_interval_ms(&self) -> u64 { - self.polling_interval_ms - } -} diff --git a/core/node/house_keeper/src/lib.rs b/core/node/house_keeper/src/lib.rs index 3401151c24c3..2326b0a6e2d2 100644 --- a/core/node/house_keeper/src/lib.rs +++ b/core/node/house_keeper/src/lib.rs @@ -1,5 +1,4 @@ pub mod blocks_state_reporter; -pub mod database; mod metrics; pub mod periodic_job; pub mod version; diff --git a/core/node/node_framework/Cargo.toml b/core/node/node_framework/Cargo.toml index d85f3dc7c8e9..2899b532038e 100644 --- a/core/node/node_framework/Cargo.toml +++ b/core/node/node_framework/Cargo.toml @@ -61,6 +61,7 @@ thiserror.workspace = true async-trait.workspace = true futures.workspace = true anyhow.workspace = true +serde.workspace = true tokio = { workspace = true, features = ["rt"] } ctrlc.workspace = true semver.workspace = true diff --git a/core/node/node_framework/src/implementations/layers/house_keeper.rs b/core/node/node_framework/src/implementations/layers/house_keeper.rs index 820b8d649972..af59a73554ac 100644 --- a/core/node/node_framework/src/implementations/layers/house_keeper.rs +++ b/core/node/node_framework/src/implementations/layers/house_keeper.rs @@ -1,19 +1,12 @@ -use std::sync::Arc; - use zksync_config::configs::house_keeper::HouseKeeperConfig; -use zksync_health_check::ReactiveHealthCheck; use zksync_house_keeper::{ - blocks_state_reporter::L1BatchMetricsReporter, database::DatabaseHealthTask, - periodic_job::PeriodicJob, version::NodeVersionInfo, + blocks_state_reporter::L1BatchMetricsReporter, periodic_job::PeriodicJob, }; use crate::{ - implementations::resources::{ - healthcheck::AppHealthCheckResource, - pools::{PoolResource, ReplicaPool}, - }, + implementations::resources::pools::{PoolResource, ReplicaPool}, service::StopReceiver, - task::{Task, TaskId, TaskKind}, + task::{Task, TaskId}, wiring_layer::{WiringError, WiringLayer}, FromContext, IntoContext, }; @@ -29,8 +22,6 @@ pub struct HouseKeeperLayer { #[context(crate = crate)] pub struct Input { pub replica_pool: PoolResource, - #[context(default)] - pub app_health: AppHealthCheckResource, } #[derive(Debug, IntoContext)] @@ -38,8 +29,6 @@ pub struct Input { pub struct Output { #[context(task)] pub l1_batch_metrics_reporter: L1BatchMetricsReporter, - #[context(task)] - pub database_health_task: DatabaseHealthTask, } impl HouseKeeperLayer { @@ -67,29 +56,11 @@ impl WiringLayer for HouseKeeperLayer { let l1_batch_metrics_reporter = L1BatchMetricsReporter::new( self.house_keeper_config .l1_batch_metrics_reporting_interval_ms, - replica_pool.clone(), + replica_pool, ); - let app_health = input.app_health.0; - app_health - .insert_custom_component(Arc::new(NodeVersionInfo::default())) - .map_err(WiringError::internal)?; - - let (database_health_check, database_health_updater) = ReactiveHealthCheck::new("database"); - - app_health - .insert_component(database_health_check) - .map_err(WiringError::internal)?; - - let database_health_task = DatabaseHealthTask { - polling_interval_ms: self.house_keeper_config.database_health_polling_interval_ms, - connection_pool: replica_pool.clone(), - database_health_updater, - }; - Ok(Output { l1_batch_metrics_reporter, - database_health_task, }) } } @@ -104,18 +75,3 @@ impl Task for L1BatchMetricsReporter { (*self).run(stop_receiver.0).await } } - -#[async_trait::async_trait] -impl Task for DatabaseHealthTask { - fn kind(&self) -> TaskKind { - TaskKind::UnconstrainedTask - } - - fn id(&self) -> TaskId { - "database_health".into() - } - - async fn run(self: Box, stop_receiver: StopReceiver) -> anyhow::Result<()> { - (*self).run(stop_receiver.0).await - } -} diff --git a/core/node/node_framework/src/implementations/layers/postgres.rs b/core/node/node_framework/src/implementations/layers/postgres.rs index e24bc663df31..d5521db252c7 100644 --- a/core/node/node_framework/src/implementations/layers/postgres.rs +++ b/core/node/node_framework/src/implementations/layers/postgres.rs @@ -1,9 +1,17 @@ use std::time::Duration; -use zksync_dal::{metrics::PostgresMetrics, ConnectionPool, Core}; +use serde::{Deserialize, Serialize}; +use tokio::sync::watch; +use zksync_dal::{ + metrics::PostgresMetrics, system_dal::DatabaseMigration, ConnectionPool, Core, CoreDal, +}; +use zksync_health_check::{Health, HealthStatus, HealthUpdater, ReactiveHealthCheck}; use crate::{ - implementations::resources::pools::{PoolResource, ReplicaPool}, + implementations::resources::{ + healthcheck::AppHealthCheckResource, + pools::{PoolResource, ReplicaPool}, + }, service::StopReceiver, task::{Task, TaskId, TaskKind}, wiring_layer::{WiringError, WiringLayer}, @@ -12,21 +20,40 @@ use crate::{ const SCRAPE_INTERVAL: Duration = Duration::from_secs(60); +#[derive(Debug, Deserialize, Clone, PartialEq)] +pub struct Config { + pub polling_interval_ms: u64, +} + +impl Default for Config { + fn default() -> Self { + Self { + polling_interval_ms: 10_000, + } + } +} + /// Wiring layer for the Postgres metrics exporter. -#[derive(Debug)] -pub struct PostgresLayer; +#[derive(Debug, Default)] +pub struct PostgresLayer { + config: Config, +} #[derive(Debug, FromContext)] #[context(crate = crate)] pub struct Input { pub replica_pool: PoolResource, + #[context(default)] + pub app_health: AppHealthCheckResource, } #[derive(Debug, IntoContext)] #[context(crate = crate)] pub struct Output { #[context(task)] - pub task: PostgresMetricsScrapingTask, + pub metrics_task: PostgresMetricsScrapingTask, + #[context(task)] + pub health_task: DatabaseHealthTask, } #[async_trait::async_trait] @@ -39,10 +66,28 @@ impl WiringLayer for PostgresLayer { } async fn wire(self, input: Self::Input) -> Result { - let pool_for_metrics = input.replica_pool.get_singleton().await?; - let task = PostgresMetricsScrapingTask { pool_for_metrics }; + let pool = input.replica_pool.get().await?; + let metrics_task = PostgresMetricsScrapingTask { + pool_for_metrics: pool.clone(), + }; - Ok(Output { task }) + let app_health = input.app_health.0; + let (database_health_check, updater) = ReactiveHealthCheck::new("database"); + + app_health + .insert_component(database_health_check) + .map_err(WiringError::internal)?; + + let health_task = DatabaseHealthTask { + polling_interval_ms: self.config.polling_interval_ms, + connection_pool: pool, + updater, + }; + + Ok(Output { + metrics_task, + health_task, + }) } } @@ -73,3 +118,63 @@ impl Task for PostgresMetricsScrapingTask { Ok(()) } } + +#[derive(Debug, Serialize, Deserialize)] +pub struct DatabaseInfo { + last_migration: DatabaseMigration, +} + +impl From for Health { + fn from(details: DatabaseInfo) -> Self { + Self::from(HealthStatus::Ready).with_details(details) + } +} + +#[derive(Debug)] +pub struct DatabaseHealthTask { + pub polling_interval_ms: u64, + pub connection_pool: ConnectionPool, + pub updater: HealthUpdater, +} + +impl DatabaseHealthTask { + async fn run(self, mut stop_receiver: watch::Receiver) -> anyhow::Result<()> + where + Self: Sized, + { + let timeout = Duration::from_millis(self.polling_interval_ms); + let mut conn = self + .connection_pool + .connection_tagged("postgres_healthcheck") + .await?; + + tracing::info!("Starting database healthcheck with frequency: {timeout:?}",); + + while !*stop_receiver.borrow_and_update() { + let last_migration = conn.system_dal().get_last_migration().await?; + self.updater.update(DatabaseInfo { last_migration }.into()); + + // Error here corresponds to a timeout w/o `stop_receiver` changed; we're OK with this. + tokio::time::timeout(timeout, stop_receiver.changed()) + .await + .ok(); + } + tracing::info!("Stop signal received; database healthcheck is shut down"); + Ok(()) + } +} + +#[async_trait::async_trait] +impl Task for DatabaseHealthTask { + fn kind(&self) -> TaskKind { + TaskKind::UnconstrainedTask + } + + fn id(&self) -> TaskId { + "database_health".into() + } + + async fn run(self: Box, stop_receiver: StopReceiver) -> anyhow::Result<()> { + (*self).run(stop_receiver.0).await + } +} diff --git a/etc/env/base/house_keeper.toml b/etc/env/base/house_keeper.toml index 649a8ea87118..6f86561d1c60 100644 --- a/etc/env/base/house_keeper.toml +++ b/etc/env/base/house_keeper.toml @@ -1,3 +1,2 @@ [house_keeper] l1_batch_metrics_reporting_interval_ms = 10000 -database_health_polling_interval_ms = 10000 diff --git a/etc/env/file_based/general.yaml b/etc/env/file_based/general.yaml index c41b34423920..50f1be4e6e74 100644 --- a/etc/env/file_based/general.yaml +++ b/etc/env/file_based/general.yaml @@ -298,7 +298,6 @@ external_price_api_client: house_keeper: l1_batch_metrics_reporting_interval_ms: 10000 - database_health_polling_interval_ms: 10000 prometheus: listener_port: 3314 From 884b8649362189211c72b31b9048f512e6125367 Mon Sep 17 00:00:00 2001 From: Manuel Mauro Date: Wed, 13 Nov 2024 14:05:06 +0100 Subject: [PATCH 37/60] feat: implement Serialize and Deserialize directly on AggregatedActionType --- core/lib/types/src/aggregated_operations.rs | 4 +++- core/node/eth_sender/src/health.rs | 19 +------------------ 2 files changed, 4 insertions(+), 19 deletions(-) diff --git a/core/lib/types/src/aggregated_operations.rs b/core/lib/types/src/aggregated_operations.rs index dadfad265cb2..44b730c929a3 100644 --- a/core/lib/types/src/aggregated_operations.rs +++ b/core/lib/types/src/aggregated_operations.rs @@ -1,6 +1,8 @@ use std::{fmt, str::FromStr}; -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +use serde::{Deserialize, Serialize}; + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] pub enum AggregatedActionType { Commit, PublishProofOnchain, diff --git a/core/node/eth_sender/src/health.rs b/core/node/eth_sender/src/health.rs index 4845192eab42..c23d81a4c537 100644 --- a/core/node/eth_sender/src/health.rs +++ b/core/node/eth_sender/src/health.rs @@ -6,23 +6,6 @@ use zksync_types::{ L1BlockNumber, Nonce, H256, }; -#[derive(Debug, Serialize, Deserialize)] -pub enum ActionType { - Commit, - PublishProofOnchain, - Execute, -} - -impl From for ActionType { - fn from(action_type: AggregatedActionType) -> Self { - match action_type { - AggregatedActionType::Commit => Self::Commit, - AggregatedActionType::PublishProofOnchain => Self::PublishProofOnchain, - AggregatedActionType::Execute => Self::Execute, - } - } -} - #[derive(Debug, Clone, Serialize, Deserialize)] pub struct TxStatus { pub tx_hash: H256, @@ -54,7 +37,7 @@ impl From for Health { #[derive(Debug, Serialize, Deserialize)] pub struct EthTxDetails { pub nonce: Nonce, - pub tx_type: ActionType, + pub tx_type: AggregatedActionType, pub created_at_timestamp: u64, pub predicted_gas_cost: u64, } From c701399409ed99ccc4eb19f03e7ce00a1a301e7d Mon Sep 17 00:00:00 2001 From: Manuel Mauro Date: Wed, 13 Nov 2024 14:18:29 +0100 Subject: [PATCH 38/60] refactor: make DatabaseHealthTask fields private --- .../node_framework/src/implementations/layers/postgres.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/core/node/node_framework/src/implementations/layers/postgres.rs b/core/node/node_framework/src/implementations/layers/postgres.rs index d5521db252c7..b4a22f8ef43b 100644 --- a/core/node/node_framework/src/implementations/layers/postgres.rs +++ b/core/node/node_framework/src/implementations/layers/postgres.rs @@ -132,9 +132,9 @@ impl From for Health { #[derive(Debug)] pub struct DatabaseHealthTask { - pub polling_interval_ms: u64, - pub connection_pool: ConnectionPool, - pub updater: HealthUpdater, + polling_interval_ms: u64, + connection_pool: ConnectionPool, + updater: HealthUpdater, } impl DatabaseHealthTask { From 53cca2fb8d510e0cea578a02c73a987bb48909be Mon Sep 17 00:00:00 2001 From: Manuel Mauro Date: Wed, 13 Nov 2024 14:19:36 +0100 Subject: [PATCH 39/60] refactor: remove redundant health status updates --- core/node/state_keeper/src/keeper.rs | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/core/node/state_keeper/src/keeper.rs b/core/node/state_keeper/src/keeper.rs index 72bf61ca7f45..f32fc9815b39 100644 --- a/core/node/state_keeper/src/keeper.rs +++ b/core/node/state_keeper/src/keeper.rs @@ -96,14 +96,8 @@ impl ZkSyncStateKeeper { pub async fn run(mut self, stop_receiver: watch::Receiver) -> anyhow::Result<()> { match self.run_inner(stop_receiver).await { Ok(_) => unreachable!(), - Err(Error::Fatal(err)) => { - self.health_updater - .update(Health::from(HealthStatus::ShuttingDown)); - Err(err).context("state_keeper failed") - } + Err(Error::Fatal(err)) => Err(err).context("state_keeper failed"), Err(Error::Canceled) => { - self.health_updater - .update(Health::from(HealthStatus::ShuttingDown)); tracing::info!("Stop signal received, state keeper is shutting down"); Ok(()) } From 9fe98d2fd980f07bc741178d93fb07d78099b7ee Mon Sep 17 00:00:00 2001 From: Manuel Mauro Date: Wed, 13 Nov 2024 15:42:55 +0100 Subject: [PATCH 40/60] feat: make health mod private --- core/node/state_keeper/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/node/state_keeper/src/lib.rs b/core/node/state_keeper/src/lib.rs index 65637a8da6ec..323580ab0434 100644 --- a/core/node/state_keeper/src/lib.rs +++ b/core/node/state_keeper/src/lib.rs @@ -12,7 +12,7 @@ pub use self::{ }; pub mod executor; -pub mod health; +mod health; pub mod io; mod keeper; mod mempool_actor; From 36ad7c5c2c4247510c52767f41f08ee7591d02cc Mon Sep 17 00:00:00 2001 From: Manuel Mauro Date: Wed, 13 Nov 2024 15:57:11 +0100 Subject: [PATCH 41/60] refactor: remove StateKeeperTask constructor --- .../layers/state_keeper/mod.rs | 22 +++---------------- 1 file changed, 3 insertions(+), 19 deletions(-) diff --git a/core/node/node_framework/src/implementations/layers/state_keeper/mod.rs b/core/node/node_framework/src/implementations/layers/state_keeper/mod.rs index 9a323a84e062..754fb67893f8 100644 --- a/core/node/node_framework/src/implementations/layers/state_keeper/mod.rs +++ b/core/node/node_framework/src/implementations/layers/state_keeper/mod.rs @@ -103,7 +103,7 @@ impl WiringLayer for StateKeeperLayer { self.rocksdb_options, ); - let state_keeper = StateKeeperTask::new( + let state_keeper = ZkSyncStateKeeper::new( io, batch_executor_base, output_handler, @@ -111,6 +111,8 @@ impl WiringLayer for StateKeeperLayer { Arc::new(storage_factory), ); + let state_keeper = StateKeeperTask { state_keeper }; + input .app_health .0 @@ -137,24 +139,6 @@ pub struct StateKeeperTask { } impl StateKeeperTask { - pub fn new( - io: Box, - executor_factory: Box>, - output_handler: OutputHandler, - sealer: Arc, - storage_factory: Arc, - ) -> Self { - let state_keeper = ZkSyncStateKeeper::new( - io, - executor_factory, - output_handler, - sealer, - storage_factory, - ); - - Self { state_keeper } - } - /// Returns the health check for state keeper. pub fn health_check(&self) -> ReactiveHealthCheck { self.state_keeper.health_check() From 1585d5dfbbe206e2d67a52c52d1b2910b244f409 Mon Sep 17 00:00:00 2001 From: Manuel Mauro Date: Wed, 13 Nov 2024 18:11:35 +0100 Subject: [PATCH 42/60] refactor: use getter for health updater --- .../src/implementations/layers/postgres.rs | 24 ++++++++++++------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/core/node/node_framework/src/implementations/layers/postgres.rs b/core/node/node_framework/src/implementations/layers/postgres.rs index b4a22f8ef43b..f0bfe6b9fcc2 100644 --- a/core/node/node_framework/src/implementations/layers/postgres.rs +++ b/core/node/node_framework/src/implementations/layers/postgres.rs @@ -22,7 +22,7 @@ const SCRAPE_INTERVAL: Duration = Duration::from_secs(60); #[derive(Debug, Deserialize, Clone, PartialEq)] pub struct Config { - pub polling_interval_ms: u64, + polling_interval_ms: u64, } impl Default for Config { @@ -72,18 +72,12 @@ impl WiringLayer for PostgresLayer { }; let app_health = input.app_health.0; - let (database_health_check, updater) = ReactiveHealthCheck::new("database"); + let health_task = DatabaseHealthTask::new(self.config.polling_interval_ms, pool); app_health - .insert_component(database_health_check) + .insert_component(health_task.health_check()) .map_err(WiringError::internal)?; - let health_task = DatabaseHealthTask { - polling_interval_ms: self.config.polling_interval_ms, - connection_pool: pool, - updater, - }; - Ok(Output { metrics_task, health_task, @@ -138,6 +132,14 @@ pub struct DatabaseHealthTask { } impl DatabaseHealthTask { + fn new(polling_interval_ms: u64, connection_pool: ConnectionPool) -> Self { + Self { + polling_interval_ms, + connection_pool, + updater: ReactiveHealthCheck::new("database").1, + } + } + async fn run(self, mut stop_receiver: watch::Receiver) -> anyhow::Result<()> where Self: Sized, @@ -162,6 +164,10 @@ impl DatabaseHealthTask { tracing::info!("Stop signal received; database healthcheck is shut down"); Ok(()) } + + pub fn health_check(&self) -> ReactiveHealthCheck { + self.updater.subscribe() + } } #[async_trait::async_trait] From f9e2ebfba76da480a4997b6e205b5df5fdc49416 Mon Sep 17 00:00:00 2001 From: Manuel Mauro Date: Thu, 14 Nov 2024 10:15:17 +0100 Subject: [PATCH 43/60] refactor: clippy --- core/node/eth_sender/src/health.rs | 2 +- .../src/implementations/layers/state_keeper/mod.rs | 8 ++------ 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/core/node/eth_sender/src/health.rs b/core/node/eth_sender/src/health.rs index c23d81a4c537..58d949ff905b 100644 --- a/core/node/eth_sender/src/health.rs +++ b/core/node/eth_sender/src/health.rs @@ -46,7 +46,7 @@ impl From<&EthTx> for EthTxDetails { fn from(tx: &EthTx) -> Self { Self { nonce: tx.nonce, - tx_type: tx.tx_type.into(), + tx_type: tx.tx_type, created_at_timestamp: tx.created_at_timestamp, predicted_gas_cost: tx.predicted_gas_cost, } diff --git a/core/node/node_framework/src/implementations/layers/state_keeper/mod.rs b/core/node/node_framework/src/implementations/layers/state_keeper/mod.rs index 754fb67893f8..6f21a321eb1a 100644 --- a/core/node/node_framework/src/implementations/layers/state_keeper/mod.rs +++ b/core/node/node_framework/src/implementations/layers/state_keeper/mod.rs @@ -2,14 +2,10 @@ use std::sync::Arc; use anyhow::Context; use zksync_health_check::ReactiveHealthCheck; +use zksync_state::AsyncCatchupTask; pub use zksync_state::RocksdbStorageOptions; -use zksync_state::{AsyncCatchupTask, OwnedStorage, ReadStorageFactory}; -use zksync_state_keeper::{ - seal_criteria::ConditionalSealer, AsyncRocksdbCache, OutputHandler, StateKeeperIO, - ZkSyncStateKeeper, -}; +use zksync_state_keeper::{AsyncRocksdbCache, ZkSyncStateKeeper}; use zksync_storage::RocksDB; -use zksync_vm_executor::interface::BatchExecutorFactory; use crate::{ implementations::resources::{ From 26aa8241480446c6df6f335d293c4bbb0e01dacd Mon Sep 17 00:00:00 2001 From: Manuel Mauro Date: Thu, 14 Nov 2024 13:14:28 +0100 Subject: [PATCH 44/60] feat: add git information to RustcMetadata --- Cargo.lock | 12 +-- Cargo.toml | 2 - core/lib/git_version_macro/Cargo.toml | 16 ---- core/lib/git_version_macro/src/lib.rs | 81 ------------------- core/node/house_keeper/Cargo.toml | 1 - core/node/house_keeper/src/lib.rs | 1 - core/node/house_keeper/src/version.rs | 39 --------- core/node/node_framework/Cargo.toml | 1 + .../layers/healtcheck_server.rs | 5 ++ core/node/shared_metrics/Cargo.toml | 3 + core/node/shared_metrics/build.rs | 36 ++++++++- core/node/shared_metrics/src/rustc.rs | 26 +++++- 12 files changed, 71 insertions(+), 152 deletions(-) delete mode 100644 core/lib/git_version_macro/Cargo.toml delete mode 100644 core/lib/git_version_macro/src/lib.rs delete mode 100644 core/node/house_keeper/src/version.rs diff --git a/Cargo.lock b/Cargo.lock index 1dbbe0dce459..515e8fa00ce3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -11238,13 +11238,6 @@ dependencies = [ "syn 1.0.109", ] -[[package]] -name = "zksync_git_version_macro" -version = "0.1.0" -dependencies = [ - "chrono", -] - [[package]] name = "zksync_health_check" version = "0.1.0" @@ -11272,7 +11265,6 @@ dependencies = [ "vise", "zksync_config", "zksync_dal", - "zksync_git_version_macro", "zksync_health_check", "zksync_shared_metrics", "zksync_types", @@ -11630,6 +11622,7 @@ dependencies = [ "zksync_proof_data_handler", "zksync_queued_job_processor", "zksync_reorg_detector", + "zksync_shared_metrics", "zksync_state", "zksync_state_keeper", "zksync_storage", @@ -11945,10 +11938,13 @@ dependencies = [ name = "zksync_shared_metrics" version = "0.1.0" dependencies = [ + "async-trait", "rustc_version 0.4.1", + "serde", "tracing", "vise", "zksync_dal", + "zksync_health_check", "zksync_types", ] diff --git a/Cargo.toml b/Cargo.toml index 4e8bc95a2d61..e491c64605bc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -53,7 +53,6 @@ members = [ "core/lib/da_client", "core/lib/eth_client", "core/lib/eth_signer", - "core/lib/git_version_macro", "core/lib/l1_contract_interface", "core/lib/mempool", "core/lib/merkle_tree", @@ -272,7 +271,6 @@ zksync_eth_client = { version = "0.1.0", path = "core/lib/eth_client" } zksync_da_client = { version = "0.1.0", path = "core/lib/da_client" } zksync_eth_signer = { version = "0.1.0", path = "core/lib/eth_signer" } zksync_health_check = { version = "0.1.0", path = "core/lib/health_check" } -zksync_git_version_macro = { version = "0.1.0", path = "core/lib/git_version_macro" } zksync_l1_contract_interface = { version = "0.1.0", path = "core/lib/l1_contract_interface" } zksync_mempool = { version = "0.1.0", path = "core/lib/mempool" } zksync_merkle_tree = { version = "0.1.0", path = "core/lib/merkle_tree" } diff --git a/core/lib/git_version_macro/Cargo.toml b/core/lib/git_version_macro/Cargo.toml deleted file mode 100644 index 461a72060042..000000000000 --- a/core/lib/git_version_macro/Cargo.toml +++ /dev/null @@ -1,16 +0,0 @@ -[package] -name = "zksync_git_version_macro" -edition = "2021" -description = "Procedural macro to generate metainformation about build in compile time" -version.workspace = true -homepage.workspace = true -license.workspace = true -authors.workspace = true -repository.workspace = true -keywords.workspace = true - -[lib] -proc-macro = true - -[dependencies] -chrono.workspace = true diff --git a/core/lib/git_version_macro/src/lib.rs b/core/lib/git_version_macro/src/lib.rs deleted file mode 100644 index 34b83efce195..000000000000 --- a/core/lib/git_version_macro/src/lib.rs +++ /dev/null @@ -1,81 +0,0 @@ -extern crate proc_macro; -use std::{process::Command, str::FromStr}; - -use proc_macro::TokenStream; - -/// Outputs the current date and time as a string literal. -/// Can be used to include the build timestamp in the binary. -#[proc_macro] -pub fn build_timestamp(_item: TokenStream) -> TokenStream { - let now = chrono::Local::now().format("%Y-%m-%d %H:%M:%S").to_string(); - encode_as_str(&now) -} - -/// Outputs the current git branch as a string literal. -#[proc_macro] -pub fn build_git_branch(_item: TokenStream) -> TokenStream { - let out = run_cmd("git", &["rev-parse", "--abbrev-ref", "HEAD"]); - encode_as_str(&out) -} - -/// Outputs the current git commit hash as a string literal. -#[proc_macro] -pub fn build_git_revision(_item: TokenStream) -> TokenStream { - let out = run_cmd("git", &["rev-parse", "--short", "HEAD"]); - encode_as_str(&out) -} - -/// Creates a slice of `&[(&str, &str)]` tuples that correspond to -/// the submodule name -> revision. -/// Results in an empty list if there are no submodules or if -/// the command fails. -#[proc_macro] -pub fn build_git_submodules(_item: TokenStream) -> TokenStream { - let Some(out) = run_cmd_opt("git", &["submodule", "status"]) else { - return TokenStream::from_str("&[]").unwrap(); - }; - let submodules = out - .lines() - .filter_map(|line| { - let parts: Vec<&str> = line.split_whitespace().collect(); - // Index 0 is commit hash, index 1 is the path to the folder, and there - // may be some metainformation after that. - if parts.len() >= 2 { - let folder_name = parts[1].split('/').last().unwrap_or(parts[1]); - Some((folder_name, parts[0])) - } else { - None - } - }) - .collect::>(); - let submodules = submodules - .iter() - .map(|(name, rev)| format!("(\"{}\", \"{}\")", name, rev)) - .collect::>() - .join(", "); - TokenStream::from_str(format!("&[{}]", submodules).as_str()) - .unwrap_or_else(|_| panic!("Unable to encode submodules: {}", submodules)) -} - -/// Tries to run the command, only returns `Some` if the command -/// succeeded and the output was valid utf8. -fn run_cmd(cmd: &str, args: &[&str]) -> String { - run_cmd_opt(cmd, args).unwrap_or("unknown".to_string()) -} - -fn run_cmd_opt(cmd: &str, args: &[&str]) -> Option { - let output = Command::new(cmd).args(args).output().ok()?; - if output.status.success() { - String::from_utf8(output.stdout) - .ok() - .map(|s| s.trim().to_string()) - } else { - None - } -} - -/// Encodes string as a literal. -fn encode_as_str(s: &str) -> TokenStream { - TokenStream::from_str(format!("\"{}\"", s).as_str()) - .unwrap_or_else(|_| panic!("Unable to encode string: {}", s)) -} diff --git a/core/node/house_keeper/Cargo.toml b/core/node/house_keeper/Cargo.toml index f4ea38daa688..99a88306facc 100644 --- a/core/node/house_keeper/Cargo.toml +++ b/core/node/house_keeper/Cargo.toml @@ -14,7 +14,6 @@ categories.workspace = true serde.workspace = true vise.workspace = true zksync_dal.workspace = true -zksync_git_version_macro.workspace = true zksync_health_check.workspace = true zksync_shared_metrics.workspace = true zksync_types.workspace = true diff --git a/core/node/house_keeper/src/lib.rs b/core/node/house_keeper/src/lib.rs index 2326b0a6e2d2..4e0d1962fc02 100644 --- a/core/node/house_keeper/src/lib.rs +++ b/core/node/house_keeper/src/lib.rs @@ -1,4 +1,3 @@ pub mod blocks_state_reporter; mod metrics; pub mod periodic_job; -pub mod version; diff --git a/core/node/house_keeper/src/version.rs b/core/node/house_keeper/src/version.rs deleted file mode 100644 index a6cb330a6e75..000000000000 --- a/core/node/house_keeper/src/version.rs +++ /dev/null @@ -1,39 +0,0 @@ -use async_trait::async_trait; -use serde::{Deserialize, Serialize}; -use zksync_health_check::{CheckHealth, Health, HealthStatus}; - -const GIT_VERSION: &str = zksync_git_version_macro::build_git_revision!(); -const GIT_BRANCH: &str = zksync_git_version_macro::build_git_branch!(); - -/// This struct implements a static health check describing node's version information. -#[derive(Debug, Serialize, Deserialize)] -pub struct NodeVersionInfo { - git_version: String, - git_branch: String, -} - -impl Default for NodeVersionInfo { - fn default() -> Self { - Self { - git_version: GIT_VERSION.to_string(), - git_branch: GIT_BRANCH.to_string(), - } - } -} - -impl From<&NodeVersionInfo> for Health { - fn from(details: &NodeVersionInfo) -> Self { - Self::from(HealthStatus::Ready).with_details(details) - } -} - -#[async_trait] -impl CheckHealth for NodeVersionInfo { - fn name(&self) -> &'static str { - "version" - } - - async fn check_health(&self) -> Health { - self.into() - } -} diff --git a/core/node/node_framework/Cargo.toml b/core/node/node_framework/Cargo.toml index 2899b532038e..eec9b8ef4b7a 100644 --- a/core/node/node_framework/Cargo.toml +++ b/core/node/node_framework/Cargo.toml @@ -54,6 +54,7 @@ zksync_node_storage_init.workspace = true zksync_external_price_api.workspace = true zksync_external_proof_integration_api.workspace = true zksync_logs_bloom_backfill.workspace = true +zksync_shared_metrics.workspace = true pin-project-lite.workspace = true tracing.workspace = true diff --git a/core/node/node_framework/src/implementations/layers/healtcheck_server.rs b/core/node/node_framework/src/implementations/layers/healtcheck_server.rs index 83a74c63cb45..af0a1c5dad13 100644 --- a/core/node/node_framework/src/implementations/layers/healtcheck_server.rs +++ b/core/node/node_framework/src/implementations/layers/healtcheck_server.rs @@ -3,6 +3,7 @@ use std::sync::Arc; use zksync_config::configs::api::HealthCheckConfig; use zksync_health_check::AppHealthCheck; use zksync_node_api_server::healthcheck::HealthCheckHandle; +use zksync_shared_metrics::rustc::RUSTC_METADATA; use crate::{ implementations::resources::healthcheck::AppHealthCheckResource, @@ -47,6 +48,10 @@ impl WiringLayer for HealthCheckLayer { let AppHealthCheckResource(app_health_check) = input.app_health_check; app_health_check.override_limits(self.0.slow_time_limit(), self.0.hard_time_limit()); + app_health_check + .insert_custom_component(Arc::new(RUSTC_METADATA)) + .map_err(WiringError::internal)?; + let health_check_task = HealthCheckTask { config: self.0, app_health_check, diff --git a/core/node/shared_metrics/Cargo.toml b/core/node/shared_metrics/Cargo.toml index f30a2ba35334..74eb5e925bf3 100644 --- a/core/node/shared_metrics/Cargo.toml +++ b/core/node/shared_metrics/Cargo.toml @@ -11,10 +11,13 @@ keywords.workspace = true categories.workspace = true [dependencies] +async-trait.workspace = true +serde.workspace = true vise.workspace = true tracing.workspace = true zksync_types.workspace = true zksync_dal.workspace = true +zksync_health_check.workspace = true [build-dependencies] rustc_version.workspace = true diff --git a/core/node/shared_metrics/build.rs b/core/node/shared_metrics/build.rs index d37fef0b1b0c..26a0a4054f74 100644 --- a/core/node/shared_metrics/build.rs +++ b/core/node/shared_metrics/build.rs @@ -4,6 +4,7 @@ use std::{ env, fs, io::{self, Write}, path::Path, + process::Command, }; use rustc_version::{Channel, LlvmVersion}; @@ -11,13 +12,15 @@ use rustc_version::{Channel, LlvmVersion}; fn print_rust_meta(out: &mut impl Write, meta: &rustc_version::VersionMeta) -> io::Result<()> { writeln!( out, - "pub(crate) const RUSTC_METADATA: RustcMetadata = RustcMetadata {{ \ + "pub const RUSTC_METADATA: RustcMetadata = RustcMetadata {{ \ version: {semver:?}, \ commit_hash: {commit_hash:?}, \ commit_date: {commit_date:?}, \ channel: {channel:?}, \ host: {host:?}, \ - llvm: {llvm:?} \ + llvm: {llvm:?}, \ + git_branch: {git_branch:?}, \ + git_revision: {git_revision:?} \ }};", semver = meta.semver.to_string(), commit_hash = meta.commit_hash, @@ -30,9 +33,38 @@ fn print_rust_meta(out: &mut impl Write, meta: &rustc_version::VersionMeta) -> i }, host = meta.host, llvm = meta.llvm_version.as_ref().map(LlvmVersion::to_string), + git_branch = git_branch(), + git_revision = git_revision() ) } +/// Outputs the current git branch as a string literal. +pub fn git_branch() -> String { + run_cmd("git", &["rev-parse", "--abbrev-ref", "HEAD"]) +} + +/// Outputs the current git commit hash as a string literal. +pub fn git_revision() -> String { + run_cmd("git", &["rev-parse", "--short", "HEAD"]) +} + +/// Tries to run the command, only returns `Some` if the command +/// succeeded and the output was valid utf8. +fn run_cmd(cmd: &str, args: &[&str]) -> String { + run_cmd_opt(cmd, args).unwrap_or("unknown".to_string()) +} + +fn run_cmd_opt(cmd: &str, args: &[&str]) -> Option { + let output = Command::new(cmd).args(args).output().ok()?; + if output.status.success() { + String::from_utf8(output.stdout) + .ok() + .map(|s| s.trim().to_string()) + } else { + None + } +} + fn main() { let out_dir = env::var("OUT_DIR").expect("`OUT_DIR` env var not set for build script"); let rustc_meta = rustc_version::version_meta().expect("Failed obtaining rustc metadata"); diff --git a/core/node/shared_metrics/src/rustc.rs b/core/node/shared_metrics/src/rustc.rs index 11165dbf51b0..eb41eb672160 100644 --- a/core/node/shared_metrics/src/rustc.rs +++ b/core/node/shared_metrics/src/rustc.rs @@ -1,3 +1,5 @@ +use async_trait::async_trait; +use serde::Serialize; use vise::{EncodeLabelSet, Info, Metrics}; mod values { @@ -5,10 +7,11 @@ mod values { include!(concat!(env!("OUT_DIR"), "/metadata_values.rs")); } -use values::RUSTC_METADATA; +pub use values::RUSTC_METADATA; +use zksync_health_check::{CheckHealth, Health, HealthStatus}; /// Metadata of Rust compiler used to compile the crate. -#[derive(Debug, EncodeLabelSet)] +#[derive(Debug, EncodeLabelSet, Serialize)] pub struct RustcMetadata { pub version: &'static str, pub commit_hash: Option<&'static str>, @@ -16,6 +19,8 @@ pub struct RustcMetadata { pub channel: &'static str, pub host: &'static str, pub llvm: Option<&'static str>, + pub git_branch: &'static str, + pub git_revision: &'static str, } #[derive(Debug, Metrics)] @@ -34,3 +39,20 @@ impl RustMetrics { #[vise::register] pub static RUST_METRICS: vise::Global = vise::Global::new(); + +impl From<&RustcMetadata> for Health { + fn from(details: &RustcMetadata) -> Self { + Self::from(HealthStatus::Ready).with_details(details) + } +} + +#[async_trait] +impl CheckHealth for RustcMetadata { + fn name(&self) -> &'static str { + "rustc_metadata" + } + + async fn check_health(&self) -> Health { + self.into() + } +} From f06a4158fb041794fbe24df7ebfd664bc6549b8c Mon Sep 17 00:00:00 2001 From: Manuel Mauro Date: Thu, 14 Nov 2024 13:32:18 +0100 Subject: [PATCH 45/60] refactor: rename rustc module to binary --- .../external_node/src/metrics/framework.rs | 4 +- .../layers/healtcheck_server.rs | 4 +- core/node/shared_metrics/build.rs | 30 +++++----- core/node/shared_metrics/src/binary.rs | 58 +++++++++++++++++++ core/node/shared_metrics/src/lib.rs | 2 +- core/node/shared_metrics/src/rustc.rs | 58 ------------------- 6 files changed, 78 insertions(+), 78 deletions(-) create mode 100644 core/node/shared_metrics/src/binary.rs delete mode 100644 core/node/shared_metrics/src/rustc.rs diff --git a/core/bin/external_node/src/metrics/framework.rs b/core/bin/external_node/src/metrics/framework.rs index fc9d4fe51345..4afa6081a8d6 100644 --- a/core/bin/external_node/src/metrics/framework.rs +++ b/core/bin/external_node/src/metrics/framework.rs @@ -5,7 +5,7 @@ use zksync_node_framework::{ implementations::resources::pools::{MasterPool, PoolResource}, FromContext, IntoContext, StopReceiver, Task, TaskId, WiringError, WiringLayer, }; -use zksync_shared_metrics::rustc::RUST_METRICS; +use zksync_shared_metrics::binary::BIN_METRICS; use zksync_types::{L1ChainId, L2ChainId, SLChainId}; use super::EN_METRICS; @@ -39,7 +39,7 @@ impl WiringLayer for ExternalNodeMetricsLayer { } async fn wire(self, input: Self::Input) -> Result { - RUST_METRICS.initialize(); + BIN_METRICS.initialize(); EN_METRICS.observe_config( self.l1_chain_id, self.sl_chain_id, diff --git a/core/node/node_framework/src/implementations/layers/healtcheck_server.rs b/core/node/node_framework/src/implementations/layers/healtcheck_server.rs index af0a1c5dad13..1ce55b60575a 100644 --- a/core/node/node_framework/src/implementations/layers/healtcheck_server.rs +++ b/core/node/node_framework/src/implementations/layers/healtcheck_server.rs @@ -3,7 +3,7 @@ use std::sync::Arc; use zksync_config::configs::api::HealthCheckConfig; use zksync_health_check::AppHealthCheck; use zksync_node_api_server::healthcheck::HealthCheckHandle; -use zksync_shared_metrics::rustc::RUSTC_METADATA; +use zksync_shared_metrics::binary::BIN_METADATA; use crate::{ implementations::resources::healthcheck::AppHealthCheckResource, @@ -49,7 +49,7 @@ impl WiringLayer for HealthCheckLayer { app_health_check.override_limits(self.0.slow_time_limit(), self.0.hard_time_limit()); app_health_check - .insert_custom_component(Arc::new(RUSTC_METADATA)) + .insert_custom_component(Arc::new(BIN_METADATA)) .map_err(WiringError::internal)?; let health_check_task = HealthCheckTask { diff --git a/core/node/shared_metrics/build.rs b/core/node/shared_metrics/build.rs index 26a0a4054f74..4483cd03ac71 100644 --- a/core/node/shared_metrics/build.rs +++ b/core/node/shared_metrics/build.rs @@ -9,30 +9,32 @@ use std::{ use rustc_version::{Channel, LlvmVersion}; -fn print_rust_meta(out: &mut impl Write, meta: &rustc_version::VersionMeta) -> io::Result<()> { +fn print_binary_meta(out: &mut impl Write) -> io::Result<()> { + let rustc_meta = rustc_version::version_meta().expect("Failed obtaining rustc metadata"); + writeln!( out, - "pub const RUSTC_METADATA: RustcMetadata = RustcMetadata {{ \ - version: {semver:?}, \ - commit_hash: {commit_hash:?}, \ - commit_date: {commit_date:?}, \ - channel: {channel:?}, \ + "pub const BIN_METADATA: BinMetadata = BinMetadata {{ \ + rustc_version: {semver:?}, \ + rustc_commit_hash: {commit_hash:?}, \ + rustc_commit_date: {commit_date:?}, \ + rustc_channel: {channel:?}, \ host: {host:?}, \ llvm: {llvm:?}, \ git_branch: {git_branch:?}, \ git_revision: {git_revision:?} \ }};", - semver = meta.semver.to_string(), - commit_hash = meta.commit_hash, - commit_date = meta.commit_date, - channel = match meta.channel { + semver = rustc_meta.semver.to_string(), + commit_hash = rustc_meta.commit_hash, + commit_date = rustc_meta.commit_date, + channel = match rustc_meta.channel { Channel::Dev => "dev", Channel::Beta => "beta", Channel::Nightly => "nightly", Channel::Stable => "stable", }, - host = meta.host, - llvm = meta.llvm_version.as_ref().map(LlvmVersion::to_string), + host = rustc_meta.host, + llvm = rustc_meta.llvm_version.as_ref().map(LlvmVersion::to_string), git_branch = git_branch(), git_revision = git_revision() ) @@ -67,12 +69,10 @@ fn run_cmd_opt(cmd: &str, args: &[&str]) -> Option { fn main() { let out_dir = env::var("OUT_DIR").expect("`OUT_DIR` env var not set for build script"); - let rustc_meta = rustc_version::version_meta().expect("Failed obtaining rustc metadata"); - let metadata_module_path = Path::new(&out_dir).join("metadata_values.rs"); let metadata_module = fs::File::create(metadata_module_path).expect("cannot create metadata module"); let mut metadata_module = io::BufWriter::new(metadata_module); - print_rust_meta(&mut metadata_module, &rustc_meta).expect("failed printing rustc metadata"); + print_binary_meta(&mut metadata_module).expect("failed printing binary metadata"); } diff --git a/core/node/shared_metrics/src/binary.rs b/core/node/shared_metrics/src/binary.rs new file mode 100644 index 000000000000..b673b42cdeff --- /dev/null +++ b/core/node/shared_metrics/src/binary.rs @@ -0,0 +1,58 @@ +use async_trait::async_trait; +use serde::Serialize; +use vise::{EncodeLabelSet, Info, Metrics}; + +mod values { + use super::BinMetadata; + include!(concat!(env!("OUT_DIR"), "/metadata_values.rs")); +} + +pub use values::BIN_METADATA; +use zksync_health_check::{CheckHealth, Health, HealthStatus}; + +/// Metadata of the compiled binary. +#[derive(Debug, EncodeLabelSet, Serialize)] +pub struct BinMetadata { + pub rustc_version: &'static str, + pub rustc_commit_hash: Option<&'static str>, + pub rustc_commit_date: Option<&'static str>, + pub rustc_channel: &'static str, + pub host: &'static str, + pub llvm: Option<&'static str>, + pub git_branch: &'static str, + pub git_revision: &'static str, +} + +#[derive(Debug, Metrics)] +#[metrics(prefix = "rust")] +pub struct BinMetrics { + /// General information about the compiled binary. + info: Info, +} + +impl BinMetrics { + pub fn initialize(&self) { + tracing::info!("Metadata for this binary: {BIN_METADATA:?}"); + self.info.set(BIN_METADATA).ok(); + } +} + +#[vise::register] +pub static BIN_METRICS: vise::Global = vise::Global::new(); + +impl From<&BinMetadata> for Health { + fn from(details: &BinMetadata) -> Self { + Self::from(HealthStatus::Ready).with_details(details) + } +} + +#[async_trait] +impl CheckHealth for BinMetadata { + fn name(&self) -> &'static str { + "metadata" + } + + async fn check_health(&self) -> Health { + self.into() + } +} diff --git a/core/node/shared_metrics/src/lib.rs b/core/node/shared_metrics/src/lib.rs index 2c41ec9293a0..9941eabcb18f 100644 --- a/core/node/shared_metrics/src/lib.rs +++ b/core/node/shared_metrics/src/lib.rs @@ -8,7 +8,7 @@ use vise::{ use zksync_dal::transactions_dal::L2TxSubmissionResult; use zksync_types::aggregated_operations::AggregatedActionType; -pub mod rustc; +pub mod binary; #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, EncodeLabelValue, EncodeLabelSet)] #[metrics(label = "stage", rename_all = "snake_case")] diff --git a/core/node/shared_metrics/src/rustc.rs b/core/node/shared_metrics/src/rustc.rs deleted file mode 100644 index eb41eb672160..000000000000 --- a/core/node/shared_metrics/src/rustc.rs +++ /dev/null @@ -1,58 +0,0 @@ -use async_trait::async_trait; -use serde::Serialize; -use vise::{EncodeLabelSet, Info, Metrics}; - -mod values { - use super::RustcMetadata; - include!(concat!(env!("OUT_DIR"), "/metadata_values.rs")); -} - -pub use values::RUSTC_METADATA; -use zksync_health_check::{CheckHealth, Health, HealthStatus}; - -/// Metadata of Rust compiler used to compile the crate. -#[derive(Debug, EncodeLabelSet, Serialize)] -pub struct RustcMetadata { - pub version: &'static str, - pub commit_hash: Option<&'static str>, - pub commit_date: Option<&'static str>, - pub channel: &'static str, - pub host: &'static str, - pub llvm: Option<&'static str>, - pub git_branch: &'static str, - pub git_revision: &'static str, -} - -#[derive(Debug, Metrics)] -#[metrics(prefix = "rust")] -pub struct RustMetrics { - /// General information about the Rust compiler. - info: Info, -} - -impl RustMetrics { - pub fn initialize(&self) { - tracing::info!("Metadata for rustc that this binary was compiled with: {RUSTC_METADATA:?}"); - self.info.set(RUSTC_METADATA).ok(); - } -} - -#[vise::register] -pub static RUST_METRICS: vise::Global = vise::Global::new(); - -impl From<&RustcMetadata> for Health { - fn from(details: &RustcMetadata) -> Self { - Self::from(HealthStatus::Ready).with_details(details) - } -} - -#[async_trait] -impl CheckHealth for RustcMetadata { - fn name(&self) -> &'static str { - "rustc_metadata" - } - - async fn check_health(&self) -> Health { - self.into() - } -} From 195aee48f0bbaebf408de53af8b27ff7d70fa9dd Mon Sep 17 00:00:00 2001 From: Manuel Mauro Date: Fri, 15 Nov 2024 10:45:10 +0100 Subject: [PATCH 46/60] refactor: remove redundant health status update --- core/node/eth_sender/src/eth_tx_aggregator.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/core/node/eth_sender/src/eth_tx_aggregator.rs b/core/node/eth_sender/src/eth_tx_aggregator.rs index 9696eeb9e9e0..9c8c49e92744 100644 --- a/core/node/eth_sender/src/eth_tx_aggregator.rs +++ b/core/node/eth_sender/src/eth_tx_aggregator.rs @@ -142,8 +142,6 @@ impl EthTxAggregator { if let Err(err) = self.loop_iteration(&mut storage).await { // Web3 API request failures can cause this, // and anything more important is already properly reported. - self.health_updater - .update(Health::from(HealthStatus::ShuttingDown)); tracing::warn!("eth_sender error {err:?}"); } From 4e14447a4c3680dad393328c7b42cd840c8dc0b9 Mon Sep 17 00:00:00 2001 From: Manuel Mauro Date: Fri, 15 Nov 2024 10:49:25 +0100 Subject: [PATCH 47/60] refactor: remove redundant health check update --- core/node/eth_sender/src/eth_tx_manager.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/core/node/eth_sender/src/eth_tx_manager.rs b/core/node/eth_sender/src/eth_tx_manager.rs index 43bbd1141075..ca65c6d3ec11 100644 --- a/core/node/eth_sender/src/eth_tx_manager.rs +++ b/core/node/eth_sender/src/eth_tx_manager.rs @@ -545,8 +545,6 @@ impl EthTxManager { if *stop_receiver.borrow() { tracing::info!("Stop signal received, eth_tx_manager is shutting down"); - self.health_updater - .update(Health::from(HealthStatus::ShuttingDown)); break; } let operator_to_track = self.l1_interface.supported_operator_types()[0]; From 2acadca9dda54735c3909d79958eaf2b6218c5da Mon Sep 17 00:00:00 2001 From: Manuel Mauro Date: Fri, 15 Nov 2024 10:53:15 +0100 Subject: [PATCH 48/60] fix: remove unused dependencies --- Cargo.lock | 2 -- core/node/house_keeper/Cargo.toml | 2 -- 2 files changed, 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a546f52234b1..fd98d44ffc5f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -11258,13 +11258,11 @@ version = "0.1.0" dependencies = [ "anyhow", "async-trait", - "serde", "tokio", "tracing", "vise", "zksync_config", "zksync_dal", - "zksync_health_check", "zksync_shared_metrics", "zksync_types", ] diff --git a/core/node/house_keeper/Cargo.toml b/core/node/house_keeper/Cargo.toml index 99a88306facc..b2ed3c14c20f 100644 --- a/core/node/house_keeper/Cargo.toml +++ b/core/node/house_keeper/Cargo.toml @@ -11,10 +11,8 @@ keywords.workspace = true categories.workspace = true [dependencies] -serde.workspace = true vise.workspace = true zksync_dal.workspace = true -zksync_health_check.workspace = true zksync_shared_metrics.workspace = true zksync_types.workspace = true zksync_config.workspace = true From d5d78fcfd67b45d61d63b4b79f8fa6b06198b8f6 Mon Sep 17 00:00:00 2001 From: Manuel Mauro Date: Fri, 15 Nov 2024 11:17:03 +0100 Subject: [PATCH 49/60] revert: revert formatting changes --- etc/env/file_based/general.yaml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/etc/env/file_based/general.yaml b/etc/env/file_based/general.yaml index 50f1be4e6e74..a4005e9477a8 100644 --- a/etc/env/file_based/general.yaml +++ b/etc/env/file_based/general.yaml @@ -41,7 +41,7 @@ api: estimate_gas_scale_factor: 1.3 estimate_gas_acceptable_overestimation: 5000 max_tx_size: 1000000 - api_namespaces: [en, eth, net, web3, zks, pubsub, debug] + api_namespaces: [ en,eth,net,web3,zks,pubsub,debug ] state_keeper: transaction_slots: 8192 max_allowed_l2_tx_gas_limit: 15000000000 @@ -101,7 +101,7 @@ eth: aggregated_block_execute_deadline: 10 timestamp_criteria_max_allowed_lag: 30 max_eth_tx_data_size: 120000 - aggregated_proof_sizes: [1] + aggregated_proof_sizes: [ 1 ] max_aggregated_tx_gas: 15000000 max_acceptable_priority_fee_in_gwei: 100000000000 # typo: value is in wei (100 gwei) pubdata_sending_mode: BLOBS @@ -118,6 +118,7 @@ eth: confirmations_for_eth_event: 0 eth_node_poll_interval: 300 + snapshot_creator: object_store: file_backed: @@ -126,6 +127,7 @@ snapshot_creator: concurrent_queries_count: 25 storage_logs_chunk_size: 1000000 + prover: prover_object_store: file_backed: @@ -285,6 +287,7 @@ prover_job_monitor: witness_job_queuer_run_interval_ms: 10000 http_port: 3074 + base_token_adjuster: price_polling_interval_ms: 30000 price_cache_update_interval_ms: 2000 @@ -296,6 +299,7 @@ external_price_api_client: forced_numerator: 314 forced_denominator: 1000 + house_keeper: l1_batch_metrics_reporting_interval_ms: 10000 From 9964ee7d62abf689bae57b45e5e0617c1d1e2042 Mon Sep 17 00:00:00 2001 From: Manuel Mauro Date: Mon, 18 Nov 2024 11:13:26 +0100 Subject: [PATCH 50/60] feat: use Duration for migrations' execution_time --- core/lib/dal/src/system_dal.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/lib/dal/src/system_dal.rs b/core/lib/dal/src/system_dal.rs index 3321a5af510f..6f2e64b1c1c5 100644 --- a/core/lib/dal/src/system_dal.rs +++ b/core/lib/dal/src/system_dal.rs @@ -21,7 +21,7 @@ pub struct DatabaseMigration { pub installed_on: DateTime, pub success: bool, pub checksum: String, - pub execution_time: i64, + pub execution_time: Duration, } #[derive(Debug)] @@ -118,7 +118,7 @@ impl SystemDal<'_, '_> { installed_on: row.installed_on, success: row.success, checksum: hex::encode(row.checksum), - execution_time: row.execution_time, + execution_time: Duration::from_millis(u64::try_from(row.execution_time).unwrap_or(0)), }) } } From 19275b6681b6a1ff7fe84f6a11a088d6d4a137f2 Mon Sep 17 00:00:00 2001 From: Manuel Mauro Date: Mon, 18 Nov 2024 11:31:46 +0100 Subject: [PATCH 51/60] refactor: use same interval for Postgres metrics exporter and healthcheck --- .../src/implementations/layers/postgres.rs | 34 ++++++------------- 1 file changed, 10 insertions(+), 24 deletions(-) diff --git a/core/node/node_framework/src/implementations/layers/postgres.rs b/core/node/node_framework/src/implementations/layers/postgres.rs index f0bfe6b9fcc2..1687387e698c 100644 --- a/core/node/node_framework/src/implementations/layers/postgres.rs +++ b/core/node/node_framework/src/implementations/layers/postgres.rs @@ -18,26 +18,12 @@ use crate::{ FromContext, IntoContext, }; -const SCRAPE_INTERVAL: Duration = Duration::from_secs(60); +/// Execution interval for Postrgres metrics and healthcheck tasks +const TASK_EXECUTION_INTERVAL: Duration = Duration::from_secs(60); -#[derive(Debug, Deserialize, Clone, PartialEq)] -pub struct Config { - polling_interval_ms: u64, -} - -impl Default for Config { - fn default() -> Self { - Self { - polling_interval_ms: 10_000, - } - } -} - -/// Wiring layer for the Postgres metrics exporter. +/// Wiring layer for the Postgres metrics exporter and healthcheck. #[derive(Debug, Default)] -pub struct PostgresLayer { - config: Config, -} +pub struct PostgresLayer; #[derive(Debug, FromContext)] #[context(crate = crate)] @@ -72,7 +58,7 @@ impl WiringLayer for PostgresLayer { }; let app_health = input.app_health.0; - let health_task = DatabaseHealthTask::new(self.config.polling_interval_ms, pool); + let health_task = DatabaseHealthTask::new(pool); app_health .insert_component(health_task.health_check()) @@ -102,7 +88,7 @@ impl Task for PostgresMetricsScrapingTask { async fn run(self: Box, mut stop_receiver: StopReceiver) -> anyhow::Result<()> { tokio::select! { - () = PostgresMetrics::run_scraping(self.pool_for_metrics, SCRAPE_INTERVAL) => { + () = PostgresMetrics::run_scraping(self.pool_for_metrics, TASK_EXECUTION_INTERVAL) => { tracing::warn!("Postgres metrics scraping unexpectedly stopped"); } _ = stop_receiver.0.changed() => { @@ -126,15 +112,15 @@ impl From for Health { #[derive(Debug)] pub struct DatabaseHealthTask { - polling_interval_ms: u64, + polling_interval: Duration, connection_pool: ConnectionPool, updater: HealthUpdater, } impl DatabaseHealthTask { - fn new(polling_interval_ms: u64, connection_pool: ConnectionPool) -> Self { + fn new(connection_pool: ConnectionPool) -> Self { Self { - polling_interval_ms, + polling_interval: TASK_EXECUTION_INTERVAL, connection_pool, updater: ReactiveHealthCheck::new("database").1, } @@ -144,7 +130,7 @@ impl DatabaseHealthTask { where Self: Sized, { - let timeout = Duration::from_millis(self.polling_interval_ms); + let timeout = self.polling_interval; let mut conn = self .connection_pool .connection_tagged("postgres_healthcheck") From 8911cc2e85289c31a4cdb0caf3750a17831bc808 Mon Sep 17 00:00:00 2001 From: Manuel Mauro Date: Mon, 18 Nov 2024 11:43:32 +0100 Subject: [PATCH 52/60] refactor: do not split use and mod declarations --- .../src/implementations/layers/healtcheck_server.rs | 2 +- core/node/shared_metrics/src/binary.rs | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/core/node/node_framework/src/implementations/layers/healtcheck_server.rs b/core/node/node_framework/src/implementations/layers/healtcheck_server.rs index 1ce55b60575a..8d522187e01e 100644 --- a/core/node/node_framework/src/implementations/layers/healtcheck_server.rs +++ b/core/node/node_framework/src/implementations/layers/healtcheck_server.rs @@ -3,7 +3,7 @@ use std::sync::Arc; use zksync_config::configs::api::HealthCheckConfig; use zksync_health_check::AppHealthCheck; use zksync_node_api_server::healthcheck::HealthCheckHandle; -use zksync_shared_metrics::binary::BIN_METADATA; +use zksync_shared_metrics::binary::values::BIN_METADATA; use crate::{ implementations::resources::healthcheck::AppHealthCheckResource, diff --git a/core/node/shared_metrics/src/binary.rs b/core/node/shared_metrics/src/binary.rs index b673b42cdeff..44e37835f736 100644 --- a/core/node/shared_metrics/src/binary.rs +++ b/core/node/shared_metrics/src/binary.rs @@ -1,15 +1,15 @@ use async_trait::async_trait; use serde::Serialize; use vise::{EncodeLabelSet, Info, Metrics}; +use zksync_health_check::{CheckHealth, Health, HealthStatus}; + +use self::values::BIN_METADATA; -mod values { +pub mod values { use super::BinMetadata; include!(concat!(env!("OUT_DIR"), "/metadata_values.rs")); } -pub use values::BIN_METADATA; -use zksync_health_check::{CheckHealth, Health, HealthStatus}; - /// Metadata of the compiled binary. #[derive(Debug, EncodeLabelSet, Serialize)] pub struct BinMetadata { From 6bc90e2f8364418fb8632548d59e397bf94ca29d Mon Sep 17 00:00:00 2001 From: Manuel Mauro Date: Mon, 18 Nov 2024 11:51:06 +0100 Subject: [PATCH 53/60] refactor: use Option type instead of "unknown" --- core/node/shared_metrics/build.rs | 14 ++++---------- core/node/shared_metrics/src/binary.rs | 4 ++-- 2 files changed, 6 insertions(+), 12 deletions(-) diff --git a/core/node/shared_metrics/build.rs b/core/node/shared_metrics/build.rs index 4483cd03ac71..6cf4ca4a32a0 100644 --- a/core/node/shared_metrics/build.rs +++ b/core/node/shared_metrics/build.rs @@ -41,19 +41,13 @@ fn print_binary_meta(out: &mut impl Write) -> io::Result<()> { } /// Outputs the current git branch as a string literal. -pub fn git_branch() -> String { - run_cmd("git", &["rev-parse", "--abbrev-ref", "HEAD"]) +pub fn git_branch() -> Option { + run_cmd_opt("git", &["rev-parse", "--abbrev-ref", "HEAD"]) } /// Outputs the current git commit hash as a string literal. -pub fn git_revision() -> String { - run_cmd("git", &["rev-parse", "--short", "HEAD"]) -} - -/// Tries to run the command, only returns `Some` if the command -/// succeeded and the output was valid utf8. -fn run_cmd(cmd: &str, args: &[&str]) -> String { - run_cmd_opt(cmd, args).unwrap_or("unknown".to_string()) +pub fn git_revision() -> Option { + run_cmd_opt("git", &["rev-parse", "--short", "HEAD"]) } fn run_cmd_opt(cmd: &str, args: &[&str]) -> Option { diff --git a/core/node/shared_metrics/src/binary.rs b/core/node/shared_metrics/src/binary.rs index 44e37835f736..a3ba8d7f5e1e 100644 --- a/core/node/shared_metrics/src/binary.rs +++ b/core/node/shared_metrics/src/binary.rs @@ -19,8 +19,8 @@ pub struct BinMetadata { pub rustc_channel: &'static str, pub host: &'static str, pub llvm: Option<&'static str>, - pub git_branch: &'static str, - pub git_revision: &'static str, + pub git_branch: Option<&'static str>, + pub git_revision: Option<&'static str>, } #[derive(Debug, Metrics)] From d2c983c9e3685e7958bdf4a40c82ada48090889c Mon Sep 17 00:00:00 2001 From: Manuel Mauro Date: Mon, 18 Nov 2024 12:21:53 +0100 Subject: [PATCH 54/60] feat: update state keeper health from cursor right away --- core/node/state_keeper/src/keeper.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/node/state_keeper/src/keeper.rs b/core/node/state_keeper/src/keeper.rs index f32fc9815b39..f77dfc9d89b5 100644 --- a/core/node/state_keeper/src/keeper.rs +++ b/core/node/state_keeper/src/keeper.rs @@ -112,7 +112,7 @@ impl ZkSyncStateKeeper { let (cursor, pending_batch_params) = self.io.initialize().await?; self.output_handler.initialize(&cursor).await?; self.health_updater - .update(Health::from(HealthStatus::Ready)); + .update(StateKeeperHealthDetails::from(&cursor).into()); tracing::info!( "Starting state keeper. Next l1 batch to seal: {}, next L2 block to seal: {}", cursor.l1_batch, From 25d8b26951f58b0be76ed85afa9dccc9b47e1f5f Mon Sep 17 00:00:00 2001 From: Manuel Mauro Date: Mon, 18 Nov 2024 13:03:58 +0100 Subject: [PATCH 55/60] refactor: merge tx status into tx details --- core/node/eth_sender/src/eth_tx_aggregator.rs | 4 ++-- core/node/eth_sender/src/eth_tx_manager.rs | 5 ++--- core/node/eth_sender/src/health.rs | 7 ++++--- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/core/node/eth_sender/src/eth_tx_aggregator.rs b/core/node/eth_sender/src/eth_tx_aggregator.rs index 9c8c49e92744..f0d7201d1503 100644 --- a/core/node/eth_sender/src/eth_tx_aggregator.rs +++ b/core/node/eth_sender/src/eth_tx_aggregator.rs @@ -28,7 +28,7 @@ use zksync_types::{ use super::aggregated_operations::AggregatedOperation; use crate::{ - health::EthTxAggregatorHealthDetails, + health::{EthTxAggregatorHealthDetails, EthTxDetails}, metrics::{PubdataKind, METRICS}, utils::agg_l1_batch_base_cost, zksync_functions::ZkSyncFunctions, @@ -441,7 +441,7 @@ impl EthTxAggregator { self.health_updater.update( EthTxAggregatorHealthDetails { - last_saved_tx: (&tx).into(), + last_saved_tx: EthTxDetails::new(&tx, None), } .into(), ); diff --git a/core/node/eth_sender/src/eth_tx_manager.rs b/core/node/eth_sender/src/eth_tx_manager.rs index ca65c6d3ec11..f411e9b3ae4b 100644 --- a/core/node/eth_sender/src/eth_tx_manager.rs +++ b/core/node/eth_sender/src/eth_tx_manager.rs @@ -20,7 +20,7 @@ use crate::{ AbstractL1Interface, L1BlockNumbers, OperatorNonce, OperatorType, RealL1Interface, }, eth_fees_oracle::{EthFees, EthFeesOracle, GasAdjusterFeesOracle}, - health::EthTxManagerHealthDetails, + health::{EthTxDetails, EthTxManagerHealthDetails}, metrics::TransactionType, }; @@ -423,8 +423,7 @@ impl EthTxManager { if receipt_block_number <= finalized_block.0 { self.health_updater.update( EthTxManagerHealthDetails { - last_mined_tx: tx.into(), - tx_status: (&tx_status).into(), + last_mined_tx: EthTxDetails::new(tx, Some((&tx_status).into())), finalized_block, } .into(), diff --git a/core/node/eth_sender/src/health.rs b/core/node/eth_sender/src/health.rs index 58d949ff905b..306105523bac 100644 --- a/core/node/eth_sender/src/health.rs +++ b/core/node/eth_sender/src/health.rs @@ -40,15 +40,17 @@ pub struct EthTxDetails { pub tx_type: AggregatedActionType, pub created_at_timestamp: u64, pub predicted_gas_cost: u64, + pub status: Option, } -impl From<&EthTx> for EthTxDetails { - fn from(tx: &EthTx) -> Self { +impl EthTxDetails { + pub fn new(tx: &EthTx, status: Option) -> Self { Self { nonce: tx.nonce, tx_type: tx.tx_type, created_at_timestamp: tx.created_at_timestamp, predicted_gas_cost: tx.predicted_gas_cost, + status: status, } } } @@ -56,7 +58,6 @@ impl From<&EthTx> for EthTxDetails { #[derive(Debug, Serialize, Deserialize)] pub struct EthTxManagerHealthDetails { pub last_mined_tx: EthTxDetails, - pub tx_status: TxStatus, pub finalized_block: L1BlockNumber, } From d7d62ec644fe42e66e470100efeb1570f13592f6 Mon Sep 17 00:00:00 2001 From: Manuel Mauro Date: Mon, 18 Nov 2024 13:05:19 +0100 Subject: [PATCH 56/60] refactor: clippy --- core/bin/external_node/src/node_builder.rs | 2 +- core/bin/zksync_server/src/node_builder.rs | 2 +- core/node/eth_sender/src/health.rs | 2 +- core/node/node_framework/src/implementations/layers/postgres.rs | 2 +- core/node/state_keeper/src/keeper.rs | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/core/bin/external_node/src/node_builder.rs b/core/bin/external_node/src/node_builder.rs index a51de27eb679..c5d9228e9930 100644 --- a/core/bin/external_node/src/node_builder.rs +++ b/core/bin/external_node/src/node_builder.rs @@ -126,7 +126,7 @@ impl ExternalNodeBuilder { } fn add_postgres_layer(mut self) -> anyhow::Result { - self.node.add_layer(PostgresLayer::default()); + self.node.add_layer(PostgresLayer); Ok(self) } diff --git a/core/bin/zksync_server/src/node_builder.rs b/core/bin/zksync_server/src/node_builder.rs index 6058dc046a17..7e7d219856ca 100644 --- a/core/bin/zksync_server/src/node_builder.rs +++ b/core/bin/zksync_server/src/node_builder.rs @@ -139,7 +139,7 @@ impl MainNodeBuilder { } fn add_postgres_layer(mut self) -> anyhow::Result { - self.node.add_layer(PostgresLayer::default()); + self.node.add_layer(PostgresLayer); Ok(self) } diff --git a/core/node/eth_sender/src/health.rs b/core/node/eth_sender/src/health.rs index 306105523bac..1aff80dae6d2 100644 --- a/core/node/eth_sender/src/health.rs +++ b/core/node/eth_sender/src/health.rs @@ -50,7 +50,7 @@ impl EthTxDetails { tx_type: tx.tx_type, created_at_timestamp: tx.created_at_timestamp, predicted_gas_cost: tx.predicted_gas_cost, - status: status, + status, } } } diff --git a/core/node/node_framework/src/implementations/layers/postgres.rs b/core/node/node_framework/src/implementations/layers/postgres.rs index 1687387e698c..8a81b8709895 100644 --- a/core/node/node_framework/src/implementations/layers/postgres.rs +++ b/core/node/node_framework/src/implementations/layers/postgres.rs @@ -22,7 +22,7 @@ use crate::{ const TASK_EXECUTION_INTERVAL: Duration = Duration::from_secs(60); /// Wiring layer for the Postgres metrics exporter and healthcheck. -#[derive(Debug, Default)] +#[derive(Debug)] pub struct PostgresLayer; #[derive(Debug, FromContext)] diff --git a/core/node/state_keeper/src/keeper.rs b/core/node/state_keeper/src/keeper.rs index f77dfc9d89b5..7fcc53ad2d26 100644 --- a/core/node/state_keeper/src/keeper.rs +++ b/core/node/state_keeper/src/keeper.rs @@ -7,7 +7,7 @@ use std::{ use anyhow::Context as _; use tokio::sync::watch; use tracing::{info_span, Instrument}; -use zksync_health_check::{Health, HealthStatus, HealthUpdater, ReactiveHealthCheck}; +use zksync_health_check::{HealthUpdater, ReactiveHealthCheck}; use zksync_multivm::{ interface::{ executor::{BatchExecutor, BatchExecutorFactory}, From 16d13582c31a09f892c11d1b76c75edd7123f0f7 Mon Sep 17 00:00:00 2001 From: Manuel Mauro Date: Tue, 19 Nov 2024 17:00:43 +0100 Subject: [PATCH 57/60] feat: integrate binary metadata into AppHealth --- Cargo.lock | 16 ++++++++++--- Cargo.toml | 2 ++ .../external_node/src/metrics/framework.rs | 2 +- core/lib/bin_metadata/Cargo.toml | 18 +++++++++++++++ .../bin_metadata}/build.rs | 0 .../binary.rs => lib/bin_metadata/src/lib.rs} | 22 ------------------ core/lib/health_check/Cargo.toml | 1 + core/lib/health_check/src/binary.rs | 21 +++++++++++++++++ core/lib/health_check/src/lib.rs | 5 +++- core/node/node_framework/Cargo.toml | 1 + .../layers/healtcheck_server.rs | 5 ---- core/node/shared_metrics/Cargo.toml | 6 +---- core/node/shared_metrics/src/lib.rs | 23 ++++++++++++++++--- 13 files changed, 82 insertions(+), 40 deletions(-) create mode 100644 core/lib/bin_metadata/Cargo.toml rename core/{node/shared_metrics => lib/bin_metadata}/build.rs (100%) rename core/{node/shared_metrics/src/binary.rs => lib/bin_metadata/src/lib.rs} (65%) create mode 100644 core/lib/health_check/src/binary.rs diff --git a/Cargo.lock b/Cargo.lock index 9ec7a0d3e20c..1dd906d54ef8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -11051,6 +11051,16 @@ dependencies = [ "zksync_pairing", ] +[[package]] +name = "zksync_bin_metadata" +version = "0.1.0" +dependencies = [ + "rustc_version 0.4.1", + "serde", + "tracing", + "vise", +] + [[package]] name = "zksync_block_reverter" version = "0.1.0" @@ -11813,6 +11823,7 @@ dependencies = [ "tokio", "tracing", "vise", + "zksync_bin_metadata", ] [[package]] @@ -12148,6 +12159,7 @@ dependencies = [ "tracing", "trybuild", "zksync_base_token_adjuster", + "zksync_bin_metadata", "zksync_block_reverter", "zksync_circuit_breaker", "zksync_commitment_generator", @@ -12495,13 +12507,11 @@ dependencies = [ name = "zksync_shared_metrics" version = "0.1.0" dependencies = [ - "async-trait", - "rustc_version 0.4.1", "serde", "tracing", "vise", + "zksync_bin_metadata", "zksync_dal", - "zksync_health_check", "zksync_types", ] diff --git a/Cargo.toml b/Cargo.toml index af7620a5216f..f13fc693d89a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -79,6 +79,7 @@ members = [ # Test infrastructure "core/tests/loadnext", "core/tests/vm-benchmark", + "core/lib/bin_metadata", ] resolver = "2" @@ -275,6 +276,7 @@ zksync_health_check = { version = "0.1.0", path = "core/lib/health_check" } zksync_l1_contract_interface = { version = "0.1.0", path = "core/lib/l1_contract_interface" } zksync_mempool = { version = "0.1.0", path = "core/lib/mempool" } zksync_merkle_tree = { version = "0.1.0", path = "core/lib/merkle_tree" } +zksync_bin_metadata = { version = "0.1.0", path = "core/lib/bin_metadata" } zksync_mini_merkle_tree = { version = "0.1.0", path = "core/lib/mini_merkle_tree" } zksync_object_store = { version = "0.1.0", path = "core/lib/object_store" } zksync_protobuf_config = { version = "0.1.0", path = "core/lib/protobuf_config" } diff --git a/core/bin/external_node/src/metrics/framework.rs b/core/bin/external_node/src/metrics/framework.rs index 4afa6081a8d6..20ebf52474ab 100644 --- a/core/bin/external_node/src/metrics/framework.rs +++ b/core/bin/external_node/src/metrics/framework.rs @@ -5,7 +5,7 @@ use zksync_node_framework::{ implementations::resources::pools::{MasterPool, PoolResource}, FromContext, IntoContext, StopReceiver, Task, TaskId, WiringError, WiringLayer, }; -use zksync_shared_metrics::binary::BIN_METRICS; +use zksync_shared_metrics::BIN_METRICS; use zksync_types::{L1ChainId, L2ChainId, SLChainId}; use super::EN_METRICS; diff --git a/core/lib/bin_metadata/Cargo.toml b/core/lib/bin_metadata/Cargo.toml new file mode 100644 index 000000000000..e529ecfb49a7 --- /dev/null +++ b/core/lib/bin_metadata/Cargo.toml @@ -0,0 +1,18 @@ +[package] +name = "zksync_bin_metadata" +version.workspace = true +edition.workspace = true +authors.workspace = true +homepage.workspace = true +repository.workspace = true +license.workspace = true +keywords.workspace = true +categories.workspace = true + +[dependencies] +serde.workspace = true +vise.workspace = true +tracing.workspace = true + +[build-dependencies] +rustc_version.workspace = true diff --git a/core/node/shared_metrics/build.rs b/core/lib/bin_metadata/build.rs similarity index 100% rename from core/node/shared_metrics/build.rs rename to core/lib/bin_metadata/build.rs diff --git a/core/node/shared_metrics/src/binary.rs b/core/lib/bin_metadata/src/lib.rs similarity index 65% rename from core/node/shared_metrics/src/binary.rs rename to core/lib/bin_metadata/src/lib.rs index a3ba8d7f5e1e..8c332890329a 100644 --- a/core/node/shared_metrics/src/binary.rs +++ b/core/lib/bin_metadata/src/lib.rs @@ -1,7 +1,5 @@ -use async_trait::async_trait; use serde::Serialize; use vise::{EncodeLabelSet, Info, Metrics}; -use zksync_health_check::{CheckHealth, Health, HealthStatus}; use self::values::BIN_METADATA; @@ -36,23 +34,3 @@ impl BinMetrics { self.info.set(BIN_METADATA).ok(); } } - -#[vise::register] -pub static BIN_METRICS: vise::Global = vise::Global::new(); - -impl From<&BinMetadata> for Health { - fn from(details: &BinMetadata) -> Self { - Self::from(HealthStatus::Ready).with_details(details) - } -} - -#[async_trait] -impl CheckHealth for BinMetadata { - fn name(&self) -> &'static str { - "metadata" - } - - async fn check_health(&self) -> Health { - self.into() - } -} diff --git a/core/lib/health_check/Cargo.toml b/core/lib/health_check/Cargo.toml index 6f1d863d8cec..0e823c848ce5 100644 --- a/core/lib/health_check/Cargo.toml +++ b/core/lib/health_check/Cargo.toml @@ -20,6 +20,7 @@ serde_json.workspace = true thiserror.workspace = true tokio = { workspace = true, features = ["sync", "time"] } tracing.workspace = true +zksync_bin_metadata.workspace = true [dev-dependencies] assert_matches.workspace = true diff --git a/core/lib/health_check/src/binary.rs b/core/lib/health_check/src/binary.rs new file mode 100644 index 000000000000..b14ed2ed9392 --- /dev/null +++ b/core/lib/health_check/src/binary.rs @@ -0,0 +1,21 @@ +use async_trait::async_trait; +use zksync_bin_metadata::BinMetadata; + +use crate::{CheckHealth, Health, HealthStatus}; + +impl From<&BinMetadata> for Health { + fn from(details: &BinMetadata) -> Self { + Self::from(HealthStatus::Ready).with_details(details) + } +} + +#[async_trait] +impl CheckHealth for BinMetadata { + fn name(&self) -> &'static str { + "metadata" + } + + async fn check_health(&self) -> Health { + self.into() + } +} diff --git a/core/lib/health_check/src/lib.rs b/core/lib/health_check/src/lib.rs index e4e8ba3c9a58..7f27ef2ce753 100644 --- a/core/lib/health_check/src/lib.rs +++ b/core/lib/health_check/src/lib.rs @@ -11,11 +11,14 @@ pub use async_trait::async_trait; use futures::future; use serde::Serialize; use tokio::sync::watch; +use zksync_bin_metadata::values::BIN_METADATA; use self::metrics::{CheckResult, METRICS}; use crate::metrics::AppHealthCheckConfig; +mod binary; mod metrics; + #[cfg(test)] mod tests; @@ -235,7 +238,7 @@ impl AppHealthCheck { .map(|health| health.status) .max_by_key(|status| status.priority_for_aggregation()) .unwrap_or(HealthStatus::Ready); - let inner = aggregated_status.into(); + let inner = Health::with_details(aggregated_status.into(), BIN_METADATA); let health = AppHealth { inner, components }; if !health.inner.status.is_healthy() { diff --git a/core/node/node_framework/Cargo.toml b/core/node/node_framework/Cargo.toml index eec9b8ef4b7a..6334495885f3 100644 --- a/core/node/node_framework/Cargo.toml +++ b/core/node/node_framework/Cargo.toml @@ -41,6 +41,7 @@ zksync_vm_executor.workspace = true zksync_state_keeper.workspace = true zksync_consistency_checker.workspace = true zksync_metadata_calculator.workspace = true +zksync_bin_metadata.workspace = true zksync_node_sync.workspace = true zksync_node_api_server.workspace = true zksync_node_consensus.workspace = true diff --git a/core/node/node_framework/src/implementations/layers/healtcheck_server.rs b/core/node/node_framework/src/implementations/layers/healtcheck_server.rs index 8d522187e01e..83a74c63cb45 100644 --- a/core/node/node_framework/src/implementations/layers/healtcheck_server.rs +++ b/core/node/node_framework/src/implementations/layers/healtcheck_server.rs @@ -3,7 +3,6 @@ use std::sync::Arc; use zksync_config::configs::api::HealthCheckConfig; use zksync_health_check::AppHealthCheck; use zksync_node_api_server::healthcheck::HealthCheckHandle; -use zksync_shared_metrics::binary::values::BIN_METADATA; use crate::{ implementations::resources::healthcheck::AppHealthCheckResource, @@ -48,10 +47,6 @@ impl WiringLayer for HealthCheckLayer { let AppHealthCheckResource(app_health_check) = input.app_health_check; app_health_check.override_limits(self.0.slow_time_limit(), self.0.hard_time_limit()); - app_health_check - .insert_custom_component(Arc::new(BIN_METADATA)) - .map_err(WiringError::internal)?; - let health_check_task = HealthCheckTask { config: self.0, app_health_check, diff --git a/core/node/shared_metrics/Cargo.toml b/core/node/shared_metrics/Cargo.toml index 74eb5e925bf3..23c669b4f963 100644 --- a/core/node/shared_metrics/Cargo.toml +++ b/core/node/shared_metrics/Cargo.toml @@ -11,13 +11,9 @@ keywords.workspace = true categories.workspace = true [dependencies] -async-trait.workspace = true serde.workspace = true vise.workspace = true tracing.workspace = true zksync_types.workspace = true zksync_dal.workspace = true -zksync_health_check.workspace = true - -[build-dependencies] -rustc_version.workspace = true +zksync_bin_metadata.workspace = true diff --git a/core/node/shared_metrics/src/lib.rs b/core/node/shared_metrics/src/lib.rs index 9941eabcb18f..1615233cebd5 100644 --- a/core/node/shared_metrics/src/lib.rs +++ b/core/node/shared_metrics/src/lib.rs @@ -3,13 +3,13 @@ use std::{fmt, time::Duration}; use vise::{ - Buckets, Counter, EncodeLabelSet, EncodeLabelValue, Family, Gauge, Histogram, Metrics, Unit, + Buckets, Counter, EncodeLabelSet, EncodeLabelValue, Family, Gauge, Histogram, Info, Metrics, + Unit, }; +use zksync_bin_metadata::{values::BIN_METADATA, BinMetadata}; use zksync_dal::transactions_dal::L2TxSubmissionResult; use zksync_types::aggregated_operations::AggregatedActionType; -pub mod binary; - #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, EncodeLabelValue, EncodeLabelSet)] #[metrics(label = "stage", rename_all = "snake_case")] pub enum SnapshotRecoveryStage { @@ -196,3 +196,20 @@ pub struct ExternalNodeMetrics { #[vise::register] pub static EN_METRICS: vise::Global = vise::Global::new(); + +#[derive(Debug, Metrics)] +#[metrics(prefix = "rust")] +pub struct BinMetrics { + /// General information about the compiled binary. + info: Info, +} + +impl BinMetrics { + pub fn initialize(&self) { + tracing::info!("Metadata for this binary: {BIN_METADATA:?}"); + self.info.set(BIN_METADATA).ok(); + } +} + +#[vise::register] +pub static BIN_METRICS: vise::Global = vise::Global::new(); From b6bdda43ed6164bccc3025e95f5f95c469501306 Mon Sep 17 00:00:00 2001 From: Manuel Mauro Date: Thu, 21 Nov 2024 11:05:02 +0100 Subject: [PATCH 58/60] feat: split git metrics from rust metrics --- .../external_node/src/metrics/framework.rs | 5 +- core/lib/bin_metadata/build.rs | 17 +++--- core/lib/bin_metadata/src/lib.rs | 57 ++++++++++++++----- core/lib/health_check/src/lib.rs | 13 ++++- core/node/shared_metrics/src/lib.rs | 22 ++----- 5 files changed, 72 insertions(+), 42 deletions(-) diff --git a/core/bin/external_node/src/metrics/framework.rs b/core/bin/external_node/src/metrics/framework.rs index 20ebf52474ab..228af8aa0417 100644 --- a/core/bin/external_node/src/metrics/framework.rs +++ b/core/bin/external_node/src/metrics/framework.rs @@ -5,7 +5,7 @@ use zksync_node_framework::{ implementations::resources::pools::{MasterPool, PoolResource}, FromContext, IntoContext, StopReceiver, Task, TaskId, WiringError, WiringLayer, }; -use zksync_shared_metrics::BIN_METRICS; +use zksync_shared_metrics::{GIT_METRICS, RUST_METRICS}; use zksync_types::{L1ChainId, L2ChainId, SLChainId}; use super::EN_METRICS; @@ -39,7 +39,8 @@ impl WiringLayer for ExternalNodeMetricsLayer { } async fn wire(self, input: Self::Input) -> Result { - BIN_METRICS.initialize(); + RUST_METRICS.initialize(); + GIT_METRICS.initialize(); EN_METRICS.observe_config( self.l1_chain_id, self.sl_chain_id, diff --git a/core/lib/bin_metadata/build.rs b/core/lib/bin_metadata/build.rs index 6cf4ca4a32a0..ff339ae5589c 100644 --- a/core/lib/bin_metadata/build.rs +++ b/core/lib/bin_metadata/build.rs @@ -14,15 +14,18 @@ fn print_binary_meta(out: &mut impl Write) -> io::Result<()> { writeln!( out, - "pub const BIN_METADATA: BinMetadata = BinMetadata {{ \ - rustc_version: {semver:?}, \ - rustc_commit_hash: {commit_hash:?}, \ - rustc_commit_date: {commit_date:?}, \ - rustc_channel: {channel:?}, \ + "pub const RUST_METADATA: RustMetadata = RustMetadata {{ \ + version: {semver:?}, \ + commit_hash: {commit_hash:?}, \ + commit_date: {commit_date:?}, \ + channel: {channel:?}, \ host: {host:?}, \ llvm: {llvm:?}, \ - git_branch: {git_branch:?}, \ - git_revision: {git_revision:?} \ + }}; + + pub const GIT_METADATA: GitMetadata = GitMetadata {{ \ + branch: {git_branch:?}, \ + revision: {git_revision:?} \ }};", semver = rustc_meta.semver.to_string(), commit_hash = rustc_meta.commit_hash, diff --git a/core/lib/bin_metadata/src/lib.rs b/core/lib/bin_metadata/src/lib.rs index 8c332890329a..d6f8ca73ed83 100644 --- a/core/lib/bin_metadata/src/lib.rs +++ b/core/lib/bin_metadata/src/lib.rs @@ -1,36 +1,65 @@ use serde::Serialize; use vise::{EncodeLabelSet, Info, Metrics}; -use self::values::BIN_METADATA; +use self::values::GIT_METADATA; +use self::values::RUST_METADATA; pub mod values { - use super::BinMetadata; + use super::GitMetadata; + use super::RustMetadata; + include!(concat!(env!("OUT_DIR"), "/metadata_values.rs")); } /// Metadata of the compiled binary. -#[derive(Debug, EncodeLabelSet, Serialize)] +#[derive(Debug, Serialize)] pub struct BinMetadata { - pub rustc_version: &'static str, - pub rustc_commit_hash: Option<&'static str>, - pub rustc_commit_date: Option<&'static str>, - pub rustc_channel: &'static str, + pub rust: RustMetadata, + pub git: GitMetadata, +} + +/// Rust metadata of the compiled binary. +#[derive(Debug, EncodeLabelSet, Serialize)] +pub struct RustMetadata { + pub version: &'static str, + pub commit_hash: Option<&'static str>, + pub commit_date: Option<&'static str>, + pub channel: &'static str, pub host: &'static str, pub llvm: Option<&'static str>, - pub git_branch: Option<&'static str>, - pub git_revision: Option<&'static str>, +} + +/// Git metadata of the compiled binary. +#[derive(Debug, EncodeLabelSet, Serialize)] +pub struct GitMetadata { + pub branch: Option<&'static str>, + pub revision: Option<&'static str>, } #[derive(Debug, Metrics)] #[metrics(prefix = "rust")] -pub struct BinMetrics { +pub struct RustMetrics { + /// General information about the compiled binary. + info: Info, +} + +impl RustMetrics { + pub fn initialize(&self) { + tracing::info!("Rust metadata for this binary: {RUST_METADATA:?}"); + self.info.set(RUST_METADATA).ok(); + } +} + +#[derive(Debug, Metrics)] +#[metrics(prefix = "git_info")] +pub struct GitMetrics { /// General information about the compiled binary. - info: Info, + info: Info, } -impl BinMetrics { +impl GitMetrics { pub fn initialize(&self) { - tracing::info!("Metadata for this binary: {BIN_METADATA:?}"); - self.info.set(BIN_METADATA).ok(); + tracing::info!("Git metadata for this binary: {GIT_METADATA:?}"); + self.info.set(GIT_METADATA).ok(); } } diff --git a/core/lib/health_check/src/lib.rs b/core/lib/health_check/src/lib.rs index 7f27ef2ce753..be54c5a2eec2 100644 --- a/core/lib/health_check/src/lib.rs +++ b/core/lib/health_check/src/lib.rs @@ -11,7 +11,10 @@ pub use async_trait::async_trait; use futures::future; use serde::Serialize; use tokio::sync::watch; -use zksync_bin_metadata::values::BIN_METADATA; +use zksync_bin_metadata::{ + values::{GIT_METADATA, RUST_METADATA}, + BinMetadata, +}; use self::metrics::{CheckResult, METRICS}; use crate::metrics::AppHealthCheckConfig; @@ -238,7 +241,13 @@ impl AppHealthCheck { .map(|health| health.status) .max_by_key(|status| status.priority_for_aggregation()) .unwrap_or(HealthStatus::Ready); - let inner = Health::with_details(aggregated_status.into(), BIN_METADATA); + let inner = Health::with_details( + aggregated_status.into(), + BinMetadata { + rust: RUST_METADATA, + git: GIT_METADATA, + }, + ); let health = AppHealth { inner, components }; if !health.inner.status.is_healthy() { diff --git a/core/node/shared_metrics/src/lib.rs b/core/node/shared_metrics/src/lib.rs index 1615233cebd5..e37764c5a6d7 100644 --- a/core/node/shared_metrics/src/lib.rs +++ b/core/node/shared_metrics/src/lib.rs @@ -3,10 +3,9 @@ use std::{fmt, time::Duration}; use vise::{ - Buckets, Counter, EncodeLabelSet, EncodeLabelValue, Family, Gauge, Histogram, Info, Metrics, - Unit, + Buckets, Counter, EncodeLabelSet, EncodeLabelValue, Family, Gauge, Histogram, Metrics, Unit, }; -use zksync_bin_metadata::{values::BIN_METADATA, BinMetadata}; +use zksync_bin_metadata::{GitMetrics, RustMetrics}; use zksync_dal::transactions_dal::L2TxSubmissionResult; use zksync_types::aggregated_operations::AggregatedActionType; @@ -197,19 +196,8 @@ pub struct ExternalNodeMetrics { #[vise::register] pub static EN_METRICS: vise::Global = vise::Global::new(); -#[derive(Debug, Metrics)] -#[metrics(prefix = "rust")] -pub struct BinMetrics { - /// General information about the compiled binary. - info: Info, -} - -impl BinMetrics { - pub fn initialize(&self) { - tracing::info!("Metadata for this binary: {BIN_METADATA:?}"); - self.info.set(BIN_METADATA).ok(); - } -} +#[vise::register] +pub static RUST_METRICS: vise::Global = vise::Global::new(); #[vise::register] -pub static BIN_METRICS: vise::Global = vise::Global::new(); +pub static GIT_METRICS: vise::Global = vise::Global::new(); From 785fe5bbe0fda20fdfaee37cf7dc3df990cebfaf Mon Sep 17 00:00:00 2001 From: Manuel Mauro Date: Thu, 21 Nov 2024 11:05:56 +0100 Subject: [PATCH 59/60] style: format code --- core/lib/bin_metadata/src/lib.rs | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/core/lib/bin_metadata/src/lib.rs b/core/lib/bin_metadata/src/lib.rs index d6f8ca73ed83..955198401cf0 100644 --- a/core/lib/bin_metadata/src/lib.rs +++ b/core/lib/bin_metadata/src/lib.rs @@ -1,12 +1,10 @@ use serde::Serialize; use vise::{EncodeLabelSet, Info, Metrics}; -use self::values::GIT_METADATA; -use self::values::RUST_METADATA; +use self::values::{GIT_METADATA, RUST_METADATA}; pub mod values { - use super::GitMetadata; - use super::RustMetadata; + use super::{GitMetadata, RustMetadata}; include!(concat!(env!("OUT_DIR"), "/metadata_values.rs")); } From e30ebacfafa1456334ff8f24a857e925ab149916 Mon Sep 17 00:00:00 2001 From: Manuel Mauro Date: Thu, 21 Nov 2024 11:46:35 +0100 Subject: [PATCH 60/60] refactor: nit BinMetadata creation --- core/lib/bin_metadata/src/lib.rs | 5 +++++ core/lib/health_check/src/lib.rs | 13 ++----------- 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/core/lib/bin_metadata/src/lib.rs b/core/lib/bin_metadata/src/lib.rs index 955198401cf0..d8a5221e4775 100644 --- a/core/lib/bin_metadata/src/lib.rs +++ b/core/lib/bin_metadata/src/lib.rs @@ -9,6 +9,11 @@ pub mod values { include!(concat!(env!("OUT_DIR"), "/metadata_values.rs")); } +pub const BIN_METADATA: BinMetadata = BinMetadata { + rust: RUST_METADATA, + git: GIT_METADATA, +}; + /// Metadata of the compiled binary. #[derive(Debug, Serialize)] pub struct BinMetadata { diff --git a/core/lib/health_check/src/lib.rs b/core/lib/health_check/src/lib.rs index be54c5a2eec2..7dcdb47aa2f9 100644 --- a/core/lib/health_check/src/lib.rs +++ b/core/lib/health_check/src/lib.rs @@ -11,10 +11,7 @@ pub use async_trait::async_trait; use futures::future; use serde::Serialize; use tokio::sync::watch; -use zksync_bin_metadata::{ - values::{GIT_METADATA, RUST_METADATA}, - BinMetadata, -}; +use zksync_bin_metadata::BIN_METADATA; use self::metrics::{CheckResult, METRICS}; use crate::metrics::AppHealthCheckConfig; @@ -241,13 +238,7 @@ impl AppHealthCheck { .map(|health| health.status) .max_by_key(|status| status.priority_for_aggregation()) .unwrap_or(HealthStatus::Ready); - let inner = Health::with_details( - aggregated_status.into(), - BinMetadata { - rust: RUST_METADATA, - git: GIT_METADATA, - }, - ); + let inner = Health::with_details(aggregated_status.into(), BIN_METADATA); let health = AppHealth { inner, components }; if !health.inner.status.is_healthy() {