feat(prover): Add min_provers and dry_run features. Improve metrics a…

…nd test. (#3129) ## What ❔ Improve metrics and test. Add min_provers config. Add dry_run config option for Agent.    ## Why ❔   ## Checklist   - [x] PR title corresponds to the body of PR (we generate changelog entries from PRs). - [x] Tests for the changes have been added / updated. - [ ] Documentation comments have been added / updated. - [x] Code has been formatted via `zkstack dev fmt` and `zkstack dev lint`.
matter-labs · Oct 21, 2024 · 7c28964 · 7c28964
1 parent 0757ecd
commit 7c28964
Show file tree

Hide file tree

Showing 13 changed files with 467 additions and 77 deletions.
diff --git a/core/lib/config/src/configs/prover_autoscaler.rs b/core/lib/config/src/configs/prover_autoscaler.rs
@@ -30,6 +30,9 @@ pub struct ProverAutoscalerAgentConfig {
     pub namespaces: Vec<String>,
     /// Watched cluster name. Also can be set via flag.
     pub cluster_name: Option<String>,
+    /// If dry-run enabled don't do any k8s updates, just report success.
+    #[serde(default = "ProverAutoscalerAgentConfig::default_dry_run")]
+    pub dry_run: bool,
 }
 
 #[derive(Debug, Clone, PartialEq, Deserialize, Default)]
@@ -53,6 +56,8 @@ pub struct ProverAutoscalerScalerConfig {
     pub prover_speed: HashMap<Gpu, u32>,
     /// Maximum number of provers which can be run per cluster/GPU.
     pub max_provers: HashMap<String, HashMap<Gpu, u32>>,
+    /// Minimum number of provers per namespace.
+    pub min_provers: HashMap<String, u32>,
     /// Duration after which pending pod considered long pending.
     #[serde(default = "ProverAutoscalerScalerConfig::default_long_pending_duration")]
     pub long_pending_duration: Duration,
@@ -99,6 +104,10 @@ impl ProverAutoscalerAgentConfig {
     pub fn default_namespaces() -> Vec<String> {
         vec!["prover-blue".to_string(), "prover-red".to_string()]
     }
+
+    pub fn default_dry_run() -> bool {
+        true
+    }
 }
 
 impl ProverAutoscalerScalerConfig {

diff --git a/core/lib/protobuf_config/src/proto/config/prover_autoscaler.proto b/core/lib/protobuf_config/src/proto/config/prover_autoscaler.proto
@@ -17,6 +17,7 @@ message ProverAutoscalerAgentConfig {
   optional uint32 http_port = 2; // required
   repeated string namespaces = 3; // optional
   optional string cluster_name = 4; // optional
+  optional bool dry_run = 5; // optional
 }
 
 message ProtocolVersion {
@@ -39,6 +40,11 @@ message MaxProver {
   optional uint32 max = 2; // required
 }
 
+message MinProver {
+  optional string namespace = 1; // required
+  optional uint32 min = 2; // required
+}
+
 message ProverAutoscalerScalerConfig {
   optional uint32 prometheus_port = 1; // required
   optional std.Duration scaler_run_interval = 2; // optional
@@ -49,4 +55,5 @@ message ProverAutoscalerScalerConfig {
   repeated ProverSpeed prover_speed = 7; // optional
   optional uint32 long_pending_duration_s = 8; // optional
   repeated MaxProver max_provers = 9; // optional
+  repeated MinProver min_provers = 10; // optional
 }
diff --git a/core/lib/protobuf_config/src/prover_autoscaler.rs b/core/lib/protobuf_config/src/prover_autoscaler.rs
@@ -1,6 +1,6 @@
 use std::collections::HashMap;
 
-use anyhow::Context as _;
+use anyhow::Context;
 use time::Duration;
 use zksync_config::configs::{self, prover_autoscaler::Gpu};
 use zksync_protobuf::{read_optional, repr::ProtoRepr, required, ProtoFmt};
@@ -42,6 +42,7 @@ impl ProtoRepr for proto::ProverAutoscalerAgentConfig {
                 .context("http_port")?,
             namespaces: self.namespaces.to_vec(),
             cluster_name: Some("".to_string()),
+            dry_run: self.dry_run.unwrap_or(Self::Type::default_dry_run()),
         })
     }
 
@@ -51,6 +52,7 @@ impl ProtoRepr for proto::ProverAutoscalerAgentConfig {
             http_port: Some(this.http_port.into()),
             namespaces: this.namespaces.clone(),
             cluster_name: this.cluster_name.clone(),
+            dry_run: Some(this.dry_run),
         }
     }
 }
@@ -103,6 +105,13 @@ impl ProtoRepr for proto::ProverAutoscalerScalerConfig {
                 }
                 acc
             }),
+            min_provers: self
+                .min_provers
+                .iter()
+                .enumerate()
+                .map(|(i, e)| e.read().context(i))
+                .collect::<Result<_, _>>()
+                .context("min_provers")?,
         })
     }
 
@@ -137,6 +146,11 @@ impl ProtoRepr for proto::ProverAutoscalerScalerConfig {
                     })
                 })
                 .collect(),
+            min_provers: this
+                .min_provers
+                .iter()
+                .map(|(k, v)| proto::MinProver::build(&(k.clone(), *v)))
+                .collect(),
         }
     }
 }
@@ -208,3 +222,19 @@ impl ProtoRepr for proto::MaxProver {
         }
     }
 }
+
+impl ProtoRepr for proto::MinProver {
+    type Type = (String, u32);
+    fn read(&self) -> anyhow::Result<Self::Type> {
+        Ok((
+            required(&self.namespace).context("namespace")?.clone(),
+            *required(&self.min).context("min")?,
+        ))
+    }
+    fn build(this: &Self::Type) -> Self {
+        Self {
+            namespace: Some(this.0.to_string()),
+            min: Some(this.1),
+        }
+    }
+}
diff --git a/prover/Cargo.lock b/prover/Cargo.lock
diff --git a/prover/Cargo.toml b/prover/Cargo.toml
@@ -58,6 +58,7 @@ tokio-util = "0.7.11"
 toml_edit = "0.14.4"
 tracing = "0.1"
 tracing-subscriber = "0.3"
+tracing-test = "0.2.5"
 url = "2.5.2"
 vise = "0.2.0"
 

diff --git a/prover/crates/bin/prover_autoscaler/Cargo.toml b/prover/crates/bin/prover_autoscaler/Cargo.toml
@@ -43,3 +43,4 @@ tracing-subscriber = { workspace = true, features = ["env-filter"] }
 tracing.workspace = true
 url.workspace = true
 vise.workspace = true
+tracing-test.workspace = true
diff --git a/prover/crates/bin/prover_autoscaler/src/global/queuer.rs b/prover/crates/bin/prover_autoscaler/src/global/queuer.rs
@@ -5,6 +5,10 @@ use reqwest::Method;
 use zksync_prover_job_monitor::autoscaler_queue_reporter::VersionedQueueReport;
 use zksync_utils::http_with_retries::send_request_with_retries;
 
+use crate::metrics::{AUTOSCALER_METRICS, DEFAULT_ERROR_CODE};
+
+const MAX_RETRIES: usize = 5;
+
 #[derive(Debug)]
 pub struct Queue {
     pub queue: HashMap<String, u64>,
@@ -24,15 +28,19 @@ impl Queuer {
 
     pub async fn get_queue(&self) -> anyhow::Result<Queue> {
         let url = &self.prover_job_monitor_url;
-        let response = send_request_with_retries(url, 5, Method::GET, None, None).await;
-        let res = response
-            .map_err(|err| anyhow::anyhow!("Failed fetching queue from url: {url}: {err:?}"))?
+        let response = send_request_with_retries(url, MAX_RETRIES, Method::GET, None, None).await;
+        let response = response.map_err(|err| {
+            AUTOSCALER_METRICS.calls[&(url.clone(), DEFAULT_ERROR_CODE)].inc();
+            anyhow::anyhow!("Failed fetching queue from url: {url}: {err:?}")
+        })?;
+
+        AUTOSCALER_METRICS.calls[&(url.clone(), response.status().as_u16())].inc();
+        let response = response
             .json::<Vec<VersionedQueueReport>>()
             .await
             .context("Failed to read response as json")?;
-
         Ok(Queue {
-            queue: res
+            queue: response
                 .iter()
                 .map(|x| (x.version.to_string(), x.report.prover_jobs.queued as u64))
                 .collect::<HashMap<_, _>>(),