Skip to content

Commit

Permalink
feat(prover): Add queue metric to report autoscaler view of the queue. (
Browse files Browse the repository at this point in the history
#3206)

## What ❔
Add `queue` metric to report autoscaler view of the queue.
Add Copy trait for QueueReportFields and remove unneeded clone or
references.

<!-- What are the changes this PR brings about? -->
<!-- Example: This PR adds a PR template to the repo. -->
<!-- (For bigger PRs adding more context is appreciated) -->

## Why ❔

The `queue` metric will be used in dashboards.

<!-- Why are these changes done? What goal do they contribute to? What
are the principles behind them? -->
<!-- Example: PR templates ensure PR reviewers, observers, and future
iterators are in context about the evolution of repos. -->

## Checklist

<!-- Check your PR fulfills the following items. -->
<!-- For draft PRs check the boxes as you complete them. -->

- [x] PR title corresponds to the body of PR (we generate changelog
entries from PRs).
- [ ] Tests for the changes have been added / updated.
- [ ] Documentation comments have been added / updated.
- [x] Code has been formatted via `zkstack dev fmt` and `zkstack dev
lint`.

ref ZKD-1855
  • Loading branch information
yorik authored Oct 31, 2024
1 parent 15bb6b5 commit 2721396
Show file tree
Hide file tree
Showing 4 changed files with 12 additions and 8 deletions.
2 changes: 1 addition & 1 deletion core/lib/config/src/configs/prover_autoscaler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ pub enum Gpu {

// TODO: generate this enum by QueueReport from https://github.com/matter-labs/zksync-era/blob/main/prover/crates/bin/prover_job_monitor/src/autoscaler_queue_reporter.rs#L23
// and remove allowing of non_camel_case_types by generating field name parser.
#[derive(Debug, Display, PartialEq, Eq, Hash, Clone, Deserialize, EnumString, Default)]
#[derive(Debug, Display, PartialEq, Eq, Hash, Clone, Copy, Deserialize, EnumString, Default)]
#[allow(non_camel_case_types)]
pub enum QueueReportFields {
#[strum(ascii_case_insensitive)]
Expand Down
6 changes: 3 additions & 3 deletions prover/crates/bin/prover_autoscaler/src/global/queuer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ pub struct Queuer {
pub prover_job_monitor_url: String,
}

fn target_to_queue(target: &QueueReportFields, report: &QueueReport) -> u64 {
fn target_to_queue(target: QueueReportFields, report: &QueueReport) -> u64 {
let res = match target {
QueueReportFields::basic_witness_jobs => report.basic_witness_jobs.all(),
QueueReportFields::leaf_witness_jobs => report.leaf_witness_jobs.all(),
Expand Down Expand Up @@ -65,8 +65,8 @@ impl Queuer {
.flat_map(|versioned_report| {
jobs.iter().map(move |j| {
(
(versioned_report.version.to_string(), j.clone()),
target_to_queue(j, &versioned_report.report),
(versioned_report.version.to_string(), *j),
target_to_queue(*j, &versioned_report.report),
)
})
})
Expand Down
8 changes: 5 additions & 3 deletions prover/crates/bin/prover_autoscaler/src/global/scaler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ impl Scaler {
let mut simple_scalers = Vec::default();
let mut jobs = vec![QueueReportFields::prover_jobs];
for c in &config.scaler_targets {
jobs.push(c.queue_report_field.clone());
jobs.push(c.queue_report_field);
simple_scalers.push(SimpleScaler::new(
c,
config.cluster_priorities.clone(),
Expand Down Expand Up @@ -429,7 +429,7 @@ impl SimpleScaler {
long_pending_duration: chrono::Duration,
) -> Self {
Self {
queue_report_field: config.queue_report_field.clone(),
queue_report_field: config.queue_report_field,
deployment: config.deployment.clone(),
cluster_priorities,
max_replicas: config.max_replicas.clone(),
Expand Down Expand Up @@ -671,6 +671,7 @@ impl Task for Scaler {
.get(&(ppv.to_string(), QueueReportFields::prover_jobs))
.cloned()
.unwrap_or(0);
AUTOSCALER_METRICS.queue[&(ns.clone(), "prover".into())].set(q);
tracing::debug!("Running eval for namespace {ns} and PPV {ppv} found queue {q}");
if q > 0 || is_namespace_running(ns, &guard.clusters) {
let provers = self.prover_scaler.run(ns, q, &guard.clusters);
Expand All @@ -684,9 +685,10 @@ impl Task for Scaler {
// Simple Scalers.
for scaler in &self.simple_scalers {
let q = queue
.get(&(ppv.to_string(), scaler.queue_report_field.clone()))
.get(&(ppv.to_string(), scaler.queue_report_field))
.cloned()
.unwrap_or(0);
AUTOSCALER_METRICS.queue[&(ns.clone(), scaler.deployment.clone())].set(q);
tracing::debug!("Running eval for namespace {ns}, PPV {ppv}, simple scaler {} found queue {q}", scaler.deployment);
if q > 0 || is_namespace_running(ns, &guard.clusters) {
let replicas = scaler.run(ns, q, &guard.clusters);
Expand Down
4 changes: 3 additions & 1 deletion prover/crates/bin/prover_autoscaler/src/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@ pub(crate) struct AutoscalerMetrics {
#[metrics(labels = ["target", "status"])]
pub calls: LabeledFamily<(String, u16), Counter, 2>,
#[metrics(labels = ["target_cluster"])]
pub scale_errors: LabeledFamily<String, Gauge<u64>, 1>,
pub scale_errors: LabeledFamily<String, Gauge<u64>>,
#[metrics(labels = ["target_namespace", "job"])]
pub queue: LabeledFamily<(String, String), Gauge<u64>, 2>,
}

#[vise::register]
Expand Down

0 comments on commit 2721396

Please sign in to comment.