From 2eb2f416ba3ec0431a0c55b0003638586a1a525e Mon Sep 17 00:00:00 2001 From: vprashar2929 Date: Mon, 4 Nov 2024 16:50:18 +0530 Subject: [PATCH] feat(validator): add support to validate kepler metrics This commit adds support to validate essential metrics produced by Kepler Signed-off-by: vprashar2929 --- e2e/tools/validator/metric_validations.yaml | 319 ++++++++++++++++++ .../validator/src/validator/cli/__init__.py | 29 ++ .../src/validator/validations/__init__.py | 1 + 3 files changed, 349 insertions(+) create mode 100644 e2e/tools/validator/metric_validations.yaml diff --git a/e2e/tools/validator/metric_validations.yaml b/e2e/tools/validator/metric_validations.yaml new file mode 100644 index 0000000000..1d7e19d6a4 --- /dev/null +++ b/e2e/tools/validator/metric_validations.yaml @@ -0,0 +1,319 @@ +config: + mapping: + actual: latest + predicted: dev + +validations: + # absolute power comparison + - name: Total - absolute + latest: | + sum( + rate( + kepler_process_joules_total{{ + job="latest", + }}[{rate_interval}] + ) + ) + dev: | + sum( + rate( + kepler_process_joules_total{{ + job="dev", + }}[{rate_interval}] + ) + ) + + max_mae: 0.5 + + - name: platform - absolute + latest: | + sum( + rate( + kepler_process_platform_joules_total{{ + job="latest", + }}[{rate_interval}] + ) + ) + dev: | + sum( + rate( + kepler_process_platform_joules_total{{ + job="dev", + }}[{rate_interval}] + ) + ) + + max_mae: 0.5 + + - name: package - absolute + latest: | + sum( + rate( + kepler_process_package_joules_total{{ + job="latest", + }}[{rate_interval}] + ) + ) + dev: | + sum( + rate( + kepler_process_package_joules_total{{ + job="dev", + }}[{rate_interval}] + ) + ) + max_mae: 0.5 + + - name: dram - absolute + latest: | + sum( + rate( + kepler_process_dram_joules_total{{ + job="latest", + }}[{rate_interval}] + ) + ) + dev: | + sum( + rate( + kepler_process_dram_joules_total{{ + job="dev", + }}[{rate_interval}] + ) + ) + max_mae: 0.5 + + - name: core - absolute + units: Watts + latest: | + sum( + rate( + kepler_process_core_joules_total{{ + job="latest", + }}[{rate_interval}] + ) + ) + dev: | + sum( + rate( + kepler_process_core_joules_total{{ + job="dev", + }}[{rate_interval}] + ) + ) + max_mae: 0.5 + + - name: other - absolute + units: Watts + latest: | + sum( + rate( + kepler_process_other_joules_total{{ + job="latest", + }}[{rate_interval}] + ) + ) + dev: | + sum( + rate( + kepler_process_other_joules_total{{ + job="dev", + }}[{rate_interval}] + ) + ) + max_mae: 0.5 + + # CPU time comparison + - name: cpu-time + units: Milliseconds + latest: | + sum( + rate( + kepler_process_bpf_cpu_time_ms_total{{ + job="latest" + }}[{rate_interval}] + ) + ) + dev: | + sum( + rate( + kepler_process_bpf_cpu_time_ms_total{{ + job="dev", + }}[{rate_interval}] + ) + ) + # max_mae: 20.0 + + - name: package - dynamic + units: Watts + latest: | + sum( + rate( + kepler_process_package_joules_total{{ + job="latest", + mode="dynamic", + }}[{rate_interval}] + ) + ) + dev: | + sum( + rate( + kepler_process_package_joules_total{{ + job="dev", + mode="dynamic", + }}[{rate_interval}] + ) + ) + max_mae: 0.5 + + - name: core - dynamic + units: Watts + latest: | + sum( + rate( + kepler_process_core_joules_total{{ + job="latest", + mode="dynamic", + }}[{rate_interval}] + ) + ) + dev: | + sum( + rate( + kepler_process_core_joules_total{{ + job="dev", + mode="dynamic", + }}[{rate_interval}] + ) + ) + max_mae: 0.5 + + - name: dram - dynamic + units: Watts + latest: | + sum( + rate( + kepler_process_dram_joules_total{{ + job="latest", + mode="dynamic", + }}[{rate_interval}] + ) + ) + dev: | + sum( + rate( + kepler_process_dram_joules_total{{ + job="dev", + mode="dynamic", + }}[{rate_interval}] + ) + ) + max_mae: 0.5 + + - name: other - dynamic + units: Watts + latest: | + sum( + rate( + kepler_process_other_joules_total{{ + job="latest", + mode="dynamic", + }}[{rate_interval}] + ) + ) + dev: | + sum( + rate( + kepler_process_other_joules_total{{ + job="dev", + mode="dynamic", + }}[{rate_interval}] + ) + ) + max_mae: 0.5 + +# Node comparison + - name: node platform - dynamic + units: Watts + latest: | + rate(kepler_node_platform_joules_total{{ + job="latest", + mode="dynamic", + }}[{rate_interval}] + ) + + dev: | + rate(kepler_node_platform_joules_total{{ + job="dev", + mode="dynamic", + }}[{rate_interval}] + ) + max_mae: 0.5 + + - name: node package - dynamic + units: Watts + latest: | + rate(kepler_node_package_joules_total{{ + job="latest", + mode="dynamic", + }}[{rate_interval}] + ) + + dev: | + rate(kepler_node_package_joules_total{{ + job="dev", + mode="dynamic", + }}[{rate_interval}] + ) + max_mae: 0.5 + + - name: node core - dynamic + units: Watts + latest: | + rate(kepler_node_core_joules_total{{ + job="latest", + mode="dynamic", + }}[{rate_interval}] + ) + + dev: | + rate(kepler_node_core_joules_total{{ + job="dev", + mode="dynamic", + }}[{rate_interval}] + ) + max_mae: 0.5 + + - name: node dram - dynamic + units: Watts + latest: | + rate(kepler_node_dram_joules_total{{ + job="latest", + mode="dynamic", + }}[{rate_interval}] + ) + + dev: | + rate(kepler_node_dram_joules_total{{ + job="dev", + mode="dynamic", + }}[{rate_interval}] + ) + max_mae: 0.5 + + - name: node other - dynamic + units: Watts + latest: | + rate(kepler_node_other_joules_total{{ + job="latest", + mode="dynamic", + }}[{rate_interval}] + ) + + dev: | + rate(kepler_node_other_joules_total{{ + job="dev", + mode="dynamic", + }}[{rate_interval}] + ) + max_mae: 0.5 diff --git a/e2e/tools/validator/src/validator/cli/__init__.py b/e2e/tools/validator/src/validator/cli/__init__.py index 3fc7c8cab2..e8091ec314 100644 --- a/e2e/tools/validator/src/validator/cli/__init__.py +++ b/e2e/tools/validator/src/validator/cli/__init__.py @@ -16,6 +16,7 @@ import matplotlib.pyplot as plt import numpy as np import numpy.typing as npt +from click.exceptions import Exit from matplotlib import ticker from matplotlib.dates import DateFormatter @@ -616,6 +617,34 @@ def validate_acpi(cfg: config.Validator, duration: datetime.timedelta, report_di return int(res.validations.passed) +@validator.command() +@click.option("--duration", "-d", type=options.Duration(), required=True) +# ruff: noqa: S108 (Suppressed as we are intentionally using `/tmp` as reporting directory) +@click.option( + "--report-dir", + "-o", + default="/tmp", + type=click.Path(exists=True, dir_okay=True, writable=True), + show_default=True, +) +@pass_config +def validate_metrics(cfg: config.Validator, duration: datetime.timedelta, report_dir: str): + results_dir, tag = create_report_dir(report_dir) + res = TestResult(tag) + res.end_time = datetime.datetime.now(tz=datetime.UTC) + res.start_time = res.end_time - duration + click.secho(" * Generating build and node info ...", fg="green") + res.build_info, res.node_info = get_build_and_node_info(cfg.prometheus) + click.secho(" * Generating spec report ...", fg="green") + res.host_spec = get_host_spec() + script_result = ScriptResult(res.start_time, res.end_time) + res.validations = run_validations(cfg, script_result, results_dir) + click.secho(" * Generating validate metrics report file and dir", fg="green") + write_md_report(results_dir, res) + + raise Exit(1) if not res.validations.passed else Exit(0) + + def write_json_report(results_dir: str, res: TestResult): pattern = re.compile(r'[{]?(\w+)=("[^"]*"|[^,]+)[},]?') diff --git a/e2e/tools/validator/src/validator/validations/__init__.py b/e2e/tools/validator/src/validator/validations/__init__.py index 21ffe8de01..d5d0d309e4 100644 --- a/e2e/tools/validator/src/validator/validations/__init__.py +++ b/e2e/tools/validator/src/validator/validations/__init__.py @@ -88,6 +88,7 @@ def validation_from_yaml(v: dict[str, Any]) -> Validation: predicted_label=predicted_label, units=v.get("units", ""), max_mape=v.get("max_mape"), + max_mae=v.get("max_mae"), ) return [validation_from_yaml(v) for v in yml["validations"]]