Skip to content

Commit

Permalink
feat(validator): add support to validate kepler metrics
Browse files Browse the repository at this point in the history
This commit adds support to validate essential metrics produced by
Kepler

Signed-off-by: vprashar2929 <[email protected]>
  • Loading branch information
vprashar2929 committed Nov 6, 2024
1 parent 25bbaed commit 2eb2f41
Show file tree
Hide file tree
Showing 3 changed files with 349 additions and 0 deletions.
319 changes: 319 additions & 0 deletions e2e/tools/validator/metric_validations.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,319 @@
config:
mapping:
actual: latest
predicted: dev

validations:
# absolute power comparison
- name: Total - absolute
latest: |
sum(
rate(
kepler_process_joules_total{{
job="latest",
}}[{rate_interval}]
)
)
dev: |
sum(
rate(
kepler_process_joules_total{{
job="dev",
}}[{rate_interval}]
)
)
max_mae: 0.5

- name: platform - absolute
latest: |
sum(
rate(
kepler_process_platform_joules_total{{
job="latest",
}}[{rate_interval}]
)
)
dev: |
sum(
rate(
kepler_process_platform_joules_total{{
job="dev",
}}[{rate_interval}]
)
)
max_mae: 0.5

- name: package - absolute
latest: |
sum(
rate(
kepler_process_package_joules_total{{
job="latest",
}}[{rate_interval}]
)
)
dev: |
sum(
rate(
kepler_process_package_joules_total{{
job="dev",
}}[{rate_interval}]
)
)
max_mae: 0.5

- name: dram - absolute
latest: |
sum(
rate(
kepler_process_dram_joules_total{{
job="latest",
}}[{rate_interval}]
)
)
dev: |
sum(
rate(
kepler_process_dram_joules_total{{
job="dev",
}}[{rate_interval}]
)
)
max_mae: 0.5

- name: core - absolute
units: Watts
latest: |
sum(
rate(
kepler_process_core_joules_total{{
job="latest",
}}[{rate_interval}]
)
)
dev: |
sum(
rate(
kepler_process_core_joules_total{{
job="dev",
}}[{rate_interval}]
)
)
max_mae: 0.5

- name: other - absolute
units: Watts
latest: |
sum(
rate(
kepler_process_other_joules_total{{
job="latest",
}}[{rate_interval}]
)
)
dev: |
sum(
rate(
kepler_process_other_joules_total{{
job="dev",
}}[{rate_interval}]
)
)
max_mae: 0.5

# CPU time comparison
- name: cpu-time
units: Milliseconds
latest: |
sum(
rate(
kepler_process_bpf_cpu_time_ms_total{{
job="latest"
}}[{rate_interval}]
)
)
dev: |
sum(
rate(
kepler_process_bpf_cpu_time_ms_total{{
job="dev",
}}[{rate_interval}]
)
)
# max_mae: 20.0

- name: package - dynamic
units: Watts
latest: |
sum(
rate(
kepler_process_package_joules_total{{
job="latest",
mode="dynamic",
}}[{rate_interval}]
)
)
dev: |
sum(
rate(
kepler_process_package_joules_total{{
job="dev",
mode="dynamic",
}}[{rate_interval}]
)
)
max_mae: 0.5

- name: core - dynamic
units: Watts
latest: |
sum(
rate(
kepler_process_core_joules_total{{
job="latest",
mode="dynamic",
}}[{rate_interval}]
)
)
dev: |
sum(
rate(
kepler_process_core_joules_total{{
job="dev",
mode="dynamic",
}}[{rate_interval}]
)
)
max_mae: 0.5

- name: dram - dynamic
units: Watts
latest: |
sum(
rate(
kepler_process_dram_joules_total{{
job="latest",
mode="dynamic",
}}[{rate_interval}]
)
)
dev: |
sum(
rate(
kepler_process_dram_joules_total{{
job="dev",
mode="dynamic",
}}[{rate_interval}]
)
)
max_mae: 0.5

- name: other - dynamic
units: Watts
latest: |
sum(
rate(
kepler_process_other_joules_total{{
job="latest",
mode="dynamic",
}}[{rate_interval}]
)
)
dev: |
sum(
rate(
kepler_process_other_joules_total{{
job="dev",
mode="dynamic",
}}[{rate_interval}]
)
)
max_mae: 0.5

# Node comparison

Check warning on line 235 in e2e/tools/validator/metric_validations.yaml

View workflow job for this annotation

GitHub Actions / yamllint / yamllint

235:1 [comments-indentation] comment not indented like content
- name: node platform - dynamic
units: Watts
latest: |
rate(kepler_node_platform_joules_total{{
job="latest",
mode="dynamic",
}}[{rate_interval}]
)
dev: |
rate(kepler_node_platform_joules_total{{
job="dev",
mode="dynamic",
}}[{rate_interval}]
)
max_mae: 0.5

- name: node package - dynamic
units: Watts
latest: |
rate(kepler_node_package_joules_total{{
job="latest",
mode="dynamic",
}}[{rate_interval}]
)
dev: |
rate(kepler_node_package_joules_total{{
job="dev",
mode="dynamic",
}}[{rate_interval}]
)
max_mae: 0.5

- name: node core - dynamic
units: Watts
latest: |
rate(kepler_node_core_joules_total{{
job="latest",
mode="dynamic",
}}[{rate_interval}]
)
dev: |
rate(kepler_node_core_joules_total{{
job="dev",
mode="dynamic",
}}[{rate_interval}]
)
max_mae: 0.5

- name: node dram - dynamic
units: Watts
latest: |
rate(kepler_node_dram_joules_total{{
job="latest",
mode="dynamic",
}}[{rate_interval}]
)
dev: |
rate(kepler_node_dram_joules_total{{
job="dev",
mode="dynamic",
}}[{rate_interval}]
)
max_mae: 0.5

- name: node other - dynamic
units: Watts
latest: |
rate(kepler_node_other_joules_total{{
job="latest",
mode="dynamic",
}}[{rate_interval}]
)
dev: |
rate(kepler_node_other_joules_total{{
job="dev",
mode="dynamic",
}}[{rate_interval}]
)
max_mae: 0.5
29 changes: 29 additions & 0 deletions e2e/tools/validator/src/validator/cli/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import matplotlib.pyplot as plt
import numpy as np
import numpy.typing as npt
from click.exceptions import Exit
from matplotlib import ticker
from matplotlib.dates import DateFormatter

Expand Down Expand Up @@ -616,6 +617,34 @@ def validate_acpi(cfg: config.Validator, duration: datetime.timedelta, report_di
return int(res.validations.passed)


@validator.command()
@click.option("--duration", "-d", type=options.Duration(), required=True)
# ruff: noqa: S108 (Suppressed as we are intentionally using `/tmp` as reporting directory)
@click.option(
"--report-dir",
"-o",
default="/tmp",
type=click.Path(exists=True, dir_okay=True, writable=True),
show_default=True,
)
@pass_config
def validate_metrics(cfg: config.Validator, duration: datetime.timedelta, report_dir: str):
results_dir, tag = create_report_dir(report_dir)
res = TestResult(tag)
res.end_time = datetime.datetime.now(tz=datetime.UTC)
res.start_time = res.end_time - duration
click.secho(" * Generating build and node info ...", fg="green")
res.build_info, res.node_info = get_build_and_node_info(cfg.prometheus)
click.secho(" * Generating spec report ...", fg="green")
res.host_spec = get_host_spec()
script_result = ScriptResult(res.start_time, res.end_time)
res.validations = run_validations(cfg, script_result, results_dir)
click.secho(" * Generating validate metrics report file and dir", fg="green")
write_md_report(results_dir, res)

raise Exit(1) if not res.validations.passed else Exit(0)


def write_json_report(results_dir: str, res: TestResult):
pattern = re.compile(r'[{]?(\w+)=("[^"]*"|[^,]+)[},]?')

Expand Down
1 change: 1 addition & 0 deletions e2e/tools/validator/src/validator/validations/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ def validation_from_yaml(v: dict[str, Any]) -> Validation:
predicted_label=predicted_label,
units=v.get("units", ""),
max_mape=v.get("max_mape"),
max_mae=v.get("max_mae"),
)

return [validation_from_yaml(v) for v in yml["validations"]]
Expand Down

0 comments on commit 2eb2f41

Please sign in to comment.