Skip to content

Commit

Permalink
feat(validator): add support to validate kepler metrics
Browse files Browse the repository at this point in the history
This commit adds support to validate essential metrics produced by
Kepler

Signed-off-by: vprashar2929 <[email protected]>
  • Loading branch information
vprashar2929 committed Nov 7, 2024
1 parent 25bbaed commit 25ba3bc
Show file tree
Hide file tree
Showing 5 changed files with 311 additions and 2 deletions.
279 changes: 279 additions & 0 deletions e2e/tools/validator/metric_validations.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,279 @@
config:
mapping:
actual: latest
predicted: dev

validations:
# absolute power comparison
- name: Total - absolute
latest: |
sum(
rate(
kepler_process_joules_total{{
job="latest",
}}[{rate_interval}]
)
)
dev: |
sum(
rate(
kepler_process_joules_total{{
job="dev",
}}[{rate_interval}]
)
)
max_mae: 0.50

# CPU time comparison
- name: cpu-time
units: Milliseconds
latest: |
sum(
rate(
kepler_process_bpf_cpu_time_ms_total{{
job="latest"
}}[{rate_interval}]
)
)
dev: |
sum(
rate(
kepler_process_bpf_cpu_time_ms_total{{
job="dev",
}}[{rate_interval}]
)
)
# max_mae: 20.0
- name: platform - dynamic
latest: |
sum(
rate(
kepler_process_platform_joules_total{{
job="latest", mode="dynamic",
}}[{rate_interval}]
)
)
dev: |
sum(
rate(
kepler_process_platform_joules_total{{
job="dev", mode="dynamic",
}}[{rate_interval}]
)
)
max_mae: 0.50

- name: package - dynamic
units: Watts
latest: |
sum(
rate(
kepler_process_package_joules_total{{
job="latest",
mode="dynamic",
}}[{rate_interval}]
)
)
dev: |
sum(
rate(
kepler_process_package_joules_total{{
job="dev",
mode="dynamic",
}}[{rate_interval}]
)
)
max_mae: 0.50

- name: core - dynamic
units: Watts
latest: |
sum(
rate(
kepler_process_core_joules_total{{
job="latest",
mode="dynamic",
}}[{rate_interval}]
)
)
dev: |
sum(
rate(
kepler_process_core_joules_total{{
job="dev",
mode="dynamic",
}}[{rate_interval}]
)
)
max_mae: 0.50

- name: dram - dynamic
units: Watts
latest: |
sum(
rate(
kepler_process_dram_joules_total{{
job="latest",
mode="dynamic",
}}[{rate_interval}]
)
)
dev: |
sum(
rate(
kepler_process_dram_joules_total{{
job="dev",
mode="dynamic",
}}[{rate_interval}]
)
)
max_mae: 0.50

- name: other - dynamic
units: Watts
latest: |
sum(
rate(
kepler_process_other_joules_total{{
job="latest",
mode="dynamic",
}}[{rate_interval}]
)
)
dev: |
sum(
rate(
kepler_process_other_joules_total{{
job="dev",
mode="dynamic",
}}[{rate_interval}]
)
)
max_mae: 0.50

- name: uncore - dynamic
units: Watts
latest: |
sum(
rate(
kepler_process_uncore_joules_total{{
job="latest",
mode="dynamic",
}}[{rate_interval}]
)
)
dev: |
sum(
rate(
kepler_process_uncore_joules_total{{
job="dev",
mode="dynamic",
}}[{rate_interval}]
)
)
max_mae: 0.50

# Node comparison

Check warning on line 178 in e2e/tools/validator/metric_validations.yaml

View workflow job for this annotation

GitHub Actions / yamllint / yamllint

178:1 [comments-indentation] comment not indented like content
- name: node platform - dynamic
units: Watts
latest: |
rate(kepler_node_platform_joules_total{{
job="latest",
mode="dynamic",
}}[{rate_interval}]
)
dev: |
rate(kepler_node_platform_joules_total{{
job="dev",
mode="dynamic",
}}[{rate_interval}]
)
max_mae: 0.50

- name: node package - dynamic
units: Watts
latest: |
rate(kepler_node_package_joules_total{{
job="latest",
mode="dynamic",
}}[{rate_interval}]
)
dev: |
rate(kepler_node_package_joules_total{{
job="dev",
mode="dynamic",
}}[{rate_interval}]
)
max_mae: 0.50

- name: node core - dynamic
units: Watts
latest: |
rate(kepler_node_core_joules_total{{
job="latest",
mode="dynamic",
}}[{rate_interval}]
)
dev: |
rate(kepler_node_core_joules_total{{
job="dev",
mode="dynamic",
}}[{rate_interval}]
)
max_mae: 0.50

- name: node dram - dynamic
units: Watts
latest: |
rate(kepler_node_dram_joules_total{{
job="latest",
mode="dynamic",
}}[{rate_interval}]
)
dev: |
rate(kepler_node_dram_joules_total{{
job="dev",
mode="dynamic",
}}[{rate_interval}]
)
max_mae: 0.50

- name: node other - dynamic
units: Watts
latest: |
rate(kepler_node_other_joules_total{{
job="latest",
mode="dynamic",
}}[{rate_interval}]
)
dev: |
rate(kepler_node_other_joules_total{{
job="dev",
mode="dynamic",
}}[{rate_interval}]
)
max_mae: 0.50

- name: node uncore - dynamic
units: Watts
latest: |
rate(kepler_node_uncore_joules_total{{
job="latest",
mode="dynamic",
}}[{rate_interval}]
)
dev: |
rate(kepler_node_uncore_joules_total{{
job="dev",
mode="dynamic",
}}[{rate_interval}]
)
max_mae: 0.50
29 changes: 29 additions & 0 deletions e2e/tools/validator/src/validator/cli/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import matplotlib.pyplot as plt
import numpy as np
import numpy.typing as npt
from click.exceptions import Exit
from matplotlib import ticker
from matplotlib.dates import DateFormatter

Expand Down Expand Up @@ -616,6 +617,34 @@ def validate_acpi(cfg: config.Validator, duration: datetime.timedelta, report_di
return int(res.validations.passed)


@validator.command()
@click.option("--duration", "-d", type=options.Duration(), required=True)
# ruff: noqa: S108 (Suppressed as we are intentionally using `/tmp` as reporting directory)
@click.option(
"--report-dir",
"-o",
default="/tmp",
type=click.Path(exists=True, dir_okay=True, writable=True),
show_default=True,
)
@pass_config
def validate_metrics(cfg: config.Validator, duration: datetime.timedelta, report_dir: str):
results_dir, tag = create_report_dir(report_dir)
res = TestResult(tag)
res.end_time = datetime.datetime.now(tz=datetime.UTC)
res.start_time = res.end_time - duration
click.secho(" * Generating build and node info ...", fg="green")
res.build_info, res.node_info = get_build_and_node_info(cfg.prometheus)
click.secho(" * Generating spec report ...", fg="green")
res.host_spec = get_host_spec()
script_result = ScriptResult(res.start_time, res.end_time)
res.validations = run_validations(cfg, script_result, results_dir)
click.secho(" * Generating validate metrics report file and dir", fg="green")
write_md_report(results_dir, res)

raise Exit(1) if not res.validations.passed else Exit(0)


def write_json_report(results_dir: str, res: TestResult):
pattern = re.compile(r'[{]?(\w+)=("[^"]*"|[^,]+)[},]?')

Expand Down
1 change: 1 addition & 0 deletions e2e/tools/validator/src/validator/validations/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ def validation_from_yaml(v: dict[str, Any]) -> Validation:
predicted_label=predicted_label,
units=v.get("units", ""),
max_mape=v.get("max_mape"),
max_mae=v.get("max_mae"),
)

return [validation_from_yaml(v) for v in yml["validations"]]
Expand Down
2 changes: 1 addition & 1 deletion e2e/tools/validator/validator.yaml.sample
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,6 @@ prometheus:

url: http://localhost:9090 # Prometheus server URL
rate_interval: 20s # Rate interval for Promql, default is 20s, typically 4 x $scrape_interval
steps: 3s # Step duration for Prometheus range queries
step: 3s # Step duration for Prometheus range queries

validations_file: ./validations.yaml # Path to the validations file, default is ./validations.yaml
2 changes: 1 addition & 1 deletion manifests/compose/monitoring/prometheus/prometheus.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
global:
scrape_interval: 5s # Set the scrape interval to every 5 seconds. Default is every 1 minute.
scrape_interval: 3s # Set the scrape interval to every 5 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).

Expand Down

0 comments on commit 25ba3bc

Please sign in to comment.