From 28889fe9e786ab52b285bb9de1ae5e62ae832baf Mon Sep 17 00:00:00 2001 From: vprashar2929 Date: Mon, 16 Dec 2024 18:10:17 +0530 Subject: [PATCH] feat(validator): add support to validate essential metrics produced by Kepler This commit introduces functionality to validate essential metrics produced by Kepler The following comparisons are included: - Node Exporter Comparison - Validates `node_rapl_` metrics against `kepler_node_{dev}` - Kepler Process Comparison - Compares `kepler_process_{latest}` metrics to `kepler_process_{dev}` - Kepler Node Comparison - Validates `kepler_node_{latest}` against `kepler_node_{dev}` Additionally, the following changes are made to existing functionality: - Adds a new `metric_validations.yaml` file which includes promql queries for comparisons along with threshold values - Update the existing `stressor.sh` script to now support few more parameters to make it more flexible - warmup time: time to wait before starting the stressor - cooldown time: time to wait after the stressor is finished - repeats: number of times to repeat the stressor. Since for regression test we don't want to repeat the stressor multiple times - Adds a new `validator-regression.yaml` file which includes the configuration for the regression test Signed-off-by: vprashar2929 --- e2e/tools/validator/metric_validations.yaml | 354 ++++++++++++++++++ e2e/tools/validator/scripts/stressor.sh | 32 +- .../validator/src/validator/cli/__init__.py | 58 ++- .../validator/src/validator/cli/options.py | 2 +- .../src/validator/config/__init__.py | 10 +- .../src/validator/stresser/__init__.py | 48 +++ .../src/validator/validations/__init__.py | 1 + e2e/tools/validator/validator-regression.yaml | 30 ++ .../monitoring/prometheus/prometheus.yml | 2 +- 9 files changed, 524 insertions(+), 13 deletions(-) create mode 100644 e2e/tools/validator/metric_validations.yaml create mode 100644 e2e/tools/validator/validator-regression.yaml diff --git a/e2e/tools/validator/metric_validations.yaml b/e2e/tools/validator/metric_validations.yaml new file mode 100644 index 0000000000..6c1cae6e92 --- /dev/null +++ b/e2e/tools/validator/metric_validations.yaml @@ -0,0 +1,354 @@ +config: + mapping: + actual: latest + predicted: dev + +validations: + # node rapl comparison + - name: node-rapl - kepler-package + units: Watts + mapping: + actual: node-rapl + predicted: kepler-package + + node-rapl: | + sum( + rate( + node_rapl_package_joules_total[{rate_interval}] + ) + ) + + kepler-package: | + sum( + rate( + kepler_node_package_joules_total{{ + job="dev", + mode="dynamic", + }}[{rate_interval}] + ) + ) + max_mae: 1.01 + + - name: node-rapl - kepler-core + units: Watts + mapping: + actual: node-rapl + predicted: kepler-core + + node-rapl: | + sum( + rate( + node_rapl_core_joules_total[{rate_interval}] + ) + ) + + kepler-core: | + sum( + rate( + kepler_node_core_joules_total{{ + job="dev", + mode="dynamic", + }}[{rate_interval}] + ) + ) + max_mae: 1.01 + + - name: node-rapl - kepler-dram + units: Watts + mapping: + actual: node-rapl + predicted: kepler-dram + + node-rapl: | + sum( + rate( + node_rapl_dram_joules_total[{rate_interval}] + ) + ) + + kepler-dram: | + sum( + rate( + kepler_node_dram_joules_total{{ + job="dev", + mode="dynamic", + }}[{rate_interval}] + ) + ) + max_mae: 1.01 + + # absolute power comparison + - name: Total - absolute + latest: | + sum( + rate( + kepler_process_joules_total{{ + job="latest", + }}[{rate_interval}] + ) + ) + dev: | + sum( + rate( + kepler_process_joules_total{{ + job="dev", + }}[{rate_interval}] + ) + ) + + max_mae: 2.01 + + # CPU time comparison + - name: cpu-time + units: Milliseconds + latest: | + sum( + rate( + kepler_process_bpf_cpu_time_ms_total{{ + job="latest" + }}[{rate_interval}] + ) + ) + dev: | + sum( + rate( + kepler_process_bpf_cpu_time_ms_total{{ + job="dev", + }}[{rate_interval}] + ) + ) + # max_mae: 20.0 + + # process comparison + - name: platform - dynamic + latest: | + sum( + rate( + kepler_process_platform_joules_total{{ + job="latest", mode="dynamic", + }}[{rate_interval}] + ) + ) + dev: | + sum( + rate( + kepler_process_platform_joules_total{{ + job="dev", mode="dynamic", + }}[{rate_interval}] + ) + ) + + max_mae: 2.01 + + - name: package - dynamic + units: Watts + latest: | + sum( + rate( + kepler_process_package_joules_total{{ + job="latest", + mode="dynamic", + }}[{rate_interval}] + ) + ) + dev: | + sum( + rate( + kepler_process_package_joules_total{{ + job="dev", + mode="dynamic", + }}[{rate_interval}] + ) + ) + max_mae: 2.01 + + - name: core - dynamic + units: Watts + latest: | + sum( + rate( + kepler_process_core_joules_total{{ + job="latest", + mode="dynamic", + }}[{rate_interval}] + ) + ) + dev: | + sum( + rate( + kepler_process_core_joules_total{{ + job="dev", + mode="dynamic", + }}[{rate_interval}] + ) + ) + max_mae: 2.01 + + - name: dram - dynamic + units: Watts + latest: | + sum( + rate( + kepler_process_dram_joules_total{{ + job="latest", + mode="dynamic", + }}[{rate_interval}] + ) + ) + dev: | + sum( + rate( + kepler_process_dram_joules_total{{ + job="dev", + mode="dynamic", + }}[{rate_interval}] + ) + ) + max_mae: 2.01 + + - name: other - dynamic + units: Watts + latest: | + sum( + rate( + kepler_process_other_joules_total{{ + job="latest", + mode="dynamic", + }}[{rate_interval}] + ) + ) + dev: | + sum( + rate( + kepler_process_other_joules_total{{ + job="dev", + mode="dynamic", + }}[{rate_interval}] + ) + ) + max_mae: 2.01 + + - name: uncore - dynamic + units: Watts + latest: | + sum( + rate( + kepler_process_uncore_joules_total{{ + job="latest", + mode="dynamic", + }}[{rate_interval}] + ) + ) + dev: | + sum( + rate( + kepler_process_uncore_joules_total{{ + job="dev", + mode="dynamic", + }}[{rate_interval}] + ) + ) + max_mae: 2.01 + + # node comparison + - name: node platform - dynamic + units: Watts + latest: | + rate(kepler_node_platform_joules_total{{ + job="latest", + mode="dynamic", + }}[{rate_interval}] + ) + + dev: | + rate(kepler_node_platform_joules_total{{ + job="dev", + mode="dynamic", + }}[{rate_interval}] + ) + max_mae: 2.01 + + - name: node package - dynamic + units: Watts + latest: | + rate(kepler_node_package_joules_total{{ + job="latest", + mode="dynamic", + }}[{rate_interval}] + ) + + dev: | + rate(kepler_node_package_joules_total{{ + job="dev", + mode="dynamic", + }}[{rate_interval}] + ) + max_mae: 2.01 + + - name: node core - dynamic + units: Watts + latest: | + rate(kepler_node_core_joules_total{{ + job="latest", + mode="dynamic", + }}[{rate_interval}] + ) + + dev: | + rate(kepler_node_core_joules_total{{ + job="dev", + mode="dynamic", + }}[{rate_interval}] + ) + max_mae: 2.01 + + - name: node dram - dynamic + units: Watts + latest: | + rate(kepler_node_dram_joules_total{{ + job="latest", + mode="dynamic", + }}[{rate_interval}] + ) + + dev: | + rate(kepler_node_dram_joules_total{{ + job="dev", + mode="dynamic", + }}[{rate_interval}] + ) + max_mae: 2.01 + + - name: node other - dynamic + units: Watts + latest: | + rate(kepler_node_other_joules_total{{ + job="latest", + mode="dynamic", + }}[{rate_interval}] + ) + + dev: | + rate(kepler_node_other_joules_total{{ + job="dev", + mode="dynamic", + }}[{rate_interval}] + ) + max_mae: 2.01 + + - name: node uncore - dynamic + units: Watts + latest: | + rate(kepler_node_uncore_joules_total{{ + job="latest", + mode="dynamic", + }}[{rate_interval}] + ) + + dev: | + rate(kepler_node_uncore_joules_total{{ + job="dev", + mode="dynamic", + }}[{rate_interval}] + ) + max_mae: 2.01 diff --git a/e2e/tools/validator/scripts/stressor.sh b/e2e/tools/validator/scripts/stressor.sh index 2058caaf2b..fee475dacc 100755 --- a/e2e/tools/validator/scripts/stressor.sh +++ b/e2e/tools/validator/scripts/stressor.sh @@ -47,21 +47,32 @@ main() { local total_time=0 local repeats=5 local curve_type="default" + local cooldown_time=5 + local warmup_time=5 - while getopts "t:r:c:" opt; do + while getopts "t:r:c:d:w:" opt; do case $opt in - t) total_time=$OPTARG ;; - c) curve_type=$OPTARG ;; - *) echo "Usage: $0 [-t total_time_in_seconds] [-c curve_type(default|stepwise)]" >&2; exit 1 ;; + t) total_time=$OPTARG ;; + c) curve_type=$OPTARG ;; + r) repeats=$OPTARG ;; + w) warmup_time=$OPTARG ;; + d) cooldown_time=$OPTARG ;; + *) + echo "Usage: $0 [-t total_time_in_seconds] [-w warmup_time_in_seconds] [-c cooldown_time_in_seconds] [-r repeats] [-c curve_type]" + exit 1 + ;; esac done # Select load curve based on curve_type local -a load_curve case $curve_type in - "default") load_curve=("${load_curve_default[@]}") ;; - "stepwise") load_curve=("${load_curve_stepwise[@]}") ;; - *) echo "Invalid curve type. Use 'default' or 'stepwise'" >&2; exit 1 ;; + "default") load_curve=("${load_curve_default[@]}") ;; + "stepwise") load_curve=("${load_curve_stepwise[@]}") ;; + *) + echo "Invalid curve type. Use 'default' or 'stepwise'" >&2 + exit 1 + ;; esac local cpus @@ -81,9 +92,9 @@ main() { echo "Total time: $total_time seconds, Repeats: $repeats, Curve type: $curve_type" - # sleep 5 so that first run and the second run look the same + # sleep so that first run and the second run look the same echo "Warmup .." - run stress-ng --cpu "$cpus" --cpu-method ackermann --cpu-load 0 --timeout 5 + run stress-ng --cpu "$cpus" --cpu-method ackermann --cpu-load 0 --timeout "$warmup_time" for i in $(seq 1 "$repeats"); do echo "Running: $i/$repeats" @@ -92,6 +103,9 @@ main() { local time="${x##*:}s" run stress-ng --cpu "$cpus" --cpu-method ackermann --cpu-load "$load" --timeout "$time" done + # sleep so that the next run looks the same + echo "Cooldown .." + run stress-ng --cpu "$cpus" --cpu-method ackermann --cpu-load 0 --timeout "$cooldown_time" done } diff --git a/e2e/tools/validator/src/validator/cli/__init__.py b/e2e/tools/validator/src/validator/cli/__init__.py index 9dd3cbc95e..8c78440a51 100644 --- a/e2e/tools/validator/src/validator/cli/__init__.py +++ b/e2e/tools/validator/src/validator/cli/__init__.py @@ -26,7 +26,7 @@ from validator.prometheus import Comparator, PrometheusClient, Series, ValueOrError from validator.report import CustomEncoder, JsonTemplate from validator.specs import MachineSpec, get_host_spec, get_vm_spec -from validator.stresser import Remote, ScriptResult +from validator.stresser import Local, Remote, ScriptResult from validator.validations import Loader, QueryTemplate, Validation logger = logging.getLogger(__name__) @@ -600,6 +600,9 @@ def run_validation( ) @pass_config def validate_acpi(cfg: config.Validator, duration: datetime.timedelta, report_dir: str) -> None: + """ + Run Kepler ACPI validation test + """ results_dir, tag = create_report_dir(report_dir) res = TestResult(tag) @@ -621,6 +624,59 @@ def validate_acpi(cfg: config.Validator, duration: datetime.timedelta, report_di raise Exit(1) if not res.validations.passed else Exit(0) +@validator.command() +@click.option( + "--script-path", + "-s", + default="./scripts/stressor.sh", + type=click.Path(exists=True), + show_default=True, +) +# ruff: noqa: S108 (Suppressed as we are intentionally using `/tmp` as reporting directory) +@click.option( + "--report-dir", + "-o", + default="/tmp", + type=click.Path(exists=True, dir_okay=True, writable=True), + show_default=True, +) +@pass_config +def regression( + cfg: config.Validator, + script_path: str, + report_dir: str, +): + """ + Run Kepler regression test + """ + results_dir, tag = create_report_dir(report_dir) + res = TestResult(tag) + click.secho(" * Generating build and node info ...", fg="green") + res.build_info, res.node_info = get_build_and_node_info(cfg.prometheus) + click.secho(" * Generating spec report ...", fg="green") + res.host_spec = get_host_spec() + local = Local() + warmup_seconds = cfg.stressor.warmup_seconds + cooldown_seconds = cfg.stressor.cooldown_seconds + curve_type = cfg.stressor.curve_type + repeats = cfg.stressor.repeats + stress_test = local.run_script( + script_path=script_path, c=curve_type, w=warmup_seconds, d=cooldown_seconds, r=repeats + ) + res.start_time = stress_test.start_time + res.end_time = stress_test.end_time + + # sleep a bit for prometheus to finish scrapping + click.secho(" * Sleeping for 10 seconds ...", fg="green") + time.sleep(10) + + res.validations = run_validations(cfg, stress_test, results_dir) + click.secho(" * Generating validate metrics report file and dir", fg="green") + write_md_report(results_dir, res) + + raise Exit(1) if not res.validations.passed else Exit(0) + + def write_json_report(results_dir: str, res: TestResult): pattern = re.compile(r'[{]?(\w+)=("[^"]*"|[^,]+)[},]?') diff --git a/e2e/tools/validator/src/validator/cli/options.py b/e2e/tools/validator/src/validator/cli/options.py index 27068afa20..1091d7d6d3 100644 --- a/e2e/tools/validator/src/validator/cli/options.py +++ b/e2e/tools/validator/src/validator/cli/options.py @@ -32,7 +32,7 @@ class Duration(click.ParamType): def convert(self, value, param, ctx): td = parse_timedelta("now", value) if not td: - self.self.fail( + self.fail( "Expected duration format got " f"{value:r}", param, ctx, diff --git a/e2e/tools/validator/src/validator/config/__init__.py b/e2e/tools/validator/src/validator/config/__init__.py index 31fed7b14b..7c62c5d6c4 100644 --- a/e2e/tools/validator/src/validator/config/__init__.py +++ b/e2e/tools/validator/src/validator/config/__init__.py @@ -43,6 +43,9 @@ class Prometheus(NamedTuple): class Stressor(NamedTuple): total_runtime_seconds: int curve_type: str + repeats: int + warmup_seconds: int + cooldown_seconds: int class Validator(NamedTuple): @@ -113,11 +116,16 @@ def load(config_file: str) -> Validator: stressor_config = config["stressor"] if not stressor_config: - stressor = Stressor(total_runtime_seconds=1200, curve_type="default") + stressor = Stressor( + total_runtime_seconds=1200, curve_type="default", repeats=5, warmup_seconds=5, cooldown_seconds=5 + ) else: stressor = Stressor( total_runtime_seconds=stressor_config.get("total_runtime_seconds", 1200), curve_type=stressor_config.get("curve_type", "default"), + repeats=stressor_config.get("repeats", 5), + warmup_seconds=stressor_config.get("warmup_seconds", 5), + cooldown_seconds=stressor_config.get("cooldown_seconds", 5), ) validations_file = config.get("validations_file", "validations.yaml") diff --git a/e2e/tools/validator/src/validator/stresser/__init__.py b/e2e/tools/validator/src/validator/stresser/__init__.py index 01afec1181..5cdf09a2de 100644 --- a/e2e/tools/validator/src/validator/stresser/__init__.py +++ b/e2e/tools/validator/src/validator/stresser/__init__.py @@ -1,4 +1,7 @@ import logging +import os +import shutil +import subprocess from datetime import datetime from typing import NamedTuple @@ -20,6 +23,51 @@ class RunResult(NamedTuple): exit_code: int +class Local: + def copy(self, script_path, target_script): + logger.info("copying script %s - %s", script_path, target_script) + shutil.copy(script_path, target_script) + os.chmod(target_script, 0o700) + logger.info("copying script %s - %s - successful", script_path, target_script) + + def run_script(self, script_path: str, **kwargs) -> ScriptResult: + logger.info("Running script %s ...", script_path) + # Prepare CLI oprions + cli_options = " ".join([f"-{k} {v}" for k, v in kwargs.items()]) if kwargs else "" + # ruff: noqa: S108 (Suppressed hard-coded path because we want to intentionally copy stress.sh inside `/tmp` dir) + target_script = "/tmp/regression-stress.sh" + self.copy(script_path, target_script) + + command = [target_script, *cli_options.split()] + logger.info("Running command %s ...", command) + # ruff: noqa: DTZ005 (Suppressed non-time-zone aware object creation as it is not necessary for this use case) + start_time = datetime.now() + process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + stdout, stderr = process.communicate() + end_time = datetime.now() + + # Output stdout + print("stdout output:") + for line in stdout.decode().splitlines(): + print(" ┊ ", line) + + # Output stderr + print("\nstderr output:") + for line in stderr.decode().splitlines(): + print(" ┊ ", line) + print("‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾\n\n") + + if process.returncode != 0: + logger.warning("script execution failed") + else: + logger.info("script execution successful") + + return ScriptResult( + start_time=start_time, + end_time=end_time, + ) + + class Remote: def __init__(self, config: config.Remote): self.host = config.host diff --git a/e2e/tools/validator/src/validator/validations/__init__.py b/e2e/tools/validator/src/validator/validations/__init__.py index 21ffe8de01..d5d0d309e4 100644 --- a/e2e/tools/validator/src/validator/validations/__init__.py +++ b/e2e/tools/validator/src/validator/validations/__init__.py @@ -88,6 +88,7 @@ def validation_from_yaml(v: dict[str, Any]) -> Validation: predicted_label=predicted_label, units=v.get("units", ""), max_mape=v.get("max_mape"), + max_mae=v.get("max_mae"), ) return [validation_from_yaml(v) for v in yml["validations"]] diff --git a/e2e/tools/validator/validator-regression.yaml b/e2e/tools/validator/validator-regression.yaml new file mode 100644 index 0000000000..c49af28c40 --- /dev/null +++ b/e2e/tools/validator/validator-regression.yaml @@ -0,0 +1,30 @@ +log_level: info # Logging level, defaults is warn + +remote: + host: 192.168.1.1 # IP address or hostname of the VM + port: 22 # SSH port, default is 22 + username: user # SSH username + password: yourpassword # SSH password + pkey: ~/.ssh/id_rsa # Path to SSH private key + +metal: + vm: + pid: 123456 # Process ID for the KVM process running on metal + +prometheus: + job: + vm: vm # Job name for virtual machine metrics, default is vm + metal: metal # Job name for metal metrics, default is metal + + url: http://localhost:9090 # Prometheus server URL + rate_interval: 60s # Rate interval for Promql, default is 20s, typically 4 x $scrape_interval + step: 3s # Step duration for Prometheus range queries + +stressor: + total_runtime_seconds: 1200 + curve_type: default + warmup_seconds: 5 + repeats: 1 + cooldown_seconds: 60 + +validations_file: ./metric_validations.yaml # Path to the validations file, default is ./validations.yaml diff --git a/manifests/compose/monitoring/prometheus/prometheus.yml b/manifests/compose/monitoring/prometheus/prometheus.yml index 93456a0e36..ad7df712f1 100644 --- a/manifests/compose/monitoring/prometheus/prometheus.yml +++ b/manifests/compose/monitoring/prometheus/prometheus.yml @@ -1,5 +1,5 @@ global: - scrape_interval: 5s # Set the scrape interval to every 5 seconds. Default is every 1 minute. + scrape_interval: 3s # Set the scrape interval to every 5 seconds. Default is every 1 minute. evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute. # scrape_timeout is set to the global default (10s).