From 28889fe9e786ab52b285bb9de1ae5e62ae832baf Mon Sep 17 00:00:00 2001
From: vprashar2929 <vibhu.sharma2929@gmail.com>
Date: Mon, 16 Dec 2024 18:10:17 +0530
Subject: [PATCH] feat(validator): add support to validate essential metrics
 produced by Kepler

This commit introduces functionality to validate essential metrics produced by Kepler
The following comparisons are included:

- Node Exporter Comparison
   - Validates `node_rapl_<package|core|dram>` metrics against `kepler_node_<package|core|dram>{dev}`

- Kepler Process Comparison
   - Compares `kepler_process_<package|core|dram|platform|other|uncore>{latest}` metrics to
      `kepler_process_<package|core|dram|platform|other|uncore>{dev}`

- Kepler Node Comparison
   - Validates `kepler_node_<package|core|dram|platform|other|uncore>{latest}` against
      `kepler_node_<package|core|dram|platform|other|uncore>{dev}`

Additionally, the following changes are made to existing functionality:

- Adds a new `metric_validations.yaml` file which includes promql queries for comparisons along with threshold values
- Update the existing `stressor.sh` script to now support few more parameters to make it more flexible
  - warmup time: time to wait before starting the stressor
  - cooldown time: time to wait after the stressor is finished
  - repeats: number of times to repeat the stressor. Since for
    regression test we don't want to repeat the stressor multiple times
- Adds a new `validator-regression.yaml` file which includes the configuration for the regression test

Signed-off-by: vprashar2929 <vibhu.sharma2929@gmail.com>
---
 e2e/tools/validator/metric_validations.yaml   | 354 ++++++++++++++++++
 e2e/tools/validator/scripts/stressor.sh       |  32 +-
 .../validator/src/validator/cli/__init__.py   |  58 ++-
 .../validator/src/validator/cli/options.py    |   2 +-
 .../src/validator/config/__init__.py          |  10 +-
 .../src/validator/stresser/__init__.py        |  48 +++
 .../src/validator/validations/__init__.py     |   1 +
 e2e/tools/validator/validator-regression.yaml |  30 ++
 .../monitoring/prometheus/prometheus.yml      |   2 +-
 9 files changed, 524 insertions(+), 13 deletions(-)
 create mode 100644 e2e/tools/validator/metric_validations.yaml
 create mode 100644 e2e/tools/validator/validator-regression.yaml

diff --git a/e2e/tools/validator/metric_validations.yaml b/e2e/tools/validator/metric_validations.yaml
new file mode 100644
index 0000000000..6c1cae6e92
--- /dev/null
+++ b/e2e/tools/validator/metric_validations.yaml
@@ -0,0 +1,354 @@
+config:
+  mapping:
+    actual: latest
+    predicted: dev
+
+validations:
+  # node rapl comparison
+  - name: node-rapl - kepler-package
+    units: Watts
+    mapping:
+      actual: node-rapl
+      predicted: kepler-package
+
+    node-rapl: |
+      sum(
+        rate(
+          node_rapl_package_joules_total[{rate_interval}]
+        )
+      )
+
+    kepler-package: |
+      sum(
+        rate(
+          kepler_node_package_joules_total{{
+            job="dev",
+            mode="dynamic",
+          }}[{rate_interval}]
+        )
+      )
+    max_mae: 1.01
+
+  - name: node-rapl - kepler-core
+    units: Watts
+    mapping:
+      actual: node-rapl
+      predicted: kepler-core
+
+    node-rapl: |
+      sum(
+        rate(
+          node_rapl_core_joules_total[{rate_interval}]
+        )
+      )
+
+    kepler-core: |
+      sum(
+        rate(
+          kepler_node_core_joules_total{{
+            job="dev",
+            mode="dynamic",
+          }}[{rate_interval}]
+        )
+      )
+    max_mae: 1.01
+
+  - name: node-rapl - kepler-dram
+    units: Watts
+    mapping:
+      actual: node-rapl
+      predicted: kepler-dram
+
+    node-rapl: |
+      sum(
+        rate(
+          node_rapl_dram_joules_total[{rate_interval}]
+        )
+      )
+
+    kepler-dram: |
+      sum(
+        rate(
+          kepler_node_dram_joules_total{{
+            job="dev",
+            mode="dynamic",
+          }}[{rate_interval}]
+        )
+      )
+    max_mae: 1.01
+
+  # absolute power comparison
+  - name: Total - absolute
+    latest: |
+      sum(
+        rate(
+        kepler_process_joules_total{{
+          job="latest",
+        }}[{rate_interval}]
+        )
+      )
+    dev: |
+      sum(
+        rate(
+          kepler_process_joules_total{{
+            job="dev",
+          }}[{rate_interval}]
+        )
+      )
+
+    max_mae: 2.01
+
+  # CPU time comparison
+  - name: cpu-time
+    units: Milliseconds
+    latest: |
+      sum(
+        rate(
+          kepler_process_bpf_cpu_time_ms_total{{
+            job="latest"
+          }}[{rate_interval}]
+        )
+      )
+    dev: |
+      sum(
+        rate(
+          kepler_process_bpf_cpu_time_ms_total{{
+            job="dev",
+          }}[{rate_interval}]
+        )
+      )
+    # max_mae: 20.0
+
+  # process comparison
+  - name: platform - dynamic
+    latest: |
+      sum(
+        rate(
+        kepler_process_platform_joules_total{{
+          job="latest", mode="dynamic",
+        }}[{rate_interval}]
+        )
+      )
+    dev: |
+      sum(
+        rate(
+          kepler_process_platform_joules_total{{
+            job="dev", mode="dynamic",
+          }}[{rate_interval}]
+        )
+      )
+
+    max_mae: 2.01
+
+  - name: package - dynamic
+    units: Watts
+    latest: |
+      sum(
+        rate(
+          kepler_process_package_joules_total{{
+            job="latest",
+            mode="dynamic",
+          }}[{rate_interval}]
+        )
+      )
+    dev: |
+      sum(
+        rate(
+          kepler_process_package_joules_total{{
+            job="dev",
+            mode="dynamic",
+          }}[{rate_interval}]
+        )
+      )
+    max_mae: 2.01
+
+  - name: core - dynamic
+    units: Watts
+    latest: |
+      sum(
+        rate(
+          kepler_process_core_joules_total{{
+            job="latest",
+            mode="dynamic",
+          }}[{rate_interval}]
+        )
+      )
+    dev: |
+      sum(
+        rate(
+          kepler_process_core_joules_total{{
+            job="dev",
+            mode="dynamic",
+          }}[{rate_interval}]
+        )
+      )
+    max_mae: 2.01
+
+  - name: dram - dynamic
+    units: Watts
+    latest: |
+      sum(
+        rate(
+          kepler_process_dram_joules_total{{
+            job="latest",
+            mode="dynamic",
+          }}[{rate_interval}]
+        )
+      )
+    dev: |
+      sum(
+        rate(
+          kepler_process_dram_joules_total{{
+            job="dev",
+            mode="dynamic",
+          }}[{rate_interval}]
+        )
+      )
+    max_mae: 2.01
+
+  - name: other - dynamic
+    units: Watts
+    latest: |
+      sum(
+        rate(
+          kepler_process_other_joules_total{{
+            job="latest",
+            mode="dynamic",
+          }}[{rate_interval}]
+        )
+      )
+    dev: |
+      sum(
+        rate(
+          kepler_process_other_joules_total{{
+            job="dev",
+            mode="dynamic",
+          }}[{rate_interval}]
+        )
+      )
+    max_mae: 2.01
+
+  - name: uncore - dynamic
+    units: Watts
+    latest: |
+      sum(
+        rate(
+          kepler_process_uncore_joules_total{{
+            job="latest",
+            mode="dynamic",
+          }}[{rate_interval}]
+        )
+      )
+    dev: |
+      sum(
+        rate(
+          kepler_process_uncore_joules_total{{
+            job="dev",
+            mode="dynamic",
+          }}[{rate_interval}]
+        )
+      )
+    max_mae: 2.01
+
+  # node comparison
+  - name: node platform - dynamic
+    units: Watts
+    latest: |
+      rate(kepler_node_platform_joules_total{{
+          job="latest",
+          mode="dynamic",
+        }}[{rate_interval}]
+      )
+
+    dev: |
+      rate(kepler_node_platform_joules_total{{
+          job="dev",
+          mode="dynamic",
+        }}[{rate_interval}]
+      )
+    max_mae: 2.01
+
+  - name: node package - dynamic
+    units: Watts
+    latest: |
+      rate(kepler_node_package_joules_total{{
+          job="latest",
+          mode="dynamic",
+        }}[{rate_interval}]
+      )
+
+    dev: |
+      rate(kepler_node_package_joules_total{{
+          job="dev",
+          mode="dynamic",
+        }}[{rate_interval}]
+      )
+    max_mae: 2.01
+
+  - name: node core - dynamic
+    units: Watts
+    latest: |
+      rate(kepler_node_core_joules_total{{
+          job="latest",
+          mode="dynamic",
+        }}[{rate_interval}]
+      )
+
+    dev: |
+      rate(kepler_node_core_joules_total{{
+          job="dev",
+          mode="dynamic",
+        }}[{rate_interval}]
+      )
+    max_mae: 2.01
+
+  - name: node dram - dynamic
+    units: Watts
+    latest: |
+      rate(kepler_node_dram_joules_total{{
+          job="latest",
+          mode="dynamic",
+        }}[{rate_interval}]
+      )
+
+    dev: |
+      rate(kepler_node_dram_joules_total{{
+          job="dev",
+          mode="dynamic",
+        }}[{rate_interval}]
+      )
+    max_mae: 2.01
+
+  - name: node other - dynamic
+    units: Watts
+    latest: |
+      rate(kepler_node_other_joules_total{{
+          job="latest",
+          mode="dynamic",
+        }}[{rate_interval}]
+      )
+
+    dev: |
+      rate(kepler_node_other_joules_total{{
+          job="dev",
+          mode="dynamic",
+        }}[{rate_interval}]
+      )
+    max_mae: 2.01
+
+  - name: node uncore - dynamic
+    units: Watts
+    latest: |
+      rate(kepler_node_uncore_joules_total{{
+          job="latest",
+          mode="dynamic",
+        }}[{rate_interval}]
+      )
+
+    dev: |
+      rate(kepler_node_uncore_joules_total{{
+          job="dev",
+          mode="dynamic",
+        }}[{rate_interval}]
+      )
+    max_mae: 2.01
diff --git a/e2e/tools/validator/scripts/stressor.sh b/e2e/tools/validator/scripts/stressor.sh
index 2058caaf2b..fee475dacc 100755
--- a/e2e/tools/validator/scripts/stressor.sh
+++ b/e2e/tools/validator/scripts/stressor.sh
@@ -47,21 +47,32 @@ main() {
 	local total_time=0
 	local repeats=5
 	local curve_type="default"
+	local cooldown_time=5
+	local warmup_time=5
 
-	while getopts "t:r:c:" opt; do
+	while getopts "t:r:c:d:w:" opt; do
 		case $opt in
-			t) total_time=$OPTARG ;;
-			c) curve_type=$OPTARG ;;
-			*) echo "Usage: $0 [-t total_time_in_seconds] [-c curve_type(default|stepwise)]" >&2; exit 1 ;;
+		t) total_time=$OPTARG ;;
+		c) curve_type=$OPTARG ;;
+		r) repeats=$OPTARG ;;
+		w) warmup_time=$OPTARG ;;
+		d) cooldown_time=$OPTARG ;;
+		*)
+			echo "Usage: $0 [-t total_time_in_seconds] [-w warmup_time_in_seconds] [-c cooldown_time_in_seconds] [-r repeats] [-c curve_type]"
+			exit 1
+			;;
 		esac
 	done
 
 	# Select load curve based on curve_type
 	local -a load_curve
 	case $curve_type in
-		"default") load_curve=("${load_curve_default[@]}") ;;
-		"stepwise") load_curve=("${load_curve_stepwise[@]}") ;;
-		*) echo "Invalid curve type. Use 'default' or 'stepwise'" >&2; exit 1 ;;
+	"default") load_curve=("${load_curve_default[@]}") ;;
+	"stepwise") load_curve=("${load_curve_stepwise[@]}") ;;
+	*)
+		echo "Invalid curve type. Use 'default' or 'stepwise'" >&2
+		exit 1
+		;;
 	esac
 
 	local cpus
@@ -81,9 +92,9 @@ main() {
 
 	echo "Total time: $total_time seconds, Repeats: $repeats, Curve type: $curve_type"
 
-	# sleep 5 so that first run and the second run look the same
+	# sleep so that first run and the second run look the same
 	echo "Warmup .."
-	run stress-ng --cpu "$cpus" --cpu-method ackermann --cpu-load 0 --timeout 5
+	run stress-ng --cpu "$cpus" --cpu-method ackermann --cpu-load 0 --timeout "$warmup_time"
 
 	for i in $(seq 1 "$repeats"); do
 		echo "Running: $i/$repeats"
@@ -92,6 +103,9 @@ main() {
 			local time="${x##*:}s"
 			run stress-ng --cpu "$cpus" --cpu-method ackermann --cpu-load "$load" --timeout "$time"
 		done
+		# sleep so that the next run looks the same
+		echo "Cooldown .."
+		run stress-ng --cpu "$cpus" --cpu-method ackermann --cpu-load 0 --timeout "$cooldown_time"
 	done
 }
 
diff --git a/e2e/tools/validator/src/validator/cli/__init__.py b/e2e/tools/validator/src/validator/cli/__init__.py
index 9dd3cbc95e..8c78440a51 100644
--- a/e2e/tools/validator/src/validator/cli/__init__.py
+++ b/e2e/tools/validator/src/validator/cli/__init__.py
@@ -26,7 +26,7 @@
 from validator.prometheus import Comparator, PrometheusClient, Series, ValueOrError
 from validator.report import CustomEncoder, JsonTemplate
 from validator.specs import MachineSpec, get_host_spec, get_vm_spec
-from validator.stresser import Remote, ScriptResult
+from validator.stresser import Local, Remote, ScriptResult
 from validator.validations import Loader, QueryTemplate, Validation
 
 logger = logging.getLogger(__name__)
@@ -600,6 +600,9 @@ def run_validation(
 )
 @pass_config
 def validate_acpi(cfg: config.Validator, duration: datetime.timedelta, report_dir: str) -> None:
+    """
+    Run Kepler ACPI validation test
+    """
     results_dir, tag = create_report_dir(report_dir)
     res = TestResult(tag)
 
@@ -621,6 +624,59 @@ def validate_acpi(cfg: config.Validator, duration: datetime.timedelta, report_di
     raise Exit(1) if not res.validations.passed else Exit(0)
 
 
+@validator.command()
+@click.option(
+    "--script-path",
+    "-s",
+    default="./scripts/stressor.sh",
+    type=click.Path(exists=True),
+    show_default=True,
+)
+# ruff: noqa: S108 (Suppressed as we are intentionally using `/tmp` as reporting directory)
+@click.option(
+    "--report-dir",
+    "-o",
+    default="/tmp",
+    type=click.Path(exists=True, dir_okay=True, writable=True),
+    show_default=True,
+)
+@pass_config
+def regression(
+    cfg: config.Validator,
+    script_path: str,
+    report_dir: str,
+):
+    """
+    Run Kepler regression test
+    """
+    results_dir, tag = create_report_dir(report_dir)
+    res = TestResult(tag)
+    click.secho("  * Generating build and node info ...", fg="green")
+    res.build_info, res.node_info = get_build_and_node_info(cfg.prometheus)
+    click.secho("  * Generating spec report ...", fg="green")
+    res.host_spec = get_host_spec()
+    local = Local()
+    warmup_seconds = cfg.stressor.warmup_seconds
+    cooldown_seconds = cfg.stressor.cooldown_seconds
+    curve_type = cfg.stressor.curve_type
+    repeats = cfg.stressor.repeats
+    stress_test = local.run_script(
+        script_path=script_path, c=curve_type, w=warmup_seconds, d=cooldown_seconds, r=repeats
+    )
+    res.start_time = stress_test.start_time
+    res.end_time = stress_test.end_time
+
+    # sleep a bit for prometheus to finish scrapping
+    click.secho("  * Sleeping for 10 seconds ...", fg="green")
+    time.sleep(10)
+
+    res.validations = run_validations(cfg, stress_test, results_dir)
+    click.secho("  * Generating validate metrics report file and dir", fg="green")
+    write_md_report(results_dir, res)
+
+    raise Exit(1) if not res.validations.passed else Exit(0)
+
+
 def write_json_report(results_dir: str, res: TestResult):
     pattern = re.compile(r'[{]?(\w+)=("[^"]*"|[^,]+)[},]?')
 
diff --git a/e2e/tools/validator/src/validator/cli/options.py b/e2e/tools/validator/src/validator/cli/options.py
index 27068afa20..1091d7d6d3 100644
--- a/e2e/tools/validator/src/validator/cli/options.py
+++ b/e2e/tools/validator/src/validator/cli/options.py
@@ -32,7 +32,7 @@ class Duration(click.ParamType):
     def convert(self, value, param, ctx):
         td = parse_timedelta("now", value)
         if not td:
-            self.self.fail(
+            self.fail(
                 "Expected duration format got " f"{value:r}",
                 param,
                 ctx,
diff --git a/e2e/tools/validator/src/validator/config/__init__.py b/e2e/tools/validator/src/validator/config/__init__.py
index 31fed7b14b..7c62c5d6c4 100644
--- a/e2e/tools/validator/src/validator/config/__init__.py
+++ b/e2e/tools/validator/src/validator/config/__init__.py
@@ -43,6 +43,9 @@ class Prometheus(NamedTuple):
 class Stressor(NamedTuple):
     total_runtime_seconds: int
     curve_type: str
+    repeats: int
+    warmup_seconds: int
+    cooldown_seconds: int
 
 
 class Validator(NamedTuple):
@@ -113,11 +116,16 @@ def load(config_file: str) -> Validator:
 
     stressor_config = config["stressor"]
     if not stressor_config:
-        stressor = Stressor(total_runtime_seconds=1200, curve_type="default")
+        stressor = Stressor(
+            total_runtime_seconds=1200, curve_type="default", repeats=5, warmup_seconds=5, cooldown_seconds=5
+        )
     else:
         stressor = Stressor(
             total_runtime_seconds=stressor_config.get("total_runtime_seconds", 1200),
             curve_type=stressor_config.get("curve_type", "default"),
+            repeats=stressor_config.get("repeats", 5),
+            warmup_seconds=stressor_config.get("warmup_seconds", 5),
+            cooldown_seconds=stressor_config.get("cooldown_seconds", 5),
         )
 
     validations_file = config.get("validations_file", "validations.yaml")
diff --git a/e2e/tools/validator/src/validator/stresser/__init__.py b/e2e/tools/validator/src/validator/stresser/__init__.py
index 01afec1181..5cdf09a2de 100644
--- a/e2e/tools/validator/src/validator/stresser/__init__.py
+++ b/e2e/tools/validator/src/validator/stresser/__init__.py
@@ -1,4 +1,7 @@
 import logging
+import os
+import shutil
+import subprocess
 from datetime import datetime
 from typing import NamedTuple
 
@@ -20,6 +23,51 @@ class RunResult(NamedTuple):
     exit_code: int
 
 
+class Local:
+    def copy(self, script_path, target_script):
+        logger.info("copying script %s - %s", script_path, target_script)
+        shutil.copy(script_path, target_script)
+        os.chmod(target_script, 0o700)
+        logger.info("copying script %s - %s - successful", script_path, target_script)
+
+    def run_script(self, script_path: str, **kwargs) -> ScriptResult:
+        logger.info("Running script %s ...", script_path)
+        # Prepare CLI oprions
+        cli_options = " ".join([f"-{k} {v}" for k, v in kwargs.items()]) if kwargs else ""
+        # ruff: noqa: S108 (Suppressed hard-coded path because we want to intentionally copy stress.sh inside `/tmp` dir)
+        target_script = "/tmp/regression-stress.sh"
+        self.copy(script_path, target_script)
+
+        command = [target_script, *cli_options.split()]
+        logger.info("Running command %s ...", command)
+        # ruff: noqa: DTZ005 (Suppressed non-time-zone aware object creation as it is not necessary for this use case)
+        start_time = datetime.now()
+        process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        stdout, stderr = process.communicate()
+        end_time = datetime.now()
+
+        # Output stdout
+        print("stdout output:")
+        for line in stdout.decode().splitlines():
+            print(" ┊ ", line)
+
+        # Output stderr
+        print("\nstderr output:")
+        for line in stderr.decode().splitlines():
+            print(" ┊ ", line)
+        print("‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾\n\n")
+
+        if process.returncode != 0:
+            logger.warning("script execution failed")
+        else:
+            logger.info("script execution successful")
+
+        return ScriptResult(
+            start_time=start_time,
+            end_time=end_time,
+        )
+
+
 class Remote:
     def __init__(self, config: config.Remote):
         self.host = config.host
diff --git a/e2e/tools/validator/src/validator/validations/__init__.py b/e2e/tools/validator/src/validator/validations/__init__.py
index 21ffe8de01..d5d0d309e4 100644
--- a/e2e/tools/validator/src/validator/validations/__init__.py
+++ b/e2e/tools/validator/src/validator/validations/__init__.py
@@ -88,6 +88,7 @@ def validation_from_yaml(v: dict[str, Any]) -> Validation:
                 predicted_label=predicted_label,
                 units=v.get("units", ""),
                 max_mape=v.get("max_mape"),
+                max_mae=v.get("max_mae"),
             )
 
         return [validation_from_yaml(v) for v in yml["validations"]]
diff --git a/e2e/tools/validator/validator-regression.yaml b/e2e/tools/validator/validator-regression.yaml
new file mode 100644
index 0000000000..c49af28c40
--- /dev/null
+++ b/e2e/tools/validator/validator-regression.yaml
@@ -0,0 +1,30 @@
+log_level: info # Logging level, defaults is warn
+
+remote:
+  host: 192.168.1.1 # IP address or hostname of the VM
+  port: 22 # SSH port, default is 22
+  username: user # SSH username
+  password: yourpassword # SSH password
+  pkey: ~/.ssh/id_rsa # Path to SSH private key
+
+metal:
+  vm:
+    pid: 123456 # Process ID for the KVM process running on metal
+
+prometheus:
+  job:
+    vm: vm  # Job name for virtual machine metrics, default is vm
+    metal: metal  # Job name for metal metrics, default is metal
+
+  url: http://localhost:9090 # Prometheus server URL
+  rate_interval: 60s  # Rate interval for Promql, default is 20s, typically 4 x $scrape_interval
+  step: 3s  # Step duration for Prometheus range queries
+
+stressor:
+  total_runtime_seconds: 1200
+  curve_type: default
+  warmup_seconds: 5
+  repeats: 1
+  cooldown_seconds: 60
+
+validations_file: ./metric_validations.yaml  # Path to the validations file, default is ./validations.yaml
diff --git a/manifests/compose/monitoring/prometheus/prometheus.yml b/manifests/compose/monitoring/prometheus/prometheus.yml
index 93456a0e36..ad7df712f1 100644
--- a/manifests/compose/monitoring/prometheus/prometheus.yml
+++ b/manifests/compose/monitoring/prometheus/prometheus.yml
@@ -1,5 +1,5 @@
 global:
-  scrape_interval: 5s # Set the scrape interval to every 5 seconds. Default is every 1 minute.
+  scrape_interval: 3s # Set the scrape interval to every 5 seconds. Default is every 1 minute.
   evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
   # scrape_timeout is set to the global default (10s).