diff --git a/e2e/tools/validator/bm_validations.yaml b/e2e/tools/validator/bm_validations.yaml new file mode 100644 index 0000000000..d35816f825 --- /dev/null +++ b/e2e/tools/validator/bm_validations.yaml @@ -0,0 +1,231 @@ +# metal_job_name: metal +# scaphandre_job_name: scaphandre +# node_exporter_job_name: node_exporter +# remove path if possible + +validations: + process: + # validate process bpf cpu time with node exporter + - name: node-cpu-time - kepler-process-bpf-cpu-time + mapping: + actual: node-cpu-time + predicted: kepler-process-bpf-cpu-time + units: Milliseconds + node-cpu-time: | + sum( + rate( + node_cpu_seconds_total{{ + cpu="{isolated_cpu}", + mode!="idle" + }}[{rate_interval}] + ) + ) * 1000 + kepler-process-bpf-cpu-time: | + sum( + rate( + kepler_process_bpf_cpu_time_ms_total{{ + job="{metal_job_name}", + pid=~"{pids}" + }}[{rate_interval}] + ) + ) + + # validate kepler bpf cpu time with process exporter (namedgroup must be pid) + # include system and user + - name: scaph-process-cpu-time - kepler-process-bpf-cpu-time + mapping: + actual: scaph-process-cpu-time + predicted: kepler-process-bpf-cpu-time + units: Milliseconds + scaph-process-cpu-time: | + sum( + rate( + namedprocess_namegroup_cpu_seconds_total{{ + groupname=~"{pids}" + }}[{rate_interval}] + ) + ) * 1000 + kepler-process-bpf-cpu-time: | + sum( + rate( + kepler_process_bpf_cpu_time_ms_total{{ + job="{metal_job_name}", + pid=~"{pids}" + }}[{rate_interval}] + ) + ) + + - name: kepler-process-bpf-cpu-time usage * node-package-power - kepler-process-package-power + mapping: + actual: kepler-process-cpu-ratio-node-package-power + predicted: kepler-process-package-power + units: Watts + kepler-process-cpu-ratio-node-package-power: | + ( + sum( + rate( + kepler_process_bpf_cpu_time_ms_total{{ + job="{metal_job_name}", + pid=~"{pids}" + }}[{rate_interval}] + ) + ) / + sum( + rate( + kepler_process_bpf_cpu_time_ms_total{{ + job="{metal_job_name}", + }}[{rate_interval}] + ) + ) + ) * + sum( + rate( + node_rapl_package_joules_total{{ + path="/host/sys/class/powercap/intel-rapl:0" + }}[{rate_interval}] + ) + ) + kepler-process-package-power: | + sum( + rate( + kepler_process_package_joules_total{{ + job="{metal_job_name}", + pid=~"{pids}" + }}[{rate_interval}] + ) + ) + + - name: node-exporter-cpu usage * node-package-power - kepler-process-package-power + mapping: + actual: kepler-process-cpu-ratio-node-package-power + predicted: kepler-process-package-power + units: Watts + kepler-process-cpu-ratio-node-package-power: | + ( + ( + sum( + rate( + node_cpu_seconds_total{{ + cpu=~"{isolated_cpu}", + mode!="idle" + }}[{rate_interval}] + ) + ) * 1000 + ) / + ( + sum( + rate( + node_cpu_seconds_total{{ + mode!="idle", + }}[{rate_interval}] + ) + ) * 1000 + ) + ) * + sum( + rate( + node_rapl_package_joules_total{{ + path="/host/sys/class/powercap/intel-rapl:0" + }}[{rate_interval}] + ) + ) + kepler-process-package-power: | + sum( + rate( + kepler_process_package_joules_total{{ + job="{metal_job_name}", + pid=~"{pids}" + }}[{rate_interval}] + ) + ) + + + container: + # validate container bpf cpu time with node exporter + - name: node-cpu-time - kepler-container-bpf-cpu-time + mapping: + actual: node-cpu-time + predicted: kepler-container-bpf-cpu-time + units: Milliseconds + node-cpu-time: | + sum( + rate( + node_cpu_seconds_total{{ + cpu=~"{isolated_cpu}", + mode!="idle" + }}[{rate_interval}] + ) + ) * 1000 + kepler-container-bpf-cpu-time: | + sum( + rate( + kepler_container_bpf_cpu_time_ms_total{{ + job="{metal_job_name}", + container_id="{container_id}" + }}[{rate_interval}] + ) + ) + + - name: kepler-container-bpf-cpu-time usage * node-package-power - kepler-container-package-power + mapping: + actual: kepler-container-cpu-ratio-node-package-power + predicted: kepler-container-package-power + units: Watts + kepler-container-cpu-ratio-node-package-power: | + ( + sum( + rate( + kepler_container_bpf_cpu_time_ms_total{{ + job="{metal_job_name}", + container_id="{container_id}" + }}[{rate_interval}] + ) + ) / + sum( + rate( + kepler_container_bpf_cpu_time_ms_total{{ + job="{metal_job_name}", + }}[{rate_interval}] + ) + ) + ) * + sum( + rate( + node_rapl_package_joules_total{{ + path="/host/sys/class/powercap/intel-rapl:0" + }}[{rate_interval}] + ) + ) + kepler-container-package-power: | + sum( + rate( + kepler_container_package_joules_total{{ + job="{metal_job_name}", + container_id="{container_id}" + }}[{rate_interval}] + ) + ) + + node: + # node level package power comparison + - name: node-rapl - kepler-node-package + mapping: + actual: node-rapl + predicted: kepler-node-package + units: Watts + node-rapl: | + sum( + rate( + node_rapl_package_joules_total{{ + path="/host/sys/class/powercap/intel-rapl:0" + }}[{rate_interval}] + ) + ) + kepler-node-package: | + sum( + rate( + kepler_node_package_joules_total{{ + job="{metal_job_name}", + }}[{rate_interval}] + ) + ) \ No newline at end of file diff --git a/e2e/tools/validator/pyproject.toml b/e2e/tools/validator/pyproject.toml index eac8d556f7..b7958f8424 100644 --- a/e2e/tools/validator/pyproject.toml +++ b/e2e/tools/validator/pyproject.toml @@ -32,10 +32,12 @@ dependencies = [ "matplotlib", "scikit-learn", "docker", + "psutil", ] [project.scripts] validator = "validator.cli:validator" +bm_validator = "validator.cli:bm_validator" [tool.hatch.version] path = "src/validator/__about__.py" diff --git a/e2e/tools/validator/scripts/targeted_stresser.sh b/e2e/tools/validator/scripts/targeted_stresser.sh old mode 100644 new mode 100755 index 3ba379f2c8..9093516d3f --- a/e2e/tools/validator/scripts/targeted_stresser.sh +++ b/e2e/tools/validator/scripts/targeted_stresser.sh @@ -64,14 +64,16 @@ main() { start_time=$(date +%s) echo "Stress Start Time: $start_time" >> "$TIME_INTERVAL_LOG" - + local all_cpus + all_cpus=$(nproc) for i in $(seq 1 "$iterations"); do echo "Running $i/$iterations" for x in "${load_curve[@]}"; do local load="${x%%:*}" local time="${x##*:}s" if $set_general_mode; then - run stress-ng --cpu "$cpus" --cpu-method ackermann --cpu-load "$load" --timeout "$time" + # replace cpus with all avaialbe cpus with nproc + run stress-ng --cpu "$all_cpus" --cpu-method ackermann --cpu-load "$load" --timeout "$time" else run taskset -c "$cpu_range" stress-ng --cpu "$cpus" --cpu-method ackermann --cpu-load "$load" --timeout "$time" fi diff --git a/e2e/tools/validator/src/validator/cli/__init__.py b/e2e/tools/validator/src/validator/cli/__init__.py index a1f608ef34..e2fc0a109b 100644 --- a/e2e/tools/validator/src/validator/cli/__init__.py +++ b/e2e/tools/validator/src/validator/cli/__init__.py @@ -668,8 +668,8 @@ def validate_acpi(cfg: config.Validator, duration: datetime.timedelta, report_di type=click.Path(exists=True, dir_okay=True, writable=True), show_default=True, ) -@pass_config -def regression( +@pass_bm_config +def stress( cfg: config.BMValidator, report_dir: str, ): @@ -681,7 +681,8 @@ def regression( click.secho(f"\tresults dir: {results_dir}, tag: {tag}", fg="bright_green") res = TestResult(tag) res.build_info, res.node_info = get_build_and_node_info(cfg.prometheus) - res.start_time = datetime.datetime.now() + test_start_time = datetime.datetime.now() + res.start_time = test_start_time click.secho(" * Generating spec report ...", fg="green") res.host_spec = get_host_spec() validation_results = [] @@ -694,7 +695,7 @@ def regression( local_stress_test = local_stress.stress() start_time = local_stress_test.start_time end_time = local_stress_test.end_time - + print(f"node: start time: {start_time}, end time: {end_time}") # sleep a bit for prometheus to finish scrapping click.secho(" * Sleeping for 10 seconds ...", fg="green") time.sleep(10) @@ -714,7 +715,7 @@ def regression( start_time = process_stress_test.script_result.start_time end_time = process_stress_test.script_result.end_time relevant_pids = process_stress_test.relevant_pids - + print(f"process: start time: {start_time}, end time: {end_time}") # sleep a bit for prometheus to finish scrapping click.secho(" * Sleeping for 10 seconds ...", fg="green") time.sleep(10) @@ -722,7 +723,6 @@ def regression( prom = PrometheusClient(cfg.prometheus) comparator = Comparator(prom) validations = BLoader(cfg).load_process_validations(relevant_pids) - validation_results.extend([run_validation(v, comparator, start_time, end_time, results_dir) for v in validations]) if cfg.container: @@ -734,7 +734,7 @@ def regression( start_time = container_stress_test.script_result.start_time end_time = container_stress_test.script_result.end_time container_id = container_stress_test.container_id - + print(f"container: start time: {start_time}, end time: {end_time}") # sleep a bit for prometheus to finish scrapping click.secho(" * Sleeping for 10 seconds ...", fg="green") time.sleep(10) @@ -745,9 +745,14 @@ def regression( validation_results.extend([run_validation(v, comparator, start_time, end_time, results_dir) for v in validations]) - res.end_time = datetime.datetime.now() + test_end_time = datetime.datetime.now() + res.end_time = test_end_time - res.validations = validation_results + res.validations = ValidationResults( + started_at=test_start_time, + ended_at=test_end_time, + results=validation_results + ) write_json_report(results_dir, res) write_md_report(results_dir, res) diff --git a/e2e/tools/validator/src/validator/prometheus/__init__.py b/e2e/tools/validator/src/validator/prometheus/__init__.py index df6c8b5f49..6cd87d94f7 100644 --- a/e2e/tools/validator/src/validator/prometheus/__init__.py +++ b/e2e/tools/validator/src/validator/prometheus/__init__.py @@ -133,7 +133,7 @@ def filter_by_equal_timestamps(a: Series, b: Series) -> tuple[Series, Series]: """ filtered_a = [] - filterd_b = [] + filtered_b = [] idx_a, idx_b = 0, 0 @@ -144,7 +144,7 @@ def filter_by_equal_timestamps(a: Series, b: Series) -> tuple[Series, Series]: while idx_a < len(a.samples) and idx_b < len(b.samples): if abs(b.samples[idx_b].timestamp - a.samples[idx_a].timestamp) < scrape_interval: filtered_a.append(a.samples[idx_a]) - filterd_b.append(b.samples[idx_b]) + filtered_b.append(b.samples[idx_b]) idx_a += 1 idx_b += 1 elif a.samples[idx_a].timestamp < b.samples[idx_b].timestamp: @@ -154,7 +154,7 @@ def filter_by_equal_timestamps(a: Series, b: Series) -> tuple[Series, Series]: return ( Series.from_samples(a.query, filtered_a, a.labels), - Series.from_samples(b.query, filterd_b, b.labels), + Series.from_samples(b.query, filtered_b, b.labels), ) @@ -208,11 +208,31 @@ def kepler_node_info(self) -> list[str]: labels = [r["metric"] for r in resp] return [to_metric(b) for b in labels] +# Add Interface for Comparator +# class Comparator(ABC): +# def single_series(self, query: str, start: datetime, end: datetime) -> Series: +# series = self.prom_client.range_query(query, start, end) + +# if len(series) != 1: +# raise SeriesError(query, 1, len(series)) + +# return series[0] + +# @abstractmethod +# def compare( +# self, +# start: datetime, +# end: datetime, +# actual_query: str, +# predicted_query: str, +# ) -> Result: +# raise NotImplementedError + class Comparator: def __init__(self, client: Queryable): self.prom_client = client - + def single_series(self, query: str, start: datetime, end: datetime) -> Series: series = self.prom_client.range_query(query, start, end) @@ -234,7 +254,6 @@ def compare( actual, predicted = filter_by_equal_timestamps(actual_series, predicted_series) actual_dropped = len(actual_series.samples) - len(actual.samples) predicted_dropped = len(predicted_series.samples) - len(predicted.samples) - return Result( mse=mse(actual.values, predicted.values), mape=mape(actual.values, predicted.values), @@ -243,4 +262,4 @@ def compare( predicted_series=predicted_series, actual_dropped=actual_dropped, predicted_dropped=predicted_dropped, - ) + ) \ No newline at end of file diff --git a/e2e/tools/validator/src/validator/validations/__init__.py b/e2e/tools/validator/src/validator/validations/__init__.py index 060068d109..bc04683af7 100644 --- a/e2e/tools/validator/src/validator/validations/__init__.py +++ b/e2e/tools/validator/src/validator/validations/__init__.py @@ -102,7 +102,7 @@ def _load_base_promql_vars(self) -> dict[str, str]: promql_vars = {} prom = self.cfg.prometheus promql_vars["rate_interval"] = prom.rate_interval - promql_vars["job"] = prom.job.metal + promql_vars["metal_job_name"] = prom.job.metal return promql_vars def load_node_validations(self) -> list[Validation]: @@ -117,8 +117,9 @@ def load_node_validations(self) -> list[Validation]: def load_process_validations(self, process_pids: list[str]) -> list[Validation]: promql_vars = self._load_base_promql_vars() pids = "|".join(map(str, process_pids)) - pid_label = f'pid=~"{pids}"' - promql_vars["target_pids"] = pid_label + #pid_label = f'pid=~"{pids}"' + promql_vars["pids"] = pids + promql_vars["isolated_cpu"] = self.cfg.process.isolated_cpu return read_validations( self.cfg.validations_file, @@ -128,7 +129,8 @@ def load_process_validations(self, process_pids: list[str]) -> list[Validation]: def load_container_validations(self, container_id: str) -> list[Validation]: promql_vars = self._load_base_promql_vars() - promql_vars["target_container_id"] = container_id + promql_vars["container_id"] = container_id + promql_vars["isolated_cpu"] = self.cfg.container.isolated_cpu return read_validations( self.cfg.validations_file, diff --git a/e2e/tools/validator/validator.bm.yaml b/e2e/tools/validator/validator.bm.yaml index 6d7b071921..41d61f4337 100644 --- a/e2e/tools/validator/validator.bm.yaml +++ b/e2e/tools/validator/validator.bm.yaml @@ -2,8 +2,8 @@ log_level: warn # Logging level, defaults is warn prometheus: job: metal # Job name for baremetal metrics - url: http://localhost:9090 # Prometheus server URL - rate_interval: 20s # Rate interval for Promql, default is 20s, typically 4 x $scrape_interval + url: http://localhost:9091 # Prometheus server URL + rate_interval: 21s # Rate interval for Promql, default is 20s, typically 4 x $scrape_interval step: 3s # Step duration for Prometheus range queries config: # default settings @@ -15,11 +15,15 @@ config: # default settings #node: {} or node: or node: ~ node: - + load_curve: "0:20,50:20,100:20,50:20,0:20" process: - iterations: "2" + #iterations: "2" + isolated_cpu: "5" + load_curve: "0:20,50:20,100:20,50:20,0:20" container: - iterations: "2" + #iterations: "2" + isolated_cpu: "5" + load_curve: "0:20,50:20,100:20,50:20,0:20" validations_file: ./bm_validations.yaml # Path to the validations file, default is ./bm_validations.yaml \ No newline at end of file