Skip to content

Commit

Permalink
feat(baremetal-validation): Add relevant config files
Browse files Browse the repository at this point in the history
Added relevant config files for baremetal validation including
formatted prom validation metrics and baremetal configuration.

Signed-off-by: Kaiyi <[email protected]>
  • Loading branch information
KaiyiLiu1234 committed Dec 11, 2024
1 parent 15106d7 commit 0e2bf49
Show file tree
Hide file tree
Showing 7 changed files with 291 additions and 26 deletions.
231 changes: 231 additions & 0 deletions e2e/tools/validator/bm_validations.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,231 @@
# metal_job_name: metal
# scaphandre_job_name: scaphandre
# node_exporter_job_name: node_exporter
# remove path if possible

validations:
process:
# validate process bpf cpu time with node exporter
- name: node-cpu-time - kepler-process-bpf-cpu-time
mapping:
actual: node-cpu-time
predicted: kepler-process-bpf-cpu-time
units: Milliseconds
node-cpu-time: |
sum(
rate(
node_cpu_seconds_total{{
cpu="{isolated_cpu}",

Check failure on line 18 in e2e/tools/validator/bm_validations.yaml

View workflow job for this annotation

GitHub Actions / yamllint / yamllint

18:36 [trailing-spaces] trailing spaces
mode!="idle"
}}[{rate_interval}]
)
) * 1000
kepler-process-bpf-cpu-time: |
sum(
rate(
kepler_process_bpf_cpu_time_ms_total{{
job="{metal_job_name}",
pid=~"{pids}"
}}[{rate_interval}]
)
)

Check failure on line 32 in e2e/tools/validator/bm_validations.yaml

View workflow job for this annotation

GitHub Actions / yamllint / yamllint

32:1 [trailing-spaces] trailing spaces
# validate kepler bpf cpu time with process exporter (namedgroup must be pid)
# include system and user
- name: scaph-process-cpu-time - kepler-process-bpf-cpu-time
mapping:
actual: scaph-process-cpu-time
predicted: kepler-process-bpf-cpu-time
units: Milliseconds
scaph-process-cpu-time: |
sum(
rate(
namedprocess_namegroup_cpu_seconds_total{{
groupname=~"{pids}"
}}[{rate_interval}]
)
) * 1000
kepler-process-bpf-cpu-time: |
sum(
rate(
kepler_process_bpf_cpu_time_ms_total{{
job="{metal_job_name}",
pid=~"{pids}"
}}[{rate_interval}]
)
)
- name: kepler-process-bpf-cpu-time usage * node-package-power - kepler-process-package-power
mapping:
actual: kepler-process-cpu-ratio-node-package-power
predicted: kepler-process-package-power
units: Watts
kepler-process-cpu-ratio-node-package-power: |
(
sum(
rate(
kepler_process_bpf_cpu_time_ms_total{{
job="{metal_job_name}",
pid=~"{pids}"
}}[{rate_interval}]
)
) /

Check failure on line 72 in e2e/tools/validator/bm_validations.yaml

View workflow job for this annotation

GitHub Actions / yamllint / yamllint

72:14 [trailing-spaces] trailing spaces
sum(
rate(
kepler_process_bpf_cpu_time_ms_total{{
job="{metal_job_name}",
}}[{rate_interval}]
)
)
) *

Check failure on line 80 in e2e/tools/validator/bm_validations.yaml

View workflow job for this annotation

GitHub Actions / yamllint / yamllint

80:12 [trailing-spaces] trailing spaces
sum(
rate(
node_rapl_package_joules_total{{
path="/host/sys/class/powercap/intel-rapl:0"
}}[{rate_interval}]
)
)
kepler-process-package-power: |
sum(
rate(
kepler_process_package_joules_total{{
job="{metal_job_name}",
pid=~"{pids}"
}}[{rate_interval}]
)
)
- name: node-exporter-cpu usage * node-package-power - kepler-process-package-power
mapping:
actual: kepler-process-cpu-ratio-node-package-power
predicted: kepler-process-package-power
units: Watts
kepler-process-cpu-ratio-node-package-power: |
(
(
sum(
rate(
node_cpu_seconds_total{{
cpu=~"{isolated_cpu}",

Check failure on line 109 in e2e/tools/validator/bm_validations.yaml

View workflow job for this annotation

GitHub Actions / yamllint / yamllint

109:41 [trailing-spaces] trailing spaces
mode!="idle"
}}[{rate_interval}]
)
) * 1000
) /

Check failure on line 114 in e2e/tools/validator/bm_validations.yaml

View workflow job for this annotation

GitHub Actions / yamllint / yamllint

114:14 [trailing-spaces] trailing spaces
(
sum(
rate(
node_cpu_seconds_total{{
mode!="idle",
}}[{rate_interval}]
)
) * 1000
)
) *

Check failure on line 124 in e2e/tools/validator/bm_validations.yaml

View workflow job for this annotation

GitHub Actions / yamllint / yamllint

124:12 [trailing-spaces] trailing spaces
sum(
rate(
node_rapl_package_joules_total{{
path="/host/sys/class/powercap/intel-rapl:0"
}}[{rate_interval}]
)
)
kepler-process-package-power: |
sum(
rate(
kepler_process_package_joules_total{{
job="{metal_job_name}",
pid=~"{pids}"
}}[{rate_interval}]
)
)
container:
# validate container bpf cpu time with node exporter
- name: node-cpu-time - kepler-container-bpf-cpu-time
mapping:
actual: node-cpu-time
predicted: kepler-container-bpf-cpu-time
units: Milliseconds
node-cpu-time: |
sum(
rate(
node_cpu_seconds_total{{
cpu=~"{isolated_cpu}",

Check failure on line 154 in e2e/tools/validator/bm_validations.yaml

View workflow job for this annotation

GitHub Actions / yamllint / yamllint

154:37 [trailing-spaces] trailing spaces
mode!="idle"
}}[{rate_interval}]
)
) * 1000
kepler-container-bpf-cpu-time: |
sum(
rate(
kepler_container_bpf_cpu_time_ms_total{{
job="{metal_job_name}",
container_id="{container_id}"
}}[{rate_interval}]
)
)

Check failure on line 168 in e2e/tools/validator/bm_validations.yaml

View workflow job for this annotation

GitHub Actions / yamllint / yamllint

168:1 [trailing-spaces] trailing spaces
- name: kepler-container-bpf-cpu-time usage * node-package-power - kepler-container-package-power
mapping:
actual: kepler-container-cpu-ratio-node-package-power
predicted: kepler-container-package-power
units: Watts
kepler-container-cpu-ratio-node-package-power: |
(
sum(
rate(
kepler_container_bpf_cpu_time_ms_total{{
job="{metal_job_name}",
container_id="{container_id}"
}}[{rate_interval}]
)
) /

Check failure on line 183 in e2e/tools/validator/bm_validations.yaml

View workflow job for this annotation

GitHub Actions / yamllint / yamllint

183:14 [trailing-spaces] trailing spaces
sum(
rate(
kepler_container_bpf_cpu_time_ms_total{{
job="{metal_job_name}",
}}[{rate_interval}]
)
)
) *
sum(
rate(
node_rapl_package_joules_total{{
path="/host/sys/class/powercap/intel-rapl:0"
}}[{rate_interval}]
)
)
kepler-container-package-power: |
sum(
rate(
kepler_container_package_joules_total{{
job="{metal_job_name}",
container_id="{container_id}"
}}[{rate_interval}]
)
)
node:
# node level package power comparison
- name: node-rapl - kepler-node-package
mapping:
actual: node-rapl
predicted: kepler-node-package
units: Watts
node-rapl: |
sum(
rate(
node_rapl_package_joules_total{{
path="/host/sys/class/powercap/intel-rapl:0"
}}[{rate_interval}]
)
)
kepler-node-package: |
sum(
rate(
kepler_node_package_joules_total{{
job="{metal_job_name}",
}}[{rate_interval}]
)
)
2 changes: 2 additions & 0 deletions e2e/tools/validator/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,12 @@ dependencies = [
"matplotlib",
"scikit-learn",
"docker",
"psutil",
]

[project.scripts]
validator = "validator.cli:validator"
bm_validator = "validator.cli:bm_validator"

[tool.hatch.version]
path = "src/validator/__about__.py"
Expand Down
6 changes: 4 additions & 2 deletions e2e/tools/validator/scripts/targeted_stresser.sh
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -64,14 +64,16 @@ main() {

start_time=$(date +%s)
echo "Stress Start Time: $start_time" >> "$TIME_INTERVAL_LOG"

local all_cpus
all_cpus=$(nproc)
for i in $(seq 1 "$iterations"); do
echo "Running $i/$iterations"
for x in "${load_curve[@]}"; do
local load="${x%%:*}"
local time="${x##*:}s"
if $set_general_mode; then
run stress-ng --cpu "$cpus" --cpu-method ackermann --cpu-load "$load" --timeout "$time"
# replace cpus with all avaialbe cpus with nproc
run stress-ng --cpu "$all_cpus" --cpu-method ackermann --cpu-load "$load" --timeout "$time"
else
run taskset -c "$cpu_range" stress-ng --cpu "$cpus" --cpu-method ackermann --cpu-load "$load" --timeout "$time"
fi
Expand Down
23 changes: 14 additions & 9 deletions e2e/tools/validator/src/validator/cli/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -668,8 +668,8 @@ def validate_acpi(cfg: config.Validator, duration: datetime.timedelta, report_di
type=click.Path(exists=True, dir_okay=True, writable=True),
show_default=True,
)
@pass_config
def regression(
@pass_bm_config
def stress(
cfg: config.BMValidator,
report_dir: str,
):
Expand All @@ -681,7 +681,8 @@ def regression(
click.secho(f"\tresults dir: {results_dir}, tag: {tag}", fg="bright_green")
res = TestResult(tag)
res.build_info, res.node_info = get_build_and_node_info(cfg.prometheus)
res.start_time = datetime.datetime.now()
test_start_time = datetime.datetime.now()
res.start_time = test_start_time
click.secho(" * Generating spec report ...", fg="green")
res.host_spec = get_host_spec()
validation_results = []
Expand All @@ -694,7 +695,7 @@ def regression(
local_stress_test = local_stress.stress()
start_time = local_stress_test.start_time
end_time = local_stress_test.end_time

print(f"node: start time: {start_time}, end time: {end_time}")
# sleep a bit for prometheus to finish scrapping
click.secho(" * Sleeping for 10 seconds ...", fg="green")
time.sleep(10)
Expand All @@ -714,15 +715,14 @@ def regression(
start_time = process_stress_test.script_result.start_time
end_time = process_stress_test.script_result.end_time
relevant_pids = process_stress_test.relevant_pids

print(f"process: start time: {start_time}, end time: {end_time}")
# sleep a bit for prometheus to finish scrapping
click.secho(" * Sleeping for 10 seconds ...", fg="green")
time.sleep(10)
click.secho(" * Acquiring process stress validations ...", fg="green")
prom = PrometheusClient(cfg.prometheus)
comparator = Comparator(prom)
validations = BLoader(cfg).load_process_validations(relevant_pids)

validation_results.extend([run_validation(v, comparator, start_time, end_time, results_dir) for v in validations])

if cfg.container:
Expand All @@ -734,7 +734,7 @@ def regression(
start_time = container_stress_test.script_result.start_time
end_time = container_stress_test.script_result.end_time
container_id = container_stress_test.container_id

print(f"container: start time: {start_time}, end time: {end_time}")
# sleep a bit for prometheus to finish scrapping
click.secho(" * Sleeping for 10 seconds ...", fg="green")
time.sleep(10)
Expand All @@ -745,9 +745,14 @@ def regression(

validation_results.extend([run_validation(v, comparator, start_time, end_time, results_dir) for v in validations])

res.end_time = datetime.datetime.now()
test_end_time = datetime.datetime.now()
res.end_time = test_end_time

res.validations = validation_results
res.validations = ValidationResults(
started_at=test_start_time,
ended_at=test_end_time,
results=validation_results
)
write_json_report(results_dir, res)
write_md_report(results_dir, res)

Expand Down
Loading

0 comments on commit 0e2bf49

Please sign in to comment.