Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(baremetal_validator): Add Validator for Process, Container Metrics #1878

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
333 changes: 333 additions & 0 deletions e2e/tools/validator/bm_validations.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,333 @@
# metal_job_name: metal
# scaphandre_job_name: scaphandre
# node_exporter_job_name: node_exporter
# remove path if possible

validations:
node:

Check failure on line 7 in e2e/tools/validator/bm_validations.yaml

View workflow job for this annotation

GitHub Actions / yamllint / yamllint

7:8 [trailing-spaces] trailing spaces
- name: node-rapl-package - kepler-node-package
mapping:
actual: node-rapl-package
predicted: kepler-node-package
units: Watts
node-rapl-package: |
sum(
rate(
node_rapl_package_joules_total{{
path="/host/sys/class/powercap/intel-rapl:0"
}}[{rate_interval}]
)
)
kepler-node-package: |
sum(
rate(
kepler_node_package_joules_total{{
job="{metal_job_name}",
}}[{rate_interval}]
)
)

- name: node-rapl-package - sum-kepler-process-package
mapping:
actual: node-rapl-package
predicted: sum-kepler-process-package
units: Watts
node-rapl-package: |
sum(
rate(
node_rapl_package_joules_total[{rate_interval}]
)
)
sum-kepler-process-package: |
sum(
rate(
kepler_process_package_joules_total{{
job="{metal_job_name}",
}}[{rate_interval}]
)
)

Check failure on line 49 in e2e/tools/validator/bm_validations.yaml

View workflow job for this annotation

GitHub Actions / yamllint / yamllint

49:1 [trailing-spaces] trailing spaces
- name: sum-kepler-process-package - kepler-node-package
mapping:
actual: sum-kepler-process-package
predicted: kepler-node-package
units: Watts
sum-kepler-process-package: |
sum(
rate(
kepler_process_package_joules_total{{
job="{metal_job_name}",
}}[{rate_interval}]
)
)
kepler-node-package: |
sum(
rate(
kepler_node_package_joules_total{{
job="{metal_job_name}",
}}[{rate_interval}]
)
)

Check failure on line 71 in e2e/tools/validator/bm_validations.yaml

View workflow job for this annotation

GitHub Actions / yamllint / yamllint

71:1 [trailing-spaces] trailing spaces
- name: node-rapl-core - kepler-node-core
mapping:
actual: node-rapl-core
predicted: kepler-node-core
units: Watts
node-rapl-core: |
sum(
rate(
node_rapl_core_joules_total{{
path="/host/sys/class/powercap/intel-rapl:0"
}}[{rate_interval}]
)
)
kepler-node-core: |
sum(
rate(
kepler_node_core_joules_total{{
job="{metal_job_name}",
}}[{rate_interval}]
)
)

- name: node-rapl-core - sum-kepler-process-core
mapping:
actual: node-rapl-core
predicted: sum-kepler-process-core
units: Watts
node-rapl-core: |
sum(
rate(
node_rapl_core_joules_total[{rate_interval}]
)
)
sum-kepler-process-core: |
sum(
rate(
kepler_process_core_joules_total{{
job="{metal_job_name}",
}}[{rate_interval}]
)
)

Check failure on line 113 in e2e/tools/validator/bm_validations.yaml

View workflow job for this annotation

GitHub Actions / yamllint / yamllint

113:1 [trailing-spaces] trailing spaces
- name: sum-kepler-process-core - kepler-node-core
mapping:
actual: sum-kepler-process-core
predicted: kepler-node-core
units: Watts
sum-kepler-process-core: |
sum(
rate(
kepler_process_core_joules_total{{
job="{metal_job_name}",
}}[{rate_interval}]
)
)
kepler-node-core: |
sum(
rate(
kepler_node_core_joules_total{{
job="{metal_job_name}",
}}[{rate_interval}]
)
)

process:
# validate process bpf cpu time with node exporter
- name: node-cpu-time - kepler-process-bpf-cpu-time
mapping:
actual: node-cpu-time
predicted: kepler-process-bpf-cpu-time
units: Milliseconds
node-cpu-time: |
sum(
rate(
node_cpu_seconds_total{{
cpu="{isolated_cpu}",

Check failure on line 147 in e2e/tools/validator/bm_validations.yaml

View workflow job for this annotation

GitHub Actions / yamllint / yamllint

147:36 [trailing-spaces] trailing spaces
mode!="idle"
}}[{rate_interval}]
)
) * 1000
kepler-process-bpf-cpu-time: |
sum(
rate(
kepler_process_bpf_cpu_time_ms_total{{
job="{metal_job_name}",
pid=~"{pids}"
}}[{rate_interval}]
)
)

Check failure on line 161 in e2e/tools/validator/bm_validations.yaml

View workflow job for this annotation

GitHub Actions / yamllint / yamllint

161:1 [trailing-spaces] trailing spaces
# validate kepler bpf cpu time with process exporter (namedgroup must be pid)
# include system and user
- name: process-exporter-process-cpu-time - kepler-process-bpf-cpu-time
mapping:
actual: process-exporter-process-cpu-time
predicted: kepler-process-bpf-cpu-time
units: Milliseconds
process-exporter-process-cpu-time: |
sum(
rate(
namedprocess_namegroup_cpu_seconds_total{{
groupname=~"{pids}"
}}[{rate_interval}]
)
) * 1000
kepler-process-bpf-cpu-time: |
sum(
rate(
kepler_process_bpf_cpu_time_ms_total{{
job="{metal_job_name}",
pid=~"{pids}"
}}[{rate_interval}]
)
)

- name: kepler-process-bpf-cpu-time usage * node-package-power - kepler-process-package-power
mapping:
actual: kepler-process-cpu-ratio-node-package-power
predicted: kepler-process-package-power
units: Watts
kepler-process-cpu-ratio-node-package-power: |
(
sum(
rate(
kepler_process_bpf_cpu_time_ms_total{{
job="{metal_job_name}",
pid=~"{pids}"
}}[{rate_interval}]
)
) /

Check failure on line 201 in e2e/tools/validator/bm_validations.yaml

View workflow job for this annotation

GitHub Actions / yamllint / yamllint

201:14 [trailing-spaces] trailing spaces
sum(
rate(
kepler_process_bpf_cpu_time_ms_total{{
job="{metal_job_name}",
}}[{rate_interval}]
)
)
) *

Check failure on line 209 in e2e/tools/validator/bm_validations.yaml

View workflow job for this annotation

GitHub Actions / yamllint / yamllint

209:12 [trailing-spaces] trailing spaces
sum(
rate(
node_rapl_package_joules_total[{rate_interval}]
)
)
kepler-process-package-power: |
sum(
rate(
kepler_process_package_joules_total{{
job="{metal_job_name}",
pid=~"{pids}"
}}[{rate_interval}]
)
)

- name: node-exporter-cpu usage * node-package-power - kepler-process-package-power
mapping:
actual: kepler-process-cpu-ratio-node-package-power
predicted: kepler-process-package-power
units: Watts
kepler-process-cpu-ratio-node-package-power: |
(
(
sum(
rate(
node_cpu_seconds_total{{
cpu=~"{isolated_cpu}",

Check failure on line 236 in e2e/tools/validator/bm_validations.yaml

View workflow job for this annotation

GitHub Actions / yamllint / yamllint

236:41 [trailing-spaces] trailing spaces
mode!="idle"
}}[{rate_interval}]
)
) * 1000
) /

Check failure on line 241 in e2e/tools/validator/bm_validations.yaml

View workflow job for this annotation

GitHub Actions / yamllint / yamllint

241:14 [trailing-spaces] trailing spaces
(
sum(
rate(
node_cpu_seconds_total{{
mode!="idle",
}}[{rate_interval}]
)
) * 1000
)
) *
sum(
rate(
node_rapl_package_joules_total[{rate_interval}]
)
)
kepler-process-package-power: |
sum(
rate(
kepler_process_package_joules_total{{
job="{metal_job_name}",
pid=~"{pids}"
}}[{rate_interval}]
)
)


container:
# validate container bpf cpu time with node exporter
- name: node-cpu-time - kepler-container-bpf-cpu-time
mapping:
actual: node-cpu-time
predicted: kepler-container-bpf-cpu-time
units: Milliseconds
node-cpu-time: |
sum(
rate(
node_cpu_seconds_total{{
cpu=~"{isolated_cpu}",
mode!="idle"
}}[{rate_interval}]
)
) * 1000
kepler-container-bpf-cpu-time: |
sum(
rate(
kepler_container_bpf_cpu_time_ms_total{{
job="{metal_job_name}",
container_id="{container_id}"
}}[{rate_interval}]
)
)

- name: kepler-container-bpf-cpu-time usage * node-package-power - kepler-container-package-power
mapping:
actual: kepler-container-cpu-ratio-node-package-power
predicted: kepler-container-package-power
units: Watts
kepler-container-cpu-ratio-node-package-power: |
(
sum(
rate(
kepler_container_bpf_cpu_time_ms_total{{
job="{metal_job_name}",
container_id="{container_id}"
}}[{rate_interval}]
)
) /
sum(
rate(
kepler_process_bpf_cpu_time_ms_total{{
job="{metal_job_name}",
}}[{rate_interval}]
)
)
) *
sum(
rate(
node_rapl_package_joules_total[{rate_interval}]
)
)
kepler-container-package-power: |
sum(
rate(
kepler_container_package_joules_total{{
job="{metal_job_name}",
container_id="{container_id}"
}}[{rate_interval}]
)
)

# node rapl {package/core} = kepler node {package/core}
# node
3 changes: 3 additions & 0 deletions e2e/tools/validator/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,13 @@ dependencies = [
"pandas",
"matplotlib",
"scikit-learn",
"docker",
"psutil",
]

[project.scripts]
validator = "validator.cli:validator"
bm_validator = "validator.cli:bm_validator"

[tool.hatch.version]
path = "src/validator/__about__.py"
Expand Down
Loading
Loading