-
Notifications
You must be signed in to change notification settings - Fork 188
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat(baremetal_validator): Add Validator for Process, Container Metrics #1878
base: main
Are you sure you want to change the base?
Changes from 2 commits
7c06cbc
ad0e7f8
15106d7
0e2bf49
3ecbcf5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -31,6 +31,7 @@ dependencies = [ | |
"pandas", | ||
"matplotlib", | ||
"scikit-learn", | ||
"docker", | ||
] | ||
|
||
[project.scripts] | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
#!/usr/bin/env bash | ||
|
||
set -eu -o pipefail | ||
|
||
trap exit_all INT | ||
exit_all() { | ||
pkill -P $$ | ||
} | ||
|
||
run() { | ||
echo "❯ $*" | ||
"$@" | ||
echo " ‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾‾" | ||
} | ||
|
||
usage() { | ||
echo "Usage: $0 -g <general_mode> -r <cpu_range> -c <cpus> -d <mount_dir> -t <time_interval_log_name> -l <load_curve> -n <iterations>" | ||
echo " -g <general_mode> If set, <cpu_range> and <cpus> are ignored." | ||
echo " -r <cpu_range> CPU range for stress-ng taskset (Default: '15')" | ||
echo " -c <cpus> Number of CPUs to use for stress-ng (Default: '1')" | ||
echo " -d <mount_dir> Directory to mount for logging (Default: '/tmp')" | ||
echo " -t <time_interval_log_name> Filename for start and end time file log (Default: 'time_interval.log')" | ||
echo " -l <load_curve> Load curve as a comma-separated list (Default: '0:5,50:20,75:20,100:20,75:20,50:20')" | ||
echo " -n <iterations> Number of times to iterate the Load curve (Default: '1')" | ||
exit 1 | ||
} | ||
|
||
main() { | ||
|
||
set_general_mode=false | ||
DEFAULT_CPU_RANGE="15" | ||
DEFAULT_CPUS="1" | ||
DEFAULT_MOUNT_DIR="/tmp" | ||
DEFAULT_LOAD_CURVE_STR="0:5,50:20,75:20,100:20,75:20,50:20" | ||
DEFAULT_TIME_INTERVAL_LOG_NAME="time_interval.log" | ||
DEFAULT_ITERATIONS="1" | ||
|
||
# Parse command-line options | ||
while getopts "g:r:c:d:t:l:n:" opt; do | ||
case "$opt" in | ||
g) set_general_mode=true ;; | ||
r) cpu_range="$OPTARG" ;; | ||
c) cpus="$OPTARG" ;; | ||
d) mount_dir="$OPTARG" ;; | ||
t) time_interval_log_name="$OPTARG" ;; | ||
l) load_curve_str="$OPTARG" ;; | ||
n) iterations="$OPTARG" ;; | ||
*) usage ;; | ||
esac | ||
done | ||
|
||
cpu_range="${cpu_range:-$DEFAULT_CPU_RANGE}" | ||
cpus="${cpus:-$DEFAULT_CPUS}" | ||
mount_dir="${mount_dir:-$DEFAULT_MOUNT_DIR}" | ||
time_interval_log_name="${time_interval_log_name:-$DEFAULT_TIME_INTERVAL_LOG_NAME}" | ||
load_curve_str="${load_curve_str:-$DEFAULT_LOAD_CURVE_STR}" | ||
iterations="${iterations:-$DEFAULT_ITERATIONS}" | ||
|
||
IFS=',' read -r -a load_curve <<< "$load_curve_str" | ||
|
||
TIME_INTERVAL_LOG="${mount_dir}/${time_interval_log_name}" | ||
|
||
> "$TIME_INTERVAL_LOG" | ||
|
||
start_time=$(date +%s) | ||
echo "Stress Start Time: $start_time" >> "$TIME_INTERVAL_LOG" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. AFAIK we do something similar in our existing validator source by checking the start and end time of the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The reason this is necessary is primarily for container validation and in future if we attempt pods (which should fall under container still anyway). Container setup requires an image installation (in this case, fedora and stressng need to be installed). If we get start and end time outside we will include the time for installing which will skew the results. The easiest solution I can think of was to actually compute it in the bash script so we do not include the installation time. |
||
|
||
for i in $(seq 1 "$iterations"); do | ||
echo "Running $i/$iterations" | ||
for x in "${load_curve[@]}"; do | ||
local load="${x%%:*}" | ||
local time="${x##*:}s" | ||
if $set_general_mode; then | ||
run stress-ng --cpu "$cpus" --cpu-method ackermann --cpu-load "$load" --timeout "$time" | ||
else | ||
run taskset -c "$cpu_range" stress-ng --cpu "$cpus" --cpu-method ackermann --cpu-load "$load" --timeout "$time" | ||
fi | ||
done | ||
done | ||
|
||
end_time=$(date +%s) | ||
echo "Stress End Time: $end_time" >> "$TIME_INTERVAL_LOG" | ||
} | ||
|
||
main "$@" |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -57,6 +57,113 @@ def __repr__(self): | |
return f"<Config {self.remote}@{self.prometheus}>" | ||
|
||
|
||
# consider switching to dataclass to avoid repeated fields | ||
class Local(NamedTuple): | ||
load_curve: str | ||
iterations: str | ||
mount_dir: str | ||
|
||
|
||
class LocalProcess(NamedTuple): | ||
isolated_cpu: str | ||
load_curve: str | ||
iterations: str | ||
mount_dir: str | ||
|
||
|
||
class LocalContainer(NamedTuple): | ||
isolated_cpu: str | ||
container_name: str | ||
load_curve: str | ||
iterations: str | ||
mount_dir: str | ||
|
||
|
||
class LocalPrometheus(NamedTuple): | ||
url: str | ||
rate_interval: str | ||
step: str | ||
job: str | ||
|
||
|
||
class BMValidator(NamedTuple): | ||
log_level: str | ||
prom: LocalPrometheus | ||
node: Local | ||
process: LocalProcess | ||
container: LocalContainer | ||
validations_file: str | ||
|
||
|
||
def bload(config_file: str) -> BMValidator: | ||
""" | ||
Reads Baremetal YAML configuration file and returns a Config object. | ||
|
||
Args: | ||
config_file (str): Path to Baremetal YAML configuration file. | ||
|
||
Returns: | ||
BMValidator: A named tuple containing configuration values for Baremetal Validation. | ||
""" | ||
with open(config_file) as file: | ||
config = yaml.safe_load(file) | ||
|
||
log_level = config.get("log_level", "warn") | ||
prom_config = config["prometheus"] | ||
if not prom_config: | ||
prom_config = {} | ||
prom = LocalPrometheus( | ||
url=prom_config.get("url", "http://localhost:9090"), | ||
rate_interval=prom_config.get("rate_interval", "20s"), | ||
step=prom_config.get("step", "3s"), | ||
job=prom_config.get("job", "metal") | ||
) | ||
print(prom) | ||
|
||
default_config = config["config"] | ||
node_config = config["node"] | ||
process_config = config["process"] | ||
container_config = config["container"] | ||
# node config | ||
if not node_config: | ||
node_config = {} | ||
node = Local( | ||
load_curve=node_config.get("load_curve", default_config["load_curve"]), | ||
iterations=node_config.get("iterations", default_config["iterations"]), | ||
mount_dir=os.path.expanduser(node_config.get("mount_dir", default_config["mount_dir"])) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do we need to mount the directory? Also how are you planning to deploy Kepler on BM? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Currently, kepler is deployed with docker compose for simplicity. however, this should still work the same even in a kubernetes environment. The mount directory is specifically to get the start and end times of the stressor script as a log file from the container. In a non container, the log file will be saved at the mount directory. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Shouldn't we target a single environment for now and focus on expanding it further? Currently, we use docker compose for our development and validations. Let's stick with that for now and afterwards add support for k8s |
||
) | ||
print(node) | ||
if not process_config: | ||
process_config = {} | ||
process = LocalProcess( | ||
isolated_cpu=process_config.get("isolated_cpu", default_config["isolated_cpu"]), | ||
load_curve=process_config.get("load_curve", default_config["load_curve"]), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What is the difference in the load curve between the process and the container? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No difference. the logic is the same There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Then why we require separate configs for both? |
||
iterations=process_config.get("iterations", default_config["iterations"]), | ||
mount_dir=os.path.expanduser(process_config.get("mount_dir", default_config["mount_dir"])) | ||
) | ||
print(process) | ||
if not container_config: | ||
container_config = {} | ||
container = LocalContainer( | ||
isolated_cpu=container_config.get("isolated_cpu", default_config["isolated_cpu"]), | ||
container_name=container_config.get("container_name", default_config["container_name"]), | ||
load_curve=container_config.get("load_curve", default_config["load_curve"]), | ||
iterations=container_config.get("iterations", default_config["iterations"]), | ||
mount_dir=os.path.expanduser(container_config.get("mount_dir", default_config["mount_dir"])) | ||
) | ||
print(container) | ||
|
||
validations_file = config.get("validations_file", "bm_validations.yaml") | ||
|
||
BMValidator( | ||
log_level=log_level, | ||
prom=prom, | ||
node=node, | ||
process=process, | ||
container=container, | ||
validations_file=validations_file | ||
) | ||
|
||
def load(config_file: str) -> Validator: | ||
""" | ||
Reads the YAML configuration file and returns a Config object. | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Couldn't we modify the existing stressor script instead of adding new ones?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I did not want to modify the other scripts because it might break the metal ci. I think this script is the best to replace the other stressor scripts (as it enables a custom load curve). A future PR should be made for modifying validator to have just one goto stressor script.