From 16b574724de45057b8f6407f146939415c392163 Mon Sep 17 00:00:00 2001 From: Judy Ng Date: Wed, 20 Mar 2024 12:17:00 -0400 Subject: [PATCH] Add static node stress test Signed-off-by: Judy Ng --- tests/integration-tests/clusters_factory.py | 2 + .../configs/scaling_stress_test.yaml | 10 +-- tests/integration-tests/conftest.py | 3 + .../tests/performance_tests/test_scaling.py | 77 +++++++++++++++++++ 4 files changed, 87 insertions(+), 5 deletions(-) diff --git a/tests/integration-tests/clusters_factory.py b/tests/integration-tests/clusters_factory.py index d935a896b2..5e8d9e31e6 100644 --- a/tests/integration-tests/clusters_factory.py +++ b/tests/integration-tests/clusters_factory.py @@ -64,6 +64,8 @@ def __init__(self, name, ssh_key, config_file, region, custom_cli_credentials=No self.__cfn_resources = None self.__cfn_stack_arn = None self.custom_cli_credentials = custom_cli_credentials + self.create_start_time = None + self.create_end_time = None def __repr__(self): attrs = ", ".join(["{key}={value}".format(key=key, value=repr(value)) for key, value in self.__dict__.items()]) diff --git a/tests/integration-tests/configs/scaling_stress_test.yaml b/tests/integration-tests/configs/scaling_stress_test.yaml index 48509191d8..d59ae53ae9 100644 --- a/tests/integration-tests/configs/scaling_stress_test.yaml +++ b/tests/integration-tests/configs/scaling_stress_test.yaml @@ -1,8 +1,8 @@ test-suites: performance_tests: - test_scaling.py::test_scaling_stress_test: + test_scaling.py::test_static_scaling_stress_test: dimensions: - - regions: ["us-east-1"] - instances: ["c5.large"] - oss: ["alinux2"] - schedulers: ["slurm"] + - regions: [ "eu-west-1" ] + instances: [ "c5.large" ] + oss: [ "alinux2" ] + schedulers: [ "slurm" ] diff --git a/tests/integration-tests/conftest.py b/tests/integration-tests/conftest.py index 9b119cc6bc..54a1822138 100644 --- a/tests/integration-tests/conftest.py +++ b/tests/integration-tests/conftest.py @@ -14,6 +14,7 @@ # additional details. import copy +import datetime import json import logging import os @@ -423,7 +424,9 @@ def _cluster_factory(cluster_config, upper_case_cluster_name=False, custom_cli_c custom_cli_credentials=custom_cli_credentials, ) if not request.config.getoption("cluster"): + cluster.create_start_time = datetime.datetime.now(tz=datetime.timezone.utc) cluster.creation_response = factory.create_cluster(cluster, **kwargs) + cluster.create_end_time = datetime.datetime.now(tz=datetime.timezone.utc) return cluster yield _cluster_factory diff --git a/tests/integration-tests/tests/performance_tests/test_scaling.py b/tests/integration-tests/tests/performance_tests/test_scaling.py index cade1b0e96..fe4e9a2a04 100644 --- a/tests/integration-tests/tests/performance_tests/test_scaling.py +++ b/tests/integration-tests/tests/performance_tests/test_scaling.py @@ -83,6 +83,83 @@ def _get_scaling_time(capacity_time_series: list, timestamps: list, scaling_targ raise e +@pytest.mark.usefixtures("scheduler") +@pytest.mark.parametrize("scaling_strategy", ["all-or-nothing"]) +def test_static_scaling_stress_test( + test_datadir, + instance, + os, + region, + request, + pcluster_config_reader, + scheduler_commands_factory, + clusters_factory, + scaling_strategy, +): + """ + The test scales up a cluster with a large number of static nodes, as opposed to scaling + up and down with dynamic nodes. + + This test records the amount of time it takes to create the cluster with the target number + of static nodes and then the amount of time it takes to start running a job once it has been + submitted. It compares the time to the baselines. + + This test doesn't upload metrics like the dynamic ones because the nodes start up in cluster creation, + so we can't monitor the number of nodes from here. So, we just do a check that the time it takes is within + the baseline. + """ + # Get the scaling parameters + scaling_test_config = _validate_and_get_scaling_test_config(test_datadir, request) + max_monitoring_time_in_mins = scaling_test_config.get("MaxMonitoringTimeInMins") + shared_headnode_storage_type = scaling_test_config.get("SharedHeadNodeStorageType") + head_node_instance_type = scaling_test_config.get("HeadNodeInstanceType") + scaling_targets = scaling_test_config.get("ScalingTargets") + + for scaling_target in scaling_targets: + # Creating cluster with intended head node instance type and scaling parameters + cluster_config = pcluster_config_reader( + # Prevent nodes being set down before we start monitoring the scale down metrics + scaledown_idletime=max_monitoring_time_in_mins, + min_cluster_size=scaling_target, + max_cluster_size=scaling_target, + head_node_instance_type=head_node_instance_type, + shared_headnode_storage_type=shared_headnode_storage_type, + scaling_strategy=scaling_strategy, + ) + + # Create cluster and get creation start/end time + cluster = clusters_factory(cluster_config) + cluster_start_time = _datetime_to_minute(cluster.create_start_time) + cluster_end_time = _datetime_to_minute(cluster.create_end_time) + cluster_create_time = int((cluster_end_time - cluster_start_time).total_seconds()) + + # Run a job and get the time it takes for the job to start running + remote_command_executor = RemoteCommandExecutor(cluster) + scheduler_commands = scheduler_commands_factory(remote_command_executor) + + scaling_job = { + "command": f"srun sleep 10", + "nodes": scaling_target, + } + job_id = scheduler_commands.submit_command_and_assert_job_accepted(scaling_job) + start_time = _datetime_to_minute(datetime.datetime.now(tz=datetime.timezone.utc)) + scheduler_commands.wait_job_running(job_id) + end_time = _datetime_to_minute(datetime.datetime.now(tz=datetime.timezone.utc)) + scheduler_commands.cancel_job(job_id) + job_start_time = int((end_time - start_time).total_seconds()) + + scaling_results = { + "Region": region, + "OS": os, + "ComputeNode": instance, + "HeadNode": head_node_instance_type, + "ScalingTarget": scaling_target, + "ScalingStrategy": scaling_strategy, + "ClusterCreateTime": cluster_create_time, + "JobStartTime": job_start_time + } + logging.info(f"Scaling Results: {scaling_results}") + @pytest.mark.usefixtures("scheduler") @pytest.mark.parametrize("scaling_strategy", ["all-or-nothing", "best-effort"]) def test_scaling_stress_test(