From 8b298b461eb608bc88c6eb33aeaf0e48b050df28 Mon Sep 17 00:00:00 2001 From: Judy Ng Date: Wed, 20 Mar 2024 12:17:00 -0400 Subject: [PATCH] Add static node stress test Signed-off-by: Judy Ng --- tests/integration-tests/clusters_factory.py | 2 + .../configs/scaling_stress_test.yaml | 10 +-- tests/integration-tests/conftest.py | 3 + .../tests/performance_tests/test_scaling.py | 77 +++++++++++++++++++ .../pcluster.config.yaml | 27 +++++++ .../results/baseline.json | 52 +++++++++++++ .../scaling_test_config.yaml | 4 + .../scaling_test_config_schema.yaml | 16 ++++ 8 files changed, 186 insertions(+), 5 deletions(-) create mode 100644 tests/integration-tests/tests/performance_tests/test_scaling/test_static_scaling_stress_test/pcluster.config.yaml create mode 100644 tests/integration-tests/tests/performance_tests/test_scaling/test_static_scaling_stress_test/results/baseline.json create mode 100644 tests/integration-tests/tests/performance_tests/test_scaling/test_static_scaling_stress_test/scaling_test_config.yaml create mode 100644 tests/integration-tests/tests/performance_tests/test_scaling/test_static_scaling_stress_test/scaling_test_config_schema.yaml diff --git a/tests/integration-tests/clusters_factory.py b/tests/integration-tests/clusters_factory.py index d935a896b2..5e8d9e31e6 100644 --- a/tests/integration-tests/clusters_factory.py +++ b/tests/integration-tests/clusters_factory.py @@ -64,6 +64,8 @@ def __init__(self, name, ssh_key, config_file, region, custom_cli_credentials=No self.__cfn_resources = None self.__cfn_stack_arn = None self.custom_cli_credentials = custom_cli_credentials + self.create_start_time = None + self.create_end_time = None def __repr__(self): attrs = ", ".join(["{key}={value}".format(key=key, value=repr(value)) for key, value in self.__dict__.items()]) diff --git a/tests/integration-tests/configs/scaling_stress_test.yaml b/tests/integration-tests/configs/scaling_stress_test.yaml index 48509191d8..d59ae53ae9 100644 --- a/tests/integration-tests/configs/scaling_stress_test.yaml +++ b/tests/integration-tests/configs/scaling_stress_test.yaml @@ -1,8 +1,8 @@ test-suites: performance_tests: - test_scaling.py::test_scaling_stress_test: + test_scaling.py::test_static_scaling_stress_test: dimensions: - - regions: ["us-east-1"] - instances: ["c5.large"] - oss: ["alinux2"] - schedulers: ["slurm"] + - regions: [ "eu-west-1" ] + instances: [ "c5.large" ] + oss: [ "alinux2" ] + schedulers: [ "slurm" ] diff --git a/tests/integration-tests/conftest.py b/tests/integration-tests/conftest.py index 9b119cc6bc..54a1822138 100644 --- a/tests/integration-tests/conftest.py +++ b/tests/integration-tests/conftest.py @@ -14,6 +14,7 @@ # additional details. import copy +import datetime import json import logging import os @@ -423,7 +424,9 @@ def _cluster_factory(cluster_config, upper_case_cluster_name=False, custom_cli_c custom_cli_credentials=custom_cli_credentials, ) if not request.config.getoption("cluster"): + cluster.create_start_time = datetime.datetime.now(tz=datetime.timezone.utc) cluster.creation_response = factory.create_cluster(cluster, **kwargs) + cluster.create_end_time = datetime.datetime.now(tz=datetime.timezone.utc) return cluster yield _cluster_factory diff --git a/tests/integration-tests/tests/performance_tests/test_scaling.py b/tests/integration-tests/tests/performance_tests/test_scaling.py index cade1b0e96..fe4e9a2a04 100644 --- a/tests/integration-tests/tests/performance_tests/test_scaling.py +++ b/tests/integration-tests/tests/performance_tests/test_scaling.py @@ -83,6 +83,83 @@ def _get_scaling_time(capacity_time_series: list, timestamps: list, scaling_targ raise e +@pytest.mark.usefixtures("scheduler") +@pytest.mark.parametrize("scaling_strategy", ["all-or-nothing"]) +def test_static_scaling_stress_test( + test_datadir, + instance, + os, + region, + request, + pcluster_config_reader, + scheduler_commands_factory, + clusters_factory, + scaling_strategy, +): + """ + The test scales up a cluster with a large number of static nodes, as opposed to scaling + up and down with dynamic nodes. + + This test records the amount of time it takes to create the cluster with the target number + of static nodes and then the amount of time it takes to start running a job once it has been + submitted. It compares the time to the baselines. + + This test doesn't upload metrics like the dynamic ones because the nodes start up in cluster creation, + so we can't monitor the number of nodes from here. So, we just do a check that the time it takes is within + the baseline. + """ + # Get the scaling parameters + scaling_test_config = _validate_and_get_scaling_test_config(test_datadir, request) + max_monitoring_time_in_mins = scaling_test_config.get("MaxMonitoringTimeInMins") + shared_headnode_storage_type = scaling_test_config.get("SharedHeadNodeStorageType") + head_node_instance_type = scaling_test_config.get("HeadNodeInstanceType") + scaling_targets = scaling_test_config.get("ScalingTargets") + + for scaling_target in scaling_targets: + # Creating cluster with intended head node instance type and scaling parameters + cluster_config = pcluster_config_reader( + # Prevent nodes being set down before we start monitoring the scale down metrics + scaledown_idletime=max_monitoring_time_in_mins, + min_cluster_size=scaling_target, + max_cluster_size=scaling_target, + head_node_instance_type=head_node_instance_type, + shared_headnode_storage_type=shared_headnode_storage_type, + scaling_strategy=scaling_strategy, + ) + + # Create cluster and get creation start/end time + cluster = clusters_factory(cluster_config) + cluster_start_time = _datetime_to_minute(cluster.create_start_time) + cluster_end_time = _datetime_to_minute(cluster.create_end_time) + cluster_create_time = int((cluster_end_time - cluster_start_time).total_seconds()) + + # Run a job and get the time it takes for the job to start running + remote_command_executor = RemoteCommandExecutor(cluster) + scheduler_commands = scheduler_commands_factory(remote_command_executor) + + scaling_job = { + "command": f"srun sleep 10", + "nodes": scaling_target, + } + job_id = scheduler_commands.submit_command_and_assert_job_accepted(scaling_job) + start_time = _datetime_to_minute(datetime.datetime.now(tz=datetime.timezone.utc)) + scheduler_commands.wait_job_running(job_id) + end_time = _datetime_to_minute(datetime.datetime.now(tz=datetime.timezone.utc)) + scheduler_commands.cancel_job(job_id) + job_start_time = int((end_time - start_time).total_seconds()) + + scaling_results = { + "Region": region, + "OS": os, + "ComputeNode": instance, + "HeadNode": head_node_instance_type, + "ScalingTarget": scaling_target, + "ScalingStrategy": scaling_strategy, + "ClusterCreateTime": cluster_create_time, + "JobStartTime": job_start_time + } + logging.info(f"Scaling Results: {scaling_results}") + @pytest.mark.usefixtures("scheduler") @pytest.mark.parametrize("scaling_strategy", ["all-or-nothing", "best-effort"]) def test_scaling_stress_test( diff --git a/tests/integration-tests/tests/performance_tests/test_scaling/test_static_scaling_stress_test/pcluster.config.yaml b/tests/integration-tests/tests/performance_tests/test_scaling/test_static_scaling_stress_test/pcluster.config.yaml new file mode 100644 index 0000000000..a51c90dd92 --- /dev/null +++ b/tests/integration-tests/tests/performance_tests/test_scaling/test_static_scaling_stress_test/pcluster.config.yaml @@ -0,0 +1,27 @@ +Image: + Os: {{ os }} +HeadNode: + {% if shared_headnode_storage_type %} + SharedStorageType: {{ shared_headnode_storage_type }} + {% endif %} + InstanceType: {{ head_node_instance_type }} + Networking: + SubnetId: {{ public_subnet_id }} + Ssh: + KeyName: {{ key_name }} +Scheduling: + Scheduler: {{ scheduler }} + ScalingStrategy: {{ scaling_strategy }} + SlurmSettings: + ScaledownIdletime: {{ scaledown_idletime }} + SlurmQueues: + - Name: queue-0 + ComputeResources: + - Name: compute-resource-0 + Instances: + - InstanceType: {{ instance }} + MinCount: {{ min_cluster_size }} + MaxCount: {{ max_cluster_size }} + Networking: + SubnetIds: + - {{ private_subnet_id }} diff --git a/tests/integration-tests/tests/performance_tests/test_scaling/test_static_scaling_stress_test/results/baseline.json b/tests/integration-tests/tests/performance_tests/test_scaling/test_static_scaling_stress_test/results/baseline.json new file mode 100644 index 0000000000..fbbb924838 --- /dev/null +++ b/tests/integration-tests/tests/performance_tests/test_scaling/test_static_scaling_stress_test/results/baseline.json @@ -0,0 +1,52 @@ +{ + "c5.large": { + "1000": { + "best-effort": { + "scale_up_time_ec2": 180, + "scale_up_time_scheduler": 540, + "scale_down_time": 120 + }, + "all-or-nothing": { + "scale_up_time_ec2": 180, + "scale_up_time_scheduler": 540, + "scale_down_time": 120 + } + }, + "2000": { + "best-effort": { + "scale_up_time_ec2": 300, + "scale_up_time_scheduler": 600, + "scale_down_time": 180 + }, + "all-or-nothing": { + "scale_up_time_ec2": 300, + "scale_up_time_scheduler": 600, + "scale_down_time": 180 + } + }, + "3000": { + "best-effort": { + "scale_up_time_ec2": 420, + "scale_up_time_scheduler": 1020, + "scale_down_time": 240 + }, + "all-or-nothing": { + "scale_up_time_ec2": 420, + "scale_up_time_scheduler": 1020, + "scale_down_time": 240 + } + }, + "4000": { + "best-effort": { + "scale_up_time_ec2": 540, + "scale_up_time_scheduler": 1200, + "scale_down_time": 300 + }, + "all-or-nothing": { + "scale_up_time_ec2": 540, + "scale_up_time_scheduler": 1200, + "scale_down_time": 300 + } + } + } +} diff --git a/tests/integration-tests/tests/performance_tests/test_scaling/test_static_scaling_stress_test/scaling_test_config.yaml b/tests/integration-tests/tests/performance_tests/test_scaling/test_static_scaling_stress_test/scaling_test_config.yaml new file mode 100644 index 0000000000..c5a8872274 --- /dev/null +++ b/tests/integration-tests/tests/performance_tests/test_scaling/test_static_scaling_stress_test/scaling_test_config.yaml @@ -0,0 +1,4 @@ +MaxMonitoringTimeInMins: 20 +ScalingTargets: [1000, 2000, 3000, 4000] +SharedHeadNodeStorageType: 'Efs' +HeadNodeInstanceType: 'c5.24xlarge' diff --git a/tests/integration-tests/tests/performance_tests/test_scaling/test_static_scaling_stress_test/scaling_test_config_schema.yaml b/tests/integration-tests/tests/performance_tests/test_scaling/test_static_scaling_stress_test/scaling_test_config_schema.yaml new file mode 100644 index 0000000000..f40b1ad28d --- /dev/null +++ b/tests/integration-tests/tests/performance_tests/test_scaling/test_static_scaling_stress_test/scaling_test_config_schema.yaml @@ -0,0 +1,16 @@ +type: map +mapping: + MaxMonitoringTimeInMins: + type: int + required: true + ScalingTargets: + type: seq + required: true + sequence: + - type: int + SharedHeadNodeStorageType: + type: str + required: true + HeadNodeInstanceType: + type: str + required: true