Add static node stress test

Signed-off-by: Judy Ng <[email protected]>
judysng · Mar 20, 2024 · 89f69b6 · 89f69b6
1 parent 4e80c57
commit 89f69b6
Show file tree

Hide file tree

Showing 8 changed files with 189 additions and 5 deletions.
diff --git a/tests/integration-tests/clusters_factory.py b/tests/integration-tests/clusters_factory.py
@@ -64,6 +64,8 @@ def __init__(self, name, ssh_key, config_file, region, custom_cli_credentials=No
         self.__cfn_resources = None
         self.__cfn_stack_arn = None
         self.custom_cli_credentials = custom_cli_credentials
+        self.create_start_time = None
+        self.create_end_time = None
 
     def __repr__(self):
         attrs = ", ".join(["{key}={value}".format(key=key, value=repr(value)) for key, value in self.__dict__.items()])

diff --git a/tests/integration-tests/configs/scaling_stress_test.yaml b/tests/integration-tests/configs/scaling_stress_test.yaml
@@ -1,8 +1,8 @@
 test-suites:
   performance_tests:
-    test_scaling.py::test_scaling_stress_test:
+    test_scaling.py::test_static_scaling_stress_test:
       dimensions:
-        - regions: ["us-east-1"]
-          instances: ["c5.large"]
-          oss: ["alinux2"]
-          schedulers: ["slurm"]
+        - regions: [ "eu-west-1" ]
+          instances: [ "c5.large" ]
+          oss: [ "alinux2" ]
+          schedulers: [ "slurm" ]
diff --git a/tests/integration-tests/conftest.py b/tests/integration-tests/conftest.py
@@ -14,6 +14,7 @@
 # additional details.
 
 import copy
+import datetime
 import json
 import logging
 import os
@@ -423,7 +424,9 @@ def _cluster_factory(cluster_config, upper_case_cluster_name=False, custom_cli_c
             custom_cli_credentials=custom_cli_credentials,
         )
         if not request.config.getoption("cluster"):
+            cluster.create_start_time = datetime.datetime.now(tz=datetime.timezone.utc)
             cluster.creation_response = factory.create_cluster(cluster, **kwargs)
+            cluster.create_end_time = datetime.datetime.now(tz=datetime.timezone.utc)
         return cluster
 
     yield _cluster_factory

diff --git a/tests/integration-tests/tests/performance_tests/test_scaling.py b/tests/integration-tests/tests/performance_tests/test_scaling.py
@@ -83,6 +83,83 @@ def _get_scaling_time(capacity_time_series: list, timestamps: list, scaling_targ
         raise e
 
 
+@pytest.mark.usefixtures("scheduler")
+@pytest.mark.parametrize("scaling_strategy", ["all-or-nothing"])
+def test_static_scaling_stress_test(
+    test_datadir,
+    instance,
+    os,
+    region,
+    request,
+    pcluster_config_reader,
+    scheduler_commands_factory,
+    clusters_factory,
+    scaling_strategy,
+):
+    """
+    The test scales up a cluster with a large number of static nodes, as opposed to scaling
+    up and down with dynamic nodes.
+
+    This test records the amount of time it takes to create the cluster with the target number
+    of static nodes and then the amount of time it takes to start running a job once it has been
+    submitted. It compares the time to the baselines.
+
+    This test doesn't upload metrics like the dynamic ones because the nodes start up in cluster creation,
+    so we can't monitor the number of nodes from here. So, we just do a check that the time it takes is within
+    the baseline.
+    """
+    # Get the scaling parameters
+    scaling_test_config = _validate_and_get_scaling_test_config(test_datadir, request)
+    max_monitoring_time_in_mins = scaling_test_config.get("MaxMonitoringTimeInMins")
+    shared_headnode_storage_type = scaling_test_config.get("SharedHeadNodeStorageType")
+    head_node_instance_type = scaling_test_config.get("HeadNodeInstanceType")
+    scaling_targets = scaling_test_config.get("ScalingTargets")
+
+    for scaling_target in scaling_targets:
+        # Creating cluster with intended head node instance type and scaling parameters
+        cluster_config = pcluster_config_reader(
+            # Prevent nodes being set down before we start monitoring the scale down metrics
+            scaledown_idletime=max_monitoring_time_in_mins,
+            min_cluster_size=scaling_target,
+            max_cluster_size=scaling_target,
+            head_node_instance_type=head_node_instance_type,
+            shared_headnode_storage_type=shared_headnode_storage_type,
+            scaling_strategy=scaling_strategy,
+        )
+
+        # Create cluster and get creation start/end time
+        cluster = clusters_factory(cluster_config)
+        cluster_start_time = _datetime_to_minute(cluster.create_start_time)
+        cluster_end_time = _datetime_to_minute(cluster.create_end_time)
+        cluster_create_time = int((cluster_end_time - cluster_start_time).total_seconds())
+
+        # Run a job and get the time it takes for the job to start running
+        remote_command_executor = RemoteCommandExecutor(cluster)
+        scheduler_commands = scheduler_commands_factory(remote_command_executor)
+
+        scaling_job = {
+            "command": f"srun sleep 10",
+            "nodes": scaling_target,
+        }
+        job_id = scheduler_commands.submit_command_and_assert_job_accepted(scaling_job)
+        start_time = _datetime_to_minute(datetime.datetime.now(tz=datetime.timezone.utc))
+        scheduler_commands.wait_job_running(job_id)
+        end_time = _datetime_to_minute(datetime.datetime.now(tz=datetime.timezone.utc))
+        scheduler_commands.cancel_job(job_id)
+        job_start_time = int((end_time - start_time).total_seconds())
+
+        scaling_results = {
+            "Region": region,
+            "OS": os,
+            "ComputeNode": instance,
+            "HeadNode": head_node_instance_type,
+            "ScalingTarget": scaling_target,
+            "ScalingStrategy": scaling_strategy,
+            "ClusterCreateTime": cluster_create_time,
+            "JobStartTime": job_start_time
+        }
+        logging.info(f"Scaling Results: {scaling_results}")
+
 @pytest.mark.usefixtures("scheduler")
 @pytest.mark.parametrize("scaling_strategy", ["all-or-nothing", "best-effort"])
 def test_scaling_stress_test(

diff --git a/...tests/performance_tests/test_scaling/test_static_scaling_stress_test/pcluster.config.yaml b/...tests/performance_tests/test_scaling/test_static_scaling_stress_test/pcluster.config.yaml
@@ -0,0 +1,30 @@
+Image:
+  Os: {{ os }}
+HeadNode:
+  {% if shared_headnode_storage_type %}
+  SharedStorageType: {{ shared_headnode_storage_type }}
+  {% endif %}
+  InstanceType: {{ head_node_instance_type }}
+  Networking:
+    SubnetId: {{ public_subnet_id }}
+  Ssh:
+    KeyName: {{ key_name }}
+Scheduling:
+  Scheduler: {{ scheduler }}
+  ScalingStrategy: {{ scaling_strategy }}
+  SlurmSettings:
+    ScaledownIdletime: {{ scaledown_idletime }}
+  SlurmQueues:
+    - Name: queue-0
+      ComputeResources:
+        - Name: compute-resource-0
+          Instances:
+            - InstanceType: {{ instance }}
+          MinCount: {{ min_cluster_size }}
+          MaxCount: {{ max_cluster_size }}
+      Networking:
+        SubnetIds:
+          - {{ private_subnet_id }}
+DevSettings:
+  Timeouts:
+    HeadNodeBootstrapTimeout: 3600
diff --git a/...ests/performance_tests/test_scaling/test_static_scaling_stress_test/results/baseline.json b/...ests/performance_tests/test_scaling/test_static_scaling_stress_test/results/baseline.json
@@ -0,0 +1,52 @@
+{
+  "c5.large": {
+    "1000": {
+      "best-effort": {
+        "scale_up_time_ec2": 180,
+        "scale_up_time_scheduler": 540,
+        "scale_down_time": 120
+      },
+      "all-or-nothing": {
+        "scale_up_time_ec2": 180,
+        "scale_up_time_scheduler": 540,
+        "scale_down_time": 120
+      }
+    },
+    "2000": {
+      "best-effort": {
+        "scale_up_time_ec2": 300,
+        "scale_up_time_scheduler": 600,
+        "scale_down_time": 180
+      },
+      "all-or-nothing": {
+        "scale_up_time_ec2": 300,
+        "scale_up_time_scheduler": 600,
+        "scale_down_time": 180
+      }
+    },
+    "3000": {
+      "best-effort": {
+        "scale_up_time_ec2": 420,
+        "scale_up_time_scheduler": 1020,
+        "scale_down_time": 240
+      },
+      "all-or-nothing": {
+        "scale_up_time_ec2": 420,
+        "scale_up_time_scheduler": 1020,
+        "scale_down_time": 240
+      }
+    },
+    "4000": {
+      "best-effort": {
+        "scale_up_time_ec2": 540,
+        "scale_up_time_scheduler": 1200,
+        "scale_down_time": 300
+      },
+      "all-or-nothing": {
+        "scale_up_time_ec2": 540,
+        "scale_up_time_scheduler": 1200,
+        "scale_down_time": 300
+      }
+    }
+  }
+}
diff --git a/...s/performance_tests/test_scaling/test_static_scaling_stress_test/scaling_test_config.yaml b/...s/performance_tests/test_scaling/test_static_scaling_stress_test/scaling_test_config.yaml
@@ -0,0 +1,4 @@
+MaxMonitoringTimeInMins: 20
+ScalingTargets: [1000, 2000, 3000, 4000]
+SharedHeadNodeStorageType: 'Efs'
+HeadNodeInstanceType: 'c5.24xlarge'
diff --git a/...rmance_tests/test_scaling/test_static_scaling_stress_test/scaling_test_config_schema.yaml b/...rmance_tests/test_scaling/test_static_scaling_stress_test/scaling_test_config_schema.yaml
@@ -0,0 +1,16 @@
+type: map
+mapping:
+  MaxMonitoringTimeInMins:
+    type: int
+    required: true
+  ScalingTargets:
+    type: seq
+    required: true
+    sequence:
+      - type: int
+  SharedHeadNodeStorageType:
+    type: str
+    required: true
+  HeadNodeInstanceType:
+    type: str
+    required: true