From 16b574724de45057b8f6407f146939415c392163 Mon Sep 17 00:00:00 2001
From: Judy Ng <njud@amazon.com>
Date: Wed, 20 Mar 2024 12:17:00 -0400
Subject: [PATCH] Add static node stress test

Signed-off-by: Judy Ng <njud@amazon.com>
---
 tests/integration-tests/clusters_factory.py   |  2 +
 .../configs/scaling_stress_test.yaml          | 10 +--
 tests/integration-tests/conftest.py           |  3 +
 .../tests/performance_tests/test_scaling.py   | 77 +++++++++++++++++++
 4 files changed, 87 insertions(+), 5 deletions(-)

diff --git a/tests/integration-tests/clusters_factory.py b/tests/integration-tests/clusters_factory.py
index d935a896b2..5e8d9e31e6 100644
--- a/tests/integration-tests/clusters_factory.py
+++ b/tests/integration-tests/clusters_factory.py
@@ -64,6 +64,8 @@ def __init__(self, name, ssh_key, config_file, region, custom_cli_credentials=No
         self.__cfn_resources = None
         self.__cfn_stack_arn = None
         self.custom_cli_credentials = custom_cli_credentials
+        self.create_start_time = None
+        self.create_end_time = None
 
     def __repr__(self):
         attrs = ", ".join(["{key}={value}".format(key=key, value=repr(value)) for key, value in self.__dict__.items()])
diff --git a/tests/integration-tests/configs/scaling_stress_test.yaml b/tests/integration-tests/configs/scaling_stress_test.yaml
index 48509191d8..d59ae53ae9 100644
--- a/tests/integration-tests/configs/scaling_stress_test.yaml
+++ b/tests/integration-tests/configs/scaling_stress_test.yaml
@@ -1,8 +1,8 @@
 test-suites:
   performance_tests:
-    test_scaling.py::test_scaling_stress_test:
+    test_scaling.py::test_static_scaling_stress_test:
       dimensions:
-        - regions: ["us-east-1"]
-          instances: ["c5.large"]
-          oss: ["alinux2"]
-          schedulers: ["slurm"]
+        - regions: [ "eu-west-1" ]
+          instances: [ "c5.large" ]
+          oss: [ "alinux2" ]
+          schedulers: [ "slurm" ]
diff --git a/tests/integration-tests/conftest.py b/tests/integration-tests/conftest.py
index 9b119cc6bc..54a1822138 100644
--- a/tests/integration-tests/conftest.py
+++ b/tests/integration-tests/conftest.py
@@ -14,6 +14,7 @@
 # additional details.
 
 import copy
+import datetime
 import json
 import logging
 import os
@@ -423,7 +424,9 @@ def _cluster_factory(cluster_config, upper_case_cluster_name=False, custom_cli_c
             custom_cli_credentials=custom_cli_credentials,
         )
         if not request.config.getoption("cluster"):
+            cluster.create_start_time = datetime.datetime.now(tz=datetime.timezone.utc)
             cluster.creation_response = factory.create_cluster(cluster, **kwargs)
+            cluster.create_end_time = datetime.datetime.now(tz=datetime.timezone.utc)
         return cluster
 
     yield _cluster_factory
diff --git a/tests/integration-tests/tests/performance_tests/test_scaling.py b/tests/integration-tests/tests/performance_tests/test_scaling.py
index cade1b0e96..fe4e9a2a04 100644
--- a/tests/integration-tests/tests/performance_tests/test_scaling.py
+++ b/tests/integration-tests/tests/performance_tests/test_scaling.py
@@ -83,6 +83,83 @@ def _get_scaling_time(capacity_time_series: list, timestamps: list, scaling_targ
         raise e
 
 
+@pytest.mark.usefixtures("scheduler")
+@pytest.mark.parametrize("scaling_strategy", ["all-or-nothing"])
+def test_static_scaling_stress_test(
+    test_datadir,
+    instance,
+    os,
+    region,
+    request,
+    pcluster_config_reader,
+    scheduler_commands_factory,
+    clusters_factory,
+    scaling_strategy,
+):
+    """
+    The test scales up a cluster with a large number of static nodes, as opposed to scaling
+    up and down with dynamic nodes.
+
+    This test records the amount of time it takes to create the cluster with the target number
+    of static nodes and then the amount of time it takes to start running a job once it has been
+    submitted. It compares the time to the baselines.
+
+    This test doesn't upload metrics like the dynamic ones because the nodes start up in cluster creation,
+    so we can't monitor the number of nodes from here. So, we just do a check that the time it takes is within
+    the baseline.
+    """
+    # Get the scaling parameters
+    scaling_test_config = _validate_and_get_scaling_test_config(test_datadir, request)
+    max_monitoring_time_in_mins = scaling_test_config.get("MaxMonitoringTimeInMins")
+    shared_headnode_storage_type = scaling_test_config.get("SharedHeadNodeStorageType")
+    head_node_instance_type = scaling_test_config.get("HeadNodeInstanceType")
+    scaling_targets = scaling_test_config.get("ScalingTargets")
+
+    for scaling_target in scaling_targets:
+        # Creating cluster with intended head node instance type and scaling parameters
+        cluster_config = pcluster_config_reader(
+            # Prevent nodes being set down before we start monitoring the scale down metrics
+            scaledown_idletime=max_monitoring_time_in_mins,
+            min_cluster_size=scaling_target,
+            max_cluster_size=scaling_target,
+            head_node_instance_type=head_node_instance_type,
+            shared_headnode_storage_type=shared_headnode_storage_type,
+            scaling_strategy=scaling_strategy,
+        )
+
+        # Create cluster and get creation start/end time
+        cluster = clusters_factory(cluster_config)
+        cluster_start_time = _datetime_to_minute(cluster.create_start_time)
+        cluster_end_time = _datetime_to_minute(cluster.create_end_time)
+        cluster_create_time = int((cluster_end_time - cluster_start_time).total_seconds())
+
+        # Run a job and get the time it takes for the job to start running
+        remote_command_executor = RemoteCommandExecutor(cluster)
+        scheduler_commands = scheduler_commands_factory(remote_command_executor)
+
+        scaling_job = {
+            "command": f"srun sleep 10",
+            "nodes": scaling_target,
+        }
+        job_id = scheduler_commands.submit_command_and_assert_job_accepted(scaling_job)
+        start_time = _datetime_to_minute(datetime.datetime.now(tz=datetime.timezone.utc))
+        scheduler_commands.wait_job_running(job_id)
+        end_time = _datetime_to_minute(datetime.datetime.now(tz=datetime.timezone.utc))
+        scheduler_commands.cancel_job(job_id)
+        job_start_time = int((end_time - start_time).total_seconds())
+
+        scaling_results = {
+            "Region": region,
+            "OS": os,
+            "ComputeNode": instance,
+            "HeadNode": head_node_instance_type,
+            "ScalingTarget": scaling_target,
+            "ScalingStrategy": scaling_strategy,
+            "ClusterCreateTime": cluster_create_time,
+            "JobStartTime": job_start_time
+        }
+        logging.info(f"Scaling Results: {scaling_results}")
+
 @pytest.mark.usefixtures("scheduler")
 @pytest.mark.parametrize("scaling_strategy", ["all-or-nothing", "best-effort"])
 def test_scaling_stress_test(