From 772599bcf1e372c28b0c4af3f547daa55f3c6ff5 Mon Sep 17 00:00:00 2001
From: Judy Ng <njud@amazon.com>
Date: Thu, 21 Mar 2024 15:04:40 -0400
Subject: [PATCH] testing

Signed-off-by: Judy Ng <njud@amazon.com>
---
 .../tests/performance_tests/test_scaling.py   | 69 +++++++++++++++----
 .../pcluster.config.yaml                      |  7 +-
 .../scaling_test_config.yaml                  |  2 +-
 .../upscale-pcluster.config.yaml              | 27 ++++++++
 4 files changed, 84 insertions(+), 21 deletions(-)
 create mode 100644 tests/integration-tests/tests/performance_tests/test_scaling/test_static_scaling_stress_test/upscale-pcluster.config.yaml

diff --git a/tests/integration-tests/tests/performance_tests/test_scaling.py b/tests/integration-tests/tests/performance_tests/test_scaling.py
index fe4e9a2a04..b05df5d5ff 100644
--- a/tests/integration-tests/tests/performance_tests/test_scaling.py
+++ b/tests/integration-tests/tests/performance_tests/test_scaling.py
@@ -115,30 +115,51 @@ def test_static_scaling_stress_test(
     head_node_instance_type = scaling_test_config.get("HeadNodeInstanceType")
     scaling_targets = scaling_test_config.get("ScalingTargets")
 
+    # Creating cluster with intended head node instance type and scaling parameters
+    cluster_config = pcluster_config_reader(
+        # Prevent nodes being set down before we start monitoring the scale down metrics
+        scaledown_idletime=max_monitoring_time_in_mins,
+        head_node_instance_type=head_node_instance_type,
+        shared_headnode_storage_type=shared_headnode_storage_type,
+        scaling_strategy=scaling_strategy,
+        cluster_size=max(scaling_targets),
+    )
+    cluster = clusters_factory(cluster_config)
+    remote_command_executor = RemoteCommandExecutor(cluster)
+    scheduler_commands = scheduler_commands_factory(remote_command_executor)
+
     for scaling_target in scaling_targets:
-        # Creating cluster with intended head node instance type and scaling parameters
-        cluster_config = pcluster_config_reader(
-            # Prevent nodes being set down before we start monitoring the scale down metrics
+        upscale_cluster_config = pcluster_config_reader(
+            config_file="upscale-pcluster.config.yaml",
             scaledown_idletime=max_monitoring_time_in_mins,
-            min_cluster_size=scaling_target,
-            max_cluster_size=scaling_target,
+            cluster_size=scaling_target,
             head_node_instance_type=head_node_instance_type,
             shared_headnode_storage_type=shared_headnode_storage_type,
             scaling_strategy=scaling_strategy,
         )
 
-        # Create cluster and get creation start/end time
-        cluster = clusters_factory(cluster_config)
-        cluster_start_time = _datetime_to_minute(cluster.create_start_time)
-        cluster_end_time = _datetime_to_minute(cluster.create_end_time)
-        cluster_create_time = int((cluster_end_time - cluster_start_time).total_seconds())
+        cluster.update(str(upscale_cluster_config), force_update="true", wait=False, raise_on_error=False)
 
-        # Run a job and get the time it takes for the job to start running
-        remote_command_executor = RemoteCommandExecutor(cluster)
-        scheduler_commands = scheduler_commands_factory(remote_command_executor)
+        # Set start time at minute granularity (to simplify calculation and visualising on CloudWatch)
+        start_time = _datetime_to_minute(datetime.datetime.now(tz=datetime.timezone.utc))
+        # Monitor the cluster during scale up
+        ec2_capacity_time_series_up, compute_nodes_time_series_up, timestamps, end_time = get_scaling_metrics(
+            remote_command_executor,
+            max_monitoring_time=minutes(max_monitoring_time_in_mins),
+            region=region,
+            cluster_name=cluster.name,
+            publish_metrics=True,
+            target_cluster_size=scaling_target,
+        )
+        # Extract scale up duration and timestamp from the monitoring metrics collected above
+        _, scale_up_time_ec2 = _get_scaling_time(ec2_capacity_time_series_up, timestamps, scaling_target, start_time)
+        scaling_target_time, scale_up_time_scheduler = _get_scaling_time(
+            compute_nodes_time_series_up, timestamps, scaling_target, start_time
+        )
 
+        # Check that a simple job succeeds and retrieve the time it takes for the job to start
         scaling_job = {
-            "command": f"srun sleep 10",
+            "command": "srun sleep 10",
             "nodes": scaling_target,
         }
         job_id = scheduler_commands.submit_command_and_assert_job_accepted(scaling_job)
@@ -148,6 +169,22 @@ def test_static_scaling_stress_test(
         scheduler_commands.cancel_job(job_id)
         job_start_time = int((end_time - start_time).total_seconds())
 
+        # Scale down the cluster
+        cluster.update(str(cluster_config), force_update="true", wait=False, raise_on_error=False)
+
+        # Monitor the cluster during scale down
+        scale_down_start_timestamp = _datetime_to_minute(datetime.datetime.now(tz=datetime.timezone.utc))
+        ec2_capacity_time_series_down, compute_nodes_time_series_down, timestamps, end_time = get_scaling_metrics(
+            remote_command_executor,
+            max_monitoring_time=minutes(max_monitoring_time_in_mins),
+            region=region,
+            cluster_name=cluster.name,
+            publish_metrics=True,
+            target_cluster_size=0,
+        )
+        # Extract scale down duration and timestamp from the monitoring metrics collected above
+        _, scale_down_time = _get_scaling_time(ec2_capacity_time_series_down, timestamps, 0, scale_down_start_timestamp)
+
         scaling_results = {
             "Region": region,
             "OS": os,
@@ -155,7 +192,9 @@ def test_static_scaling_stress_test(
             "HeadNode": head_node_instance_type,
             "ScalingTarget": scaling_target,
             "ScalingStrategy": scaling_strategy,
-            "ClusterCreateTime": cluster_create_time,
+            "ScaleUpTimeEC2": scale_up_time_ec2,
+            "ScaleUpTimeScheduler": scale_up_time_scheduler,
+            "ScaleDownTime": scale_down_time,
             "JobStartTime": job_start_time
         }
         logging.info(f"Scaling Results: {scaling_results}")
diff --git a/tests/integration-tests/tests/performance_tests/test_scaling/test_static_scaling_stress_test/pcluster.config.yaml b/tests/integration-tests/tests/performance_tests/test_scaling/test_static_scaling_stress_test/pcluster.config.yaml
index 7ca50781a2..e2a3fd4a2e 100644
--- a/tests/integration-tests/tests/performance_tests/test_scaling/test_static_scaling_stress_test/pcluster.config.yaml
+++ b/tests/integration-tests/tests/performance_tests/test_scaling/test_static_scaling_stress_test/pcluster.config.yaml
@@ -20,11 +20,8 @@ Scheduling:
         - Name: compute-resource-0
           Instances:
             - InstanceType: {{ instance }}
-          MinCount: {{ min_cluster_size }}
-          MaxCount: {{ max_cluster_size }}
+          MinCount: 0
+          MaxCount: {{ cluster_size }}
       Networking:
         SubnetIds:
           - {{ private_subnet_id }}
-DevSettings:
-  Timeouts:
-    HeadNodeBootstrapTimeout: 3600
diff --git a/tests/integration-tests/tests/performance_tests/test_scaling/test_static_scaling_stress_test/scaling_test_config.yaml b/tests/integration-tests/tests/performance_tests/test_scaling/test_static_scaling_stress_test/scaling_test_config.yaml
index c5a8872274..141803a666 100644
--- a/tests/integration-tests/tests/performance_tests/test_scaling/test_static_scaling_stress_test/scaling_test_config.yaml
+++ b/tests/integration-tests/tests/performance_tests/test_scaling/test_static_scaling_stress_test/scaling_test_config.yaml
@@ -1,4 +1,4 @@
 MaxMonitoringTimeInMins: 20
-ScalingTargets: [1000, 2000, 3000, 4000]
+ScalingTargets: [100, 100, 100]
 SharedHeadNodeStorageType: 'Efs'
 HeadNodeInstanceType: 'c5.24xlarge'
diff --git a/tests/integration-tests/tests/performance_tests/test_scaling/test_static_scaling_stress_test/upscale-pcluster.config.yaml b/tests/integration-tests/tests/performance_tests/test_scaling/test_static_scaling_stress_test/upscale-pcluster.config.yaml
new file mode 100644
index 0000000000..4b2a96c13a
--- /dev/null
+++ b/tests/integration-tests/tests/performance_tests/test_scaling/test_static_scaling_stress_test/upscale-pcluster.config.yaml
@@ -0,0 +1,27 @@
+Image:
+  Os: {{ os }}
+HeadNode:
+  {% if shared_headnode_storage_type %}
+  SharedStorageType: {{ shared_headnode_storage_type }}
+  {% endif %}
+  InstanceType: {{ head_node_instance_type }}
+  Networking:
+    SubnetId: {{ public_subnet_id }}
+  Ssh:
+    KeyName: {{ key_name }}
+Scheduling:
+  Scheduler: {{ scheduler }}
+  ScalingStrategy: {{ scaling_strategy }}
+  SlurmSettings:
+    ScaledownIdletime: {{ scaledown_idletime }}
+  SlurmQueues:
+    - Name: queue-0
+      ComputeResources:
+        - Name: compute-resource-0
+          Instances:
+            - InstanceType: {{ instance }}
+          MinCount: {{ cluster_size }}
+          MaxCount: {{ cluster_size }}
+      Networking:
+        SubnetIds:
+          - {{ private_subnet_id }}