From 3565343f8770b76162c872e1f96eb5a028e005b0 Mon Sep 17 00:00:00 2001
From: Judy Ng <njud@amazon.com>
Date: Tue, 26 Mar 2024 22:31:59 -0400
Subject: [PATCH] try

Signed-off-by: Judy Ng <njud@amazon.com>
---
 .../configs/scaling_stress_test.yaml          |  4 +-
 .../tests/common/scaling_common.py            | 52 +++++++++++++------
 .../tests/performance_tests/test_scaling.py   |  4 --
 3 files changed, 38 insertions(+), 22 deletions(-)

diff --git a/tests/integration-tests/configs/scaling_stress_test.yaml b/tests/integration-tests/configs/scaling_stress_test.yaml
index 5350b6bb9a..12ccfbad35 100644
--- a/tests/integration-tests/configs/scaling_stress_test.yaml
+++ b/tests/integration-tests/configs/scaling_stress_test.yaml
@@ -2,7 +2,7 @@ test-suites:
   performance_tests:
     test_scaling.py::test_scaling_stress_test:
       dimensions:
-        - regions: [ "use1-az6" ]
-          instances: [ "c5.large" ]
+        - regions: [ "euw1-az3" ]
+          instances: [ "p3.2xlarge" ]
           oss: [ "alinux2" ]
           schedulers: [ "slurm" ]
diff --git a/tests/integration-tests/tests/common/scaling_common.py b/tests/integration-tests/tests/common/scaling_common.py
index 175e55b230..18681fdf0a 100644
--- a/tests/integration-tests/tests/common/scaling_common.py
+++ b/tests/integration-tests/tests/common/scaling_common.py
@@ -65,30 +65,50 @@ def retry_if_scaling_target_not_reached(
     )
 
 
+def _check_no_node_log_exists_for_ip_address(path, ip_address):
+    for file_name in os.listdir(path):
+        if file_name.startswith(ip_address):
+            return False
+    return True
+
+
+def _sort_instances_by_launch_time(describe_instance_response):
+    instances = []
+    for reservation in describe_instance_response["Reservations"]:
+        for instance in reservation["Instances"]:
+            instances.append(instance)
+    instances.sort(key=lambda inst: inst["LaunchTime"])
+    return instances
+
+
 def get_bootstrap_errors(remote_command_executor: RemoteCommandExecutor, cluster_name, region):
     logging.info("Checking for bootstrap errors...")
     remote_command_executor.run_remote_script(script_file=str(SCALING_COMMON_DATADIR / "get_bootstrap_errors.sh"))
     ip_addresses_with_bootstrap_errors = remote_command_executor.run_remote_command(command=f"cat $HOME/bootstrap_errors.txt").stdout
 
     os.makedirs("bootstrap_errors", exist_ok=True)
+
     client = boto3.client("ec2", region_name=region)
     for ip_address in ip_addresses_with_bootstrap_errors.splitlines():
-        try:
-            instance_id = client.describe_instances(
-                Filters=[{"Name": "private-ip-address", "Values": [ip_address]}]
-            )["Reservations"][0]["Instances"][0]["InstanceId"]
-            logging.warning(f"Instance {instance_id} had bootstrap errors. Check the logs for details.")
-            compute_node_log = client.get_console_output(InstanceId=instance_id, Latest=True)["Output"]
-            with open(f"bootstrap_errors/{region}-{cluster_name}-{instance_id}-bootstrap-error.txt", "w") as file:
-                file.write(compute_node_log)
-        except IndexError:
-            # The instance id might not be found if a bootstrap error occurred in a previous scale-up using the same
-            # cluster, since it was terminated already but the IP address is still present in the clustermgtd.
-            # Issue warning and continue since we want the compute logs for the errors of the most recent scale-up
-            logging.warning("Instance not found for IP address %s that was found to have a bootstrap error", ip_address)
-        except Exception:
-            logging.error("Error while retrieving the compute node logs for instance with ip address %s", ip_address)
-            raise
+        # Since the same cluster is re-used for multiple scale up tests, the script may find the same bootstrap error
+        # multiple times and then get the wrong instance logs since the IP address would be attached to a new instance.
+        # Therefore, only write the compute node logs for the IP address if the file doesn't exist yet.
+        if _check_no_node_log_exists_for_ip_address("bootstrap_errors", ip_address):
+            try:
+                # Get the latest launched instance with the IP address since the most recent one should have the error
+                instance_id = _sort_instances_by_launch_time(client.describe_instances(
+                    Filters=[{"Name": "private-ip-address", "Values": [ip_address]}]
+                ))[-1]
+                logging.warning(f"Instance {instance_id} had bootstrap errors. Check the logs for details.")
+                compute_node_log = client.get_console_output(InstanceId=instance_id)["Output"]
+                with open(f"bootstrap_errors/{ip_address}-{cluster_name}-{instance_id}-{region}-log.txt", "w") as file:
+                    file.write(compute_node_log)
+            except IndexError:
+                # If the instance with the IP address can't be found, continue to get other bootstrap errors
+                logging.warning("Couldn't find instance with IP %s but could have a bootstrap error.", ip_address)
+            except Exception:
+                logging.error("Error when retrieving the compute node logs for instance with ip address %s", ip_address)
+                raise
 
 
 def get_scaling_metrics(
diff --git a/tests/integration-tests/tests/performance_tests/test_scaling.py b/tests/integration-tests/tests/performance_tests/test_scaling.py
index 4c0710e93c..f10b1eeff9 100644
--- a/tests/integration-tests/tests/performance_tests/test_scaling.py
+++ b/tests/integration-tests/tests/performance_tests/test_scaling.py
@@ -284,11 +284,7 @@ def _scale_up_and_down(
         target_cluster_size=scaling_target,
     )
 
-    # Get the compute node logs for bootstrap errors if compute nodes did not scale up to scaling target within time
-    # if scaling_target not in compute_nodes_time_series_up:
     get_bootstrap_errors(remote_command_executor, cluster.name, region)
-    # raise Exception(f"Cluster did not scale up to {scaling_target} nodes. "
-    #                 f"Check the compute node logs for any bootstrap errors in the test artifacts.")
 
     # Extract scale up duration and timestamp from the monitoring metrics collected above
     _, scale_up_time_ec2 = _get_scaling_time(ec2_capacity_time_series_up, timestamps, scaling_target, start_time)