Output bootstrap errors in scaling test

Signed-off-by: Judy Ng <[email protected]>
judysng · Mar 27, 2024 · a2cadb1 · a2cadb1
1 parent a4a0389
commit a2cadb1
Show file tree

Hide file tree

Showing 3 changed files with 80 additions and 2 deletions.
diff --git a/tests/integration-tests/tests/common/scaling/get_bootstrap_errors.sh b/tests/integration-tests/tests/common/scaling/get_bootstrap_errors.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+# Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License").
+# You may not use this file except in compliance with the License.
+# A copy of the License is located at
+#
+# http://aws.amazon.com/apache2.0/
+#
+# or in the "LICENSE.txt" file accompanying this file.
+# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
+# See the License for the specific language governing permissions and limitations under the License.
+
+set -ex
+
+CLUSTERMGTD_LOG="/var/log/parallelcluster/clustermgtd"
+touch "bootstrap_errors.txt"
+
+# Find a log message like:
+# ... WARNING - Node bootstrap error: Node queue-0-dy-compute-resource-0-1690(192.168.90.197) ...
+# and get the IP address
+sudo cat ${CLUSTERMGTD_LOG} | grep -i "Node bootstrap error" | awk -F"[()]" '{print $2}' | while read -r ip_address ; do
+  if ! grep -q "${ip_address}" "bootstrap_errors.txt"; then
+    echo "${ip_address}" >> "bootstrap_errors.txt"
+  fi
+done
diff --git a/tests/integration-tests/tests/common/scaling_common.py b/tests/integration-tests/tests/common/scaling_common.py
@@ -16,6 +16,7 @@
 
 import boto3
 import yaml
+import os
 from framework.metrics_publisher import Metric, MetricsPublisher
 from pykwalify.core import Core
 from remote_command_executor import RemoteCommandExecutor
@@ -64,6 +65,54 @@ def retry_if_scaling_target_not_reached(
     )
 
 
+def _check_no_node_log_exists_for_ip_address(path, ip_address):
+    for file_name in os.listdir(path):
+        if file_name.startswith(ip_address):
+            return False
+    return True
+
+
+def _sort_instances_by_launch_time(describe_instance_response):
+    instances = []
+    for reservation in describe_instance_response["Reservations"]:
+        for instance in reservation["Instances"]:
+            instances.append(instance)
+    instances.sort(key=lambda inst: inst["LaunchTime"])
+    return instances
+
+
+def get_bootstrap_errors(remote_command_executor: RemoteCommandExecutor, cluster_name, output_dir, region):
+    logging.info("Checking for bootstrap errors...")
+    remote_command_executor.run_remote_script(script_file=str(SCALING_COMMON_DATADIR / "get_bootstrap_errors.sh"))
+    ip_addresses_with_bootstrap_errors = remote_command_executor.run_remote_command(command=f"cat $HOME/bootstrap_errors.txt").stdout
+
+    path = os.path.join(output_dir, "bootstrap_errors")
+    os.makedirs(path, exist_ok=True)
+
+    client = boto3.client("ec2", region_name=region)
+    for ip_address in ip_addresses_with_bootstrap_errors.splitlines():
+        # Since the same cluster is re-used for multiple scale up tests, the script may find the same bootstrap error
+        # multiple times and then get the wrong instance logs since the IP address would be attached to a new instance.
+        # Therefore, only write the compute node logs for the IP address if the file doesn't exist yet.
+        if _check_no_node_log_exists_for_ip_address(path, ip_address):
+            try:
+                logging.warning(f"Compute node with IP {ip_address} had bootstrap errors. Getting instance id...")
+                # Get the latest launched instance with the IP address since the most recent one should have the error
+                instance_id = _sort_instances_by_launch_time(client.describe_instances(
+                    Filters=[{"Name": "private-ip-address", "Values": [ip_address]}]
+                ))[-1]["InstanceId"]
+                logging.warning(f"Instance {instance_id} had bootstrap errors. Check the test outputs for details.")
+                compute_node_log = client.get_console_output(InstanceId=instance_id, Latest=True)["Output"]
+                with open(os.path.join(path, f"{ip_address}-{cluster_name}-{instance_id}-{region}-log.txt"), "w") as f:
+                    f.write(compute_node_log)
+            except IndexError:
+                # If the instance with the IP address can't be found, continue to get any other bootstrap errors
+                logging.warning("Couldn't find instance with IP %s but could have a bootstrap error.", ip_address)
+            except Exception:
+                logging.error("Error when retrieving the compute node logs for instance with ip address %s", ip_address)
+                raise
+
+
 def get_scaling_metrics(
     remote_command_executor: RemoteCommandExecutor,
     region,

diff --git a/tests/integration-tests/tests/performance_tests/test_scaling.py b/tests/integration-tests/tests/performance_tests/test_scaling.py
@@ -11,7 +11,7 @@
 from utils import disable_protected_mode
 
 from tests.common.assertions import assert_no_msg_in_logs
-from tests.common.scaling_common import get_scaling_metrics, validate_and_get_scaling_test_config
+from tests.common.scaling_common import get_scaling_metrics, validate_and_get_scaling_test_config, get_bootstrap_errors
 
 
 @pytest.mark.parametrize(
@@ -78,7 +78,8 @@ def _get_scaling_time(capacity_time_series: list, timestamps: list, scaling_targ
         return scaling_target_time, int((scaling_target_time - start_time).total_seconds())
     except ValueError as e:
         logging.error("Cluster did not scale up to %d nodes", scaling_target)
-        raise Exception("Cluster could not scale up to target nodes within the max monitoring time") from e
+        raise Exception(f"Cluster could not scale up to {scaling_target} nodes within the max monitoring time. "
+                        "Check the test outputs for any bootstrap failures.") from e
 
 
 @pytest.mark.usefixtures("scheduler")
@@ -284,6 +285,8 @@ def _scale_up_and_down(
         target_cluster_size=scaling_target,
     )
 
+    get_bootstrap_errors(remote_command_executor, cluster.name, request.config.getoption("output_dir"), region)
+
     # Extract scale up duration and timestamp from the monitoring metrics collected above
     _, scale_up_time_ec2 = _get_scaling_time(ec2_capacity_time_series_up, timestamps, scaling_target, start_time)
     scaling_target_time, scale_up_time_scheduler = _get_scaling_time(