From a2cadb16b9e8887a378755401c8410c64410e81c Mon Sep 17 00:00:00 2001 From: Judy Ng Date: Sun, 24 Mar 2024 21:33:36 -0400 Subject: [PATCH] Output bootstrap errors in scaling test Signed-off-by: Judy Ng --- .../common/scaling/get_bootstrap_errors.sh | 26 ++++++++++ .../tests/common/scaling_common.py | 49 +++++++++++++++++++ .../tests/performance_tests/test_scaling.py | 7 ++- 3 files changed, 80 insertions(+), 2 deletions(-) create mode 100644 tests/integration-tests/tests/common/scaling/get_bootstrap_errors.sh diff --git a/tests/integration-tests/tests/common/scaling/get_bootstrap_errors.sh b/tests/integration-tests/tests/common/scaling/get_bootstrap_errors.sh new file mode 100644 index 0000000000..ccde69b393 --- /dev/null +++ b/tests/integration-tests/tests/common/scaling/get_bootstrap_errors.sh @@ -0,0 +1,26 @@ +#!/bin/bash +# Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at +# +# http://aws.amazon.com/apache2.0/ +# +# or in the "LICENSE.txt" file accompanying this file. +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +set -ex + +CLUSTERMGTD_LOG="/var/log/parallelcluster/clustermgtd" +touch "bootstrap_errors.txt" + +# Find a log message like: +# ... WARNING - Node bootstrap error: Node queue-0-dy-compute-resource-0-1690(192.168.90.197) ... +# and get the IP address +sudo cat ${CLUSTERMGTD_LOG} | grep -i "Node bootstrap error" | awk -F"[()]" '{print $2}' | while read -r ip_address ; do + if ! grep -q "${ip_address}" "bootstrap_errors.txt"; then + echo "${ip_address}" >> "bootstrap_errors.txt" + fi +done diff --git a/tests/integration-tests/tests/common/scaling_common.py b/tests/integration-tests/tests/common/scaling_common.py index 234f636c2a..c1bb6ce5ec 100644 --- a/tests/integration-tests/tests/common/scaling_common.py +++ b/tests/integration-tests/tests/common/scaling_common.py @@ -16,6 +16,7 @@ import boto3 import yaml +import os from framework.metrics_publisher import Metric, MetricsPublisher from pykwalify.core import Core from remote_command_executor import RemoteCommandExecutor @@ -64,6 +65,54 @@ def retry_if_scaling_target_not_reached( ) +def _check_no_node_log_exists_for_ip_address(path, ip_address): + for file_name in os.listdir(path): + if file_name.startswith(ip_address): + return False + return True + + +def _sort_instances_by_launch_time(describe_instance_response): + instances = [] + for reservation in describe_instance_response["Reservations"]: + for instance in reservation["Instances"]: + instances.append(instance) + instances.sort(key=lambda inst: inst["LaunchTime"]) + return instances + + +def get_bootstrap_errors(remote_command_executor: RemoteCommandExecutor, cluster_name, output_dir, region): + logging.info("Checking for bootstrap errors...") + remote_command_executor.run_remote_script(script_file=str(SCALING_COMMON_DATADIR / "get_bootstrap_errors.sh")) + ip_addresses_with_bootstrap_errors = remote_command_executor.run_remote_command(command=f"cat $HOME/bootstrap_errors.txt").stdout + + path = os.path.join(output_dir, "bootstrap_errors") + os.makedirs(path, exist_ok=True) + + client = boto3.client("ec2", region_name=region) + for ip_address in ip_addresses_with_bootstrap_errors.splitlines(): + # Since the same cluster is re-used for multiple scale up tests, the script may find the same bootstrap error + # multiple times and then get the wrong instance logs since the IP address would be attached to a new instance. + # Therefore, only write the compute node logs for the IP address if the file doesn't exist yet. + if _check_no_node_log_exists_for_ip_address(path, ip_address): + try: + logging.warning(f"Compute node with IP {ip_address} had bootstrap errors. Getting instance id...") + # Get the latest launched instance with the IP address since the most recent one should have the error + instance_id = _sort_instances_by_launch_time(client.describe_instances( + Filters=[{"Name": "private-ip-address", "Values": [ip_address]}] + ))[-1]["InstanceId"] + logging.warning(f"Instance {instance_id} had bootstrap errors. Check the test outputs for details.") + compute_node_log = client.get_console_output(InstanceId=instance_id, Latest=True)["Output"] + with open(os.path.join(path, f"{ip_address}-{cluster_name}-{instance_id}-{region}-log.txt"), "w") as f: + f.write(compute_node_log) + except IndexError: + # If the instance with the IP address can't be found, continue to get any other bootstrap errors + logging.warning("Couldn't find instance with IP %s but could have a bootstrap error.", ip_address) + except Exception: + logging.error("Error when retrieving the compute node logs for instance with ip address %s", ip_address) + raise + + def get_scaling_metrics( remote_command_executor: RemoteCommandExecutor, region, diff --git a/tests/integration-tests/tests/performance_tests/test_scaling.py b/tests/integration-tests/tests/performance_tests/test_scaling.py index a12a0c8e49..b436d4f278 100644 --- a/tests/integration-tests/tests/performance_tests/test_scaling.py +++ b/tests/integration-tests/tests/performance_tests/test_scaling.py @@ -11,7 +11,7 @@ from utils import disable_protected_mode from tests.common.assertions import assert_no_msg_in_logs -from tests.common.scaling_common import get_scaling_metrics, validate_and_get_scaling_test_config +from tests.common.scaling_common import get_scaling_metrics, validate_and_get_scaling_test_config, get_bootstrap_errors @pytest.mark.parametrize( @@ -78,7 +78,8 @@ def _get_scaling_time(capacity_time_series: list, timestamps: list, scaling_targ return scaling_target_time, int((scaling_target_time - start_time).total_seconds()) except ValueError as e: logging.error("Cluster did not scale up to %d nodes", scaling_target) - raise Exception("Cluster could not scale up to target nodes within the max monitoring time") from e + raise Exception(f"Cluster could not scale up to {scaling_target} nodes within the max monitoring time. " + "Check the test outputs for any bootstrap failures.") from e @pytest.mark.usefixtures("scheduler") @@ -284,6 +285,8 @@ def _scale_up_and_down( target_cluster_size=scaling_target, ) + get_bootstrap_errors(remote_command_executor, cluster.name, request.config.getoption("output_dir"), region) + # Extract scale up duration and timestamp from the monitoring metrics collected above _, scale_up_time_ec2 = _get_scaling_time(ec2_capacity_time_series_up, timestamps, scaling_target, start_time) scaling_target_time, scale_up_time_scheduler = _get_scaling_time(