Skip to content

Commit

Permalink
Output bootstrap errors in scaling test
Browse files Browse the repository at this point in the history
Signed-off-by: Judy Ng <[email protected]>
  • Loading branch information
judysng committed Mar 26, 2024
1 parent a4a0389 commit 099f9c1
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 1 deletion.
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/bin/bash
# Copyright 2024 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License").
# You may not use this file except in compliance with the License.
# A copy of the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "LICENSE.txt" file accompanying this file.
# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied.
# See the License for the specific language governing permissions and limitations under the License.

set -ex

CLUSTERMGTD_LOG="/var/log/parallelcluster/clustermgtd"
touch "bootstrap_errors.txt"

# Find a log message like:
# ... WARNING - Node bootstrap error: Node queue-0-dy-compute-resource-0-1690(192.168.90.197) ...
# and get the IP address
sudo cat ${CLUSTERMGTD_LOG} | grep -i "no corresponding instance in EC2 for node" | awk -F"[()]" '{print $2}' | while read -r ip_address ; do
echo "${ip_address}" >> "bootstrap_errors.txt"
done
19 changes: 19 additions & 0 deletions tests/integration-tests/tests/common/scaling_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

import boto3
import yaml
import os
from framework.metrics_publisher import Metric, MetricsPublisher
from pykwalify.core import Core
from remote_command_executor import RemoteCommandExecutor
Expand Down Expand Up @@ -64,6 +65,24 @@ def retry_if_scaling_target_not_reached(
)


def get_bootstrap_errors(remote_command_executor: RemoteCommandExecutor, cluster_name, region):
logging.info("Checking for bootstrap errors...")
remote_command_executor.run_remote_script(script_file=str(SCALING_COMMON_DATADIR / "get_bootstrap_errors.sh"))
ip_addresses_with_bootstrap_errors = remote_command_executor.run_remote_command(command=f"cat $HOME/bootstrap_errors.txt").stdout

os.makedirs("bootstrap_errors", exist_ok=True)

client = boto3.client("ec2", region_name=region)
for ip_address in ip_addresses_with_bootstrap_errors.splitlines():
instance_id = client.describe_instances(
Filters=[{"Name": "private-ip-address", "Values": [ip_address]}]
)["Reservations"][0]["Instances"][0]["InstanceId"]
logging.info(f"Instance {instance_id} had bootstrap errors. Check the logs for details.")
compute_node_log = client.get_console_output(InstanceId=instance_id)["Output"]
with open(f"bootstrap_errors/{ip_address}-{cluster_name}-{instance_id}-bootstrap-error.txt", "w") as file:
file.write(compute_node_log)


def get_scaling_metrics(
remote_command_executor: RemoteCommandExecutor,
region,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from utils import disable_protected_mode

from tests.common.assertions import assert_no_msg_in_logs
from tests.common.scaling_common import get_scaling_metrics, validate_and_get_scaling_test_config
from tests.common.scaling_common import get_scaling_metrics, validate_and_get_scaling_test_config, get_bootstrap_errors


@pytest.mark.parametrize(
Expand Down Expand Up @@ -284,6 +284,12 @@ def _scale_up_and_down(
target_cluster_size=scaling_target,
)

# Get the compute node logs for bootstrap errors if compute nodes did not scale up to scaling target within time
if scaling_target not in compute_nodes_time_series_up:
get_bootstrap_errors(remote_command_executor, cluster.name, region)
raise Exception(f"Cluster did not scale up to {scaling_target} nodes."
f"Check the compute node logs for any bootstrap errors.")

# Extract scale up duration and timestamp from the monitoring metrics collected above
_, scale_up_time_ec2 = _get_scaling_time(ec2_capacity_time_series_up, timestamps, scaling_target, start_time)
scaling_target_time, scale_up_time_scheduler = _get_scaling_time(
Expand Down

0 comments on commit 099f9c1

Please sign in to comment.