Skip to content

Commit

Permalink
import
Browse files Browse the repository at this point in the history
Signed-off-by: Judy Ng <[email protected]>
  • Loading branch information
judysng committed Mar 25, 2024
1 parent a660146 commit 198e779
Show file tree
Hide file tree
Showing 3 changed files with 10 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
set -ex

CLUSTERMGTD_LOG="/var/log/parallelcluster/clustermgtd"
touch "bootstrap_errors.txt"

# Find a log message like:
# ... WARNING - Node bootstrap error: Node queue-0-dy-compute-resource-0-1690(192.168.90.197) ...
Expand All @@ -23,8 +24,8 @@ sudo cat ${CLUSTERMGTD_LOG} | grep -i "Node bootstrap error" | awk -F"[()]" '{pr
# multiple times and then get the wrong instance logs since the IP address would be attached to a new instance.
# Therefore, only write the compute node logs for the IP address if the file doesn't exist yet.
if [ ! -f "${ip_address}.txt" ]; then
INSTANCE_ID=$(aws ec2 describe-instances --filter Name=private-ip-address,Values="${ip_address}" --query 'Reservations[].Instances[].InstanceId' --output)
echo "${INSTANCE_ID} ${ip_address}"
INSTANCE_ID=$(aws ec2 describe-instances --filter Name=private-ip-address,Values="${ip_address}" --query 'Reservations[].Instances[].InstanceId' --output text)
echo "${INSTANCE_ID} ${ip_address}" >> "bootstrap_errors.txt"
aws ec2 get-console-output --output text --instance-id "${INSTANCE_ID}" > "${ip_address}.txt"
fi
done
7 changes: 4 additions & 3 deletions tests/integration-tests/tests/common/scaling_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,15 +66,16 @@ def retry_if_scaling_target_not_reached(


def get_bootstrap_errors(remote_command_executor: RemoteCommandExecutor, cluster_name, output_dir):
instance_ids_with_bootstrap_errors = remote_command_executor.run_remote_script(
script_file=str(SCALING_COMMON_DATADIR / "get_bootstrap_errors.sh")
).stdout
logging.info("Checking for bootstrap errors...")
remote_command_executor.run_remote_script(script_file=str(SCALING_COMMON_DATADIR / "get_bootstrap_errors.sh"))
instance_ids_with_bootstrap_errors = remote_command_executor.run_remote_command(command=f"cat $HOME/bootstrap_errors.txt").stdout

path = os.path.join(output_dir, "bootstrap_errors")
os.makedirs(path, exist_ok=True)

for instance_id_ip_address in instance_ids_with_bootstrap_errors.splitlines():
instance_id, ip_address = instance_id_ip_address.split(" ")
logging.info(f"Instance {instance_id} had bootstrap errors. Check the logs for details.")
compute_node_log = remote_command_executor.run_remote_command(command=f"cat $HOME/{ip_address}.txt").stdout
with open(os.path.join(path, f"{cluster_name}-{instance_id}-bootstrap-error.txt"), "w") as file:
file.write(compute_node_log)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from utils import disable_protected_mode

from tests.common.assertions import assert_no_msg_in_logs
from tests.common.scaling_common import get_scaling_metrics, validate_and_get_scaling_test_config
from tests.common.scaling_common import get_scaling_metrics, validate_and_get_scaling_test_config, get_bootstrap_errors


@pytest.mark.parametrize(
Expand Down Expand Up @@ -284,10 +284,8 @@ def _scale_up_and_down(
target_cluster_size=scaling_target,
)

# Check for bootstrap errors since the cluster was unable to scale up to target within the max monitoring time
if scaling_target not in compute_nodes_time_series_up:
get_bootstrap_errors(remote_command_executor, cluster.name, request.config.getoption("output_dir"))
raise Exception(f"Cluster could not scale up to {scaling_target} nodes within the max monitoring time")
# Check for any bootstrap errors and get the compute node logs in the test artifacts
get_bootstrap_errors(remote_command_executor, cluster.name, request.config.getoption("output_dir"))

# Extract scale up duration and timestamp from the monitoring metrics collected above
_, scale_up_time_ec2 = _get_scaling_time(ec2_capacity_time_series_up, timestamps, scaling_target, start_time)
Expand Down

0 comments on commit 198e779

Please sign in to comment.