diff --git a/tests/integration-tests/tests/common/scaling/get_bootstrap_errors.sh b/tests/integration-tests/tests/common/scaling/get_bootstrap_errors.sh index 88efb38a82..0364d89507 100644 --- a/tests/integration-tests/tests/common/scaling/get_bootstrap_errors.sh +++ b/tests/integration-tests/tests/common/scaling/get_bootstrap_errors.sh @@ -23,8 +23,8 @@ sudo cat ${CLUSTERMGTD_LOG} | grep -i "Node bootstrap error" | awk -F"[()]" '{pr # multiple times and then get the wrong instance logs since the IP address would be attached to a new instance. # Therefore, only write the compute node logs for the IP address if the file doesn't exist yet. if [ ! -f "${ip_address}.txt" ]; then - INSTANCE_ID=$(aws ec2 describe-instances --filter Name=private-ip-address,Values="${ip_address}" --query 'Reservations[].Instances[].InstanceId' --output) - echo "${INSTANCE_ID} ${ip_address}" + INSTANCE_ID=$(aws ec2 describe-instances --filter Name=private-ip-address,Values="${ip_address}" --query 'Reservations[].Instances[].InstanceId' --output text) + echo "${INSTANCE_ID} ${ip_address}" >> "bootstrap_errors.txt" aws ec2 get-console-output --output text --instance-id "${INSTANCE_ID}" > "${ip_address}.txt" fi done diff --git a/tests/integration-tests/tests/common/scaling_common.py b/tests/integration-tests/tests/common/scaling_common.py index 72fba1cfd7..cef2b24cdd 100644 --- a/tests/integration-tests/tests/common/scaling_common.py +++ b/tests/integration-tests/tests/common/scaling_common.py @@ -66,15 +66,16 @@ def retry_if_scaling_target_not_reached( def get_bootstrap_errors(remote_command_executor: RemoteCommandExecutor, cluster_name, output_dir): - instance_ids_with_bootstrap_errors = remote_command_executor.run_remote_script( - script_file=str(SCALING_COMMON_DATADIR / "get_bootstrap_errors.sh") - ).stdout + logging.info("Checking for bootstrap errors...") + remote_command_executor.run_remote_script(script_file=str(SCALING_COMMON_DATADIR / "get_bootstrap_errors.sh")) + instance_ids_with_bootstrap_errors = remote_command_executor.run_remote_command(command=f"[[ -f $HOME/bootstrap_errors.txt ]] && cat $HOME/bootstrap_errors.txt").stdout path = os.path.join(output_dir, "bootstrap_errors") os.makedirs(path, exist_ok=True) for instance_id_ip_address in instance_ids_with_bootstrap_errors.splitlines(): instance_id, ip_address = instance_id_ip_address.split(" ") + logging.info(f"Instance {instance_id} had bootstrap errors. Check the logs for details.") compute_node_log = remote_command_executor.run_remote_command(command=f"cat $HOME/{ip_address}.txt").stdout with open(os.path.join(path, f"{cluster_name}-{instance_id}-bootstrap-error.txt"), "w") as file: file.write(compute_node_log) diff --git a/tests/integration-tests/tests/performance_tests/test_scaling.py b/tests/integration-tests/tests/performance_tests/test_scaling.py index 4cf2c79fb1..6070f396d4 100644 --- a/tests/integration-tests/tests/performance_tests/test_scaling.py +++ b/tests/integration-tests/tests/performance_tests/test_scaling.py @@ -11,7 +11,7 @@ from utils import disable_protected_mode from tests.common.assertions import assert_no_msg_in_logs -from tests.common.scaling_common import get_scaling_metrics, validate_and_get_scaling_test_config +from tests.common.scaling_common import get_scaling_metrics, validate_and_get_scaling_test_config, get_bootstrap_errors @pytest.mark.parametrize( @@ -284,10 +284,8 @@ def _scale_up_and_down( target_cluster_size=scaling_target, ) - # Check for bootstrap errors since the cluster was unable to scale up to target within the max monitoring time - if scaling_target not in compute_nodes_time_series_up: - get_bootstrap_errors(remote_command_executor, cluster.name, request.config.getoption("output_dir")) - raise Exception(f"Cluster could not scale up to {scaling_target} nodes within the max monitoring time") + # Check for any bootstrap errors and get the compute node logs in the test artifacts + get_bootstrap_errors(remote_command_executor, cluster.name, request.config.getoption("output_dir")) # Extract scale up duration and timestamp from the monitoring metrics collected above _, scale_up_time_ec2 = _get_scaling_time(ec2_capacity_time_series_up, timestamps, scaling_target, start_time)