import

Signed-off-by: Judy Ng <[email protected]>
judysng · Mar 25, 2024 · 2c5d389 · 2c5d389
1 parent a660146
commit 2c5d389
Show file tree

Hide file tree

Showing 3 changed files with 9 additions and 10 deletions.
diff --git a/tests/integration-tests/tests/common/scaling/get_bootstrap_errors.sh b/tests/integration-tests/tests/common/scaling/get_bootstrap_errors.sh
@@ -23,8 +23,8 @@ sudo cat ${CLUSTERMGTD_LOG} | grep -i "Node bootstrap error" | awk -F"[()]" '{pr
     # multiple times and then get the wrong instance logs since the IP address would be attached to a new instance.
     # Therefore, only write the compute node logs for the IP address if the file doesn't exist yet.
     if [ ! -f "${ip_address}.txt" ]; then
-        INSTANCE_ID=$(aws ec2 describe-instances --filter Name=private-ip-address,Values="${ip_address}" --query 'Reservations[].Instances[].InstanceId' --output)
-        echo "${INSTANCE_ID} ${ip_address}"
+        INSTANCE_ID=$(aws ec2 describe-instances --filter Name=private-ip-address,Values="${ip_address}" --query 'Reservations[].Instances[].InstanceId' --output text)
+        echo "${INSTANCE_ID} ${ip_address}" >> "bootstrap_errors.txt"
         aws ec2 get-console-output --output text --instance-id "${INSTANCE_ID}" > "${ip_address}.txt"
     fi
 done
diff --git a/tests/integration-tests/tests/common/scaling_common.py b/tests/integration-tests/tests/common/scaling_common.py
@@ -66,15 +66,16 @@ def retry_if_scaling_target_not_reached(
 
 
 def get_bootstrap_errors(remote_command_executor: RemoteCommandExecutor, cluster_name, output_dir):
-    instance_ids_with_bootstrap_errors = remote_command_executor.run_remote_script(
-        script_file=str(SCALING_COMMON_DATADIR / "get_bootstrap_errors.sh")
-    ).stdout
+    logging.info("Checking for bootstrap errors...")
+    remote_command_executor.run_remote_script(script_file=str(SCALING_COMMON_DATADIR / "get_bootstrap_errors.sh"))
+    instance_ids_with_bootstrap_errors = remote_command_executor.run_remote_command(command=f"[[ -f $HOME/bootstrap_errors.txt ]] && cat $HOME/bootstrap_errors.txt").stdout
 
     path = os.path.join(output_dir, "bootstrap_errors")
     os.makedirs(path, exist_ok=True)
 
     for instance_id_ip_address in instance_ids_with_bootstrap_errors.splitlines():
         instance_id, ip_address = instance_id_ip_address.split(" ")
+        logging.info(f"Instance {instance_id} had bootstrap errors. Check the logs for details.")
         compute_node_log = remote_command_executor.run_remote_command(command=f"cat $HOME/{ip_address}.txt").stdout
         with open(os.path.join(path, f"{cluster_name}-{instance_id}-bootstrap-error.txt"), "w") as file:
             file.write(compute_node_log)

diff --git a/tests/integration-tests/tests/performance_tests/test_scaling.py b/tests/integration-tests/tests/performance_tests/test_scaling.py
@@ -11,7 +11,7 @@
 from utils import disable_protected_mode
 
 from tests.common.assertions import assert_no_msg_in_logs
-from tests.common.scaling_common import get_scaling_metrics, validate_and_get_scaling_test_config
+from tests.common.scaling_common import get_scaling_metrics, validate_and_get_scaling_test_config, get_bootstrap_errors
 
 
 @pytest.mark.parametrize(
@@ -284,10 +284,8 @@ def _scale_up_and_down(
         target_cluster_size=scaling_target,
     )
 
-    # Check for bootstrap errors since the cluster was unable to scale up to target within the max monitoring time
-    if scaling_target not in compute_nodes_time_series_up:
-        get_bootstrap_errors(remote_command_executor, cluster.name, request.config.getoption("output_dir"))
-        raise Exception(f"Cluster could not scale up to {scaling_target} nodes within the max monitoring time")
+    # Check for any bootstrap errors and get the compute node logs in the test artifacts
+    get_bootstrap_errors(remote_command_executor, cluster.name, request.config.getoption("output_dir"))
 
     # Extract scale up duration and timestamp from the monitoring metrics collected above
     _, scale_up_time_ec2 = _get_scaling_time(ec2_capacity_time_series_up, timestamps, scaling_target, start_time)