hi

Signed-off-by: Judy Ng <[email protected]>
judysng · Mar 26, 2024 · c19913c · c19913c
1 parent d1bc51f
commit c19913c
Show file tree

Hide file tree

Showing 3 changed files with 31 additions and 17 deletions.
diff --git a/tests/integration-tests/tests/common/scaling/get_bootstrap_errors.sh b/tests/integration-tests/tests/common/scaling/get_bootstrap_errors.sh
@@ -18,14 +18,14 @@ touch "bootstrap_errors.txt"
 
 # Find a log message like:
 # ... WARNING - Node bootstrap error: Node queue-0-dy-compute-resource-0-1690(192.168.90.197) ...
-# and get the IP address and instance id
+# and get the IP address
 sudo cat ${CLUSTERMGTD_LOG} | grep -i "no corresponding instance in EC2 for node" | awk -F"[()]" '{print $2}' | while read -r ip_address ; do
-    # Since the same cluster is re-used for multiple scale up tests, this script may find the same bootstrap error
-    # multiple times and then get the wrong instance logs since the IP address would be attached to a new instance.
-    # Therefore, only write the compute node logs for the IP address if the file doesn't exist yet.
-    if [ ! -f "${ip_address}.txt" ]; then
-        INSTANCE_ID=$(aws ec2 describe-instances --filter Name=private-ip-address,Values="${ip_address}" --query 'Reservations[].Instances[].InstanceId' --output text)
-        echo "${INSTANCE_ID} ${ip_address}" >> "bootstrap_errors.txt"
-        aws ec2 get-console-output --output text --instance-id "${INSTANCE_ID}" > "${ip_address}.txt"
+    if ! grep -q "${ip_address}" "bootstrap_errors.txt"; then
+      echo "${ip_address}" >> "bootstrap_errors.txt"
     fi
+#    if [ ! -f "${ip_address}.txt" ]; then
+#        INSTANCE_ID=$(aws ec2 describe-instances --filter Name=private-ip-address,Values="${ip_address}" --query 'Reservations[].Instances[].InstanceId' --output text)
+#        echo "${INSTANCE_ID} ${ip_address}" >> "bootstrap_errors.txt"
+#        aws ec2 get-console-output --output text --instance-id "${INSTANCE_ID}" > "${ip_address}.txt"
+#    fi
 done
diff --git a/tests/integration-tests/tests/common/scaling_common.py b/tests/integration-tests/tests/common/scaling_common.py
@@ -65,20 +65,34 @@ def retry_if_scaling_target_not_reached(
     )
 
 
-def get_bootstrap_errors(remote_command_executor: RemoteCommandExecutor, cluster_name, output_dir):
+def _check_if_node_log_exists_for_ip_address(path, ip_address):
+    for file_name in os.listdir(path):
+        if file_name.startswith(ip_address):
+            return True
+    return False
+
+
+def get_bootstrap_errors(remote_command_executor: RemoteCommandExecutor, cluster_name, output_dir, region):
     logging.info("Checking for bootstrap errors...")
     remote_command_executor.run_remote_script(script_file=str(SCALING_COMMON_DATADIR / "get_bootstrap_errors.sh"))
-    instance_ids_with_bootstrap_errors = remote_command_executor.run_remote_command(command=f"cat $HOME/bootstrap_errors.txt").stdout
+    ip_addresses_with_bootstrap_errors = remote_command_executor.run_remote_command(command=f"cat $HOME/bootstrap_errors.txt").stdout
 
     path = os.path.join(output_dir, "bootstrap_errors")
     os.makedirs(path, exist_ok=True)
 
-    for instance_id_ip_address in instance_ids_with_bootstrap_errors.splitlines():
-        instance_id, ip_address = instance_id_ip_address.split(" ")
-        logging.info(f"Instance {instance_id} had bootstrap errors. Check the logs for details.")
-        compute_node_log = remote_command_executor.run_remote_command(command=f"cat $HOME/{ip_address}.txt").stdout
-        with open(os.path.join(path, f"{cluster_name}-{instance_id}-bootstrap-error.txt"), "w") as file:
-            file.write(compute_node_log)
+    client = boto3.client("ec2", region_name=region)
+    for ip_address in ip_addresses_with_bootstrap_errors.splitlines():
+        # Since the same cluster is re-used for multiple scale up tests, the script may find the same bootstrap error
+        # multiple times and then get the wrong instance logs since the IP address would be attached to a new instance.
+        # Therefore, only write the compute node logs for the IP address if the file doesn't exist yet.
+        if not _check_if_node_log_exists_for_ip_address(path, ip_address):
+            instance_id = client.describe_instances(
+                Filters={"Name": "private-ip-address", "Values": [ip_address]}
+            )["Reservations"][0]["Instances"][0]["InstanceId"]
+            logging.info(f"Instance {instance_id} had bootstrap errors. Check the logs for details.")
+            compute_node_log = client.get_console_output(InstanceId=instance_id)["Output"]
+            with open(os.path.join(path, f"{ip_address}-{cluster_name}-{instance_id}-bootstrap-error.txt"), "w") as f:
+                f.write(compute_node_log)
 
 
 def get_scaling_metrics(

diff --git a/tests/integration-tests/tests/performance_tests/test_scaling.py b/tests/integration-tests/tests/performance_tests/test_scaling.py
@@ -285,7 +285,7 @@ def _scale_up_and_down(
     )
 
     # Check for any bootstrap errors and get the compute node logs in the test artifacts
-    get_bootstrap_errors(remote_command_executor, cluster.name, request.config.getoption("output_dir"))
+    get_bootstrap_errors(remote_command_executor, cluster.name, request.config.getoption("output_dir"), region)
 
     # Extract scale up duration and timestamp from the monitoring metrics collected above
     _, scale_up_time_ec2 = _get_scaling_time(ec2_capacity_time_series_up, timestamps, scaling_target, start_time)