diff --git a/tests/integration-tests/configs/scaling_stress_test.yaml b/tests/integration-tests/configs/scaling_stress_test.yaml index 5350b6bb9a..12ccfbad35 100644 --- a/tests/integration-tests/configs/scaling_stress_test.yaml +++ b/tests/integration-tests/configs/scaling_stress_test.yaml @@ -2,7 +2,7 @@ test-suites: performance_tests: test_scaling.py::test_scaling_stress_test: dimensions: - - regions: [ "use1-az6" ] - instances: [ "c5.large" ] + - regions: [ "euw1-az3" ] + instances: [ "p3.2xlarge" ] oss: [ "alinux2" ] schedulers: [ "slurm" ] diff --git a/tests/integration-tests/tests/common/scaling_common.py b/tests/integration-tests/tests/common/scaling_common.py index 175e55b230..18681fdf0a 100644 --- a/tests/integration-tests/tests/common/scaling_common.py +++ b/tests/integration-tests/tests/common/scaling_common.py @@ -65,30 +65,50 @@ def retry_if_scaling_target_not_reached( ) +def _check_no_node_log_exists_for_ip_address(path, ip_address): + for file_name in os.listdir(path): + if file_name.startswith(ip_address): + return False + return True + + +def _sort_instances_by_launch_time(describe_instance_response): + instances = [] + for reservation in describe_instance_response["Reservations"]: + for instance in reservation["Instances"]: + instances.append(instance) + instances.sort(key=lambda inst: inst["LaunchTime"]) + return instances + + def get_bootstrap_errors(remote_command_executor: RemoteCommandExecutor, cluster_name, region): logging.info("Checking for bootstrap errors...") remote_command_executor.run_remote_script(script_file=str(SCALING_COMMON_DATADIR / "get_bootstrap_errors.sh")) ip_addresses_with_bootstrap_errors = remote_command_executor.run_remote_command(command=f"cat $HOME/bootstrap_errors.txt").stdout os.makedirs("bootstrap_errors", exist_ok=True) + client = boto3.client("ec2", region_name=region) for ip_address in ip_addresses_with_bootstrap_errors.splitlines(): - try: - instance_id = client.describe_instances( - Filters=[{"Name": "private-ip-address", "Values": [ip_address]}] - )["Reservations"][0]["Instances"][0]["InstanceId"] - logging.warning(f"Instance {instance_id} had bootstrap errors. Check the logs for details.") - compute_node_log = client.get_console_output(InstanceId=instance_id, Latest=True)["Output"] - with open(f"bootstrap_errors/{region}-{cluster_name}-{instance_id}-bootstrap-error.txt", "w") as file: - file.write(compute_node_log) - except IndexError: - # The instance id might not be found if a bootstrap error occurred in a previous scale-up using the same - # cluster, since it was terminated already but the IP address is still present in the clustermgtd. - # Issue warning and continue since we want the compute logs for the errors of the most recent scale-up - logging.warning("Instance not found for IP address %s that was found to have a bootstrap error", ip_address) - except Exception: - logging.error("Error while retrieving the compute node logs for instance with ip address %s", ip_address) - raise + # Since the same cluster is re-used for multiple scale up tests, the script may find the same bootstrap error + # multiple times and then get the wrong instance logs since the IP address would be attached to a new instance. + # Therefore, only write the compute node logs for the IP address if the file doesn't exist yet. + if _check_no_node_log_exists_for_ip_address("bootstrap_errors", ip_address): + try: + # Get the latest launched instance with the IP address since the most recent one should have the error + instance_id = _sort_instances_by_launch_time(client.describe_instances( + Filters=[{"Name": "private-ip-address", "Values": [ip_address]}] + ))[-1] + logging.warning(f"Instance {instance_id} had bootstrap errors. Check the logs for details.") + compute_node_log = client.get_console_output(InstanceId=instance_id)["Output"] + with open(f"bootstrap_errors/{ip_address}-{cluster_name}-{instance_id}-{region}-log.txt", "w") as file: + file.write(compute_node_log) + except IndexError: + # If the instance with the IP address can't be found, continue to get other bootstrap errors + logging.warning("Couldn't find instance with IP %s but could have a bootstrap error.", ip_address) + except Exception: + logging.error("Error when retrieving the compute node logs for instance with ip address %s", ip_address) + raise def get_scaling_metrics( diff --git a/tests/integration-tests/tests/performance_tests/test_scaling.py b/tests/integration-tests/tests/performance_tests/test_scaling.py index 4c0710e93c..f10b1eeff9 100644 --- a/tests/integration-tests/tests/performance_tests/test_scaling.py +++ b/tests/integration-tests/tests/performance_tests/test_scaling.py @@ -284,11 +284,7 @@ def _scale_up_and_down( target_cluster_size=scaling_target, ) - # Get the compute node logs for bootstrap errors if compute nodes did not scale up to scaling target within time - # if scaling_target not in compute_nodes_time_series_up: get_bootstrap_errors(remote_command_executor, cluster.name, region) - # raise Exception(f"Cluster did not scale up to {scaling_target} nodes. " - # f"Check the compute node logs for any bootstrap errors in the test artifacts.") # Extract scale up duration and timestamp from the monitoring metrics collected above _, scale_up_time_ec2 = _get_scaling_time(ec2_capacity_time_series_up, timestamps, scaling_target, start_time)