diff --git a/tests/integration-tests/configs/scaling_stress_test.yaml b/tests/integration-tests/configs/scaling_stress_test.yaml index 546dd2a84d..9edbaed33d 100644 --- a/tests/integration-tests/configs/scaling_stress_test.yaml +++ b/tests/integration-tests/configs/scaling_stress_test.yaml @@ -2,13 +2,7 @@ test-suites: performance_tests: test_scaling.py::test_scaling_stress_test: dimensions: - - regions: [ "us-east-1" ] - instances: [ "c5.large" ] - oss: [ "alinux2" ] - schedulers: [ "slurm" ] - test_scaling.py::test_static_scaling_stress_test: - dimensions: - - regions: [ "us-east-1" ] - instances: [ "c5.large" ] + - regions: [ "euw1-az2" ] + instances: [ "p3.2xlarge" ] oss: [ "alinux2" ] schedulers: [ "slurm" ] diff --git a/tests/integration-tests/tests/common/scaling/get_bootstrap_errors.sh b/tests/integration-tests/tests/common/scaling/get_bootstrap_errors.sh index ccde69b393..212b31e485 100644 --- a/tests/integration-tests/tests/common/scaling/get_bootstrap_errors.sh +++ b/tests/integration-tests/tests/common/scaling/get_bootstrap_errors.sh @@ -19,7 +19,7 @@ touch "bootstrap_errors.txt" # Find a log message like: # ... WARNING - Node bootstrap error: Node queue-0-dy-compute-resource-0-1690(192.168.90.197) ... # and get the IP address -sudo cat ${CLUSTERMGTD_LOG} | grep -i "Node bootstrap error" | awk -F"[()]" '{print $2}' | while read -r ip_address ; do +sudo cat ${CLUSTERMGTD_LOG} | grep -i "no corresponding instance in EC2 for node" | awk -F"[()]" '{print $2}' | while read -r ip_address ; do if ! grep -q "${ip_address}" "bootstrap_errors.txt"; then echo "${ip_address}" >> "bootstrap_errors.txt" fi diff --git a/tests/integration-tests/tests/common/scaling_common.py b/tests/integration-tests/tests/common/scaling_common.py index e25dbd9b34..9e8120c323 100644 --- a/tests/integration-tests/tests/common/scaling_common.py +++ b/tests/integration-tests/tests/common/scaling_common.py @@ -94,13 +94,13 @@ def get_bootstrap_errors(remote_command_executor: RemoteCommandExecutor, cluster # Since the same cluster is re-used for multiple scale up tests, the script may find the same bootstrap error # multiple times and then get the wrong instance logs since the IP address would be attached to a new instance. # Therefore, only write the compute node logs for the IP address if the file doesn't exist yet. - if _check_no_node_log_exists_for_ip_address("bootstrap_errors", ip_address): + if _check_no_node_log_exists_for_ip_address(path, ip_address): try: logging.warning(f"Compute node with IP address {ip_address} had bootstrap errors. Getting instance id...") # Get the latest launched instance with the IP address since the most recent one should have the error instance_id = _sort_instances_by_launch_time(client.describe_instances( Filters=[{"Name": "private-ip-address", "Values": [ip_address]}] - ))[-1] + ))[-1]["InstanceId"] logging.warning(f"Instance {instance_id} had bootstrap errors. Check the logs for details.") compute_node_log = client.get_console_output(InstanceId=instance_id)["Output"] with open(os.path.join(path, f"{ip_address}-{cluster_name}-{instance_id}-{region}-log.txt", "w")) as f: diff --git a/tests/integration-tests/tests/performance_tests/test_scaling.py b/tests/integration-tests/tests/performance_tests/test_scaling.py index fcc2767937..277f4a0ede 100644 --- a/tests/integration-tests/tests/performance_tests/test_scaling.py +++ b/tests/integration-tests/tests/performance_tests/test_scaling.py @@ -82,7 +82,7 @@ def _get_scaling_time(capacity_time_series: list, timestamps: list, scaling_targ @pytest.mark.usefixtures("scheduler") -@pytest.mark.parametrize("scaling_strategy", ["all-or-nothing", "best-effort"]) +@pytest.mark.parametrize("scaling_strategy", ["all-or-nothing"]) def test_scaling_stress_test( test_datadir, instance,