Skip to content

Commit

Permalink
hi
Browse files Browse the repository at this point in the history
Signed-off-by: Judy Ng <[email protected]>
  • Loading branch information
judysng committed Mar 27, 2024
1 parent 66a4346 commit 89629f5
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 10 deletions.
2 changes: 1 addition & 1 deletion tests/integration-tests/configs/scaling_stress_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ test-suites:
performance_tests:
test_scaling.py::test_scaling_stress_test:
dimensions:
- regions: [ "us-east-1" ]
- regions: [ "use1-az6" ]
instances: [ "c5.large" ]
oss: [ "alinux2" ]
schedulers: [ "slurm" ]
24 changes: 16 additions & 8 deletions tests/integration-tests/tests/common/scaling_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,16 +71,24 @@ def get_bootstrap_errors(remote_command_executor: RemoteCommandExecutor, cluster
ip_addresses_with_bootstrap_errors = remote_command_executor.run_remote_command(command=f"cat $HOME/bootstrap_errors.txt").stdout

os.makedirs("bootstrap_errors", exist_ok=True)

client = boto3.client("ec2", region_name=region)
for ip_address in ip_addresses_with_bootstrap_errors.splitlines():
instance_id = client.describe_instances(
Filters=[{"Name": "private-ip-address", "Values": [ip_address]}]
)["Reservations"][0]["Instances"][0]["InstanceId"]
logging.info(f"Instance {instance_id} had bootstrap errors. Check the logs for details.")
compute_node_log = client.get_console_output(InstanceId=instance_id, Latest=True)["Output"]
with open(f"bootstrap_errors/{region}-{cluster_name}-{instance_id}-bootstrap-error.txt", "w") as file:
file.write(compute_node_log)
try:
instance_id = client.describe_instances(
Filters=[{"Name": "private-ip-address", "Values": [ip_address]}]
)["Reservations"][0]["Instances"][0]["InstanceId"]
logging.warning(f"Instance {instance_id} had bootstrap errors. Check the logs for details.")
compute_node_log = client.get_console_output(InstanceId=instance_id, Latest=True)["Output"]
with open(f"bootstrap_errors/{region}-{cluster_name}-{instance_id}-bootstrap-error.txt", "w") as file:
file.write(compute_node_log)
except IndexError:
# The instance id might not be found if a bootstrap error occurred in a previous scale-up using the same
# cluster, since it was terminated already but the IP address is still present in the clustermgtd.
# Issue warning and continue since we want the compute logs for the errors of the most recent scale-up
logging.warning("Instance not found for IP address %s that was found to have a bootstrap error", ip_address)
except Exception:
logging.error("Error while retrieving the compute node logs for instance with ip address %s", ip_address)
raise


def get_scaling_metrics(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def _get_scaling_time(capacity_time_series: list, timestamps: list, scaling_targ


@pytest.mark.usefixtures("scheduler")
@pytest.mark.parametrize("scaling_strategy", ["all-or-nothing", "best-effort"])
@pytest.mark.parametrize("scaling_strategy", ["all-or-nothing"])
def test_scaling_stress_test(
test_datadir,
instance,
Expand Down

0 comments on commit 89629f5

Please sign in to comment.