Skip to content

Commit

Permalink
ah
Browse files Browse the repository at this point in the history
Signed-off-by: Judy Ng <[email protected]>
  • Loading branch information
judysng committed Mar 27, 2024
1 parent 933a16d commit 354eca5
Show file tree
Hide file tree
Showing 4 changed files with 6 additions and 12 deletions.
10 changes: 2 additions & 8 deletions tests/integration-tests/configs/scaling_stress_test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,7 @@ test-suites:
performance_tests:
test_scaling.py::test_scaling_stress_test:
dimensions:
- regions: [ "us-east-1" ]
instances: [ "c5.large" ]
oss: [ "alinux2" ]
schedulers: [ "slurm" ]
test_scaling.py::test_static_scaling_stress_test:
dimensions:
- regions: [ "us-east-1" ]
instances: [ "c5.large" ]
- regions: [ "euw1-az2" ]
instances: [ "p3.2xlarge" ]
oss: [ "alinux2" ]
schedulers: [ "slurm" ]
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ touch "bootstrap_errors.txt"
# Find a log message like:
# ... WARNING - Node bootstrap error: Node queue-0-dy-compute-resource-0-1690(192.168.90.197) ...
# and get the IP address
sudo cat ${CLUSTERMGTD_LOG} | grep -i "Node bootstrap error" | awk -F"[()]" '{print $2}' | while read -r ip_address ; do
sudo cat ${CLUSTERMGTD_LOG} | grep -i "no corresponding instance in EC2 for node" | awk -F"[()]" '{print $2}' | while read -r ip_address ; do
if ! grep -q "${ip_address}" "bootstrap_errors.txt"; then
echo "${ip_address}" >> "bootstrap_errors.txt"
fi
Expand Down
4 changes: 2 additions & 2 deletions tests/integration-tests/tests/common/scaling_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,13 +94,13 @@ def get_bootstrap_errors(remote_command_executor: RemoteCommandExecutor, cluster
# Since the same cluster is re-used for multiple scale up tests, the script may find the same bootstrap error
# multiple times and then get the wrong instance logs since the IP address would be attached to a new instance.
# Therefore, only write the compute node logs for the IP address if the file doesn't exist yet.
if _check_no_node_log_exists_for_ip_address("bootstrap_errors", ip_address):
if _check_no_node_log_exists_for_ip_address(path, ip_address):
try:
logging.warning(f"Compute node with IP address {ip_address} had bootstrap errors. Getting instance id...")
# Get the latest launched instance with the IP address since the most recent one should have the error
instance_id = _sort_instances_by_launch_time(client.describe_instances(
Filters=[{"Name": "private-ip-address", "Values": [ip_address]}]
))[-1]
))[-1]["InstanceId"]
logging.warning(f"Instance {instance_id} had bootstrap errors. Check the logs for details.")
compute_node_log = client.get_console_output(InstanceId=instance_id)["Output"]
with open(os.path.join(path, f"{ip_address}-{cluster_name}-{instance_id}-{region}-log.txt", "w")) as f:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def _get_scaling_time(capacity_time_series: list, timestamps: list, scaling_targ


@pytest.mark.usefixtures("scheduler")
@pytest.mark.parametrize("scaling_strategy", ["all-or-nothing", "best-effort"])
@pytest.mark.parametrize("scaling_strategy", ["all-or-nothing"])
def test_scaling_stress_test(
test_datadir,
instance,
Expand Down

0 comments on commit 354eca5

Please sign in to comment.