Skip to content

Commit

Permalink
fix ip-worker mapping for k8s ssh
Browse files Browse the repository at this point in the history
  • Loading branch information
asaiacai committed Aug 12, 2024
1 parent 278ba18 commit 5e382b9
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 6 deletions.
10 changes: 7 additions & 3 deletions sky/backends/cloud_vm_ray_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -2323,9 +2323,13 @@ def is_provided_ips_valid(
zip(cluster_internal_ips, cluster_feasible_ips))

# Ensure head node is the first element, then sort based on the
# external IPs for stableness
stable_internal_external_ips = [internal_external_ips[0]] + sorted(
internal_external_ips[1:], key=lambda x: x[1])
# external IPs for stableness. Skip for k8s nodes since pods
# worker ids are already mapped.
if cluster_info is not None and cluster_info.provider_name == 'kubernetes':
stable_internal_external_ips = internal_external_ips
else:
stable_internal_external_ips = [internal_external_ips[0]] + sorted(
internal_external_ips[1:], key=lambda x: x[1])
self.stable_internal_external_ips = stable_internal_external_ips

@functools.lru_cache()
Expand Down
13 changes: 10 additions & 3 deletions tests/test_smoke.py
Original file line number Diff line number Diff line change
Expand Up @@ -3146,17 +3146,24 @@ def test_kubernetes_ssh_hostname():
test = Test(
'test-kubernetes-ssh-hostname',
[
f'sky launch -c {name} -y --num-nodes 3',
f'sky launch -c {name} -y --num-nodes 10 --cpus 1+',
f'ssh {name} -t "hostname" | grep head',
f'ssh {name}-worker1 -t "hostname" | grep worker1',
f'ssh {name}-worker2 -t "hostname" | grep worker2',
f'sky down -y {name}'
] * 10,
f'ssh {name}-worker3 -t "hostname" | grep worker3',
f'ssh {name}-worker4 -t "hostname" | grep worker4',
f'ssh {name}-worker5 -t "hostname" | grep worker5',
f'ssh {name}-worker6 -t "hostname" | grep worker6',
f'ssh {name}-worker7 -t "hostname" | grep worker7',
f'ssh {name}-worker8 -t "hostname" | grep worker8',
f'ssh {name}-worker9 -t "hostname" | grep worker9',
],
f'sky down -y {name}',
timeout=10 * 60,
)
run_one_test(test)


@pytest.mark.azure
def test_azure_start_stop_two_nodes():
name = _get_cluster_name()
Expand Down

0 comments on commit 5e382b9

Please sign in to comment.