Skip to content

Commit

Permalink
[Core] Skip worker ray start for multinode (skypilot-org#4390)
Browse files Browse the repository at this point in the history
* Optimize ray start

* add comments

* update logging
  • Loading branch information
Michaelvll authored Nov 22, 2024
1 parent aca090e commit 204d979
Showing 1 changed file with 14 additions and 6 deletions.
20 changes: 14 additions & 6 deletions sky/provision/provisioner.py
Original file line number Diff line number Diff line change
Expand Up @@ -537,14 +537,16 @@ def check_ray_port_and_cluster_healthy() -> Tuple[int, bool, bool]:
instance_setup.RAY_STATUS_WITH_SKY_RAY_PORT_COMMAND,
stream_logs=False,
require_outputs=True)
if returncode:
logger.debug('Ray cluster on head is not ready.')
else:
logger.debug('Ray cluster on head is ready.')
if not returncode:
ray_port = common_utils.decode_payload(stdout)['ray_port']
logger.debug(f'Ray cluster on head is up with port {ray_port}.')

head_ray_needs_restart = bool(returncode)
ray_cluster_healthy = is_ray_cluster_healthy(
stdout, cluster_info.num_instances)
# This is a best effort check to see if the ray cluster has expected
# number of nodes connected.
ray_cluster_healthy = (not head_ray_needs_restart and
is_ray_cluster_healthy(
stdout, cluster_info.num_instances))
return ray_port, ray_cluster_healthy, head_ray_needs_restart

status.update(
Expand Down Expand Up @@ -585,6 +587,9 @@ def check_ray_port_and_cluster_healthy() -> Tuple[int, bool, bool]:
custom_resource=custom_resource,
cluster_info=cluster_info,
ssh_credentials=ssh_credentials)
else:
logger.debug('Ray cluster on head is ready. Skip starting ray '
'cluster on head node.')

# NOTE: We have to check all worker nodes to make sure they are all
# healthy, otherwise we can only start Ray on newly started worker
Expand All @@ -610,6 +615,9 @@ def check_ray_port_and_cluster_healthy() -> Tuple[int, bool, bool]:
ray_port=ray_port,
cluster_info=cluster_info,
ssh_credentials=ssh_credentials)
elif ray_cluster_healthy:
logger.debug('Ray cluster is ready. Skip starting ray cluster on '
'worker nodes.')

instance_setup.start_skylet_on_head_node(cluster_name.name_on_cloud,
cluster_info, ssh_credentials)
Expand Down

0 comments on commit 204d979

Please sign in to comment.