Skip to content

Commit

Permalink
[Azure] Optimize autostopping speed for azure (#3519)
Browse files Browse the repository at this point in the history
Optimize autostopping speed for azure
  • Loading branch information
Michaelvll authored May 8, 2024
1 parent 40a0f57 commit d09827b
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 11 deletions.
8 changes: 7 additions & 1 deletion sky/skylet/autostop_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,10 +75,16 @@ def set_autostopping_started() -> None:
configs.set_config(_AUTOSTOP_INDICATOR, str(psutil.boot_time()))


def get_is_autostopping_payload() -> str:
def get_is_autostopping() -> bool:
"""Returns whether the cluster is in the process of autostopping."""
result = configs.get_config(_AUTOSTOP_INDICATOR)
is_autostopping = (result == str(psutil.boot_time()))
return is_autostopping


def get_is_autostopping_payload() -> str:
"""Payload for whether the cluster is in the process of autostopping."""
is_autostopping = get_is_autostopping()
return common_utils.encode_payload(is_autostopping)


Expand Down
2 changes: 2 additions & 0 deletions sky/skylet/providers/azure/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,8 @@ def _configure_resource_group(config):
create_or_update = get_azure_sdk_function(
client=resource_client.deployments, function_name="create_or_update"
)
# TODO (skypilot): this takes a long time (> 40 seconds) for stopping an
# azure VM, and this can be called twice during ray down.
outputs = (
create_or_update(
resource_group_name=resource_group,
Expand Down
28 changes: 18 additions & 10 deletions sky/skylet/providers/azure/node_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
bootstrap_azure,
get_azure_sdk_function,
)
from sky.skylet import autostop_lib
from sky.skylet.providers.command_runner import SkyDockerCommandRunner
from sky.provision import docker_utils

Expand Down Expand Up @@ -61,16 +62,23 @@ class AzureNodeProvider(NodeProvider):

def __init__(self, provider_config, cluster_name):
NodeProvider.__init__(self, provider_config, cluster_name)
# TODO(suquark): This is a temporary patch for resource group.
# By default, Ray autoscaler assumes the resource group is still here even
# after the whole cluster is destroyed. However, now we deletes the resource
# group after tearing down the cluster. To comfort the autoscaler, we need
# to create/update it here, so the resource group always exists.
from sky.skylet.providers.azure.config import _configure_resource_group

_configure_resource_group(
{"cluster_name": cluster_name, "provider": provider_config}
)
if not autostop_lib.get_is_autostopping():
# TODO(suquark): This is a temporary patch for resource group.
# By default, Ray autoscaler assumes the resource group is still
# here even after the whole cluster is destroyed. However, now we
# deletes the resource group after tearing down the cluster. To
# comfort the autoscaler, we need to create/update it here, so the
# resource group always exists.
#
# We should not re-configure the resource group again, when it is
# running on the remote VM and the autostopping is in progress,
# because the VM is running which guarantees the resource group
# exists.
from sky.skylet.providers.azure.config import _configure_resource_group

_configure_resource_group(
{"cluster_name": cluster_name, "provider": provider_config}
)
subscription_id = provider_config["subscription_id"]
self.cache_stopped_nodes = provider_config.get("cache_stopped_nodes", True)
# Sky only supports Azure CLI credential for now.
Expand Down

0 comments on commit d09827b

Please sign in to comment.