diff --git a/sky/skylet/autostop_lib.py b/sky/skylet/autostop_lib.py index 687c04f5211..130e39fb425 100644 --- a/sky/skylet/autostop_lib.py +++ b/sky/skylet/autostop_lib.py @@ -75,10 +75,16 @@ def set_autostopping_started() -> None: configs.set_config(_AUTOSTOP_INDICATOR, str(psutil.boot_time())) -def get_is_autostopping_payload() -> str: +def get_is_autostopping() -> bool: """Returns whether the cluster is in the process of autostopping.""" result = configs.get_config(_AUTOSTOP_INDICATOR) is_autostopping = (result == str(psutil.boot_time())) + return is_autostopping + + +def get_is_autostopping_payload() -> str: + """Payload for whether the cluster is in the process of autostopping.""" + is_autostopping = get_is_autostopping() return common_utils.encode_payload(is_autostopping) diff --git a/sky/skylet/providers/azure/config.py b/sky/skylet/providers/azure/config.py index 0c1827a1141..a19273761ba 100644 --- a/sky/skylet/providers/azure/config.py +++ b/sky/skylet/providers/azure/config.py @@ -120,6 +120,8 @@ def _configure_resource_group(config): create_or_update = get_azure_sdk_function( client=resource_client.deployments, function_name="create_or_update" ) + # TODO (skypilot): this takes a long time (> 40 seconds) for stopping an + # azure VM, and this can be called twice during ray down. outputs = ( create_or_update( resource_group_name=resource_group, diff --git a/sky/skylet/providers/azure/node_provider.py b/sky/skylet/providers/azure/node_provider.py index 4b315f23589..068930eb390 100644 --- a/sky/skylet/providers/azure/node_provider.py +++ b/sky/skylet/providers/azure/node_provider.py @@ -15,6 +15,7 @@ bootstrap_azure, get_azure_sdk_function, ) +from sky.skylet import autostop_lib from sky.skylet.providers.command_runner import SkyDockerCommandRunner from sky.provision import docker_utils @@ -61,16 +62,23 @@ class AzureNodeProvider(NodeProvider): def __init__(self, provider_config, cluster_name): NodeProvider.__init__(self, provider_config, cluster_name) - # TODO(suquark): This is a temporary patch for resource group. - # By default, Ray autoscaler assumes the resource group is still here even - # after the whole cluster is destroyed. However, now we deletes the resource - # group after tearing down the cluster. To comfort the autoscaler, we need - # to create/update it here, so the resource group always exists. - from sky.skylet.providers.azure.config import _configure_resource_group - - _configure_resource_group( - {"cluster_name": cluster_name, "provider": provider_config} - ) + if not autostop_lib.get_is_autostopping(): + # TODO(suquark): This is a temporary patch for resource group. + # By default, Ray autoscaler assumes the resource group is still + # here even after the whole cluster is destroyed. However, now we + # deletes the resource group after tearing down the cluster. To + # comfort the autoscaler, we need to create/update it here, so the + # resource group always exists. + # + # We should not re-configure the resource group again, when it is + # running on the remote VM and the autostopping is in progress, + # because the VM is running which guarantees the resource group + # exists. + from sky.skylet.providers.azure.config import _configure_resource_group + + _configure_resource_group( + {"cluster_name": cluster_name, "provider": provider_config} + ) subscription_id = provider_config["subscription_id"] self.cache_stopped_nodes = provider_config.get("cache_stopped_nodes", True) # Sky only supports Azure CLI credential for now.