From 4821f70b3f4998821dd68c2afcdc7ff61b54ec46 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Fri, 28 Jun 2024 18:48:22 -0700 Subject: [PATCH] [Azure] Avoid azure reconfig everytime, speed up launch by up to 5.8x (#3697) * Avoid azure reconfig everytime * Add debug message * format * Fix error handling * format * skip deployment recreation when deployment exist * Add retry for subscription ID * fix logging * format * comment --- sky/adaptors/azure.py | 23 ++++++++++-- sky/skylet/providers/azure/config.py | 40 +++++++++++++++------ sky/skylet/providers/azure/node_provider.py | 37 +++++++++---------- sky/utils/common_utils.py | 2 +- 4 files changed, 68 insertions(+), 34 deletions(-) diff --git a/sky/adaptors/azure.py b/sky/adaptors/azure.py index 44618a8f64f..6bd57bc6bec 100644 --- a/sky/adaptors/azure.py +++ b/sky/adaptors/azure.py @@ -3,8 +3,10 @@ # pylint: disable=import-outside-toplevel import functools import threading +import time from sky.adaptors import common +from sky.utils import common_utils azure = common.LazyImport( 'azure', @@ -13,13 +15,30 @@ _LAZY_MODULES = (azure,) _session_creation_lock = threading.RLock() +_MAX_RETRY_FOR_GET_SUBSCRIPTION_ID = 5 @common.load_lazy_modules(modules=_LAZY_MODULES) +@functools.lru_cache() def get_subscription_id() -> str: """Get the default subscription id.""" from azure.common import credentials - return credentials.get_cli_profile().get_subscription_id() + retry = 0 + backoff = common_utils.Backoff(initial_backoff=0.5, max_backoff_factor=4) + while True: + try: + return credentials.get_cli_profile().get_subscription_id() + except Exception as e: + if ('Please run \'az login\' to setup account.' in str(e) and + retry < _MAX_RETRY_FOR_GET_SUBSCRIPTION_ID): + # When there are multiple processes trying to get the + # subscription id, it may fail with the above error message. + # Retry will fix the issue. + retry += 1 + + time.sleep(backoff.current_backoff()) + continue + raise @common.load_lazy_modules(modules=_LAZY_MODULES) @@ -36,8 +55,8 @@ def exceptions(): return azure_exceptions -@functools.lru_cache() @common.load_lazy_modules(modules=_LAZY_MODULES) +@functools.lru_cache() def get_client(name: str, subscription_id: str): # Sky only supports Azure CLI credential for now. # Increase the timeout to fix the Azure get-access-token timeout issue. diff --git a/sky/skylet/providers/azure/config.py b/sky/skylet/providers/azure/config.py index a19273761ba..35008ef13d7 100644 --- a/sky/skylet/providers/azure/config.py +++ b/sky/skylet/providers/azure/config.py @@ -12,6 +12,7 @@ from azure.mgmt.resource import ResourceManagementClient from azure.mgmt.resource.resources.models import DeploymentMode +from sky.adaptors import azure from sky.utils import common_utils UNIQUE_ID_LEN = 4 @@ -120,17 +121,36 @@ def _configure_resource_group(config): create_or_update = get_azure_sdk_function( client=resource_client.deployments, function_name="create_or_update" ) - # TODO (skypilot): this takes a long time (> 40 seconds) for stopping an - # azure VM, and this can be called twice during ray down. - outputs = ( - create_or_update( - resource_group_name=resource_group, - deployment_name="ray-config", - parameters=parameters, - ) - .result() - .properties.outputs + # Skip creating or updating the deployment if the deployment already exists + # and the cluster name is the same. + get_deployment = get_azure_sdk_function( + client=resource_client.deployments, function_name="get" ) + deployment_exists = False + try: + deployment = get_deployment( + resource_group_name=resource_group, deployment_name="ray-config" + ) + logger.info("Deployment already exists. Skipping deployment creation.") + + outputs = deployment.properties.outputs + if outputs is not None: + deployment_exists = True + except azure.exceptions().ResourceNotFoundError: + deployment_exists = False + + if not deployment_exists: + # This takes a long time (> 40 seconds), we should be careful calling + # this function. + outputs = ( + create_or_update( + resource_group_name=resource_group, + deployment_name="ray-config", + parameters=parameters, + ) + .result() + .properties.outputs + ) # We should wait for the NSG to be created before opening any ports # to avoid overriding the newly-added NSG rules. diff --git a/sky/skylet/providers/azure/node_provider.py b/sky/skylet/providers/azure/node_provider.py index 068930eb390..b4a1c656688 100644 --- a/sky/skylet/providers/azure/node_provider.py +++ b/sky/skylet/providers/azure/node_provider.py @@ -11,11 +11,11 @@ from azure.mgmt.resource import ResourceManagementClient from azure.mgmt.resource.resources.models import DeploymentMode +from sky.adaptors import azure from sky.skylet.providers.azure.config import ( bootstrap_azure, get_azure_sdk_function, ) -from sky.skylet import autostop_lib from sky.skylet.providers.command_runner import SkyDockerCommandRunner from sky.provision import docker_utils @@ -62,23 +62,7 @@ class AzureNodeProvider(NodeProvider): def __init__(self, provider_config, cluster_name): NodeProvider.__init__(self, provider_config, cluster_name) - if not autostop_lib.get_is_autostopping(): - # TODO(suquark): This is a temporary patch for resource group. - # By default, Ray autoscaler assumes the resource group is still - # here even after the whole cluster is destroyed. However, now we - # deletes the resource group after tearing down the cluster. To - # comfort the autoscaler, we need to create/update it here, so the - # resource group always exists. - # - # We should not re-configure the resource group again, when it is - # running on the remote VM and the autostopping is in progress, - # because the VM is running which guarantees the resource group - # exists. - from sky.skylet.providers.azure.config import _configure_resource_group - - _configure_resource_group( - {"cluster_name": cluster_name, "provider": provider_config} - ) + subscription_id = provider_config["subscription_id"] self.cache_stopped_nodes = provider_config.get("cache_stopped_nodes", True) # Sky only supports Azure CLI credential for now. @@ -106,9 +90,20 @@ def match_tags(vm): return False return True - vms = self.compute_client.virtual_machines.list( - resource_group_name=self.provider_config["resource_group"] - ) + try: + vms = list( + self.compute_client.virtual_machines.list( + resource_group_name=self.provider_config["resource_group"] + ) + ) + except azure.exceptions().ResourceNotFoundError as e: + if "Code: ResourceGroupNotFound" in e.exc_msg: + logger.debug( + "Resource group not found. VMs should have been terminated." + ) + vms = [] + else: + raise nodes = [self._extract_metadata(vm) for vm in filter(match_tags, vms)] self.cached_nodes = {node["name"]: node for node in nodes} diff --git a/sky/utils/common_utils.py b/sky/utils/common_utils.py index 103c834000c..a9227fb4c20 100644 --- a/sky/utils/common_utils.py +++ b/sky/utils/common_utils.py @@ -233,7 +233,7 @@ class Backoff: MULTIPLIER = 1.6 JITTER = 0.4 - def __init__(self, initial_backoff: int = 5, max_backoff_factor: int = 5): + def __init__(self, initial_backoff: float = 5, max_backoff_factor: int = 5): self._initial = True self._backoff = 0.0 self._initial_backoff = initial_backoff