From c3c73dfe756a79823f2bea42dd3026fefd536696 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Thu, 27 Jun 2024 07:26:39 +0000 Subject: [PATCH 01/10] Avoid azure reconfig everytime --- sky/skylet/providers/azure/node_provider.py | 32 +++++++-------------- 1 file changed, 11 insertions(+), 21 deletions(-) diff --git a/sky/skylet/providers/azure/node_provider.py b/sky/skylet/providers/azure/node_provider.py index 068930eb390..4783dffa2c0 100644 --- a/sky/skylet/providers/azure/node_provider.py +++ b/sky/skylet/providers/azure/node_provider.py @@ -11,11 +11,11 @@ from azure.mgmt.resource import ResourceManagementClient from azure.mgmt.resource.resources.models import DeploymentMode +from sky.adaptors import azure from sky.skylet.providers.azure.config import ( bootstrap_azure, get_azure_sdk_function, ) -from sky.skylet import autostop_lib from sky.skylet.providers.command_runner import SkyDockerCommandRunner from sky.provision import docker_utils @@ -62,23 +62,7 @@ class AzureNodeProvider(NodeProvider): def __init__(self, provider_config, cluster_name): NodeProvider.__init__(self, provider_config, cluster_name) - if not autostop_lib.get_is_autostopping(): - # TODO(suquark): This is a temporary patch for resource group. - # By default, Ray autoscaler assumes the resource group is still - # here even after the whole cluster is destroyed. However, now we - # deletes the resource group after tearing down the cluster. To - # comfort the autoscaler, we need to create/update it here, so the - # resource group always exists. - # - # We should not re-configure the resource group again, when it is - # running on the remote VM and the autostopping is in progress, - # because the VM is running which guarantees the resource group - # exists. - from sky.skylet.providers.azure.config import _configure_resource_group - - _configure_resource_group( - {"cluster_name": cluster_name, "provider": provider_config} - ) + subscription_id = provider_config["subscription_id"] self.cache_stopped_nodes = provider_config.get("cache_stopped_nodes", True) # Sky only supports Azure CLI credential for now. @@ -106,9 +90,15 @@ def match_tags(vm): return False return True - vms = self.compute_client.virtual_machines.list( - resource_group_name=self.provider_config["resource_group"] - ) + try: + vms = self.compute_client.virtual_machines.list( + resource_group_name=self.provider_config["resource_group"] + ) + except azure.exceptions().HttpResponseError as e: + if e.reason == "ResourceGroupNotFound": + vms = {} + else: + raise nodes = [self._extract_metadata(vm) for vm in filter(match_tags, vms)] self.cached_nodes = {node["name"]: node for node in nodes} From 8b1586e1166639d2210b4224c1d2d6da31480753 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Thu, 27 Jun 2024 08:17:26 +0000 Subject: [PATCH 02/10] Add debug message --- sky/skylet/providers/azure/node_provider.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sky/skylet/providers/azure/node_provider.py b/sky/skylet/providers/azure/node_provider.py index 4783dffa2c0..ee02e900092 100644 --- a/sky/skylet/providers/azure/node_provider.py +++ b/sky/skylet/providers/azure/node_provider.py @@ -96,6 +96,8 @@ def match_tags(vm): ) except azure.exceptions().HttpResponseError as e: if e.reason == "ResourceGroupNotFound": + logger.debug("Resource group not found. VMs should be " + "terminated.") vms = {} else: raise From b0186952eaca4446d15919e4fd9e193fbb783593 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Thu, 27 Jun 2024 08:20:57 +0000 Subject: [PATCH 03/10] format --- sky/skylet/providers/azure/node_provider.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sky/skylet/providers/azure/node_provider.py b/sky/skylet/providers/azure/node_provider.py index ee02e900092..e3ab534ef2b 100644 --- a/sky/skylet/providers/azure/node_provider.py +++ b/sky/skylet/providers/azure/node_provider.py @@ -96,8 +96,7 @@ def match_tags(vm): ) except azure.exceptions().HttpResponseError as e: if e.reason == "ResourceGroupNotFound": - logger.debug("Resource group not found. VMs should be " - "terminated.") + logger.debug("Resource group not found. VMs should be " "terminated.") vms = {} else: raise From 044cde023e0ad3efdc8eaa71bc0287fbaca1267d Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Thu, 27 Jun 2024 08:35:19 +0000 Subject: [PATCH 04/10] Fix error handling --- sky/skylet/providers/azure/node_provider.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/sky/skylet/providers/azure/node_provider.py b/sky/skylet/providers/azure/node_provider.py index e3ab534ef2b..c4a328970e8 100644 --- a/sky/skylet/providers/azure/node_provider.py +++ b/sky/skylet/providers/azure/node_provider.py @@ -83,7 +83,6 @@ def __init__(self, provider_config, cluster_name): def _get_filtered_nodes(self, tag_filters): # add cluster name filter to only get nodes from this cluster cluster_tag_filters = {**tag_filters, TAG_RAY_CLUSTER_NAME: self.cluster_name} - def match_tags(vm): for k, v in cluster_tag_filters.items(): if vm.tags.get(k) != v: @@ -91,13 +90,13 @@ def match_tags(vm): return True try: - vms = self.compute_client.virtual_machines.list( + vms = list(self.compute_client.virtual_machines.list( resource_group_name=self.provider_config["resource_group"] - ) - except azure.exceptions().HttpResponseError as e: - if e.reason == "ResourceGroupNotFound": - logger.debug("Resource group not found. VMs should be " "terminated.") - vms = {} + )) + except azure.exceptions().ResourceNotFoundError as e: + if "Code: ResourceGroupNotFound" in e.exc_msg: + logger.debug("Resource group not found. VMs should be terminated.") + vms = [] else: raise From e576b2ee878402dc248519fe820c643e07a5c405 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Thu, 27 Jun 2024 08:37:01 +0000 Subject: [PATCH 05/10] format --- sky/skylet/providers/azure/node_provider.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/sky/skylet/providers/azure/node_provider.py b/sky/skylet/providers/azure/node_provider.py index c4a328970e8..4fc91a62cee 100644 --- a/sky/skylet/providers/azure/node_provider.py +++ b/sky/skylet/providers/azure/node_provider.py @@ -83,6 +83,7 @@ def __init__(self, provider_config, cluster_name): def _get_filtered_nodes(self, tag_filters): # add cluster name filter to only get nodes from this cluster cluster_tag_filters = {**tag_filters, TAG_RAY_CLUSTER_NAME: self.cluster_name} + def match_tags(vm): for k, v in cluster_tag_filters.items(): if vm.tags.get(k) != v: @@ -90,9 +91,11 @@ def match_tags(vm): return True try: - vms = list(self.compute_client.virtual_machines.list( - resource_group_name=self.provider_config["resource_group"] - )) + vms = list( + self.compute_client.virtual_machines.list( + resource_group_name=self.provider_config["resource_group"] + ) + ) except azure.exceptions().ResourceNotFoundError as e: if "Code: ResourceGroupNotFound" in e.exc_msg: logger.debug("Resource group not found. VMs should be terminated.") From f46147a67d5f66ea805e68861e033525f8ccd423 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Fri, 28 Jun 2024 07:33:59 +0000 Subject: [PATCH 06/10] skip deployment recreation when deployment exist --- sky/skylet/providers/azure/config.py | 40 +++++++++++++++++++++------- 1 file changed, 30 insertions(+), 10 deletions(-) diff --git a/sky/skylet/providers/azure/config.py b/sky/skylet/providers/azure/config.py index a19273761ba..63a82c479a1 100644 --- a/sky/skylet/providers/azure/config.py +++ b/sky/skylet/providers/azure/config.py @@ -12,6 +12,7 @@ from azure.mgmt.resource import ResourceManagementClient from azure.mgmt.resource.resources.models import DeploymentMode +from sky.adaptors import azure from sky.utils import common_utils UNIQUE_ID_LEN = 4 @@ -120,17 +121,36 @@ def _configure_resource_group(config): create_or_update = get_azure_sdk_function( client=resource_client.deployments, function_name="create_or_update" ) - # TODO (skypilot): this takes a long time (> 40 seconds) for stopping an - # azure VM, and this can be called twice during ray down. - outputs = ( - create_or_update( - resource_group_name=resource_group, - deployment_name="ray-config", - parameters=parameters, - ) - .result() - .properties.outputs + # Skip creating or updating the deployment if the deployment already exists + # and the cluster name is the same. + get_deployment = get_azure_sdk_function( + client=resource_client.deployments, function_name="get" ) + deployment_exists = False + try: + deployment = get_deployment( + resource_group_name=resource_group, deployment_name="ray-config" + ) + logger.info("Deployment already exists. Skipping deployment creation.") + + outputs = deployment.properties.outputs + if outputs is not None: + deployment_exists = True + except azure.exceptions().ResourceNotFoundError: + deployment_exists = False + + if not deployment_exists: + # TODO (skypilot): this takes a long time (> 40 seconds) for stopping an + # azure VM, and this can be called twice during ray down. + outputs = ( + create_or_update( + resource_group_name=resource_group, + deployment_name="ray-config", + parameters=parameters, + ) + .result() + .properties.outputs + ) # We should wait for the NSG to be created before opening any ports # to avoid overriding the newly-added NSG rules. From b498a0697ff7b0a1627efd6a0d1f93d58541010d Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Fri, 28 Jun 2024 14:54:26 +0000 Subject: [PATCH 07/10] Add retry for subscription ID --- sky/adaptors/azure.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/sky/adaptors/azure.py b/sky/adaptors/azure.py index 44618a8f64f..6bd57bc6bec 100644 --- a/sky/adaptors/azure.py +++ b/sky/adaptors/azure.py @@ -3,8 +3,10 @@ # pylint: disable=import-outside-toplevel import functools import threading +import time from sky.adaptors import common +from sky.utils import common_utils azure = common.LazyImport( 'azure', @@ -13,13 +15,30 @@ _LAZY_MODULES = (azure,) _session_creation_lock = threading.RLock() +_MAX_RETRY_FOR_GET_SUBSCRIPTION_ID = 5 @common.load_lazy_modules(modules=_LAZY_MODULES) +@functools.lru_cache() def get_subscription_id() -> str: """Get the default subscription id.""" from azure.common import credentials - return credentials.get_cli_profile().get_subscription_id() + retry = 0 + backoff = common_utils.Backoff(initial_backoff=0.5, max_backoff_factor=4) + while True: + try: + return credentials.get_cli_profile().get_subscription_id() + except Exception as e: + if ('Please run \'az login\' to setup account.' in str(e) and + retry < _MAX_RETRY_FOR_GET_SUBSCRIPTION_ID): + # When there are multiple processes trying to get the + # subscription id, it may fail with the above error message. + # Retry will fix the issue. + retry += 1 + + time.sleep(backoff.current_backoff()) + continue + raise @common.load_lazy_modules(modules=_LAZY_MODULES) @@ -36,8 +55,8 @@ def exceptions(): return azure_exceptions -@functools.lru_cache() @common.load_lazy_modules(modules=_LAZY_MODULES) +@functools.lru_cache() def get_client(name: str, subscription_id: str): # Sky only supports Azure CLI credential for now. # Increase the timeout to fix the Azure get-access-token timeout issue. From 71c7aa2c5c3ac507eb48c563a40d54e7d4fe4a33 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Fri, 28 Jun 2024 15:35:45 +0000 Subject: [PATCH 08/10] fix logging --- sky/skylet/providers/azure/node_provider.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sky/skylet/providers/azure/node_provider.py b/sky/skylet/providers/azure/node_provider.py index 4fc91a62cee..1388f050e7f 100644 --- a/sky/skylet/providers/azure/node_provider.py +++ b/sky/skylet/providers/azure/node_provider.py @@ -98,7 +98,8 @@ def match_tags(vm): ) except azure.exceptions().ResourceNotFoundError as e: if "Code: ResourceGroupNotFound" in e.exc_msg: - logger.debug("Resource group not found. VMs should be terminated.") + logger.debug("Resource group not found. VMs should have been " + "terminated.") vms = [] else: raise From 59e9853a601a7eca167d1b63104fd6b2a0d5dcd7 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Fri, 28 Jun 2024 15:37:27 +0000 Subject: [PATCH 09/10] format --- sky/skylet/providers/azure/node_provider.py | 5 +++-- sky/utils/common_utils.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/sky/skylet/providers/azure/node_provider.py b/sky/skylet/providers/azure/node_provider.py index 1388f050e7f..b4a1c656688 100644 --- a/sky/skylet/providers/azure/node_provider.py +++ b/sky/skylet/providers/azure/node_provider.py @@ -98,8 +98,9 @@ def match_tags(vm): ) except azure.exceptions().ResourceNotFoundError as e: if "Code: ResourceGroupNotFound" in e.exc_msg: - logger.debug("Resource group not found. VMs should have been " - "terminated.") + logger.debug( + "Resource group not found. VMs should have been terminated." + ) vms = [] else: raise diff --git a/sky/utils/common_utils.py b/sky/utils/common_utils.py index 103c834000c..a9227fb4c20 100644 --- a/sky/utils/common_utils.py +++ b/sky/utils/common_utils.py @@ -233,7 +233,7 @@ class Backoff: MULTIPLIER = 1.6 JITTER = 0.4 - def __init__(self, initial_backoff: int = 5, max_backoff_factor: int = 5): + def __init__(self, initial_backoff: float = 5, max_backoff_factor: int = 5): self._initial = True self._backoff = 0.0 self._initial_backoff = initial_backoff From de75b4488425be2300d9059a884a3be9c59e5244 Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Fri, 28 Jun 2024 15:43:10 +0000 Subject: [PATCH 10/10] comment --- sky/skylet/providers/azure/config.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sky/skylet/providers/azure/config.py b/sky/skylet/providers/azure/config.py index 63a82c479a1..35008ef13d7 100644 --- a/sky/skylet/providers/azure/config.py +++ b/sky/skylet/providers/azure/config.py @@ -140,8 +140,8 @@ def _configure_resource_group(config): deployment_exists = False if not deployment_exists: - # TODO (skypilot): this takes a long time (> 40 seconds) for stopping an - # azure VM, and this can be called twice during ray down. + # This takes a long time (> 40 seconds), we should be careful calling + # this function. outputs = ( create_or_update( resource_group_name=resource_group,