From 7a54940c0022b6d28919bd61b8443b9b72654ca7 Mon Sep 17 00:00:00 2001 From: Zongheng Yang Date: Wed, 18 Oct 2023 10:06:09 -0700 Subject: [PATCH] Core: Fix AWS/GCP autostop with new provisioner. (#2719) * Core: Fix AWS/GCP autostop with new provisioner. * Bump skylet version to 4. * Hint version more. * debug -> info --- sky/skylet/attempt_skylet.py | 15 +++++++++++---- sky/skylet/constants.py | 8 +++++--- sky/skylet/events.py | 11 ++++++++--- 3 files changed, 24 insertions(+), 10 deletions(-) diff --git a/sky/skylet/attempt_skylet.py b/sky/skylet/attempt_skylet.py index df260fe4867..1c28bc8a5ff 100644 --- a/sky/skylet/attempt_skylet.py +++ b/sky/skylet/attempt_skylet.py @@ -34,17 +34,24 @@ def restart_skylet(): running = (proc.returncode == 0) version_match = False +found_version = None if os.path.exists(VERSION_FILE): with open(VERSION_FILE) as f: - if f.read().strip() == constants.SKYLET_VERSION: + found_version = f.read().strip() + if found_version == constants.SKYLET_VERSION: version_match = True +version_string = (f' (found version {found_version}, new version ' + f'{constants.SKYLET_VERSION})') if not running: - print('Skylet is not running. Starting...') + print('Skylet is not running. Starting (version ' + f'{constants.SKYLET_VERSION})...') elif not version_match: - print('Skylet is staled. Restarting...') + print(f'Skylet is stale{version_string}. Restarting...') else: - print('Skylet is running with the latest version.') + print( + f'Skylet is running with the latest version {constants.SKYLET_VERSION}.' + ) if not running or not version_match: restart_skylet() diff --git a/sky/skylet/constants.py b/sky/skylet/constants.py index 2326d019325..a01195f1b95 100644 --- a/sky/skylet/constants.py +++ b/sky/skylet/constants.py @@ -39,10 +39,12 @@ # lifetime of the job. TASK_ID_LIST_ENV_VAR = 'SKYPILOT_TASK_IDS' -# The version of skylet. We should bump this version whenever we need the skylet -# to be restarted on existing clusters updated with the new version of SkyPilot, +# The version of skylet. MUST bump this version whenever we need the skylet to +# be restarted on existing clusters updated with the new version of SkyPilot, # e.g., when we add new events to skylet, or we fix a bug in skylet. -SKYLET_VERSION = '3' +# +# TODO(zongheng,zhanghao): make the upgrading of skylet automatic? +SKYLET_VERSION = '4' SKYLET_VERSION_FILE = '~/.sky/skylet_version' # `sky spot dashboard`-related diff --git a/sky/skylet/events.py b/sky/skylet/events.py index bc97c7c2993..fd5940ea62d 100644 --- a/sky/skylet/events.py +++ b/sky/skylet/events.py @@ -129,15 +129,20 @@ def _stop_cluster(self, autostop_config): config = common_utils.read_yaml(self._ray_yaml_path) provider_module = config['provider']['module'] - provider_search = re.search(r'(?:providers|provision)\.(.*)(\.)?', + # Examples: + # 'sky.skylet.providers.aws.AWSNodeProviderV2' -> 'aws' + # 'sky.provision.aws' -> 'aws' + provider_search = re.search(r'(?:providers|provision)\.(\w+)\.?', provider_module) assert provider_search is not None, config provider_name = provider_search.group(1).lower() - - if provider_name in ['aws', 'gcp']: + if provider_name in ('aws', 'gcp'): + logger.info('Using new provisioner to stop the cluster.') self._stop_cluster_with_new_provisioner(autostop_config, config, provider_name) return + logger.info('Not using new provisioner to stop the cluster. ' + f'Cloud of this cluster: {provider_name}') is_cluster_multinode = config['max_workers'] > 0