Skip to content

Commit

Permalink
Core: Fix AWS/GCP autostop with new provisioner. (#2719)
Browse files Browse the repository at this point in the history
* Core: Fix AWS/GCP autostop with new provisioner.

* Bump skylet version to 4.

* Hint version more.

* debug -> info
  • Loading branch information
concretevitamin authored Oct 18, 2023
1 parent bf33602 commit 7a54940
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 10 deletions.
15 changes: 11 additions & 4 deletions sky/skylet/attempt_skylet.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,17 +34,24 @@ def restart_skylet():
running = (proc.returncode == 0)

version_match = False
found_version = None
if os.path.exists(VERSION_FILE):
with open(VERSION_FILE) as f:
if f.read().strip() == constants.SKYLET_VERSION:
found_version = f.read().strip()
if found_version == constants.SKYLET_VERSION:
version_match = True

version_string = (f' (found version {found_version}, new version '
f'{constants.SKYLET_VERSION})')
if not running:
print('Skylet is not running. Starting...')
print('Skylet is not running. Starting (version '
f'{constants.SKYLET_VERSION})...')
elif not version_match:
print('Skylet is staled. Restarting...')
print(f'Skylet is stale{version_string}. Restarting...')
else:
print('Skylet is running with the latest version.')
print(
f'Skylet is running with the latest version {constants.SKYLET_VERSION}.'
)

if not running or not version_match:
restart_skylet()
8 changes: 5 additions & 3 deletions sky/skylet/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,12 @@
# lifetime of the job.
TASK_ID_LIST_ENV_VAR = 'SKYPILOT_TASK_IDS'

# The version of skylet. We should bump this version whenever we need the skylet
# to be restarted on existing clusters updated with the new version of SkyPilot,
# The version of skylet. MUST bump this version whenever we need the skylet to
# be restarted on existing clusters updated with the new version of SkyPilot,
# e.g., when we add new events to skylet, or we fix a bug in skylet.
SKYLET_VERSION = '3'
#
# TODO(zongheng,zhanghao): make the upgrading of skylet automatic?
SKYLET_VERSION = '4'
SKYLET_VERSION_FILE = '~/.sky/skylet_version'

# `sky spot dashboard`-related
Expand Down
11 changes: 8 additions & 3 deletions sky/skylet/events.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,15 +129,20 @@ def _stop_cluster(self, autostop_config):
config = common_utils.read_yaml(self._ray_yaml_path)

provider_module = config['provider']['module']
provider_search = re.search(r'(?:providers|provision)\.(.*)(\.)?',
# Examples:
# 'sky.skylet.providers.aws.AWSNodeProviderV2' -> 'aws'
# 'sky.provision.aws' -> 'aws'
provider_search = re.search(r'(?:providers|provision)\.(\w+)\.?',
provider_module)
assert provider_search is not None, config
provider_name = provider_search.group(1).lower()

if provider_name in ['aws', 'gcp']:
if provider_name in ('aws', 'gcp'):
logger.info('Using new provisioner to stop the cluster.')
self._stop_cluster_with_new_provisioner(autostop_config, config,
provider_name)
return
logger.info('Not using new provisioner to stop the cluster. '
f'Cloud of this cluster: {provider_name}')

is_cluster_multinode = config['max_workers'] > 0

Expand Down

0 comments on commit 7a54940

Please sign in to comment.