Skip to content

Commit

Permalink
delete template and registry after termination
Browse files Browse the repository at this point in the history
  • Loading branch information
cblmemo committed Dec 27, 2024
1 parent 8236882 commit 974239c
Show file tree
Hide file tree
Showing 4 changed files with 77 additions and 31 deletions.
11 changes: 11 additions & 0 deletions sky/backends/cloud_vm_ray_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -1570,6 +1570,17 @@ def _retry_zones(
config_dict['provision_record'] = provision_record
config_dict['resources_vars'] = resources_vars
config_dict['handle'] = handle
if provision_record.ephemeral_resources:
# Some ephemeral resources are created during the launch
# process. Add them to the provider config so that they
# can be cleaned up later.
original_config_content = common_utils.read_yaml(
cluster_config_file)
original_config_content['provider'][
'ephemeral_resources'] = (
provision_record.ephemeral_resources)
common_utils.dump_yaml(cluster_config_file,
original_config_content)
return config_dict
except provision_common.StopFailoverError:
with ux_utils.print_exception_no_traceback():
Expand Down
1 change: 1 addition & 0 deletions sky/provision/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ class ProvisionRecord:
resumed_instance_ids: List[InstanceId]
# The IDs of all just created instances.
created_instance_ids: List[InstanceId]
ephemeral_resources: List[Any] = dataclasses.field(default_factory=list)

def is_instance_just_booted(self, instance_id: InstanceId) -> bool:
"""Whether or not the instance is just booted.
Expand Down
18 changes: 15 additions & 3 deletions sky/provision/runpod/instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,10 +79,11 @@ def run_instances(region: str, cluster_name_on_cloud: str,
created_instance_ids=[])

created_instance_ids = []
ephemeral_resources = []
for _ in range(to_start_count):
node_type = 'head' if head_instance_id is None else 'worker'
try:
instance_id = utils.launch(
instance_id, ers = utils.launch(
cluster_name=cluster_name_on_cloud,
node_type=node_type,
instance_type=config.node_config['InstanceType'],
Expand All @@ -96,6 +97,9 @@ def run_instances(region: str, cluster_name_on_cloud: str,
docker_login_config=config.provider_config.get(
'docker_login_config'),
)
for er in ers:
if er is not None:
ephemeral_resources.append(er)
except Exception as e: # pylint: disable=broad-except
logger.warning(f'run_instances error: {e}')
raise
Expand Down Expand Up @@ -124,7 +128,8 @@ def run_instances(region: str, cluster_name_on_cloud: str,
zone=None,
head_instance_id=head_instance_id,
resumed_instance_ids=[],
created_instance_ids=created_instance_ids)
created_instance_ids=created_instance_ids,
ephemeral_resources=ephemeral_resources)


def wait_instances(region: str, cluster_name_on_cloud: str,
Expand All @@ -146,7 +151,8 @@ def terminate_instances(
worker_only: bool = False,
) -> None:
"""See sky/provision/__init__.py"""
del provider_config # unused
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
ephemeral_resources = provider_config.get('ephemeral_resources', [])
instances = _filter_instances(cluster_name_on_cloud, None)
for inst_id, inst in instances.items():
logger.debug(f'Terminating instance {inst_id}: {inst}')
Expand All @@ -160,6 +166,12 @@ def terminate_instances(
f'Failed to terminate instance {inst_id}: '
f'{common_utils.format_exception(e, use_bracket=False)}'
) from e
if ephemeral_resources:
# See sky/provision/runpod/utils.py::launch for details
assert len(ephemeral_resources) == 2, ephemeral_resources
template_name, registry_auth_id = ephemeral_resources
utils.delete_pod_template(template_name)
utils.delete_register_auth(registry_auth_id)


def get_cluster_info(
Expand Down
78 changes: 50 additions & 28 deletions sky/provision/runpod/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import base64
import time
from typing import Any, Dict, List, Optional, Tuple
import uuid

from sky import sky_logging
from sky.adaptors import runpod
Expand Down Expand Up @@ -102,19 +101,44 @@ def list_instances() -> Dict[str, Dict[str, Any]]:
return instance_dict


def delete_pod_template(template_name: str) -> None:
"""Deletes a pod template."""
try:
runpod.runpod.api.graphql.run_graphql_query(
f'mutation {{deleteTemplate(templateName: "{template_name}")}}')
except runpod.runpod.error.QueryError as e:
logger.warning(f'Failed to delete template {template_name}: {e}'
'Please delete it manually.')


def delete_register_auth(registry_auth_id: str) -> None:
"""Deletes a registry auth."""
try:
runpod.runpod.delete_container_registry_auth(registry_auth_id)
except runpod.runpod.error.QueryError as e:
logger.warning(f'Failed to delete registry auth {registry_auth_id}: {e}'
'Please delete it manually.')


def _create_template_for_docker_login(
cluster_name: str, image_name: str,
docker_login_config: Optional[Dict[str,
str]]) -> Tuple[str, Optional[str]]:
cluster_name: str,
image_name: str,
docker_login_config: Optional[Dict[str, str]],
) -> Tuple[str, Optional[str], Optional[str], Optional[str]]:
"""Creates a template for the given image with the docker login config.
Returns:
formatted_image_name: The formatted image name.
# following fields are None for no docker login config.
template_id: The template ID.
template_name: The template name.
registry_auth_id: The registry auth ID.
"""
if docker_login_config is None:
return image_name, None
# We add a uuid here to avoid the name conflict for terminating and
# launching with the same cluster name. Please see the comments below
# for reason we cannot cleanup the old resources.
name = f'{cluster_name}-{str(uuid.uuid4())[:4]}'
return image_name, None, None, None
login_config = docker_utils.DockerLoginConfig(**docker_login_config)
container_registry_auth_name = f'{name}-registry-auth'
container_template_name = f'{name}-docker-login-template'
container_registry_auth_name = f'{cluster_name}-registry-auth'
container_template_name = f'{cluster_name}-docker-login-template'
# The `name` argument is only for display purpose and the registry server
# will be splitted from the docker image name (Tested with AWS ECR).
# Here we only need the username and password to create the registry auth.
Expand All @@ -123,15 +147,6 @@ def _create_template_for_docker_login(
# we create it. So we use a separate auth and template per cluster. This
# also assumes that every cluster has only one node, so no extra worker
# nodes will reuse the same auth and template name.
# TODO(tian): RunPod python API does not provide a way to delete the
# template. So we skip the deletion of template for now. We should
# implement this once they provide the API.
# TODO(tian): We also skipped the deletion of the auth for now, as the
# RunPod python API does not provide a way to delete the auth with the
# name (nor to get the id by the name), which requires we store the id
# at creation somewhere, and returning this value to outer caller will
# increase the call chain complexity. We should implement this once they
# provide the API.
create_auth_resp = runpod.runpod.create_container_registry_auth(
name=container_registry_auth_name,
username=login_config.username,
Expand All @@ -143,17 +158,23 @@ def _create_template_for_docker_login(
image_name=None,
registry_auth_id=registry_auth_id,
)
return login_config.format_image(image_name), create_template_resp['id']
return (login_config.format_image(image_name), create_template_resp['id'],
container_template_name, registry_auth_id)


def launch(cluster_name: str, node_type: str, instance_type: str, region: str,
disk_size: int, image_name: str, ports: Optional[List[int]],
public_key: str, preemptible: Optional[bool], bid_per_gpu: float,
docker_login_config: Optional[Dict[str, str]]) -> str:
def launch(
cluster_name: str, node_type: str, instance_type: str, region: str,
disk_size: int, image_name: str, ports: Optional[List[int]],
public_key: str, preemptible: Optional[bool], bid_per_gpu: float,
docker_login_config: Optional[Dict[str, str]]) -> Tuple[str, List[Any]]:
"""Launches an instance with the given parameters.
Converts the instance_type to the RunPod GPU name, finds the specs for the
GPU, and launches the instance.
Returns:
instance_id: The instance ID.
ephemeral_resources: A list of ephemeral resources.
"""
name = f'{cluster_name}-{node_type}'
gpu_type = GPU_NAME_MAP[instance_type.split('_')[1]]
Expand Down Expand Up @@ -199,8 +220,9 @@ def launch(cluster_name: str, node_type: str, instance_type: str, region: str,
f'{constants.SKY_REMOTE_RAY_DASHBOARD_PORT}/http,'
f'{constants.SKY_REMOTE_RAY_PORT}/http')

image_name_formatted, template_id = (_create_template_for_docker_login(
cluster_name, image_name, docker_login_config))
image_name_formatted, template_id, template_name, registry_auth_id = (
_create_template_for_docker_login(cluster_name, image_name,
docker_login_config))

params = {
'name': name,
Expand All @@ -226,7 +248,7 @@ def launch(cluster_name: str, node_type: str, instance_type: str, region: str,
**params,
)

return new_instance['id']
return new_instance['id'], [template_name, registry_auth_id]


def remove(instance_id: str) -> None:
Expand Down

0 comments on commit 974239c

Please sign in to comment.