diff --git a/docs/source/getting-started/installation.rst b/docs/source/getting-started/installation.rst index deb2307b67b..142edb01124 100644 --- a/docs/source/getting-started/installation.rst +++ b/docs/source/getting-started/installation.rst @@ -303,7 +303,7 @@ RunPod .. code-block:: shell - pip install "runpod>=1.5.1" + pip install "runpod>=1.6.1" runpod config diff --git a/sky/provision/docker_utils.py b/sky/provision/docker_utils.py index c55508ab41a..3f342b57ce7 100644 --- a/sky/provision/docker_utils.py +++ b/sky/provision/docker_utils.py @@ -38,6 +38,13 @@ class DockerLoginConfig: password: str server: str + def format_image(self, image: str) -> str: + """Format the image name with the server prefix.""" + server_prefix = f'{self.server}/' + if not image.startswith(server_prefix): + return f'{server_prefix}{image}' + return image + @classmethod def from_env_vars(cls, d: Dict[str, str]) -> 'DockerLoginConfig': return cls( @@ -220,9 +227,7 @@ def initialize(self) -> str: wait_for_docker_daemon=True) # We automatically add the server prefix to the image name if # the user did not add it. - server_prefix = f'{docker_login_config.server}/' - if not specific_image.startswith(server_prefix): - specific_image = f'{server_prefix}{specific_image}' + specific_image = docker_login_config.format_image(specific_image) if self.docker_config.get('pull_before_run', True): assert specific_image, ('Image must be included in config if ' + diff --git a/sky/provision/runpod/instance.py b/sky/provision/runpod/instance.py index 8f992f569d9..9e57887c3f1 100644 --- a/sky/provision/runpod/instance.py +++ b/sky/provision/runpod/instance.py @@ -83,7 +83,8 @@ def run_instances(region: str, cluster_name_on_cloud: str, node_type = 'head' if head_instance_id is None else 'worker' try: instance_id = utils.launch( - name=f'{cluster_name_on_cloud}-{node_type}', + cluster_name=cluster_name_on_cloud, + node_type=node_type, instance_type=config.node_config['InstanceType'], region=region, disk_size=config.node_config['DiskSize'], @@ -92,6 +93,8 @@ def run_instances(region: str, cluster_name_on_cloud: str, public_key=config.node_config['PublicKey'], preemptible=config.node_config['Preemptible'], bid_per_gpu=config.node_config['BidPerGPU'], + docker_login_config=config.provider_config.get( + 'docker_login_config'), ) except Exception as e: # pylint: disable=broad-except logger.warning(f'run_instances error: {e}') @@ -145,6 +148,8 @@ def terminate_instances( """See sky/provision/__init__.py""" del provider_config # unused instances = _filter_instances(cluster_name_on_cloud, None) + template_name, registry_auth_id = utils.get_registry_auth_resources( + cluster_name_on_cloud) for inst_id, inst in instances.items(): logger.debug(f'Terminating instance {inst_id}: {inst}') if worker_only and inst['name'].endswith('-head'): @@ -157,6 +162,10 @@ def terminate_instances( f'Failed to terminate instance {inst_id}: ' f'{common_utils.format_exception(e, use_bracket=False)}' ) from e + if template_name is not None: + utils.delete_pod_template(template_name) + if registry_auth_id is not None: + utils.delete_register_auth(registry_auth_id) def get_cluster_info( diff --git a/sky/provision/runpod/utils.py b/sky/provision/runpod/utils.py index d0a06b026b3..6600cfd6198 100644 --- a/sky/provision/runpod/utils.py +++ b/sky/provision/runpod/utils.py @@ -2,10 +2,11 @@ import base64 import time -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, Tuple from sky import sky_logging from sky.adaptors import runpod +from sky.provision import docker_utils import sky.provision.runpod.api.commands as runpod_commands from sky.skylet import constants from sky.utils import common_utils @@ -47,6 +48,11 @@ } +def _construct_docker_login_template_name(cluster_name: str) -> str: + """Constructs the registry auth template name.""" + return f'{cluster_name}-docker-login-template' + + def retry(func): """Decorator to retry a function.""" @@ -66,9 +72,83 @@ def wrapper(*args, **kwargs): return wrapper +# Adapted from runpod.api.queries.pods.py::QUERY_POD. +# Adding containerRegistryAuthId to the query. +_QUERY_POD = """ +query myPods { + myself { + pods { + id + containerDiskInGb + containerRegistryAuthId + costPerHr + desiredStatus + dockerArgs + dockerId + env + gpuCount + imageName + lastStatusChange + machineId + memoryInGb + name + podType + port + ports + uptimeSeconds + vcpuCount + volumeInGb + volumeMountPath + runtime { + ports{ + ip + isIpPublic + privatePort + publicPort + type + } + } + machine { + gpuDisplayName + } + } + } +} +""" + + +def _sky_get_pods() -> dict: + """List all pods with extra registry auth information. + + Adapted from runpod.get_pods() to include containerRegistryAuthId. + """ + raw_return = runpod.runpod.api.graphql.run_graphql_query(_QUERY_POD) + cleaned_return = raw_return['data']['myself']['pods'] + return cleaned_return + + +_QUERY_POD_TEMPLATE_WITH_REGISTRY_AUTH = """ +query myself { + myself { + podTemplates { + name + containerRegistryAuthId + } + } +} +""" + + +def _list_pod_templates_with_container_registry() -> dict: + """List all pod templates.""" + raw_return = runpod.runpod.api.graphql.run_graphql_query( + _QUERY_POD_TEMPLATE_WITH_REGISTRY_AUTH) + return raw_return['data']['myself']['podTemplates'] + + def list_instances() -> Dict[str, Dict[str, Any]]: """Lists instances associated with API key.""" - instances = runpod.runpod.get_pods() + instances = _sky_get_pods() instance_dict: Dict[str, Dict[str, Any]] = {} for instance in instances: @@ -100,14 +180,75 @@ def list_instances() -> Dict[str, Dict[str, Any]]: return instance_dict -def launch(name: str, instance_type: str, region: str, disk_size: int, - image_name: str, ports: Optional[List[int]], public_key: str, - preemptible: Optional[bool], bid_per_gpu: float) -> str: +def delete_pod_template(template_name: str) -> None: + """Deletes a pod template.""" + try: + runpod.runpod.api.graphql.run_graphql_query( + f'mutation {{deleteTemplate(templateName: "{template_name}")}}') + except runpod.runpod.error.QueryError as e: + logger.warning(f'Failed to delete template {template_name}: {e}' + 'Please delete it manually.') + + +def delete_register_auth(registry_auth_id: str) -> None: + """Deletes a registry auth.""" + try: + runpod.runpod.delete_container_registry_auth(registry_auth_id) + except runpod.runpod.error.QueryError as e: + logger.warning(f'Failed to delete registry auth {registry_auth_id}: {e}' + 'Please delete it manually.') + + +def _create_template_for_docker_login( + cluster_name: str, + image_name: str, + docker_login_config: Optional[Dict[str, str]], +) -> Tuple[str, Optional[str]]: + """Creates a template for the given image with the docker login config. + + Returns: + formatted_image_name: The formatted image name. + template_id: The template ID. None for no docker login config. + """ + if docker_login_config is None: + return image_name, None + login_config = docker_utils.DockerLoginConfig(**docker_login_config) + container_registry_auth_name = f'{cluster_name}-registry-auth' + container_template_name = _construct_docker_login_template_name( + cluster_name) + # The `name` argument is only for display purpose and the registry server + # will be splitted from the docker image name (Tested with AWS ECR). + # Here we only need the username and password to create the registry auth. + # TODO(tian): Now we create a template and a registry auth for each cluster. + # Consider create one for each server and reuse them. Challenges including + # calculate the reference count and delete them when no longer needed. + create_auth_resp = runpod.runpod.create_container_registry_auth( + name=container_registry_auth_name, + username=login_config.username, + password=login_config.password, + ) + registry_auth_id = create_auth_resp['id'] + create_template_resp = runpod.runpod.create_template( + name=container_template_name, + image_name=None, + registry_auth_id=registry_auth_id, + ) + return login_config.format_image(image_name), create_template_resp['id'] + + +def launch(cluster_name: str, node_type: str, instance_type: str, region: str, + disk_size: int, image_name: str, ports: Optional[List[int]], + public_key: str, preemptible: Optional[bool], bid_per_gpu: float, + docker_login_config: Optional[Dict[str, str]]) -> str: """Launches an instance with the given parameters. Converts the instance_type to the RunPod GPU name, finds the specs for the GPU, and launches the instance. + + Returns: + instance_id: The instance ID. """ + name = f'{cluster_name}-{node_type}' gpu_type = GPU_NAME_MAP[instance_type.split('_')[1]] gpu_quantity = int(instance_type.split('_')[0].replace('x', '')) cloud_type = instance_type.split('_')[2] @@ -139,21 +280,24 @@ def launch(name: str, instance_type: str, region: str, disk_size: int, # Use base64 to deal with the tricky quoting issues caused by runpod API. encoded = base64.b64encode(setup_cmd.encode('utf-8')).decode('utf-8') + docker_args = (f'bash -c \'echo {encoded} | base64 --decode > init.sh; ' + f'bash init.sh\'') + # Port 8081 is occupied for nginx in the base image. custom_ports_str = '' if ports is not None: custom_ports_str = ''.join([f'{p}/tcp,' for p in ports]) + ports_str = (f'22/tcp,' + f'{custom_ports_str}' + f'{constants.SKY_REMOTE_RAY_DASHBOARD_PORT}/http,' + f'{constants.SKY_REMOTE_RAY_PORT}/http') - docker_args = (f'bash -c \'echo {encoded} | base64 --decode > init.sh; ' - f'bash init.sh\'') - ports = (f'22/tcp,' - f'{custom_ports_str}' - f'{constants.SKY_REMOTE_RAY_DASHBOARD_PORT}/http,' - f'{constants.SKY_REMOTE_RAY_PORT}/http') + image_name_formatted, template_id = _create_template_for_docker_login( + cluster_name, image_name, docker_login_config) params = { 'name': name, - 'image_name': image_name, + 'image_name': image_name_formatted, 'gpu_type_id': gpu_type, 'cloud_type': cloud_type, 'container_disk_in_gb': disk_size, @@ -161,9 +305,10 @@ def launch(name: str, instance_type: str, region: str, disk_size: int, 'min_memory_in_gb': gpu_specs['memoryInGb'] * gpu_quantity, 'gpu_count': gpu_quantity, 'country_code': region, - 'ports': ports, + 'ports': ports_str, 'support_public_ip': True, 'docker_args': docker_args, + 'template_id': template_id, } if preemptible is None or not preemptible: @@ -177,6 +322,18 @@ def launch(name: str, instance_type: str, region: str, disk_size: int, return new_instance['id'] +def get_registry_auth_resources( + cluster_name: str) -> Tuple[Optional[str], Optional[str]]: + """Gets the registry auth resources.""" + container_registry_auth_name = _construct_docker_login_template_name( + cluster_name) + for template in _list_pod_templates_with_container_registry(): + if template['name'] == container_registry_auth_name: + return container_registry_auth_name, template[ + 'containerRegistryAuthId'] + return None, None + + def remove(instance_id: str) -> None: """Terminates the given instance.""" runpod.runpod.terminate_pod(instance_id) diff --git a/sky/setup_files/dependencies.py b/sky/setup_files/dependencies.py index 18d2f5cdc08..434d4beddae 100644 --- a/sky/setup_files/dependencies.py +++ b/sky/setup_files/dependencies.py @@ -123,7 +123,9 @@ 'oci': ['oci'] + local_ray, 'kubernetes': ['kubernetes>=20.0.0'], 'remote': remote, - 'runpod': ['runpod>=1.5.1'], + # For the container registry auth api. Reference: + # https://github.com/runpod/runpod-python/releases/tag/1.6.1 + 'runpod': ['runpod>=1.6.1'], 'fluidstack': [], # No dependencies needed for fluidstack 'cudo': ['cudo-compute>=0.1.10'], 'paperspace': [], # No dependencies needed for paperspace diff --git a/sky/skylet/providers/command_runner.py b/sky/skylet/providers/command_runner.py index 4f66ef54383..16dbc4d2668 100644 --- a/sky/skylet/providers/command_runner.py +++ b/sky/skylet/providers/command_runner.py @@ -25,7 +25,7 @@ def docker_start_cmds( docker_cmd, ): """Generating docker start command without --rm. - + The code is borrowed from `ray.autoscaler._private.docker`. Changes we made: @@ -159,19 +159,17 @@ def run_init(self, *, as_head: bool, file_mounts: Dict[str, str], return True # SkyPilot: Docker login if user specified a private docker registry. - if "docker_login_config" in self.docker_config: + if 'docker_login_config' in self.docker_config: # TODO(tian): Maybe support a command to get the login password? - docker_login_config: docker_utils.DockerLoginConfig = self.docker_config[ - "docker_login_config"] + docker_login_config: docker_utils.DockerLoginConfig = ( + self.docker_config['docker_login_config']) self._run_with_retry( f'{self.docker_cmd} login --username ' f'{docker_login_config.username} --password ' f'{docker_login_config.password} {docker_login_config.server}') # We automatically add the server prefix to the image name if # the user did not add it. - server_prefix = f'{docker_login_config.server}/' - if not specific_image.startswith(server_prefix): - specific_image = f'{server_prefix}{specific_image}' + specific_image = docker_login_config.format_image(specific_image) if self.docker_config.get('pull_before_run', True): assert specific_image, ('Image must be included in config if ' diff --git a/sky/templates/runpod-ray.yml.j2 b/sky/templates/runpod-ray.yml.j2 index 853b9142037..ea57c9ac808 100644 --- a/sky/templates/runpod-ray.yml.j2 +++ b/sky/templates/runpod-ray.yml.j2 @@ -10,6 +10,19 @@ provider: module: sky.provision.runpod region: "{{region}}" disable_launch_config_check: true + # For RunPod, we directly set the image id for the docker as runtime environment + # support, thus we need to avoid the DockerInitializer detects the docker field + # and performs the initialization. Therefore we put the docker login config in + # the provider config here. + {%- if docker_login_config is not none %} + docker_login_config: + username: |- + {{docker_login_config.username}} + password: |- + {{docker_login_config.password}} + server: |- + {{docker_login_config.server}} + {%- endif %} auth: ssh_user: root