From 2f0432bff3a70d264f4b2cfde91220c9670c98fa Mon Sep 17 00:00:00 2001 From: Zhanghao Wu Date: Mon, 12 Feb 2024 15:13:02 -0800 Subject: [PATCH] [Core] Add option to avoid uploading credentials and custom labels for GCP (#2904) * Avoid uploading crednetials to the clouds * Fix permission for service account * Add labels for GCP and config for authentication method * reduce the instance size for file mounts * address comments * Update docs/source/reference/config.rst Co-authored-by: Zongheng Yang * Update sky/backends/backend_utils.py Co-authored-by: Zongheng Yang * Update sky/backends/cloud_vm_ray_backend.py Co-authored-by: Zongheng Yang * remove provisioner * Refactor cloud in list * utility * update remote_identity for GCP * update config.rst --------- Co-authored-by: Zongheng Yang --- docs/source/reference/config.rst | 59 +++++++++ examples/using_file_mounts.yaml | 1 + sky/backends/backend_utils.py | 24 +++- sky/backends/cloud_vm_ray_backend.py | 11 ++ sky/check.py | 8 +- sky/clouds/__init__.py | 3 + sky/clouds/aws.py | 2 + sky/clouds/cloud.py | 26 +++- sky/clouds/gcp.py | 2 + sky/exceptions.py | 5 + sky/optimizer.py | 10 +- sky/skypilot_config.py | 7 +- sky/templates/gcp-ray.yml.j2 | 3 + sky/utils/schemas.py | 174 +++++++++++++++------------ 14 files changed, 240 insertions(+), 95 deletions(-) diff --git a/docs/source/reference/config.rst b/docs/source/reference/config.rst index 87dafef38db..9b27762761f 100644 --- a/docs/source/reference/config.rst +++ b/docs/source/reference/config.rst @@ -109,9 +109,43 @@ Available fields and semantics: # permission to create a security group. security_group_name: my-security-group + # Identity to use for all AWS instances (optional). + # + # LOCAL_CREDENTIALS: The user's local credential files will be uploaded to + # AWS instances created by SkyPilot. They are used for accessing cloud + # resources (e.g., private buckets) or launching new instances (e.g., for + # spot/serve controllers). + # + # SERVICE_ACCOUNT: Local credential files are not uploaded to AWS + # instances. SkyPilot will auto-create and reuse a service account (IAM + # role) for AWS instances. + # + # Two caveats of SERVICE_ACCOUNT for multicloud users: + # + # - This only affects AWS instances. Local AWS credentials will still be + # uploaded to non-AWS instances (since those instances may need to access + # AWS resources). + # - If the SkyPilot spot/serve controller is on AWS, this setting will make + # non-AWS managed spot jobs / non-AWS service replicas fail to access any + # resources on AWS (since the controllers don't have AWS credential + # files to assign to these non-AWS instances). + # + # Default: 'LOCAL_CREDENTIALS'. + remote_identity: LOCAL_CREDENTIALS + # Advanced GCP configurations (optional). # Apply to all new instances but not existing ones. gcp: + # Labels to assign to all instances launched by SkyPilot (optional). + # + # Example use case: cost tracking by user/team/project. + # + # Users should guarantee that these key-values are valid GCP labels, otherwise + # errors from the cloud provider will be surfaced. + instance_tags: + Owner: user-unique-name + my-tag: my-value + # VPC to use (optional). # # Default: null, which implies the following behavior. First, all existing @@ -165,6 +199,31 @@ Available fields and semantics: - projects/my-project/reservations/my-reservation1 - projects/my-project/reservations/my-reservation2 + + # Identity to use for all GCP instances (optional). + # + # LOCAL_CREDENTIALS: The user's local credential files will be uploaded to + # GCP instances created by SkyPilot. They are used for accessing cloud + # resources (e.g., private buckets) or launching new instances (e.g., for + # spot/serve controllers). + # + # SERVICE_ACCOUNT: Local credential files are not uploaded to GCP + # instances. SkyPilot will auto-create and reuse a service account for GCP + # instances. + # + # Two caveats of SERVICE_ACCOUNT for multicloud users: + # + # - This only affects GCP instances. Local GCP credentials will still be + # uploaded to non-GCP instances (since those instances may need to access + # GCP resources). + # - If the SkyPilot spot/serve controller is on GCP, this setting will make + # non-GCP managed spot jobs / non-GCP service replicas fail to access any + # resources on GCP (since the controllers don't have GCP credential + # files to assign to these non-GCP instances). + # + # Default: 'LOCAL_CREDENTIALS'. + remote_identity: LOCAL_CREDENTIALS + # Advanced Kubernetes configurations (optional). kubernetes: # The networking mode for accessing SSH jump pod (optional). diff --git a/examples/using_file_mounts.yaml b/examples/using_file_mounts.yaml index bd77ba88430..efdf9791f61 100644 --- a/examples/using_file_mounts.yaml +++ b/examples/using_file_mounts.yaml @@ -11,6 +11,7 @@ resources: cloud: aws + cpus: 2+ workdir: . diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index 06763d00415..3ea636f341b 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -756,6 +756,10 @@ def write_cluster_config( not appear in the catalog, or an ssh_proxy_command is specified but not for the given region, or GPUs are requested in a Kubernetes cluster but the cluster does not have nodes labeled with GPU types. + exceptions.InvalidCloudConfigs: if the user specifies some config for the + cloud that is not valid, e.g. remote_identity: SERVICE_ACCOUNT + for a cloud that does not support it, the caller should skip the + cloud in this case. """ # task.best_resources may not be equal to to_provision if the user # is running a job with less resources than the cluster has. @@ -786,7 +790,18 @@ def write_cluster_config( (str(to_provision.cloud).lower(), 'specific_reservations'), set())) assert cluster_name is not None - credentials = sky_check.get_cloud_credential_file_mounts() + excluded_clouds = [] + remote_identity = skypilot_config.get_nested( + (str(cloud).lower(), 'remote_identity'), 'LOCAL_CREDENTIALS') + if remote_identity == 'SERVICE_ACCOUNT': + if not cloud.supports_service_account_on_remote(): + raise exceptions.InvalidCloudConfigs( + 'remote_identity: SERVICE_ACCOUNT is specified in ' + f'{skypilot_config.loaded_config_path!r} for {cloud}, but it ' + 'is not supported by this cloud. Remove the config or set: ' + '`remote_identity: LOCAL_CREDENTIALS`.') + excluded_clouds = [cloud] + credentials = sky_check.get_cloud_credential_file_mounts(excluded_clouds) ip_list = None auth_config = {'ssh_private_key': auth.PRIVATE_SSH_KEY_PATH} @@ -828,10 +843,9 @@ def write_cluster_config( instance_tags = {} instance_tags = skypilot_config.get_nested( (str(cloud).lower(), 'instance_tags'), {}) - if not isinstance(instance_tags, dict): - with ux_utils.print_exception_no_traceback(): - raise ValueError('Custom instance_tags in config.yaml should ' - f'be a dict, but received {type(instance_tags)}.') + # instance_tags is a dict, which is guaranteed by the type check in + # schemas.py + assert isinstance(instance_tags, dict), instance_tags # Dump the Ray ports to a file for Ray job submission dump_port_command = ( diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py index 3c74e4c5441..cfbbb59e294 100644 --- a/sky/backends/cloud_vm_ray_backend.py +++ b/sky/backends/cloud_vm_ray_backend.py @@ -1507,6 +1507,17 @@ def _retry_zones( # does not have nodes labeled with GPU types. logger.info(f'{e}') continue + except exceptions.InvalidCloudConfigs as e: + # Failed due to invalid user configs in ~/.sky/config.yaml. + logger.warning(f'{common_utils.format_exception(e)}') + # We should block the entire cloud if the user config is + # invalid. + _add_to_blocked_resources( + self._blocked_resources, + to_provision.copy(region=None, zone=None)) + raise exceptions.ResourcesUnavailableError( + f'Failed to provision on cloud {to_provision.cloud} due to ' + f'invalid cloud config: {common_utils.format_exception(e)}') if dryrun: return config_dict cluster_config_file = config_dict['ray'] diff --git a/sky/check.py b/sky/check.py index 2ad2c891bab..b998bac4ce0 100644 --- a/sky/check.py +++ b/sky/check.py @@ -1,5 +1,5 @@ """Credential checks: check cloud credentials and enable clouds.""" -from typing import Dict +from typing import Dict, Iterable, Optional import click @@ -72,7 +72,8 @@ def check(quiet: bool = False, verbose: bool = False) -> None: global_user_state.set_enabled_clouds(enabled_clouds) -def get_cloud_credential_file_mounts() -> Dict[str, str]: +def get_cloud_credential_file_mounts( + excluded_clouds: Optional[Iterable[clouds.Cloud]]) -> Dict[str, str]: """Returns the files necessary to access all enabled clouds. Returns a dictionary that will be added to a task's file mounts @@ -81,6 +82,9 @@ def get_cloud_credential_file_mounts() -> Dict[str, str]: enabled_clouds = global_user_state.get_enabled_clouds() file_mounts = {} for cloud in enabled_clouds: + if (excluded_clouds is not None and + clouds.cloud_in_list(cloud, excluded_clouds)): + continue cloud_file_mounts = cloud.get_credential_file_mounts() file_mounts.update(cloud_file_mounts) # Currently, get_enabled_clouds() does not support r2 diff --git a/sky/clouds/__init__.py b/sky/clouds/__init__.py index 00406b63f85..93171a050ce 100644 --- a/sky/clouds/__init__.py +++ b/sky/clouds/__init__.py @@ -1,5 +1,6 @@ """Clouds in Sky.""" from sky.clouds.cloud import Cloud +from sky.clouds.cloud import cloud_in_list from sky.clouds.cloud import CloudImplementationFeatures from sky.clouds.cloud import ProvisionerVersion from sky.clouds.cloud import Region @@ -42,4 +43,6 @@ 'CLOUD_REGISTRY', 'ProvisionerVersion', 'StatusVersion', + # Utility functions + 'cloud_in_list', ] diff --git a/sky/clouds/aws.py b/sky/clouds/aws.py index fef065db071..4527a4acb88 100644 --- a/sky/clouds/aws.py +++ b/sky/clouds/aws.py @@ -102,6 +102,8 @@ class AWS(clouds.Cloud): # Reference: https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/Using_Tags.html # pylint: disable=line-too-long _MAX_CLUSTER_NAME_LEN_LIMIT = 248 + _SUPPORTS_SERVICE_ACCOUNT_ON_REMOTE = True + _regions: List[clouds.Region] = [] _INDENT_PREFIX = ' ' diff --git a/sky/clouds/cloud.py b/sky/clouds/cloud.py index 331a7f9a728..65a38c266e7 100644 --- a/sky/clouds/cloud.py +++ b/sky/clouds/cloud.py @@ -10,7 +10,7 @@ import collections import enum import typing -from typing import Dict, Iterator, List, Optional, Set, Tuple +from typing import Dict, Iterable, Iterator, List, Optional, Set, Tuple from sky import exceptions from sky import skypilot_config @@ -97,6 +97,7 @@ class Cloud: _DEFAULT_DISK_TIER = resources_utils.DiskTier.MEDIUM _BEST_DISK_TIER = resources_utils.DiskTier.HIGH _SUPPORTED_DISK_TIERS = {resources_utils.DiskTier.BEST} + _SUPPORTS_SERVICE_ACCOUNT_ON_REMOTE = False # The version of provisioner and status query. This is used to determine # the code path to use for each cloud in the backend. @@ -115,6 +116,21 @@ def max_cluster_name_length(cls) -> Optional[int]: """ return None + @classmethod + def supports_service_account_on_remote(cls) -> bool: + """Returns whether the cloud supports service account on remote cluster. + + This method is used by backend_utils.write_cluster_config() to decide + whether to upload user's local cloud credential files to the remote + cluster. + + If a cloud supports service account on remote cluster, the user's local + cloud credential files are not needed to be uploaded to the remote + instance, as the remote instance can be assigned with a service account + that has the necessary permissions to access the cloud resources. + """ + return cls._SUPPORTS_SERVICE_ACCOUNT_ON_REMOTE + #### Regions/Zones #### @classmethod @@ -228,7 +244,7 @@ def get_egress_cost(self, num_gigabytes: float): """ raise NotImplementedError - def is_same_cloud(self, other): + def is_same_cloud(self, other: 'Cloud'): raise NotImplementedError def make_deploy_resources_variables( @@ -725,3 +741,9 @@ def __getstate__(self): state.pop('PROVISIONER_VERSION', None) state.pop('STATUS_VERSION', None) return state + + +# === Helper functions === +def cloud_in_list(cloud: Cloud, cloud_list: Iterable[Cloud]) -> bool: + """Returns whether the cloud is in the given cloud list.""" + return any(cloud.is_same_cloud(c) for c in cloud_list) diff --git a/sky/clouds/gcp.py b/sky/clouds/gcp.py index 0573e98394d..4a056f01664 100644 --- a/sky/clouds/gcp.py +++ b/sky/clouds/gcp.py @@ -134,6 +134,8 @@ class GCP(clouds.Cloud): # lower limit. _MAX_CLUSTER_NAME_LEN_LIMIT = 35 + _SUPPORTS_SERVICE_ACCOUNT_ON_REMOTE = True + _INDENT_PREFIX = ' ' _DEPENDENCY_HINT = ( 'GCP tools are not installed. Run the following commands:\n' diff --git a/sky/exceptions.py b/sky/exceptions.py index ae717b23bfb..08206b3305e 100644 --- a/sky/exceptions.py +++ b/sky/exceptions.py @@ -46,6 +46,11 @@ def with_failover_history( return self +class InvalidCloudConfigs(Exception): + """Raised when invalid configurations are provided for a given cloud.""" + pass + + class ProvisionPrechecksError(Exception): """Raised when a spot job fails prechecks before provision. Developer note: For now this should only be used by managed diff --git a/sky/optimizer.py b/sky/optimizer.py index 77d0f8c854a..d58c183a537 100644 --- a/sky/optimizer.py +++ b/sky/optimizer.py @@ -335,7 +335,7 @@ def _estimate_nodes_cost_or_time( # mention "kubernetes cluster" and/instead of "catalog" # in the error message. enabled_clouds = global_user_state.get_enabled_clouds() - if _cloud_in_list(clouds.Kubernetes(), enabled_clouds): + if clouds.cloud_in_list(clouds.Kubernetes(), enabled_clouds): if any(orig_resources.cloud is None for orig_resources in node.resources): source_hint = 'catalog and kubernetes cluster' @@ -1083,10 +1083,6 @@ class DummyCloud(clouds.Cloud): pass -def _cloud_in_list(cloud: clouds.Cloud, lst: Iterable[clouds.Cloud]) -> bool: - return any(cloud.is_same_cloud(c) for c in lst) - - def _make_launchables_for_valid_region_zones( launchable_resources: resources_lib.Resources ) -> List[resources_lib.Resources]: @@ -1170,8 +1166,8 @@ def _fill_in_launchable_resources( if blocked_resources is None: blocked_resources = [] for resources in task.resources: - if resources.cloud is not None and not _cloud_in_list( - resources.cloud, enabled_clouds): + if (resources.cloud is not None and + not clouds.cloud_in_list(resources.cloud, enabled_clouds)): if try_fix_with_sky_check: # Explicitly check again to update the enabled cloud list. check.check(quiet=True) diff --git a/sky/skypilot_config.py b/sky/skypilot_config.py index 78fe4cf6ca5..267bf1897b6 100644 --- a/sky/skypilot_config.py +++ b/sky/skypilot_config.py @@ -44,7 +44,7 @@ import copy import os import pprint -from typing import Any, Dict, Iterable +from typing import Any, Dict, Iterable, Optional import yaml @@ -153,6 +153,11 @@ def _try_load_config() -> None: logger.debug('Config syntax check passed.') +def loaded_config_path() -> Optional[str]: + """Returns the path to the loaded config file.""" + return _loaded_config_path + + # Load on import. _try_load_config() diff --git a/sky/templates/gcp-ray.yml.j2 b/sky/templates/gcp-ray.yml.j2 index 5268b4cbf77..76c818b8aef 100644 --- a/sky/templates/gcp-ray.yml.j2 +++ b/sky/templates/gcp-ray.yml.j2 @@ -76,6 +76,9 @@ available_node_types: node_config: labels: skypilot-user: {{ user }} + {%- for tag_key, tag_value in instance_tags.items() %} + {{ tag_key }}: {{ tag_value }} + {%- endfor %} {%- if specific_reservations %} reservationAffinity: consumeReservationType: SPECIFIC_RESERVATION diff --git a/sky/utils/schemas.py b/sky/utils/schemas.py index 7d435d32a50..1d91e77c4d2 100644 --- a/sky/utils/schemas.py +++ b/sky/utils/schemas.py @@ -507,6 +507,23 @@ def get_cluster_schema(): }, } +_INSTANCE_TAGS_SCHEMA = { + 'instance_tags': { + 'type': 'object', + 'required': [], + 'additionalProperties': { + 'type': 'string', + }, + }, +} + +_REMOTE_IDENTITY_SCHEMA = { + 'remote_identity': { + 'type': 'string', + 'case_insensitive_enum': ['LOCAL_CREDENTIALS', 'SERVICE_ACCOUNT'], + } +} + def get_config_schema(): # pylint: disable=import-outside-toplevel @@ -534,97 +551,98 @@ def get_config_schema(): }, } } - - return { - '$schema': 'https://json-schema.org/draft/2020-12/schema', - 'type': 'object', - 'required': [], - 'additionalProperties': False, - 'properties': { - 'spot': controller_resources_schema, - 'serve': controller_resources_schema, - 'aws': { - 'type': 'object', - 'required': [], - 'additionalProperties': False, - 'properties': { - 'instance_tags': { - 'type': 'object', - 'required': [], - 'additionalProperties': { - 'type': 'string', - }, - }, - 'security_group_name': { + cloud_configs = { + 'aws': { + 'type': 'object', + 'required': [], + 'additionalProperties': False, + 'properties': { + 'security_group_name': { + 'type': 'string', + }, + **_INSTANCE_TAGS_SCHEMA, + **_NETWORK_CONFIG_SCHEMA, + } + }, + 'gcp': { + 'type': 'object', + 'required': [], + 'additionalProperties': False, + 'properties': { + 'specific_reservations': { + 'type': 'array', + 'items': { 'type': 'string', }, - **_NETWORK_CONFIG_SCHEMA, + }, + **_INSTANCE_TAGS_SCHEMA, + **_NETWORK_CONFIG_SCHEMA, + } + }, + 'kubernetes': { + 'type': 'object', + 'required': [], + 'additionalProperties': False, + 'properties': { + 'networking': { + 'type': 'string', + 'case_insensitive_enum': [ + type.value + for type in kubernetes_enums.KubernetesNetworkingMode + ] + }, + 'ports': { + 'type': 'string', + 'case_insensitive_enum': [ + type.value + for type in kubernetes_enums.KubernetesPortMode + ] + }, + 'pod_config': { + 'type': 'object', + 'required': [], + # Allow arbitrary keys since validating pod spec is hard + 'additionalProperties': True, } - }, - 'gcp': { + } + }, + 'oci': { + 'type': 'object', + 'required': [], + 'properties': {}, + # Properties are either 'default' or a region name. + 'additionalProperties': { 'type': 'object', 'required': [], 'additionalProperties': False, 'properties': { - 'specific_reservations': { - 'type': 'array', - 'items': { - 'type': 'string', - }, + 'compartment_ocid': { + 'type': 'string', }, - **_NETWORK_CONFIG_SCHEMA, - } - }, - 'kubernetes': { - 'type': 'object', - 'required': [], - 'additionalProperties': False, - 'properties': { - 'networking': { + 'image_tag_general': { 'type': 'string', - 'case_insensitive_enum': [ - type.value for type in - kubernetes_enums.KubernetesNetworkingMode - ] }, - 'ports': { + 'image_tag_gpu': { + 'type': 'string', + }, + 'vcn_subnet': { 'type': 'string', - 'case_insensitive_enum': [ - type.value - for type in kubernetes_enums.KubernetesPortMode - ] }, - 'pod_config': { - 'type': 'object', - 'required': [], - # Allow arbitrary keys since validating pod spec is hard - 'additionalProperties': True, - } } }, - 'oci': { - 'type': 'object', - 'required': [], - # Properties are either 'default' or a region name. - 'additionalProperties': { - 'type': 'object', - 'required': [], - 'additionalProperties': False, - 'properties': { - 'compartment_ocid': { - 'type': 'string', - }, - 'image_tag_general': { - 'type': 'string', - }, - 'image_tag_gpu': { - 'type': 'string', - }, - 'vcn_subnet': { - 'type': 'string', - }, - } - }, - }, + }, + } + + for config in cloud_configs.values(): + config['properties'].update(_REMOTE_IDENTITY_SCHEMA) + return { + '$schema': 'https://json-schema.org/draft/2020-12/schema', + 'type': 'object', + 'required': [], + 'additionalProperties': False, + 'properties': { + 'spot': controller_resources_schema, + 'serve': controller_resources_schema, + **cloud_configs, } }