Skip to content

Commit

Permalink
[Core] Add option to avoid uploading credentials and custom labels fo…
Browse files Browse the repository at this point in the history
…r GCP (#2904)

* Avoid uploading crednetials to the clouds

* Fix permission for service account

* Add labels for GCP and config for authentication method

* reduce the instance size for file mounts

* address comments

* Update docs/source/reference/config.rst

Co-authored-by: Zongheng Yang <[email protected]>

* Update sky/backends/backend_utils.py

Co-authored-by: Zongheng Yang <[email protected]>

* Update sky/backends/cloud_vm_ray_backend.py

Co-authored-by: Zongheng Yang <[email protected]>

* remove provisioner

* Refactor cloud in list

* utility

* update remote_identity for GCP

* update config.rst

---------

Co-authored-by: Zongheng Yang <[email protected]>
  • Loading branch information
Michaelvll and concretevitamin authored Feb 12, 2024
1 parent bd8c2ea commit 2f0432b
Show file tree
Hide file tree
Showing 14 changed files with 240 additions and 95 deletions.
59 changes: 59 additions & 0 deletions docs/source/reference/config.rst
Original file line number Diff line number Diff line change
Expand Up @@ -109,9 +109,43 @@ Available fields and semantics:
# permission to create a security group.
security_group_name: my-security-group
# Identity to use for all AWS instances (optional).
#
# LOCAL_CREDENTIALS: The user's local credential files will be uploaded to
# AWS instances created by SkyPilot. They are used for accessing cloud
# resources (e.g., private buckets) or launching new instances (e.g., for
# spot/serve controllers).
#
# SERVICE_ACCOUNT: Local credential files are not uploaded to AWS
# instances. SkyPilot will auto-create and reuse a service account (IAM
# role) for AWS instances.
#
# Two caveats of SERVICE_ACCOUNT for multicloud users:
#
# - This only affects AWS instances. Local AWS credentials will still be
# uploaded to non-AWS instances (since those instances may need to access
# AWS resources).
# - If the SkyPilot spot/serve controller is on AWS, this setting will make
# non-AWS managed spot jobs / non-AWS service replicas fail to access any
# resources on AWS (since the controllers don't have AWS credential
# files to assign to these non-AWS instances).
#
# Default: 'LOCAL_CREDENTIALS'.
remote_identity: LOCAL_CREDENTIALS
# Advanced GCP configurations (optional).
# Apply to all new instances but not existing ones.
gcp:
# Labels to assign to all instances launched by SkyPilot (optional).
#
# Example use case: cost tracking by user/team/project.
#
# Users should guarantee that these key-values are valid GCP labels, otherwise
# errors from the cloud provider will be surfaced.
instance_tags:
Owner: user-unique-name
my-tag: my-value
# VPC to use (optional).
#
# Default: null, which implies the following behavior. First, all existing
Expand Down Expand Up @@ -165,6 +199,31 @@ Available fields and semantics:
- projects/my-project/reservations/my-reservation1
- projects/my-project/reservations/my-reservation2
# Identity to use for all GCP instances (optional).
#
# LOCAL_CREDENTIALS: The user's local credential files will be uploaded to
# GCP instances created by SkyPilot. They are used for accessing cloud
# resources (e.g., private buckets) or launching new instances (e.g., for
# spot/serve controllers).
#
# SERVICE_ACCOUNT: Local credential files are not uploaded to GCP
# instances. SkyPilot will auto-create and reuse a service account for GCP
# instances.
#
# Two caveats of SERVICE_ACCOUNT for multicloud users:
#
# - This only affects GCP instances. Local GCP credentials will still be
# uploaded to non-GCP instances (since those instances may need to access
# GCP resources).
# - If the SkyPilot spot/serve controller is on GCP, this setting will make
# non-GCP managed spot jobs / non-GCP service replicas fail to access any
# resources on GCP (since the controllers don't have GCP credential
# files to assign to these non-GCP instances).
#
# Default: 'LOCAL_CREDENTIALS'.
remote_identity: LOCAL_CREDENTIALS
# Advanced Kubernetes configurations (optional).
kubernetes:
# The networking mode for accessing SSH jump pod (optional).
Expand Down
1 change: 1 addition & 0 deletions examples/using_file_mounts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

resources:
cloud: aws
cpus: 2+

workdir: .

Expand Down
24 changes: 19 additions & 5 deletions sky/backends/backend_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -756,6 +756,10 @@ def write_cluster_config(
not appear in the catalog, or an ssh_proxy_command is specified but
not for the given region, or GPUs are requested in a Kubernetes
cluster but the cluster does not have nodes labeled with GPU types.
exceptions.InvalidCloudConfigs: if the user specifies some config for the
cloud that is not valid, e.g. remote_identity: SERVICE_ACCOUNT
for a cloud that does not support it, the caller should skip the
cloud in this case.
"""
# task.best_resources may not be equal to to_provision if the user
# is running a job with less resources than the cluster has.
Expand Down Expand Up @@ -786,7 +790,18 @@ def write_cluster_config(
(str(to_provision.cloud).lower(), 'specific_reservations'), set()))

assert cluster_name is not None
credentials = sky_check.get_cloud_credential_file_mounts()
excluded_clouds = []
remote_identity = skypilot_config.get_nested(
(str(cloud).lower(), 'remote_identity'), 'LOCAL_CREDENTIALS')
if remote_identity == 'SERVICE_ACCOUNT':
if not cloud.supports_service_account_on_remote():
raise exceptions.InvalidCloudConfigs(
'remote_identity: SERVICE_ACCOUNT is specified in '
f'{skypilot_config.loaded_config_path!r} for {cloud}, but it '
'is not supported by this cloud. Remove the config or set: '
'`remote_identity: LOCAL_CREDENTIALS`.')
excluded_clouds = [cloud]
credentials = sky_check.get_cloud_credential_file_mounts(excluded_clouds)

ip_list = None
auth_config = {'ssh_private_key': auth.PRIVATE_SSH_KEY_PATH}
Expand Down Expand Up @@ -828,10 +843,9 @@ def write_cluster_config(
instance_tags = {}
instance_tags = skypilot_config.get_nested(
(str(cloud).lower(), 'instance_tags'), {})
if not isinstance(instance_tags, dict):
with ux_utils.print_exception_no_traceback():
raise ValueError('Custom instance_tags in config.yaml should '
f'be a dict, but received {type(instance_tags)}.')
# instance_tags is a dict, which is guaranteed by the type check in
# schemas.py
assert isinstance(instance_tags, dict), instance_tags

# Dump the Ray ports to a file for Ray job submission
dump_port_command = (
Expand Down
11 changes: 11 additions & 0 deletions sky/backends/cloud_vm_ray_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -1507,6 +1507,17 @@ def _retry_zones(
# does not have nodes labeled with GPU types.
logger.info(f'{e}')
continue
except exceptions.InvalidCloudConfigs as e:
# Failed due to invalid user configs in ~/.sky/config.yaml.
logger.warning(f'{common_utils.format_exception(e)}')
# We should block the entire cloud if the user config is
# invalid.
_add_to_blocked_resources(
self._blocked_resources,
to_provision.copy(region=None, zone=None))
raise exceptions.ResourcesUnavailableError(
f'Failed to provision on cloud {to_provision.cloud} due to '
f'invalid cloud config: {common_utils.format_exception(e)}')
if dryrun:
return config_dict
cluster_config_file = config_dict['ray']
Expand Down
8 changes: 6 additions & 2 deletions sky/check.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""Credential checks: check cloud credentials and enable clouds."""
from typing import Dict
from typing import Dict, Iterable, Optional

import click

Expand Down Expand Up @@ -72,7 +72,8 @@ def check(quiet: bool = False, verbose: bool = False) -> None:
global_user_state.set_enabled_clouds(enabled_clouds)


def get_cloud_credential_file_mounts() -> Dict[str, str]:
def get_cloud_credential_file_mounts(
excluded_clouds: Optional[Iterable[clouds.Cloud]]) -> Dict[str, str]:
"""Returns the files necessary to access all enabled clouds.
Returns a dictionary that will be added to a task's file mounts
Expand All @@ -81,6 +82,9 @@ def get_cloud_credential_file_mounts() -> Dict[str, str]:
enabled_clouds = global_user_state.get_enabled_clouds()
file_mounts = {}
for cloud in enabled_clouds:
if (excluded_clouds is not None and
clouds.cloud_in_list(cloud, excluded_clouds)):
continue
cloud_file_mounts = cloud.get_credential_file_mounts()
file_mounts.update(cloud_file_mounts)
# Currently, get_enabled_clouds() does not support r2
Expand Down
3 changes: 3 additions & 0 deletions sky/clouds/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Clouds in Sky."""
from sky.clouds.cloud import Cloud
from sky.clouds.cloud import cloud_in_list
from sky.clouds.cloud import CloudImplementationFeatures
from sky.clouds.cloud import ProvisionerVersion
from sky.clouds.cloud import Region
Expand Down Expand Up @@ -42,4 +43,6 @@
'CLOUD_REGISTRY',
'ProvisionerVersion',
'StatusVersion',
# Utility functions
'cloud_in_list',
]
2 changes: 2 additions & 0 deletions sky/clouds/aws.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,8 @@ class AWS(clouds.Cloud):
# Reference: https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/Using_Tags.html # pylint: disable=line-too-long
_MAX_CLUSTER_NAME_LEN_LIMIT = 248

_SUPPORTS_SERVICE_ACCOUNT_ON_REMOTE = True

_regions: List[clouds.Region] = []

_INDENT_PREFIX = ' '
Expand Down
26 changes: 24 additions & 2 deletions sky/clouds/cloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import collections
import enum
import typing
from typing import Dict, Iterator, List, Optional, Set, Tuple
from typing import Dict, Iterable, Iterator, List, Optional, Set, Tuple

from sky import exceptions
from sky import skypilot_config
Expand Down Expand Up @@ -97,6 +97,7 @@ class Cloud:
_DEFAULT_DISK_TIER = resources_utils.DiskTier.MEDIUM
_BEST_DISK_TIER = resources_utils.DiskTier.HIGH
_SUPPORTED_DISK_TIERS = {resources_utils.DiskTier.BEST}
_SUPPORTS_SERVICE_ACCOUNT_ON_REMOTE = False

# The version of provisioner and status query. This is used to determine
# the code path to use for each cloud in the backend.
Expand All @@ -115,6 +116,21 @@ def max_cluster_name_length(cls) -> Optional[int]:
"""
return None

@classmethod
def supports_service_account_on_remote(cls) -> bool:
"""Returns whether the cloud supports service account on remote cluster.
This method is used by backend_utils.write_cluster_config() to decide
whether to upload user's local cloud credential files to the remote
cluster.
If a cloud supports service account on remote cluster, the user's local
cloud credential files are not needed to be uploaded to the remote
instance, as the remote instance can be assigned with a service account
that has the necessary permissions to access the cloud resources.
"""
return cls._SUPPORTS_SERVICE_ACCOUNT_ON_REMOTE

#### Regions/Zones ####

@classmethod
Expand Down Expand Up @@ -228,7 +244,7 @@ def get_egress_cost(self, num_gigabytes: float):
"""
raise NotImplementedError

def is_same_cloud(self, other):
def is_same_cloud(self, other: 'Cloud'):
raise NotImplementedError

def make_deploy_resources_variables(
Expand Down Expand Up @@ -725,3 +741,9 @@ def __getstate__(self):
state.pop('PROVISIONER_VERSION', None)
state.pop('STATUS_VERSION', None)
return state


# === Helper functions ===
def cloud_in_list(cloud: Cloud, cloud_list: Iterable[Cloud]) -> bool:
"""Returns whether the cloud is in the given cloud list."""
return any(cloud.is_same_cloud(c) for c in cloud_list)
2 changes: 2 additions & 0 deletions sky/clouds/gcp.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,8 @@ class GCP(clouds.Cloud):
# lower limit.
_MAX_CLUSTER_NAME_LEN_LIMIT = 35

_SUPPORTS_SERVICE_ACCOUNT_ON_REMOTE = True

_INDENT_PREFIX = ' '
_DEPENDENCY_HINT = (
'GCP tools are not installed. Run the following commands:\n'
Expand Down
5 changes: 5 additions & 0 deletions sky/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,11 @@ def with_failover_history(
return self


class InvalidCloudConfigs(Exception):
"""Raised when invalid configurations are provided for a given cloud."""
pass


class ProvisionPrechecksError(Exception):
"""Raised when a spot job fails prechecks before provision.
Developer note: For now this should only be used by managed
Expand Down
10 changes: 3 additions & 7 deletions sky/optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -335,7 +335,7 @@ def _estimate_nodes_cost_or_time(
# mention "kubernetes cluster" and/instead of "catalog"
# in the error message.
enabled_clouds = global_user_state.get_enabled_clouds()
if _cloud_in_list(clouds.Kubernetes(), enabled_clouds):
if clouds.cloud_in_list(clouds.Kubernetes(), enabled_clouds):
if any(orig_resources.cloud is None
for orig_resources in node.resources):
source_hint = 'catalog and kubernetes cluster'
Expand Down Expand Up @@ -1083,10 +1083,6 @@ class DummyCloud(clouds.Cloud):
pass


def _cloud_in_list(cloud: clouds.Cloud, lst: Iterable[clouds.Cloud]) -> bool:
return any(cloud.is_same_cloud(c) for c in lst)


def _make_launchables_for_valid_region_zones(
launchable_resources: resources_lib.Resources
) -> List[resources_lib.Resources]:
Expand Down Expand Up @@ -1170,8 +1166,8 @@ def _fill_in_launchable_resources(
if blocked_resources is None:
blocked_resources = []
for resources in task.resources:
if resources.cloud is not None and not _cloud_in_list(
resources.cloud, enabled_clouds):
if (resources.cloud is not None and
not clouds.cloud_in_list(resources.cloud, enabled_clouds)):
if try_fix_with_sky_check:
# Explicitly check again to update the enabled cloud list.
check.check(quiet=True)
Expand Down
7 changes: 6 additions & 1 deletion sky/skypilot_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
import copy
import os
import pprint
from typing import Any, Dict, Iterable
from typing import Any, Dict, Iterable, Optional

import yaml

Expand Down Expand Up @@ -153,6 +153,11 @@ def _try_load_config() -> None:
logger.debug('Config syntax check passed.')


def loaded_config_path() -> Optional[str]:
"""Returns the path to the loaded config file."""
return _loaded_config_path


# Load on import.
_try_load_config()

Expand Down
3 changes: 3 additions & 0 deletions sky/templates/gcp-ray.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,9 @@ available_node_types:
node_config:
labels:
skypilot-user: {{ user }}
{%- for tag_key, tag_value in instance_tags.items() %}
{{ tag_key }}: {{ tag_value }}
{%- endfor %}
{%- if specific_reservations %}
reservationAffinity:
consumeReservationType: SPECIFIC_RESERVATION
Expand Down
Loading

0 comments on commit 2f0432b

Please sign in to comment.