Skip to content

Commit

Permalink
[k8s] Use SERVICE_ACCOUNT as default remote_identity, add checks fo…
Browse files Browse the repository at this point in the history
…r autodown support (#3527)

* Use SERVICE_ACCOUNT as default, add error for autodown when used with LOCAL_CREDENTIALS

* lint

* rename and comments

* lint

* Update sky/execution.py

Co-authored-by: Zhanghao Wu <[email protected]>

* lint

---------

Co-authored-by: Zhanghao Wu <[email protected]>
  • Loading branch information
romilbhardwaj and Michaelvll authored May 9, 2024
1 parent 07da93b commit 4683c46
Show file tree
Hide file tree
Showing 8 changed files with 41 additions and 11 deletions.
2 changes: 1 addition & 1 deletion docs/source/reference/config.rst
Original file line number Diff line number Diff line change
Expand Up @@ -319,7 +319,7 @@ Available fields and semantics:
# instances (e.g., a serve controller on GCP or AWS may need to provision
# Kubernetes resources).
#
# Default: 'LOCAL_CREDENTIALS'.
# Default: 'SERVICE_ACCOUNT'.
remote_identity: my-k8s-service-account
# Attach custom metadata to Kubernetes objects created by SkyPilot
Expand Down
2 changes: 1 addition & 1 deletion sky/backends/backend_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -804,7 +804,7 @@ def write_cluster_config(
excluded_clouds = []
remote_identity = skypilot_config.get_nested(
(str(cloud).lower(), 'remote_identity'),
schemas.REMOTE_IDENTITY_DEFAULT)
schemas.get_default_remote_identity(str(cloud).lower()))
if remote_identity is not None and not isinstance(remote_identity, str):
for profile in remote_identity:
if fnmatch.fnmatchcase(cluster_name, list(profile.keys())[0]):
Expand Down
1 change: 1 addition & 0 deletions sky/clouds/cloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ class CloudImplementationFeatures(enum.Enum):
OPEN_PORTS = 'open_ports'
STORAGE_MOUNTING = 'storage_mounting'
HOST_CONTROLLERS = 'host_controllers' # Can run jobs/serve controllers
AUTO_TERMINATE = 'auto_terminate' # Pod/VM can stop or down itself


class Region(collections.namedtuple('Region', ['name'])):
Expand Down
7 changes: 6 additions & 1 deletion sky/clouds/kubernetes.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,8 +97,12 @@ def _unsupported_features_for_resources(
is_exec_auth, message = kubernetes_utils.is_kubeconfig_exec_auth()
if is_exec_auth:
assert isinstance(message, str), message
# Controllers cannot spin up new pods with exec auth.
unsupported_features[
clouds.CloudImplementationFeatures.HOST_CONTROLLERS] = message
# Pod does not have permissions to terminate itself with exec auth.
unsupported_features[
clouds.CloudImplementationFeatures.AUTO_TERMINATE] = message
return unsupported_features

@classmethod
Expand Down Expand Up @@ -270,7 +274,8 @@ def make_deploy_resources_variables(
port_mode = network_utils.get_port_mode(None)

remote_identity = skypilot_config.get_nested(
('kubernetes', 'remote_identity'), schemas.REMOTE_IDENTITY_DEFAULT)
('kubernetes', 'remote_identity'),
schemas.get_default_remote_identity('kubernetes'))
if (remote_identity ==
schemas.RemoteIdentityOptions.LOCAL_CREDENTIALS.value):
# SA name doesn't matter since automounting credentials is disabled
Expand Down
13 changes: 13 additions & 0 deletions sky/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -488,6 +488,19 @@ def autostop(
f' {_stop_not_supported_message(handle.launched_resources)}.'
) from e

# Check if autodown is required and supported
if not is_cancel:
try:
cloud.check_features_are_supported(
handle.launched_resources,
{clouds.CloudImplementationFeatures.AUTO_TERMINATE})
except exceptions.NotSupportedError as e:
raise exceptions.NotSupportedError(
f'{colorama.Fore.YELLOW}{operation} on cluster '
f'{cluster_name!r}...skipped.{colorama.Style.RESET_ALL}\n'
f' Auto{option_str} is not supported on {cloud!r} - '
f'see reason above.') from e

usage_lib.record_cluster_name_for_current_operation(cluster_name)
backend.set_autostop(handle, idle_minutes, down)

Expand Down
8 changes: 6 additions & 2 deletions sky/execution.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,8 +207,12 @@ def _execute(
f'{colorama.Style.RESET_ALL}')
idle_minutes_to_autostop = 1
stages.remove(Stage.DOWN)
if not down:
requested_features.add(clouds.CloudImplementationFeatures.STOP)
if idle_minutes_to_autostop >= 0:
requested_features.add(
clouds.CloudImplementationFeatures.AUTO_TERMINATE)
if not down:
requested_features.add(
clouds.CloudImplementationFeatures.STOP)
# NOTE: in general we may not have sufficiently specified info
# (cloud/resource) to check STOP_SPOT_INSTANCE here. This is checked in
# the backend.
Expand Down
12 changes: 7 additions & 5 deletions sky/provision/kubernetes/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -634,16 +634,18 @@ def is_kubeconfig_exec_auth() -> Tuple[bool, Optional[str]]:
user for user in user_details if user['name'] == target_username)

remote_identity = skypilot_config.get_nested(
('kubernetes', 'remote_identity'), schemas.REMOTE_IDENTITY_DEFAULT)
('kubernetes', 'remote_identity'),
schemas.get_default_remote_identity('kubernetes'))
if ('exec' in user_details.get('user', {}) and remote_identity
== schemas.RemoteIdentityOptions.LOCAL_CREDENTIALS.value):
ctx_name = current_context['name']
exec_msg = ('exec-based authentication is used for '
f'Kubernetes context {ctx_name!r}.'
' This may cause issues when running Managed Jobs '
'or SkyServe controller on Kubernetes. To fix, configure '
'SkyPilot to create a service account for running pods by '
'adding the following in ~/.sky/config.yaml:\n'
' This may cause issues with autodown or when running '
'Managed Jobs or SkyServe controller on Kubernetes. '
'To fix, configure SkyPilot to create a service account '
'for running pods by setting the following in '
'~/.sky/config.yaml:\n'
' kubernetes:\n'
' remote_identity: SERVICE_ACCOUNT\n'
' More: https://skypilot.readthedocs.io/en/latest/'
Expand Down
7 changes: 6 additions & 1 deletion sky/utils/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -535,7 +535,12 @@ class RemoteIdentityOptions(enum.Enum):
SERVICE_ACCOUNT = 'SERVICE_ACCOUNT'


REMOTE_IDENTITY_DEFAULT = RemoteIdentityOptions.LOCAL_CREDENTIALS.value
def get_default_remote_identity(cloud: str) -> str:
"""Get the default remote identity for the specified cloud."""
if cloud == 'kubernetes':
return RemoteIdentityOptions.SERVICE_ACCOUNT.value
return RemoteIdentityOptions.LOCAL_CREDENTIALS.value


_REMOTE_IDENTITY_SCHEMA = {
'remote_identity': {
Expand Down

0 comments on commit 4683c46

Please sign in to comment.