diff --git a/docs/source/reference/config.rst b/docs/source/reference/config.rst index e7594142331..53c983edfad 100644 --- a/docs/source/reference/config.rst +++ b/docs/source/reference/config.rst @@ -319,7 +319,7 @@ Available fields and semantics: # instances (e.g., a serve controller on GCP or AWS may need to provision # Kubernetes resources). # - # Default: 'LOCAL_CREDENTIALS'. + # Default: 'SERVICE_ACCOUNT'. remote_identity: my-k8s-service-account # Attach custom metadata to Kubernetes objects created by SkyPilot diff --git a/sky/backends/backend_utils.py b/sky/backends/backend_utils.py index 5a9663b1275..cf43cfdf2ed 100644 --- a/sky/backends/backend_utils.py +++ b/sky/backends/backend_utils.py @@ -804,7 +804,7 @@ def write_cluster_config( excluded_clouds = [] remote_identity = skypilot_config.get_nested( (str(cloud).lower(), 'remote_identity'), - schemas.REMOTE_IDENTITY_DEFAULT) + schemas.get_default_remote_identity(str(cloud).lower())) if remote_identity is not None and not isinstance(remote_identity, str): for profile in remote_identity: if fnmatch.fnmatchcase(cluster_name, list(profile.keys())[0]): diff --git a/sky/clouds/cloud.py b/sky/clouds/cloud.py index 889e6716074..08045e28ab9 100644 --- a/sky/clouds/cloud.py +++ b/sky/clouds/cloud.py @@ -43,6 +43,7 @@ class CloudImplementationFeatures(enum.Enum): OPEN_PORTS = 'open_ports' STORAGE_MOUNTING = 'storage_mounting' HOST_CONTROLLERS = 'host_controllers' # Can run jobs/serve controllers + AUTO_TERMINATE = 'auto_terminate' # Pod/VM can stop or down itself class Region(collections.namedtuple('Region', ['name'])): diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py index 5740e0ed9b1..f50050716a8 100644 --- a/sky/clouds/kubernetes.py +++ b/sky/clouds/kubernetes.py @@ -97,8 +97,12 @@ def _unsupported_features_for_resources( is_exec_auth, message = kubernetes_utils.is_kubeconfig_exec_auth() if is_exec_auth: assert isinstance(message, str), message + # Controllers cannot spin up new pods with exec auth. unsupported_features[ clouds.CloudImplementationFeatures.HOST_CONTROLLERS] = message + # Pod does not have permissions to terminate itself with exec auth. + unsupported_features[ + clouds.CloudImplementationFeatures.AUTO_TERMINATE] = message return unsupported_features @classmethod @@ -270,7 +274,8 @@ def make_deploy_resources_variables( port_mode = network_utils.get_port_mode(None) remote_identity = skypilot_config.get_nested( - ('kubernetes', 'remote_identity'), schemas.REMOTE_IDENTITY_DEFAULT) + ('kubernetes', 'remote_identity'), + schemas.get_default_remote_identity('kubernetes')) if (remote_identity == schemas.RemoteIdentityOptions.LOCAL_CREDENTIALS.value): # SA name doesn't matter since automounting credentials is disabled diff --git a/sky/core.py b/sky/core.py index b5ecc483354..b1006fe19ab 100644 --- a/sky/core.py +++ b/sky/core.py @@ -488,6 +488,19 @@ def autostop( f' {_stop_not_supported_message(handle.launched_resources)}.' ) from e + # Check if autodown is required and supported + if not is_cancel: + try: + cloud.check_features_are_supported( + handle.launched_resources, + {clouds.CloudImplementationFeatures.AUTO_TERMINATE}) + except exceptions.NotSupportedError as e: + raise exceptions.NotSupportedError( + f'{colorama.Fore.YELLOW}{operation} on cluster ' + f'{cluster_name!r}...skipped.{colorama.Style.RESET_ALL}\n' + f' Auto{option_str} is not supported on {cloud!r} - ' + f'see reason above.') from e + usage_lib.record_cluster_name_for_current_operation(cluster_name) backend.set_autostop(handle, idle_minutes, down) diff --git a/sky/execution.py b/sky/execution.py index 2cffc5a7d09..b1fb4ec4164 100644 --- a/sky/execution.py +++ b/sky/execution.py @@ -207,8 +207,12 @@ def _execute( f'{colorama.Style.RESET_ALL}') idle_minutes_to_autostop = 1 stages.remove(Stage.DOWN) - if not down: - requested_features.add(clouds.CloudImplementationFeatures.STOP) + if idle_minutes_to_autostop >= 0: + requested_features.add( + clouds.CloudImplementationFeatures.AUTO_TERMINATE) + if not down: + requested_features.add( + clouds.CloudImplementationFeatures.STOP) # NOTE: in general we may not have sufficiently specified info # (cloud/resource) to check STOP_SPOT_INSTANCE here. This is checked in # the backend. diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index 3b3608947ad..8ccf64201c7 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -634,16 +634,18 @@ def is_kubeconfig_exec_auth() -> Tuple[bool, Optional[str]]: user for user in user_details if user['name'] == target_username) remote_identity = skypilot_config.get_nested( - ('kubernetes', 'remote_identity'), schemas.REMOTE_IDENTITY_DEFAULT) + ('kubernetes', 'remote_identity'), + schemas.get_default_remote_identity('kubernetes')) if ('exec' in user_details.get('user', {}) and remote_identity == schemas.RemoteIdentityOptions.LOCAL_CREDENTIALS.value): ctx_name = current_context['name'] exec_msg = ('exec-based authentication is used for ' f'Kubernetes context {ctx_name!r}.' - ' This may cause issues when running Managed Jobs ' - 'or SkyServe controller on Kubernetes. To fix, configure ' - 'SkyPilot to create a service account for running pods by ' - 'adding the following in ~/.sky/config.yaml:\n' + ' This may cause issues with autodown or when running ' + 'Managed Jobs or SkyServe controller on Kubernetes. ' + 'To fix, configure SkyPilot to create a service account ' + 'for running pods by setting the following in ' + '~/.sky/config.yaml:\n' ' kubernetes:\n' ' remote_identity: SERVICE_ACCOUNT\n' ' More: https://skypilot.readthedocs.io/en/latest/' diff --git a/sky/utils/schemas.py b/sky/utils/schemas.py index 518d5edf07a..c50e15185a3 100644 --- a/sky/utils/schemas.py +++ b/sky/utils/schemas.py @@ -535,7 +535,12 @@ class RemoteIdentityOptions(enum.Enum): SERVICE_ACCOUNT = 'SERVICE_ACCOUNT' -REMOTE_IDENTITY_DEFAULT = RemoteIdentityOptions.LOCAL_CREDENTIALS.value +def get_default_remote_identity(cloud: str) -> str: + """Get the default remote identity for the specified cloud.""" + if cloud == 'kubernetes': + return RemoteIdentityOptions.SERVICE_ACCOUNT.value + return RemoteIdentityOptions.LOCAL_CREDENTIALS.value + _REMOTE_IDENTITY_SCHEMA = { 'remote_identity': {