From 10340f86a4991e58f678025d55f33dce34c45a5b Mon Sep 17 00:00:00 2001
From: Romil Bhardwaj <romil.bhardwaj@berkeley.edu>
Date: Tue, 7 May 2024 00:51:38 -0700
Subject: [PATCH] [k8s] Add support for autoscaling kubernetes clusters (#3513)

* Add Karpenter label formatter.

* add autoscaler support

* lint

* lint

* comments

* comments

* lint
---
 docs/source/reference/config.rst  | 19 ++++++++++
 sky/cli.py                        | 21 +++++++++--
 sky/clouds/kubernetes.py          | 22 +++++++----
 sky/provision/kubernetes/utils.py | 63 ++++++++++++++++++++++++++++---
 sky/utils/kubernetes_enums.py     |  7 ++++
 sky/utils/schemas.py              |  7 ++++
 6 files changed, 121 insertions(+), 18 deletions(-)

diff --git a/docs/source/reference/config.rst b/docs/source/reference/config.rst
index 3c1c02dd659..641ebede5e5 100644
--- a/docs/source/reference/config.rst
+++ b/docs/source/reference/config.rst
@@ -326,6 +326,25 @@ Available fields and semantics:
     # Default: 10 seconds
     provision_timeout: 10
 
+    # Autoscaler configured in the Kubernetes cluster (optional)
+    #
+    # This field informs SkyPilot about the cluster autoscaler used in the
+    # Kubernetes cluster. Setting this field disables pre-launch checks for
+    # GPU capacity in the cluster and SkyPilot relies on the autoscaler to
+    # provision nodes with the required GPU capacity.
+    #
+    # Remember to set provision_timeout accordingly when using an autoscaler.
+    #
+    # Supported values: gke, karpenter, generic
+    #   gke: uses cloud.google.com/gke-accelerator label to identify GPUs on nodes
+    #   karpenter: uses karpenter.k8s.aws/instance-gpu-name label to identify GPUs on nodes
+    #   generic: uses skypilot.co/accelerator labels to identify GPUs on nodes
+    # Refer to https://skypilot.readthedocs.io/en/latest/reference/kubernetes/kubernetes-setup.html#setting-up-gpu-support
+    # for more details on setting up labels for GPU support.
+    #
+    # Default: null (no autoscaler, autodetect label format for GPU nodes)
+    autoscaler: gke
+
     # Additional fields to override the pod fields used by SkyPilot (optional)
     #
     # Any key:value pairs added here would get added to the pod spec used to
diff --git a/sky/cli.py b/sky/cli.py
index 485703e4caf..8d60de53e87 100644
--- a/sky/cli.py
+++ b/sky/cli.py
@@ -2998,6 +2998,11 @@ def _output():
 
         name, quantity = None, None
 
+        # Kubernetes specific bools
+        cloud_is_kubernetes = isinstance(cloud_obj, clouds.Kubernetes)
+        kubernetes_autoscaling = kubernetes_utils.get_autoscaler_type(
+        ) is not None
+
         if accelerator_str is None:
             result = service_catalog.list_accelerator_counts(
                 gpus_only=True,
@@ -3005,16 +3010,17 @@ def _output():
                 region_filter=region,
             )
 
-            if (len(result) == 0 and cloud_obj is not None and
-                    cloud_obj.is_same_cloud(clouds.Kubernetes())):
+            if len(result) == 0 and cloud_is_kubernetes:
                 yield kubernetes_utils.NO_GPU_ERROR_MESSAGE
+                if kubernetes_autoscaling:
+                    yield '\n'
+                    yield kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE
                 return
 
             # "Common" GPUs
             # If cloud is kubernetes, we want to show all GPUs here, even if
             # they are not listed as common in SkyPilot.
-            if (cloud_obj is not None and
-                    cloud_obj.is_same_cloud(clouds.Kubernetes())):
+            if cloud_is_kubernetes:
                 for gpu, _ in sorted(result.items()):
                     gpu_table.add_row([gpu, _list_to_str(result.pop(gpu))])
             else:
@@ -3038,9 +3044,16 @@ def _output():
                     other_table.add_row([gpu, _list_to_str(qty)])
                 yield from other_table.get_string()
                 yield '\n\n'
+                if (cloud_is_kubernetes or
+                        cloud is None) and kubernetes_autoscaling:
+                    yield kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE
+                    yield '\n\n'
             else:
                 yield ('\n\nHint: use -a/--all to see all accelerators '
                        '(including non-common ones) and pricing.')
+                if (cloud_is_kubernetes or
+                        cloud is None) and kubernetes_autoscaling:
+                    yield kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE
                 return
         else:
             # Parse accelerator string
diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py
index 9777a28948b..be9111feac5 100644
--- a/sky/clouds/kubernetes.py
+++ b/sky/clouds/kubernetes.py
@@ -337,14 +337,20 @@ def _make(instance_list):
                     gpu_task_cpus, gpu_task_memory, acc_count, acc_type).name)
 
         # Check if requested instance type will fit in the cluster.
-        # TODO(romilb): This will fail early for autoscaling clusters.
-        fits, reason = kubernetes_utils.check_instance_fits(
-            chosen_instance_type)
-        if not fits:
-            logger.debug(f'Instance type {chosen_instance_type} does '
-                         'not fit in the Kubernetes cluster. '
-                         f'Reason: {reason}')
-            return [], []
+        autoscaler_type = kubernetes_utils.get_autoscaler_type()
+        if autoscaler_type is None:
+            # If autoscaler is not set, check if the instance type fits in the
+            # cluster. Else, rely on the autoscaler to provision the right
+            # instance type without running checks. Worst case, if autoscaling
+            # fails, the pod will be stuck in pending state until
+            # provision_timeout, after which failover will be triggered.
+            fits, reason = kubernetes_utils.check_instance_fits(
+                chosen_instance_type)
+            if not fits:
+                logger.debug(f'Instance type {chosen_instance_type} does '
+                             'not fit in the Kubernetes cluster. '
+                             f'Reason: {reason}')
+                return [], []
 
         # No fuzzy lists for Kubernetes
         return _make([chosen_instance_type]), []
diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py
index c7c19680e07..b0b27f121fe 100644
--- a/sky/provision/kubernetes/utils.py
+++ b/sky/provision/kubernetes/utils.py
@@ -35,6 +35,12 @@
 (e.g., skypilot.co/accelerator) are setup correctly. \
 To further debug, run: sky check.'
 
+KUBERNETES_AUTOSCALER_NOTE = (
+    'Note: Kubernetes cluster autoscaling is enabled. '
+    'All GPUs that can be provisioned may not be listed '
+    'here. Refer to your autoscaler\'s node pool '
+    'configuration to see the list of supported GPUs.')
+
 # TODO(romilb): Add links to docs for configuration instructions when ready.
 ENDPOINTS_DEBUG_MESSAGE = ('Additionally, make sure your {endpoint_type} '
                            'is configured correctly. '
@@ -178,13 +184,31 @@ def get_accelerator_from_label_value(cls, value: str) -> str:
                 f'Invalid accelerator name in GKE cluster: {value}')
 
 
+class KarpenterLabelFormatter(SkyPilotLabelFormatter):
+    """Karpeneter label formatter
+    Karpenter uses the label `karpenter.k8s.aws/instance-gpu-name` to identify
+    the GPU type. Details: https://karpenter.sh/docs/reference/instance-types/
+    The naming scheme is same as the SkyPilot formatter, so we inherit from it.
+    """
+    LABEL_KEY = 'karpenter.k8s.aws/instance-gpu-name'
+
+
 # LABEL_FORMATTER_REGISTRY stores the label formats SkyPilot will try to
 # discover the accelerator type from. The order of the list is important, as
-# it will be used to determine the priority of the label formats.
+# it will be used to determine the priority of the label formats when
+# auto-detecting the GPU label type.
 LABEL_FORMATTER_REGISTRY = [
-    SkyPilotLabelFormatter, CoreWeaveLabelFormatter, GKELabelFormatter
+    SkyPilotLabelFormatter, CoreWeaveLabelFormatter, GKELabelFormatter,
+    KarpenterLabelFormatter
 ]
 
+# Mapping of autoscaler type to label formatter
+AUTOSCALER_TO_LABEL_FORMATTER = {
+    kubernetes_enums.KubernetesAutoscalerType.GKE: GKELabelFormatter,
+    kubernetes_enums.KubernetesAutoscalerType.KARPENTER: KarpenterLabelFormatter,  # pylint: disable=line-too-long
+    kubernetes_enums.KubernetesAutoscalerType.GENERIC: SkyPilotLabelFormatter,
+}
+
 
 def detect_gpu_label_formatter(
 ) -> Tuple[Optional[GPULabelFormatter], Dict[str, List[Tuple[str, str]]]]:
@@ -348,10 +372,26 @@ def get_gpu_label_key_value(acc_type: str, check_mode=False) -> Tuple[str, str]:
     # Check if the cluster has GPU resources
     # TODO(romilb): This assumes the accelerator is a nvidia GPU. We
     #  need to support TPUs and other accelerators as well.
-    # TODO(romilb): This will fail early for autoscaling clusters.
-    #  For AS clusters, we may need a way for users to specify GPU node pools
-    #  to use since the cluster may be scaling up from zero nodes and may not
-    #  have any GPU nodes yet.
+    # TODO(romilb): Currently, we broadly disable all GPU checks if autoscaling
+    #  is configured in config.yaml since the cluster may be scaling up from
+    #  zero nodes and may not have any GPU nodes yet. In the future, we should
+    #  support pollingthe clusters for autoscaling information, such as the
+    #  node pools configured etc.
+
+    autoscaler_type = get_autoscaler_type()
+    if autoscaler_type is not None:
+        # If autoscaler is set in config.yaml, override the label key and value
+        # to the autoscaler's format and bypass the GPU checks.
+        if check_mode:
+            # If check mode is enabled and autoscaler is set, we can return
+            # early since we assume the cluster autoscaler will handle GPU
+            # node provisioning.
+            return '', ''
+        formatter = AUTOSCALER_TO_LABEL_FORMATTER.get(autoscaler_type)
+        assert formatter is not None, ('Unsupported autoscaler type:'
+                                       f' {autoscaler_type}')
+        return formatter.get_label_key(), formatter.get_label_value(acc_type)
+
     has_gpus, cluster_resources = detect_gpu_resource()
     if has_gpus:
         # Check if the cluster has GPU labels setup correctly
@@ -1310,3 +1350,14 @@ def get_head_pod_name(cluster_name_on_cloud: str):
     # label, but since we know the naming convention, we can directly return the
     # head pod name.
     return f'{cluster_name_on_cloud}-head'
+
+
+def get_autoscaler_type(
+) -> Optional[kubernetes_enums.KubernetesAutoscalerType]:
+    """Returns the autoscaler type by reading from config"""
+    autoscaler_type = skypilot_config.get_nested(['kubernetes', 'autoscaler'],
+                                                 None)
+    if autoscaler_type is not None:
+        autoscaler_type = kubernetes_enums.KubernetesAutoscalerType(
+            autoscaler_type)
+    return autoscaler_type
diff --git a/sky/utils/kubernetes_enums.py b/sky/utils/kubernetes_enums.py
index a08e95b4a08..6ebe924ea47 100644
--- a/sky/utils/kubernetes_enums.py
+++ b/sky/utils/kubernetes_enums.py
@@ -36,3 +36,10 @@ class KubernetesPortMode(enum.Enum):
     INGRESS = 'ingress'
     LOADBALANCER = 'loadbalancer'
     PODIP = 'podip'
+
+
+class KubernetesAutoscalerType(enum.Enum):
+    """Enum for the different types of cluster autoscalers for Kubernetes."""
+    GKE = 'gke'
+    KARPENTER = 'karpenter'
+    GENERIC = 'generic'
diff --git a/sky/utils/schemas.py b/sky/utils/schemas.py
index 4ea74714f6c..bea6523ae05 100644
--- a/sky/utils/schemas.py
+++ b/sky/utils/schemas.py
@@ -645,6 +645,13 @@ def get_config_schema():
                 'provision_timeout': {
                     'type': 'integer',
                 },
+                'autoscaler': {
+                    'type': 'string',
+                    'case_insensitive_enum': [
+                        type.value
+                        for type in kubernetes_enums.KubernetesAutoscalerType
+                    ]
+                },
             }
         },
         'oci': {