From 10340f86a4991e58f678025d55f33dce34c45a5b Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Tue, 7 May 2024 00:51:38 -0700 Subject: [PATCH] [k8s] Add support for autoscaling kubernetes clusters (#3513) * Add Karpenter label formatter. * add autoscaler support * lint * lint * comments * comments * lint --- docs/source/reference/config.rst | 19 ++++++++++ sky/cli.py | 21 +++++++++-- sky/clouds/kubernetes.py | 22 +++++++---- sky/provision/kubernetes/utils.py | 63 ++++++++++++++++++++++++++++--- sky/utils/kubernetes_enums.py | 7 ++++ sky/utils/schemas.py | 7 ++++ 6 files changed, 121 insertions(+), 18 deletions(-) diff --git a/docs/source/reference/config.rst b/docs/source/reference/config.rst index 3c1c02dd659..641ebede5e5 100644 --- a/docs/source/reference/config.rst +++ b/docs/source/reference/config.rst @@ -326,6 +326,25 @@ Available fields and semantics: # Default: 10 seconds provision_timeout: 10 + # Autoscaler configured in the Kubernetes cluster (optional) + # + # This field informs SkyPilot about the cluster autoscaler used in the + # Kubernetes cluster. Setting this field disables pre-launch checks for + # GPU capacity in the cluster and SkyPilot relies on the autoscaler to + # provision nodes with the required GPU capacity. + # + # Remember to set provision_timeout accordingly when using an autoscaler. + # + # Supported values: gke, karpenter, generic + # gke: uses cloud.google.com/gke-accelerator label to identify GPUs on nodes + # karpenter: uses karpenter.k8s.aws/instance-gpu-name label to identify GPUs on nodes + # generic: uses skypilot.co/accelerator labels to identify GPUs on nodes + # Refer to https://skypilot.readthedocs.io/en/latest/reference/kubernetes/kubernetes-setup.html#setting-up-gpu-support + # for more details on setting up labels for GPU support. + # + # Default: null (no autoscaler, autodetect label format for GPU nodes) + autoscaler: gke + # Additional fields to override the pod fields used by SkyPilot (optional) # # Any key:value pairs added here would get added to the pod spec used to diff --git a/sky/cli.py b/sky/cli.py index 485703e4caf..8d60de53e87 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -2998,6 +2998,11 @@ def _output(): name, quantity = None, None + # Kubernetes specific bools + cloud_is_kubernetes = isinstance(cloud_obj, clouds.Kubernetes) + kubernetes_autoscaling = kubernetes_utils.get_autoscaler_type( + ) is not None + if accelerator_str is None: result = service_catalog.list_accelerator_counts( gpus_only=True, @@ -3005,16 +3010,17 @@ def _output(): region_filter=region, ) - if (len(result) == 0 and cloud_obj is not None and - cloud_obj.is_same_cloud(clouds.Kubernetes())): + if len(result) == 0 and cloud_is_kubernetes: yield kubernetes_utils.NO_GPU_ERROR_MESSAGE + if kubernetes_autoscaling: + yield '\n' + yield kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE return # "Common" GPUs # If cloud is kubernetes, we want to show all GPUs here, even if # they are not listed as common in SkyPilot. - if (cloud_obj is not None and - cloud_obj.is_same_cloud(clouds.Kubernetes())): + if cloud_is_kubernetes: for gpu, _ in sorted(result.items()): gpu_table.add_row([gpu, _list_to_str(result.pop(gpu))]) else: @@ -3038,9 +3044,16 @@ def _output(): other_table.add_row([gpu, _list_to_str(qty)]) yield from other_table.get_string() yield '\n\n' + if (cloud_is_kubernetes or + cloud is None) and kubernetes_autoscaling: + yield kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE + yield '\n\n' else: yield ('\n\nHint: use -a/--all to see all accelerators ' '(including non-common ones) and pricing.') + if (cloud_is_kubernetes or + cloud is None) and kubernetes_autoscaling: + yield kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE return else: # Parse accelerator string diff --git a/sky/clouds/kubernetes.py b/sky/clouds/kubernetes.py index 9777a28948b..be9111feac5 100644 --- a/sky/clouds/kubernetes.py +++ b/sky/clouds/kubernetes.py @@ -337,14 +337,20 @@ def _make(instance_list): gpu_task_cpus, gpu_task_memory, acc_count, acc_type).name) # Check if requested instance type will fit in the cluster. - # TODO(romilb): This will fail early for autoscaling clusters. - fits, reason = kubernetes_utils.check_instance_fits( - chosen_instance_type) - if not fits: - logger.debug(f'Instance type {chosen_instance_type} does ' - 'not fit in the Kubernetes cluster. ' - f'Reason: {reason}') - return [], [] + autoscaler_type = kubernetes_utils.get_autoscaler_type() + if autoscaler_type is None: + # If autoscaler is not set, check if the instance type fits in the + # cluster. Else, rely on the autoscaler to provision the right + # instance type without running checks. Worst case, if autoscaling + # fails, the pod will be stuck in pending state until + # provision_timeout, after which failover will be triggered. + fits, reason = kubernetes_utils.check_instance_fits( + chosen_instance_type) + if not fits: + logger.debug(f'Instance type {chosen_instance_type} does ' + 'not fit in the Kubernetes cluster. ' + f'Reason: {reason}') + return [], [] # No fuzzy lists for Kubernetes return _make([chosen_instance_type]), [] diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index c7c19680e07..b0b27f121fe 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -35,6 +35,12 @@ (e.g., skypilot.co/accelerator) are setup correctly. \ To further debug, run: sky check.' +KUBERNETES_AUTOSCALER_NOTE = ( + 'Note: Kubernetes cluster autoscaling is enabled. ' + 'All GPUs that can be provisioned may not be listed ' + 'here. Refer to your autoscaler\'s node pool ' + 'configuration to see the list of supported GPUs.') + # TODO(romilb): Add links to docs for configuration instructions when ready. ENDPOINTS_DEBUG_MESSAGE = ('Additionally, make sure your {endpoint_type} ' 'is configured correctly. ' @@ -178,13 +184,31 @@ def get_accelerator_from_label_value(cls, value: str) -> str: f'Invalid accelerator name in GKE cluster: {value}') +class KarpenterLabelFormatter(SkyPilotLabelFormatter): + """Karpeneter label formatter + Karpenter uses the label `karpenter.k8s.aws/instance-gpu-name` to identify + the GPU type. Details: https://karpenter.sh/docs/reference/instance-types/ + The naming scheme is same as the SkyPilot formatter, so we inherit from it. + """ + LABEL_KEY = 'karpenter.k8s.aws/instance-gpu-name' + + # LABEL_FORMATTER_REGISTRY stores the label formats SkyPilot will try to # discover the accelerator type from. The order of the list is important, as -# it will be used to determine the priority of the label formats. +# it will be used to determine the priority of the label formats when +# auto-detecting the GPU label type. LABEL_FORMATTER_REGISTRY = [ - SkyPilotLabelFormatter, CoreWeaveLabelFormatter, GKELabelFormatter + SkyPilotLabelFormatter, CoreWeaveLabelFormatter, GKELabelFormatter, + KarpenterLabelFormatter ] +# Mapping of autoscaler type to label formatter +AUTOSCALER_TO_LABEL_FORMATTER = { + kubernetes_enums.KubernetesAutoscalerType.GKE: GKELabelFormatter, + kubernetes_enums.KubernetesAutoscalerType.KARPENTER: KarpenterLabelFormatter, # pylint: disable=line-too-long + kubernetes_enums.KubernetesAutoscalerType.GENERIC: SkyPilotLabelFormatter, +} + def detect_gpu_label_formatter( ) -> Tuple[Optional[GPULabelFormatter], Dict[str, List[Tuple[str, str]]]]: @@ -348,10 +372,26 @@ def get_gpu_label_key_value(acc_type: str, check_mode=False) -> Tuple[str, str]: # Check if the cluster has GPU resources # TODO(romilb): This assumes the accelerator is a nvidia GPU. We # need to support TPUs and other accelerators as well. - # TODO(romilb): This will fail early for autoscaling clusters. - # For AS clusters, we may need a way for users to specify GPU node pools - # to use since the cluster may be scaling up from zero nodes and may not - # have any GPU nodes yet. + # TODO(romilb): Currently, we broadly disable all GPU checks if autoscaling + # is configured in config.yaml since the cluster may be scaling up from + # zero nodes and may not have any GPU nodes yet. In the future, we should + # support pollingthe clusters for autoscaling information, such as the + # node pools configured etc. + + autoscaler_type = get_autoscaler_type() + if autoscaler_type is not None: + # If autoscaler is set in config.yaml, override the label key and value + # to the autoscaler's format and bypass the GPU checks. + if check_mode: + # If check mode is enabled and autoscaler is set, we can return + # early since we assume the cluster autoscaler will handle GPU + # node provisioning. + return '', '' + formatter = AUTOSCALER_TO_LABEL_FORMATTER.get(autoscaler_type) + assert formatter is not None, ('Unsupported autoscaler type:' + f' {autoscaler_type}') + return formatter.get_label_key(), formatter.get_label_value(acc_type) + has_gpus, cluster_resources = detect_gpu_resource() if has_gpus: # Check if the cluster has GPU labels setup correctly @@ -1310,3 +1350,14 @@ def get_head_pod_name(cluster_name_on_cloud: str): # label, but since we know the naming convention, we can directly return the # head pod name. return f'{cluster_name_on_cloud}-head' + + +def get_autoscaler_type( +) -> Optional[kubernetes_enums.KubernetesAutoscalerType]: + """Returns the autoscaler type by reading from config""" + autoscaler_type = skypilot_config.get_nested(['kubernetes', 'autoscaler'], + None) + if autoscaler_type is not None: + autoscaler_type = kubernetes_enums.KubernetesAutoscalerType( + autoscaler_type) + return autoscaler_type diff --git a/sky/utils/kubernetes_enums.py b/sky/utils/kubernetes_enums.py index a08e95b4a08..6ebe924ea47 100644 --- a/sky/utils/kubernetes_enums.py +++ b/sky/utils/kubernetes_enums.py @@ -36,3 +36,10 @@ class KubernetesPortMode(enum.Enum): INGRESS = 'ingress' LOADBALANCER = 'loadbalancer' PODIP = 'podip' + + +class KubernetesAutoscalerType(enum.Enum): + """Enum for the different types of cluster autoscalers for Kubernetes.""" + GKE = 'gke' + KARPENTER = 'karpenter' + GENERIC = 'generic' diff --git a/sky/utils/schemas.py b/sky/utils/schemas.py index 4ea74714f6c..bea6523ae05 100644 --- a/sky/utils/schemas.py +++ b/sky/utils/schemas.py @@ -645,6 +645,13 @@ def get_config_schema(): 'provision_timeout': { 'type': 'integer', }, + 'autoscaler': { + 'type': 'string', + 'case_insensitive_enum': [ + type.value + for type in kubernetes_enums.KubernetesAutoscalerType + ] + }, } }, 'oci': {