From e6b975d505791ebafe4aec3ffea2a2944c1b906c Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Mon, 29 Apr 2024 23:27:03 -0700 Subject: [PATCH 01/26] wip --- sky/cli.py | 97 ++++++++++++------- sky/clouds/service_catalog/__init__.py | 32 ++++++ .../service_catalog/kubernetes_catalog.py | 75 ++++++++++++-- sky/provision/kubernetes/utils.py | 12 +++ 4 files changed, 170 insertions(+), 46 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index 72667cffc97..7a9167cd055 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -2902,6 +2902,10 @@ def _list_to_str(lst): def _output(): gpu_table = log_utils.create_table( ['COMMON_GPU', 'AVAILABLE_QUANTITIES']) + realtime_gpu_table = log_utils.create_table([ + 'COMMON_GPU', 'REQUESTABLE_QUANTITIES', 'TOTAL_GPUS', + 'AVAILABLE_GPUS' + ]) tpu_table = log_utils.create_table( ['GOOGLE_TPU', 'AVAILABLE_QUANTITIES']) other_table = log_utils.create_table( @@ -2910,49 +2914,68 @@ def _output(): name, quantity = None, None if accelerator_str is None: - result = service_catalog.list_accelerator_counts( - gpus_only=True, - clouds=cloud, - region_filter=region, - ) - if (len(result) == 0 and cloud_obj is not None and - cloud_obj.is_same_cloud(clouds.Kubernetes())): - yield kubernetes_utils.NO_GPU_ERROR_MESSAGE - return - - # "Common" GPUs - # If cloud is kubernetes, we want to show all GPUs here, even if - # they are not listed as common in SkyPilot. + # If cloud is kubernetes, we want to show real-time capacity if (cloud_obj is not None and cloud_obj.is_same_cloud(clouds.Kubernetes())): - for gpu, _ in sorted(result.items()): - gpu_table.add_row([gpu, _list_to_str(result.pop(gpu))]) - else: - for gpu in service_catalog.get_common_gpus(): - if gpu in result: - gpu_table.add_row([gpu, _list_to_str(result.pop(gpu))]) - yield from gpu_table.get_string() - - # Google TPUs - for tpu in service_catalog.get_tpus(): - if tpu in result: - tpu_table.add_row([tpu, _list_to_str(result.pop(tpu))]) - if len(tpu_table.get_string()) > 0: - yield '\n\n' - yield from tpu_table.get_string() - - # Other GPUs - if show_all: - yield '\n\n' - for gpu, qty in sorted(result.items()): - other_table.add_row([gpu, _list_to_str(qty)]) - yield from other_table.get_string() - yield '\n\n' - else: + counts, capacity, available = service_catalog.list_accelerator_realtime( + gpus_only=True, clouds=cloud, region_filter=region) + assert (set(counts.keys()) == set(capacity.keys()) == set( + available.keys())), ('Keys of counts, capacity, ' + 'and available must be same.') + if len(counts) == 0: + yield kubernetes_utils.NO_GPU_ERROR_MESSAGE + return + for gpu, _ in sorted(counts.items()): + realtime_gpu_table.add_row([ + gpu, + _list_to_str(counts.pop(gpu)), capacity[gpu], + available[gpu] + ]) + yield from realtime_gpu_table.get_string() yield ('\n\nHint: use -a/--all to see all accelerators ' '(including non-common ones) and pricing.') return + else: + result = service_catalog.list_accelerator_counts( + gpus_only=True, + clouds=cloud, + region_filter=region, + ) + + # "Common" GPUs + # If cloud is kubernetes, we want to show all GPUs here, even if + # they are not listed as common in SkyPilot. + if (cloud_obj is not None and + cloud_obj.is_same_cloud(clouds.Kubernetes())): + for gpu, _ in sorted(result.items()): + gpu_table.add_row([gpu, _list_to_str(result.pop(gpu))]) + else: + for gpu in service_catalog.get_common_gpus(): + if gpu in result: + gpu_table.add_row( + [gpu, _list_to_str(result.pop(gpu))]) + yield from gpu_table.get_string() + + # Google TPUs + for tpu in service_catalog.get_tpus(): + if tpu in result: + tpu_table.add_row([tpu, _list_to_str(result.pop(tpu))]) + if len(tpu_table.get_string()) > 0: + yield '\n\n' + yield from tpu_table.get_string() + + # Other GPUs + if show_all: + yield '\n\n' + for gpu, qty in sorted(result.items()): + other_table.add_row([gpu, _list_to_str(qty)]) + yield from other_table.get_string() + yield '\n\n' + else: + yield ('\n\nHint: use -a/--all to see all accelerators ' + '(including non-common ones) and pricing.') + return else: # Parse accelerator string accelerator_split = accelerator_str.split(':') diff --git a/sky/clouds/service_catalog/__init__.py b/sky/clouds/service_catalog/__init__.py index d380cce6757..b40a56bf672 100644 --- a/sky/clouds/service_catalog/__init__.py +++ b/sky/clouds/service_catalog/__init__.py @@ -117,6 +117,38 @@ def list_accelerator_counts( return ret +def list_accelerator_realtime( + gpus_only: bool = True, + name_filter: Optional[str] = None, + region_filter: Optional[str] = None, + quantity_filter: Optional[int] = None, + clouds: CloudFilter = None, +) -> Tuple[Dict[str, List[int]], Dict[str, int], Dict[str, int]]: + """List all accelerators offered by Sky and their realtime availability. + + Useful for fixed size clusters. + + Returns: + """ + qtys_map, total_accelerators_capacity, total_accelerators_available = ( + _map_clouds_catalog( + clouds, + 'list_accelerators_realtime', + gpus_only, + name_filter, + region_filter, + quantity_filter, + all_regions=False, + require_price=False)) + accelerator_counts: Dict[str, List[int]] = collections.defaultdict(list) + for gpu, items in qtys_map.items(): + for item in items: + accelerator_counts[gpu].append(item.accelerator_count) + accelerator_counts[gpu] = sorted(accelerator_counts[gpu]) + return (accelerator_counts, total_accelerators_capacity, + total_accelerators_available) + + def instance_type_exists(instance_type: str, clouds: CloudFilter = None) -> bool: """Check the existence of a instance type.""" diff --git a/sky/clouds/service_catalog/kubernetes_catalog.py b/sky/clouds/service_catalog/kubernetes_catalog.py index bd44847016e..436404369d2 100644 --- a/sky/clouds/service_catalog/kubernetes_catalog.py +++ b/sky/clouds/service_catalog/kubernetes_catalog.py @@ -46,38 +46,91 @@ def list_accelerators( case_sensitive: bool = True, all_regions: bool = False, require_price: bool = True) -> Dict[str, List[common.InstanceTypeInfo]]: + return list_accelerators_realtime(gpus_only, name_filter, region_filter, + quantity_filter, case_sensitive, + all_regions, require_price)[0] + + +def list_accelerators_realtime( + gpus_only: bool, + name_filter: Optional[str], + region_filter: Optional[str], + quantity_filter: Optional[int], + case_sensitive: bool = True, + all_regions: bool = False, + require_price: bool = True +) -> Tuple[Dict[str, List[common.InstanceTypeInfo]], Dict[str, int], Dict[str, + int]]: del all_regions, require_price # Unused. k8s_cloud = Kubernetes() if not any( map(k8s_cloud.is_same_cloud, sky_check.get_cached_enabled_clouds_or_refresh()) ) or not kubernetes_utils.check_credentials()[0]: - return {} + return {}, {}, {} has_gpu = kubernetes_utils.detect_gpu_resource() if not has_gpu: - return {} + return {}, {}, {} label_formatter, _ = kubernetes_utils.detect_gpu_label_formatter() if not label_formatter: - return {} + return {}, {}, {} - accelerators: Set[Tuple[str, int]] = set() + accelerators_qtys: Set[Tuple[str, int]] = set() key = label_formatter.get_label_key() nodes = kubernetes_utils.get_kubernetes_nodes() + # Get the pods to get the real-time GPU usage + pods = kubernetes_utils.get_kubernetes_pods() + # Total number of GPUs in the cluster + total_accelerators_capacity: Dict[str, int] = {} + # Total number of GPUs currently available in the cluster + total_accelerators_available: Dict[str, int] = {} + for node in nodes: if key in node.metadata.labels: + allocated_qty = 0 accelerator_name = label_formatter.get_accelerator_from_label_value( node.metadata.labels.get(key)) accelerator_count = int( node.status.allocatable.get('nvidia.com/gpu', 0)) + # Generate the GPU quantities for the accelerators if accelerator_name and accelerator_count > 0: for count in range(1, accelerator_count + 1): - accelerators.add((accelerator_name, count)) + accelerators_qtys.add((accelerator_name, count)) + + for pod in pods: + # Get all the pods running on the node + if (pod.spec.node_name == node.metadata.name and + pod.status.phase in ['Running', 'Pending']): + # Iterate over all the containers in the pod and sum the + # GPU requests + for container in pod.spec.containers: + if container.resources.requests: + allocated_qty += int( + container.resources.requests.get( + 'nvidia.com/gpu', 0)) + + accelerators_availabe = accelerator_count - allocated_qty + + if accelerator_name not in total_accelerators_capacity: + total_accelerators_capacity[ + accelerator_name] = accelerator_count + else: + total_accelerators_capacity[ + accelerator_name] += accelerator_count + if accelerator_name not in total_accelerators_available: + total_accelerators_available[ + accelerator_name] = accelerators_availabe + else: + total_accelerators_available[ + accelerator_name] += accelerators_availabe result = [] - for accelerator_name, accelerator_count in accelerators: + + # Generate dataframe for common.list_accelerators_impl + for accelerator_name, accelerator_count in accelerators_qtys: result.append( common.InstanceTypeInfo(cloud='Kubernetes', instance_type=None, @@ -98,9 +151,13 @@ def list_accelerators( ]) df['GpuInfo'] = True - return common.list_accelerators_impl('Kubernetes', df, gpus_only, - name_filter, region_filter, - quantity_filter, case_sensitive) + qtys_map = common.list_accelerators_impl('Kubernetes', df, gpus_only, + name_filter, region_filter, + quantity_filter, case_sensitive) + + # TODO(romilb): Add filtering for total_accelerators_capacity and total_accelerators_available + + return qtys_map, total_accelerators_capacity, total_accelerators_available def validate_region_zone( diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index 1cb31328d50..8a9cb2ac379 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -251,6 +251,18 @@ def get_kubernetes_nodes() -> List[Any]: return nodes +def get_kubernetes_pods() -> List[Any]: + try: + ns = get_current_kube_config_context_namespace() + pods = kubernetes.core_api().list_namespaced_pod( + ns, _request_timeout=kubernetes.API_TIMEOUT).items + except kubernetes.max_retry_error(): + raise exceptions.ResourcesUnavailableError( + 'Timed out when trying to get pod info from Kubernetes cluster. ' + 'Please check if the cluster is healthy and retry.') from None + return pods + + def check_instance_fits(instance: str) -> Tuple[bool, Optional[str]]: """Checks if the instance fits on the Kubernetes cluster. From a6b5bfc4965f139852c0f540e24445a9d6b13c52 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Tue, 30 Apr 2024 12:09:26 -0700 Subject: [PATCH 02/26] filtering support --- sky/cli.py | 230 ++++++++++-------- sky/clouds/service_catalog/__init__.py | 11 +- .../service_catalog/kubernetes_catalog.py | 46 ++-- sky/provision/kubernetes/utils.py | 4 +- 4 files changed, 174 insertions(+), 117 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index 7a9167cd055..57f468166bf 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -2863,6 +2863,15 @@ def show_gpus( To show all regions for a specified accelerator, use ``sky show-gpus --all-regions``. + If ``--region`` or ``--all-regions`` is not specified, the price displayed + for each instance type is the lowest across all regions for both on-demand + and spot instances. There may be multiple regions with the same lowest + price. + + If ``--cloud kubernetes`` is specified, it will show the maximum quantities + of the GPU available on a single node and the real-time availability of + the GPU across all nodes in the Kubernetes cluster. + Definitions of certain fields: * ``DEVICE_MEM``: Memory of a single device; does not depend on the device @@ -2870,10 +2879,15 @@ def show_gpus( * ``HOST_MEM``: Memory of the host instance (VM). - If ``--region`` or ``--all-regions`` is not specified, the price displayed - for each instance type is the lowest across all regions for both on-demand - and spot instances. There may be multiple regions with the same lowest - price. + * ``QTY_PER_NODE`` (Kubernetes only): Maximum quantity of the GPU available + on a single node. + + * ``TOTAL_GPUS`` (Kubernetes only): Total number of GPUs available in the + Kubernetes cluster. + + * ``AVAILABLE_GPUS`` (Kubernetes only): Number of currently available GPUs + in the Kubernetes cluster. This is fetched in real-time and may change + when other users are using the cluster. """ # validation for the --region flag if region is not None and cloud is None: @@ -2899,13 +2913,48 @@ def show_gpus( def _list_to_str(lst): return ', '.join([str(e) for e in lst]) - def _output(): - gpu_table = log_utils.create_table( - ['COMMON_GPU', 'AVAILABLE_QUANTITIES']) + def _kubernetes_realtime_gpu_output(name_filter: Optional[str] = None, + quantity_filter: Optional[int] = None): + if quantity_filter: + qty_header = 'QTY_FILTER' + else: + qty_header = 'QTY_PER_NODE' realtime_gpu_table = log_utils.create_table([ - 'COMMON_GPU', 'REQUESTABLE_QUANTITIES', 'TOTAL_GPUS', + 'GPU', qty_header, 'TOTAL_GPUS', 'AVAILABLE_GPUS' ]) + counts, capacity, available = service_catalog.list_accelerator_realtime( + gpus_only=True, + clouds=cloud, + name_filter=name_filter, + quantity_filter=quantity_filter, + region_filter=region) + assert (set(counts.keys()) == set(capacity.keys()) == set( + available.keys())), ('Keys of counts, capacity, ' + 'and available must be same.') + if len(counts) == 0: + gpu_info_msg = '' + debug_msg = 'To further debug, run: sky check.' + if name_filter is not None: + gpu_info_msg = f' matching name {name_filter!r}' + debug_msg = ('To list all available accelerators, ' + 'run: sky show-gpus --cloud kubernetes.') + if quantity_filter is not None: + gpu_info_msg += f' with quantity {quantity_filter}' + err_msg = kubernetes_utils.NO_GPU_ERROR_MESSAGE.format(gpu_info_msg=gpu_info_msg, debug_msg=debug_msg) + yield err_msg + return + for gpu, _ in sorted(counts.items()): + realtime_gpu_table.add_row([ + gpu, + _list_to_str(counts.pop(gpu)), capacity[gpu], + available[gpu] + ]) + yield from realtime_gpu_table.get_string() + + def _output(): + gpu_table = log_utils.create_table( + ['COMMON_GPU', 'AVAILABLE_QUANTITIES']) tpu_table = log_utils.create_table( ['GOOGLE_TPU', 'AVAILABLE_QUANTITIES']) other_table = log_utils.create_table( @@ -2914,27 +2963,10 @@ def _output(): name, quantity = None, None if accelerator_str is None: - # If cloud is kubernetes, we want to show real-time capacity if (cloud_obj is not None and cloud_obj.is_same_cloud(clouds.Kubernetes())): - counts, capacity, available = service_catalog.list_accelerator_realtime( - gpus_only=True, clouds=cloud, region_filter=region) - assert (set(counts.keys()) == set(capacity.keys()) == set( - available.keys())), ('Keys of counts, capacity, ' - 'and available must be same.') - if len(counts) == 0: - yield kubernetes_utils.NO_GPU_ERROR_MESSAGE - return - for gpu, _ in sorted(counts.items()): - realtime_gpu_table.add_row([ - gpu, - _list_to_str(counts.pop(gpu)), capacity[gpu], - available[gpu] - ]) - yield from realtime_gpu_table.get_string() - yield ('\n\nHint: use -a/--all to see all accelerators ' - '(including non-common ones) and pricing.') + yield from _kubernetes_realtime_gpu_output() return else: result = service_catalog.list_accelerator_counts( @@ -2998,81 +3030,85 @@ def _output(): else: name, quantity = accelerator_str, None - # Case-sensitive - result = service_catalog.list_accelerators(gpus_only=True, - name_filter=name, - quantity_filter=quantity, - region_filter=region, - clouds=cloud, - case_sensitive=False, - all_regions=all_regions) - - if len(result) == 0: - if cloud == 'kubernetes': - yield kubernetes_utils.NO_GPU_ERROR_MESSAGE - return - - quantity_str = (f' with requested quantity {quantity}' - if quantity else '') - yield f'Resources \'{name}\'{quantity_str} not found. ' - yield 'Try \'sky show-gpus --all\' ' - yield 'to show available accelerators.' + if (cloud_obj is not None and + cloud_obj.is_same_cloud(clouds.Kubernetes())): + # Get real-time availability of GPUs for Kubernetes + yield from _kubernetes_realtime_gpu_output(name_filter=name, + quantity_filter=quantity) return + else: + # For clouds other than Kubernetes, get the accelerator details + # Case-sensitive + result = service_catalog.list_accelerators(gpus_only=True, + name_filter=name, + quantity_filter=quantity, + region_filter=region, + clouds=cloud, + case_sensitive=False, + all_regions=all_regions) + + if len(result) == 0: + quantity_str = (f' with requested quantity {quantity}' + if quantity else '') + yield f'Resources \'{name}\'{quantity_str} not found. ' + yield 'Try \'sky show-gpus --all\' ' + yield 'to show available accelerators.' + return - for i, (gpu, items) in enumerate(result.items()): - accelerator_table_headers = [ - 'GPU', - 'QTY', - 'CLOUD', - 'INSTANCE_TYPE', - 'DEVICE_MEM', - 'vCPUs', - 'HOST_MEM', - 'HOURLY_PRICE', - 'HOURLY_SPOT_PRICE', - ] - if not show_all: - accelerator_table_headers.append('REGION') - accelerator_table = log_utils.create_table( - accelerator_table_headers) - for item in items: - instance_type_str = item.instance_type if not pd.isna( - item.instance_type) else '(attachable)' - cpu_count = item.cpu_count - if pd.isna(cpu_count): - cpu_str = '-' - elif isinstance(cpu_count, (float, int)): - if int(cpu_count) == cpu_count: - cpu_str = str(int(cpu_count)) - else: - cpu_str = f'{cpu_count:.1f}' - device_memory_str = (f'{item.device_memory:.0f}GB' if - not pd.isna(item.device_memory) else '-') - host_memory_str = f'{item.memory:.0f}GB' if not pd.isna( - item.memory) else '-' - price_str = f'$ {item.price:.3f}' if not pd.isna( - item.price) else '-' - spot_price_str = f'$ {item.spot_price:.3f}' if not pd.isna( - item.spot_price) else '-' - region_str = item.region if not pd.isna(item.region) else '-' - accelerator_table_vals = [ - item.accelerator_name, - item.accelerator_count, - item.cloud, - instance_type_str, - device_memory_str, - cpu_str, - host_memory_str, - price_str, - spot_price_str, + for i, (gpu, items) in enumerate(result.items()): + accelerator_table_headers = [ + 'GPU', + 'QTY', + 'CLOUD', + 'INSTANCE_TYPE', + 'DEVICE_MEM', + 'vCPUs', + 'HOST_MEM', + 'HOURLY_PRICE', + 'HOURLY_SPOT_PRICE', ] if not show_all: - accelerator_table_vals.append(region_str) - accelerator_table.add_row(accelerator_table_vals) - - if i != 0: - yield '\n\n' - yield from accelerator_table.get_string() + accelerator_table_headers.append('REGION') + accelerator_table = log_utils.create_table( + accelerator_table_headers) + for item in items: + instance_type_str = item.instance_type if not pd.isna( + item.instance_type) else '(attachable)' + cpu_count = item.cpu_count + if pd.isna(cpu_count): + cpu_str = '-' + elif isinstance(cpu_count, (float, int)): + if int(cpu_count) == cpu_count: + cpu_str = str(int(cpu_count)) + else: + cpu_str = f'{cpu_count:.1f}' + device_memory_str = (f'{item.device_memory:.0f}GB' if + not pd.isna(item.device_memory) else '-') + host_memory_str = f'{item.memory:.0f}GB' if not pd.isna( + item.memory) else '-' + price_str = f'$ {item.price:.3f}' if not pd.isna( + item.price) else '-' + spot_price_str = f'$ {item.spot_price:.3f}' if not pd.isna( + item.spot_price) else '-' + region_str = item.region if not pd.isna(item.region) else '-' + accelerator_table_vals = [ + item.accelerator_name, + item.accelerator_count, + item.cloud, + instance_type_str, + device_memory_str, + cpu_str, + host_memory_str, + price_str, + spot_price_str, + ] + if not show_all: + accelerator_table_vals.append(region_str) + accelerator_table.add_row(accelerator_table_vals) + + if i != 0: + yield '\n\n' + yield from accelerator_table.get_string() if show_all: click.echo_via_pager(_output()) diff --git a/sky/clouds/service_catalog/__init__.py b/sky/clouds/service_catalog/__init__.py index b40a56bf672..11063623f35 100644 --- a/sky/clouds/service_catalog/__init__.py +++ b/sky/clouds/service_catalog/__init__.py @@ -124,11 +124,18 @@ def list_accelerator_realtime( quantity_filter: Optional[int] = None, clouds: CloudFilter = None, ) -> Tuple[Dict[str, List[int]], Dict[str, int], Dict[str, int]]: - """List all accelerators offered by Sky and their realtime availability. + """List all accelerators offered by Sky with their realtime availability. - Useful for fixed size clusters. + Realtime availability is the total number of accelerators in the cluster + and number of accelerators available at the time of the call. + + Used for fixed size cluster settings, such as Kubernetes. Returns: + A tuple of three dictionaries mapping canonical accelerator names to: + - A list of available counts. (e.g., [1, 2, 4]) + - Total number of accelerators in the cluster (capacity). + - Number of accelerators available at the time of call (availability). """ qtys_map, total_accelerators_capacity, total_accelerators_available = ( _map_clouds_catalog( diff --git a/sky/clouds/service_catalog/kubernetes_catalog.py b/sky/clouds/service_catalog/kubernetes_catalog.py index 436404369d2..a7b0a7296ba 100644 --- a/sky/clouds/service_catalog/kubernetes_catalog.py +++ b/sky/clouds/service_catalog/kubernetes_catalog.py @@ -3,6 +3,7 @@ Kubernetes does not require a catalog of instances, but we need an image catalog mapping SkyPilot image tags to corresponding container image tags. """ +import re import typing from typing import Dict, List, Optional, Set, Tuple @@ -86,12 +87,18 @@ def list_accelerators_realtime( total_accelerators_capacity: Dict[str, int] = {} # Total number of GPUs currently available in the cluster total_accelerators_available: Dict[str, int] = {} + min_quantity_filter = quantity_filter if quantity_filter else 1 for node in nodes: if key in node.metadata.labels: allocated_qty = 0 accelerator_name = label_formatter.get_accelerator_from_label_value( node.metadata.labels.get(key)) + + # Check if name_filter regex matches the accelerator_name + if name_filter and not re.match(name_filter, accelerator_name): + continue + accelerator_count = int( node.status.allocatable.get('nvidia.com/gpu', 0)) @@ -112,20 +119,27 @@ def list_accelerators_realtime( container.resources.requests.get( 'nvidia.com/gpu', 0)) - accelerators_availabe = accelerator_count - allocated_qty - - if accelerator_name not in total_accelerators_capacity: - total_accelerators_capacity[ - accelerator_name] = accelerator_count - else: - total_accelerators_capacity[ - accelerator_name] += accelerator_count - if accelerator_name not in total_accelerators_available: - total_accelerators_available[ - accelerator_name] = accelerators_availabe - else: - total_accelerators_available[ - accelerator_name] += accelerators_availabe + accelerators_available = accelerator_count - allocated_qty + + if accelerator_count >= min_quantity_filter: + quantized_count = (min_quantity_filter * + (accelerator_count//min_quantity_filter)) + if accelerator_name not in total_accelerators_capacity: + total_accelerators_capacity[ + accelerator_name] = quantized_count + else: + total_accelerators_capacity[ + accelerator_name] += quantized_count + + if accelerators_available >= min_quantity_filter: + quantized_availability = min_quantity_filter * ( + accelerators_available // min_quantity_filter) + if accelerator_name not in total_accelerators_available: + total_accelerators_available[ + accelerator_name] = quantized_availability + else: + total_accelerators_available[ + accelerator_name] += quantized_availability result = [] @@ -151,12 +165,12 @@ def list_accelerators_realtime( ]) df['GpuInfo'] = True + # Use common.list_accelerators_impl to get InstanceTypeInfo objects used + # by sky show-gpus when cloud is not specified. qtys_map = common.list_accelerators_impl('Kubernetes', df, gpus_only, name_filter, region_filter, quantity_filter, case_sensitive) - # TODO(romilb): Add filtering for total_accelerators_capacity and total_accelerators_available - return qtys_map, total_accelerators_capacity, total_accelerators_available diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index 8a9cb2ac379..2fef2ea0552 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -30,10 +30,10 @@ 'T': 2**40, 'P': 2**50, } -NO_GPU_ERROR_MESSAGE = 'No GPUs found in Kubernetes cluster. \ +NO_GPU_ERROR_MESSAGE = 'No GPUs{gpu_info_msg} found in Kubernetes cluster. \ If your cluster contains GPUs, make sure nvidia.com/gpu resource is available on the nodes and the node labels for identifying GPUs \ (e.g., skypilot.co/accelerator) are setup correctly. \ -To further debug, run: sky check.' +{debug_msg}' # TODO(romilb): Add links to docs for configuration instructions when ready. ENDPOINTS_DEBUG_MESSAGE = ('Additionally, make sure your {endpoint_type} ' From 13461597f38a4594d9f9d0bcae80657cf170c229 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Tue, 30 Apr 2024 12:11:30 -0700 Subject: [PATCH 03/26] lint --- sky/cli.py | 20 +++++++++---------- sky/clouds/service_catalog/__init__.py | 17 ++++++++-------- .../service_catalog/kubernetes_catalog.py | 4 ++-- 3 files changed, 20 insertions(+), 21 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index 57f468166bf..97833eaa6e2 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -2919,10 +2919,8 @@ def _kubernetes_realtime_gpu_output(name_filter: Optional[str] = None, qty_header = 'QTY_FILTER' else: qty_header = 'QTY_PER_NODE' - realtime_gpu_table = log_utils.create_table([ - 'GPU', qty_header, 'TOTAL_GPUS', - 'AVAILABLE_GPUS' - ]) + realtime_gpu_table = log_utils.create_table( + ['GPU', qty_header, 'TOTAL_GPUS', 'AVAILABLE_GPUS']) counts, capacity, available = service_catalog.list_accelerator_realtime( gpus_only=True, clouds=cloud, @@ -2941,14 +2939,14 @@ def _kubernetes_realtime_gpu_output(name_filter: Optional[str] = None, 'run: sky show-gpus --cloud kubernetes.') if quantity_filter is not None: gpu_info_msg += f' with quantity {quantity_filter}' - err_msg = kubernetes_utils.NO_GPU_ERROR_MESSAGE.format(gpu_info_msg=gpu_info_msg, debug_msg=debug_msg) + err_msg = kubernetes_utils.NO_GPU_ERROR_MESSAGE.format( + gpu_info_msg=gpu_info_msg, debug_msg=debug_msg) yield err_msg return for gpu, _ in sorted(counts.items()): realtime_gpu_table.add_row([ gpu, - _list_to_str(counts.pop(gpu)), capacity[gpu], - available[gpu] + _list_to_str(counts.pop(gpu)), capacity[gpu], available[gpu] ]) yield from realtime_gpu_table.get_string() @@ -3082,15 +3080,17 @@ def _output(): cpu_str = str(int(cpu_count)) else: cpu_str = f'{cpu_count:.1f}' - device_memory_str = (f'{item.device_memory:.0f}GB' if - not pd.isna(item.device_memory) else '-') + device_memory_str = (f'{item.device_memory:.0f}GB' + if not pd.isna(item.device_memory) else + '-') host_memory_str = f'{item.memory:.0f}GB' if not pd.isna( item.memory) else '-' price_str = f'$ {item.price:.3f}' if not pd.isna( item.price) else '-' spot_price_str = f'$ {item.spot_price:.3f}' if not pd.isna( item.spot_price) else '-' - region_str = item.region if not pd.isna(item.region) else '-' + region_str = item.region if not pd.isna( + item.region) else '-' accelerator_table_vals = [ item.accelerator_name, item.accelerator_count, diff --git a/sky/clouds/service_catalog/__init__.py b/sky/clouds/service_catalog/__init__.py index 11063623f35..c654a66aecd 100644 --- a/sky/clouds/service_catalog/__init__.py +++ b/sky/clouds/service_catalog/__init__.py @@ -138,15 +138,14 @@ def list_accelerator_realtime( - Number of accelerators available at the time of call (availability). """ qtys_map, total_accelerators_capacity, total_accelerators_available = ( - _map_clouds_catalog( - clouds, - 'list_accelerators_realtime', - gpus_only, - name_filter, - region_filter, - quantity_filter, - all_regions=False, - require_price=False)) + _map_clouds_catalog(clouds, + 'list_accelerators_realtime', + gpus_only, + name_filter, + region_filter, + quantity_filter, + all_regions=False, + require_price=False)) accelerator_counts: Dict[str, List[int]] = collections.defaultdict(list) for gpu, items in qtys_map.items(): for item in items: diff --git a/sky/clouds/service_catalog/kubernetes_catalog.py b/sky/clouds/service_catalog/kubernetes_catalog.py index a7b0a7296ba..6e43fb62612 100644 --- a/sky/clouds/service_catalog/kubernetes_catalog.py +++ b/sky/clouds/service_catalog/kubernetes_catalog.py @@ -123,7 +123,7 @@ def list_accelerators_realtime( if accelerator_count >= min_quantity_filter: quantized_count = (min_quantity_filter * - (accelerator_count//min_quantity_filter)) + (accelerator_count // min_quantity_filter)) if accelerator_name not in total_accelerators_capacity: total_accelerators_capacity[ accelerator_name] = quantized_count @@ -133,7 +133,7 @@ def list_accelerators_realtime( if accelerators_available >= min_quantity_filter: quantized_availability = min_quantity_filter * ( - accelerators_available // min_quantity_filter) + accelerators_available // min_quantity_filter) if accelerator_name not in total_accelerators_available: total_accelerators_available[ accelerator_name] = quantized_availability From 6bbbf25a93e584c6b7501b6ca9fdeb7bbf338977 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Tue, 30 Apr 2024 12:17:07 -0700 Subject: [PATCH 04/26] update doc --- sky/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/cli.py b/sky/cli.py index 97833eaa6e2..9254e55f41f 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -2879,7 +2879,7 @@ def show_gpus( * ``HOST_MEM``: Memory of the host instance (VM). - * ``QTY_PER_NODE`` (Kubernetes only): Maximum quantity of the GPU available + * ``QTY_PER_NODE`` (Kubernetes only): GPU quantities that can be requested on a single node. * ``TOTAL_GPUS`` (Kubernetes only): Total number of GPUs available in the From a26336522ad8acb9c60ad5832a434404c90b286b Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Wed, 1 May 2024 19:11:51 -0700 Subject: [PATCH 05/26] rename headers --- sky/cli.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sky/cli.py b/sky/cli.py index 9254e55f41f..8a0da4746b7 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -2917,10 +2917,12 @@ def _kubernetes_realtime_gpu_output(name_filter: Optional[str] = None, quantity_filter: Optional[int] = None): if quantity_filter: qty_header = 'QTY_FILTER' + free_header = 'FILTERED_FREE_GPUS' else: qty_header = 'QTY_PER_NODE' + free_header = 'TOTAL_FREE_GPUS' realtime_gpu_table = log_utils.create_table( - ['GPU', qty_header, 'TOTAL_GPUS', 'AVAILABLE_GPUS']) + ['GPU', qty_header, 'TOTAL_GPUS', free_header]) counts, capacity, available = service_catalog.list_accelerator_realtime( gpus_only=True, clouds=cloud, From 0bd06a43c88450789ca941d1045303f14caeea74 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Fri, 17 May 2024 13:50:28 -0700 Subject: [PATCH 06/26] comments --- sky/cli.py | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index 6bb00b3d42b..d19e58f2b94 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -2988,7 +2988,7 @@ def show_gpus( * ``TOTAL_GPUS`` (Kubernetes only): Total number of GPUs available in the Kubernetes cluster. - * ``AVAILABLE_GPUS`` (Kubernetes only): Number of currently available GPUs + * ``TOTAL_FREE_GPUS`` (Kubernetes only): Number of currently free GPUs in the Kubernetes cluster. This is fetched in real-time and may change when other users are using the cluster. """ @@ -3047,6 +3047,10 @@ def _kubernetes_realtime_gpu_output(name_filter: Optional[str] = None, err_msg = kubernetes_utils.NO_GPU_ERROR_MESSAGE.format( gpu_info_msg=gpu_info_msg, debug_msg=debug_msg) yield err_msg + if kubernetes_utils.get_autoscaler_type() is not None: + # If using autoscaling cluster, show note + yield '\n' + yield kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE return for gpu, _ in sorted(counts.items()): realtime_gpu_table.add_row([ @@ -3082,13 +3086,6 @@ def _output(): region_filter=region, ) - if len(result) == 0 and cloud_is_kubernetes: - yield kubernetes_utils.NO_GPU_ERROR_MESSAGE - if kubernetes_autoscaling: - yield '\n' - yield kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE - return - # "Common" GPUs for gpu in service_catalog.get_common_gpus(): if gpu in result: @@ -3110,15 +3107,13 @@ def _output(): other_table.add_row([gpu, _list_to_str(qty)]) yield from other_table.get_string() yield '\n\n' - if (cloud_is_kubernetes or - cloud is None) and kubernetes_autoscaling: + if cloud is None and kubernetes_autoscaling: yield kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE yield '\n\n' else: yield ('\n\nHint: use -a/--all to see all accelerators ' '(including non-common ones) and pricing.') - if (cloud_is_kubernetes or - cloud is None) and kubernetes_autoscaling: + if cloud is None and kubernetes_autoscaling: yield kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE return else: From 6bf3045ebdb45cc804c5c9a37c1970d05328f824 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Fri, 17 May 2024 16:55:05 -0700 Subject: [PATCH 07/26] add TODO --- sky/clouds/service_catalog/kubernetes_catalog.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sky/clouds/service_catalog/kubernetes_catalog.py b/sky/clouds/service_catalog/kubernetes_catalog.py index 6e43fb62612..b6727d38c02 100644 --- a/sky/clouds/service_catalog/kubernetes_catalog.py +++ b/sky/clouds/service_catalog/kubernetes_catalog.py @@ -47,6 +47,9 @@ def list_accelerators( case_sensitive: bool = True, all_regions: bool = False, require_price: bool = True) -> Dict[str, List[common.InstanceTypeInfo]]: + # TODO(romilb): We should consider putting a lru_cache() with TTL to + # avoid multiple calls to kubernetes API in a short period of time (e.g., + # from the optimizer). return list_accelerators_realtime(gpus_only, name_filter, region_filter, quantity_filter, case_sensitive, all_regions, require_price)[0] From f96032245779820801201c8ed87482b14534e3c1 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Fri, 17 May 2024 17:15:05 -0700 Subject: [PATCH 08/26] Add autoscaler note --- sky/cli.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index d19e58f2b94..49c56f94304 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -3047,10 +3047,6 @@ def _kubernetes_realtime_gpu_output(name_filter: Optional[str] = None, err_msg = kubernetes_utils.NO_GPU_ERROR_MESSAGE.format( gpu_info_msg=gpu_info_msg, debug_msg=debug_msg) yield err_msg - if kubernetes_utils.get_autoscaler_type() is not None: - # If using autoscaling cluster, show note - yield '\n' - yield kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE return for gpu, _ in sorted(counts.items()): realtime_gpu_table.add_row([ @@ -3078,6 +3074,9 @@ def _output(): # If cloud is kubernetes, we want to show real-time capacity if cloud_is_kubernetes: yield from _kubernetes_realtime_gpu_output() + if kubernetes_utils.get_autoscaler_type() is not None: + yield '\n' + yield kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE return result = service_catalog.list_accelerator_counts( From 8878254aa556a996e71eb20886412a1f1d6521f1 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Thu, 23 May 2024 17:41:24 -0700 Subject: [PATCH 09/26] case sensitive fix --- sky/cli.py | 3 ++- sky/clouds/service_catalog/__init__.py | 2 ++ sky/clouds/service_catalog/kubernetes_catalog.py | 3 ++- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index d07cb3b86b7..622e5c0584d 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -3030,8 +3030,9 @@ def _kubernetes_realtime_gpu_output(name_filter: Optional[str] = None, gpus_only=True, clouds=cloud, name_filter=name_filter, + region_filter=region, quantity_filter=quantity_filter, - region_filter=region) + case_sensitive=False) assert (set(counts.keys()) == set(capacity.keys()) == set( available.keys())), ('Keys of counts, capacity, ' 'and available must be same.') diff --git a/sky/clouds/service_catalog/__init__.py b/sky/clouds/service_catalog/__init__.py index c654a66aecd..7479cd77cf7 100644 --- a/sky/clouds/service_catalog/__init__.py +++ b/sky/clouds/service_catalog/__init__.py @@ -123,6 +123,7 @@ def list_accelerator_realtime( region_filter: Optional[str] = None, quantity_filter: Optional[int] = None, clouds: CloudFilter = None, + case_sensitive: bool = True, ) -> Tuple[Dict[str, List[int]], Dict[str, int], Dict[str, int]]: """List all accelerators offered by Sky with their realtime availability. @@ -144,6 +145,7 @@ def list_accelerator_realtime( name_filter, region_filter, quantity_filter, + case_sensitive=case_sensitive, all_regions=False, require_price=False)) accelerator_counts: Dict[str, List[int]] = collections.defaultdict(list) diff --git a/sky/clouds/service_catalog/kubernetes_catalog.py b/sky/clouds/service_catalog/kubernetes_catalog.py index b6727d38c02..cba689ae648 100644 --- a/sky/clouds/service_catalog/kubernetes_catalog.py +++ b/sky/clouds/service_catalog/kubernetes_catalog.py @@ -99,7 +99,8 @@ def list_accelerators_realtime( node.metadata.labels.get(key)) # Check if name_filter regex matches the accelerator_name - if name_filter and not re.match(name_filter, accelerator_name): + regex_flags = 0 if case_sensitive else re.IGNORECASE + if not re.match(name_filter, accelerator_name, flags=regex_flags): continue accelerator_count = int( From 3fe8fc6c579aea9c9cabf6280b727a228bdd62d6 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Thu, 23 May 2024 18:00:38 -0700 Subject: [PATCH 10/26] case sensitive fix --- sky/clouds/service_catalog/kubernetes_catalog.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/clouds/service_catalog/kubernetes_catalog.py b/sky/clouds/service_catalog/kubernetes_catalog.py index cba689ae648..70e1b463460 100644 --- a/sky/clouds/service_catalog/kubernetes_catalog.py +++ b/sky/clouds/service_catalog/kubernetes_catalog.py @@ -100,7 +100,7 @@ def list_accelerators_realtime( # Check if name_filter regex matches the accelerator_name regex_flags = 0 if case_sensitive else re.IGNORECASE - if not re.match(name_filter, accelerator_name, flags=regex_flags): + if name_filter and not re.match(name_filter, accelerator_name, flags=regex_flags): continue accelerator_count = int( From 2203d6b3ebf68cebf4a4de17aa8f936346acbdbd Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Thu, 23 May 2024 18:54:29 -0700 Subject: [PATCH 11/26] show kubernetes GPUs in a separate table in sky show-gpus --- sky/cli.py | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index 622e5c0584d..ace46d1f2ce 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -3017,7 +3017,10 @@ def _list_to_str(lst): return ', '.join([str(e) for e in lst]) def _kubernetes_realtime_gpu_output(name_filter: Optional[str] = None, - quantity_filter: Optional[int] = None): + quantity_filter: Optional[int] = None, + gpu_col_name: Optional[str] = None): + if gpu_col_name is None: + gpu_col_name = 'GPU' if quantity_filter: qty_header = 'QTY_FILTER' free_header = 'FILTERED_FREE_GPUS' @@ -3025,10 +3028,10 @@ def _kubernetes_realtime_gpu_output(name_filter: Optional[str] = None, qty_header = 'QTY_PER_NODE' free_header = 'TOTAL_FREE_GPUS' realtime_gpu_table = log_utils.create_table( - ['GPU', qty_header, 'TOTAL_GPUS', free_header]) + [gpu_col_name, qty_header, 'TOTAL_GPUS', free_header]) counts, capacity, available = service_catalog.list_accelerator_realtime( gpus_only=True, - clouds=cloud, + clouds='kubernetes', name_filter=name_filter, region_filter=region, quantity_filter=quantity_filter, @@ -3070,6 +3073,7 @@ def _output(): cloud_is_kubernetes = isinstance(cloud_obj, sky_clouds.Kubernetes) kubernetes_autoscaling = kubernetes_utils.get_autoscaler_type( ) is not None + kubernetes_is_enabled = sky_clouds.cloud_in_iterable(sky_clouds.Kubernetes(), global_user_state.get_cached_enabled_clouds()) if accelerator_str is None: # If cloud is kubernetes, we want to show real-time capacity @@ -3080,9 +3084,15 @@ def _output(): yield kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE return + # Optimization - do not poll for Kubernetes API for fetching + # common GPUs because that will be fetched later for the table after + # common GPUs. + clouds_to_list = cloud + if cloud is None and not show_all: + clouds_to_list = (c for c in service_catalog.ALL_CLOUDS if c != 'kubernetes') result = service_catalog.list_accelerator_counts( gpus_only=True, - clouds=cloud, + clouds=clouds_to_list, region_filter=region, ) @@ -3092,6 +3102,11 @@ def _output(): gpu_table.add_row([gpu, _list_to_str(result.pop(gpu))]) yield from gpu_table.get_string() + # Kubernetes GPUs with realtime information + if kubernetes_is_enabled: + yield '\n\n' + yield from _kubernetes_realtime_gpu_output(gpu_col_name='KUBERNETES_GPU') + # Google TPUs for tpu in service_catalog.get_tpus(): if tpu in result: @@ -3107,13 +3122,14 @@ def _output(): other_table.add_row([gpu, _list_to_str(qty)]) yield from other_table.get_string() yield '\n\n' - if cloud is None and kubernetes_autoscaling: + if cloud is None and kubernetes_is_enabled and kubernetes_autoscaling: yield kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE yield '\n\n' else: yield ('\n\nHint: use -a/--all to see all accelerators ' '(including non-common ones) and pricing.') - if cloud is None and kubernetes_autoscaling: + if cloud is None and kubernetes_is_enabled and kubernetes_autoscaling: + yield '\n' yield kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE return else: From b75e471dd6976412e802232737d1aa981a002a51 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Thu, 23 May 2024 18:57:25 -0700 Subject: [PATCH 12/26] lint --- sky/cli.py | 16 +++++++++++----- sky/clouds/service_catalog/kubernetes_catalog.py | 3 ++- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index ace46d1f2ce..aa644c51abb 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -3073,7 +3073,9 @@ def _output(): cloud_is_kubernetes = isinstance(cloud_obj, sky_clouds.Kubernetes) kubernetes_autoscaling = kubernetes_utils.get_autoscaler_type( ) is not None - kubernetes_is_enabled = sky_clouds.cloud_in_iterable(sky_clouds.Kubernetes(), global_user_state.get_cached_enabled_clouds()) + kubernetes_is_enabled = sky_clouds.cloud_in_iterable( + sky_clouds.Kubernetes(), + global_user_state.get_cached_enabled_clouds()) if accelerator_str is None: # If cloud is kubernetes, we want to show real-time capacity @@ -3089,7 +3091,8 @@ def _output(): # common GPUs. clouds_to_list = cloud if cloud is None and not show_all: - clouds_to_list = (c for c in service_catalog.ALL_CLOUDS if c != 'kubernetes') + clouds_to_list = ( + c for c in service_catalog.ALL_CLOUDS if c != 'kubernetes') result = service_catalog.list_accelerator_counts( gpus_only=True, clouds=clouds_to_list, @@ -3105,7 +3108,8 @@ def _output(): # Kubernetes GPUs with realtime information if kubernetes_is_enabled: yield '\n\n' - yield from _kubernetes_realtime_gpu_output(gpu_col_name='KUBERNETES_GPU') + yield from _kubernetes_realtime_gpu_output( + gpu_col_name='KUBERNETES_GPU') # Google TPUs for tpu in service_catalog.get_tpus(): @@ -3122,13 +3126,15 @@ def _output(): other_table.add_row([gpu, _list_to_str(qty)]) yield from other_table.get_string() yield '\n\n' - if cloud is None and kubernetes_is_enabled and kubernetes_autoscaling: + if (cloud is None and kubernetes_is_enabled + and kubernetes_autoscaling): yield kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE yield '\n\n' else: yield ('\n\nHint: use -a/--all to see all accelerators ' '(including non-common ones) and pricing.') - if cloud is None and kubernetes_is_enabled and kubernetes_autoscaling: + if (cloud is None and kubernetes_is_enabled + and kubernetes_autoscaling): yield '\n' yield kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE return diff --git a/sky/clouds/service_catalog/kubernetes_catalog.py b/sky/clouds/service_catalog/kubernetes_catalog.py index 70e1b463460..602e19b5ff0 100644 --- a/sky/clouds/service_catalog/kubernetes_catalog.py +++ b/sky/clouds/service_catalog/kubernetes_catalog.py @@ -100,7 +100,8 @@ def list_accelerators_realtime( # Check if name_filter regex matches the accelerator_name regex_flags = 0 if case_sensitive else re.IGNORECASE - if name_filter and not re.match(name_filter, accelerator_name, flags=regex_flags): + if name_filter and not re.match( + name_filter, accelerator_name, flags=regex_flags): continue accelerator_count = int( From ba98957e22918f6d35551fe1c4e1358b29d12470 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Thu, 23 May 2024 18:58:35 -0700 Subject: [PATCH 13/26] lint --- sky/cli.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index aa644c51abb..e2c1b87f548 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -3126,15 +3126,15 @@ def _output(): other_table.add_row([gpu, _list_to_str(qty)]) yield from other_table.get_string() yield '\n\n' - if (cloud is None and kubernetes_is_enabled - and kubernetes_autoscaling): + if (cloud is None and kubernetes_is_enabled and + kubernetes_autoscaling): yield kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE yield '\n\n' else: yield ('\n\nHint: use -a/--all to see all accelerators ' '(including non-common ones) and pricing.') - if (cloud is None and kubernetes_is_enabled - and kubernetes_autoscaling): + if (cloud is None and kubernetes_is_enabled and + kubernetes_autoscaling): yield '\n' yield kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE return From b44b7597f336f9245e9d5fba00b7aad04bad09a5 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Thu, 23 May 2024 19:11:33 -0700 Subject: [PATCH 14/26] fix for non-k8s cloud specified --- sky/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/cli.py b/sky/cli.py index e2c1b87f548..b26548da412 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -3106,7 +3106,7 @@ def _output(): yield from gpu_table.get_string() # Kubernetes GPUs with realtime information - if kubernetes_is_enabled: + if cloud is None and kubernetes_is_enabled: yield '\n\n' yield from _kubernetes_realtime_gpu_output( gpu_col_name='KUBERNETES_GPU') From 57cc132ece7ef53208440e3f755bac04d0f24f4f Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Thu, 23 May 2024 19:18:34 -0700 Subject: [PATCH 15/26] fix for region specified with k8s --- sky/cli.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index b26548da412..c7c29586b7e 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -3013,6 +3013,15 @@ def show_gpus( if show_all and accelerator_str is not None: raise click.UsageError('--all is only allowed without a GPU name.') + # Kubernetes specific bools + cloud_is_kubernetes = isinstance(cloud_obj, sky_clouds.Kubernetes) + kubernetes_autoscaling = kubernetes_utils.get_autoscaler_type() is not None + kubernetes_is_enabled = sky_clouds.cloud_in_iterable(sky_clouds.Kubernetes(), global_user_state.get_cached_enabled_clouds()) + + if cloud_is_kubernetes and region is not None: + raise click.UsageError( + 'The --region flag cannot be set with --cloud kubernetes.') + def _list_to_str(lst): return ', '.join([str(e) for e in lst]) @@ -3037,8 +3046,10 @@ def _kubernetes_realtime_gpu_output(name_filter: Optional[str] = None, quantity_filter=quantity_filter, case_sensitive=False) assert (set(counts.keys()) == set(capacity.keys()) == set( - available.keys())), ('Keys of counts, capacity, ' - 'and available must be same.') + available.keys())), (f'Keys of counts ({list(counts.keys())}), ' + f'capacity ({list(capacity.keys())}), ' + f'and available ({list(available.keys())}) ' + 'must be same.') if len(counts) == 0: gpu_info_msg = '' debug_msg = 'To further debug, run: sky check.' @@ -3069,14 +3080,6 @@ def _output(): name, quantity = None, None - # Kubernetes specific bools - cloud_is_kubernetes = isinstance(cloud_obj, sky_clouds.Kubernetes) - kubernetes_autoscaling = kubernetes_utils.get_autoscaler_type( - ) is not None - kubernetes_is_enabled = sky_clouds.cloud_in_iterable( - sky_clouds.Kubernetes(), - global_user_state.get_cached_enabled_clouds()) - if accelerator_str is None: # If cloud is kubernetes, we want to show real-time capacity if cloud_is_kubernetes: From 46653864818eda93e8939ee82f5e98e0ed4745ab Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Thu, 23 May 2024 19:23:22 -0700 Subject: [PATCH 16/26] lint --- sky/cli.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sky/cli.py b/sky/cli.py index c7c29586b7e..e9e0766e36f 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -3016,7 +3016,8 @@ def show_gpus( # Kubernetes specific bools cloud_is_kubernetes = isinstance(cloud_obj, sky_clouds.Kubernetes) kubernetes_autoscaling = kubernetes_utils.get_autoscaler_type() is not None - kubernetes_is_enabled = sky_clouds.cloud_in_iterable(sky_clouds.Kubernetes(), global_user_state.get_cached_enabled_clouds()) + kubernetes_is_enabled = sky_clouds.cloud_in_iterable( + sky_clouds.Kubernetes(), global_user_state.get_cached_enabled_clouds()) if cloud_is_kubernetes and region is not None: raise click.UsageError( From 400336fb1437cfe6bd1df092df0a1ae92407281e Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Fri, 24 May 2024 12:23:33 -0700 Subject: [PATCH 17/26] show kubernetes in separate section --- sky/cli.py | 66 ++++++++++++++++++++----------- sky/provision/kubernetes/utils.py | 2 +- 2 files changed, 43 insertions(+), 25 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index e9e0766e36f..a9b474153ac 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -3055,11 +3055,11 @@ def _kubernetes_realtime_gpu_output(name_filter: Optional[str] = None, gpu_info_msg = '' debug_msg = 'To further debug, run: sky check.' if name_filter is not None: - gpu_info_msg = f' matching name {name_filter!r}' - debug_msg = ('To list all available accelerators, ' - 'run: sky show-gpus --cloud kubernetes.') + gpu_info_msg = f' {name_filter!r}' if quantity_filter is not None: - gpu_info_msg += f' with quantity {quantity_filter}' + gpu_info_msg += f' with requested quantity {quantity_filter}' + debug_msg = ('To show available accelerators on kubernetes,' + ' run: sky show-gpus --cloud kubernetes') err_msg = kubernetes_utils.NO_GPU_ERROR_MESSAGE.format( gpu_info_msg=gpu_info_msg, debug_msg=debug_msg) yield err_msg @@ -3081,40 +3081,49 @@ def _output(): name, quantity = None, None + # Optimization - do not poll for Kubernetes API for fetching + # common GPUs because that will be fetched later for the table after + # common GPUs. + clouds_to_list = cloud + if cloud is None: + clouds_to_list = [ + c for c in service_catalog.ALL_CLOUDS if c != 'kubernetes'] + if accelerator_str is None: # If cloud is kubernetes, we want to show real-time capacity - if cloud_is_kubernetes: + if kubernetes_is_enabled and (cloud is None or cloud_is_kubernetes): + yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' + f'Kubernetes GPUs{colorama.Style.RESET_ALL}\n') yield from _kubernetes_realtime_gpu_output() + yield '\n\n' if kubernetes_utils.get_autoscaler_type() is not None: yield '\n' yield kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE + if cloud_is_kubernetes: + # Do not show clouds if --cloud kubernetes is specified + if not kubernetes_is_enabled: + yield ('Kubernetes is not enabled. To fix, run: ' + 'sky check kubernetes ') return - # Optimization - do not poll for Kubernetes API for fetching - # common GPUs because that will be fetched later for the table after - # common GPUs. - clouds_to_list = cloud - if cloud is None and not show_all: - clouds_to_list = ( - c for c in service_catalog.ALL_CLOUDS if c != 'kubernetes') result = service_catalog.list_accelerator_counts( gpus_only=True, clouds=clouds_to_list, region_filter=region, ) + if kubernetes_is_enabled and cloud is None: + # Show section headers only if Kubernetes is enabled and + # a cloud is not specified + yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' + f'Cloud GPUs{colorama.Style.RESET_ALL}\n') + # "Common" GPUs for gpu in service_catalog.get_common_gpus(): if gpu in result: gpu_table.add_row([gpu, _list_to_str(result.pop(gpu))]) yield from gpu_table.get_string() - # Kubernetes GPUs with realtime information - if cloud is None and kubernetes_is_enabled: - yield '\n\n' - yield from _kubernetes_realtime_gpu_output( - gpu_col_name='KUBERNETES_GPU') - # Google TPUs for tpu in service_catalog.get_tpus(): if tpu in result: @@ -3164,18 +3173,22 @@ def _output(): else: name, quantity = accelerator_str, None - if cloud_is_kubernetes: + if kubernetes_is_enabled and (cloud is None or cloud_is_kubernetes) and not show_all: + yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' + f'Kubernetes GPUs{colorama.Style.RESET_ALL}\n') # Get real-time availability of GPUs for Kubernetes yield from _kubernetes_realtime_gpu_output(name_filter=name, quantity_filter=quantity) - return + if cloud_is_kubernetes: + return + yield '\n\n' # For clouds other than Kubernetes, get the accelerator details # Case-sensitive result = service_catalog.list_accelerators(gpus_only=True, name_filter=name, quantity_filter=quantity, region_filter=region, - clouds=cloud, + clouds=clouds_to_list, case_sensitive=False, all_regions=all_regions) # Import here to save module load speed. @@ -3207,14 +3220,19 @@ def _output(): new_result[gpu] = sorted_dataclasses result = new_result + if kubernetes_is_enabled and (cloud is None or cloud_is_kubernetes) and not show_all: + yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' + f'Cloud GPUs{colorama.Style.RESET_ALL}\n') + if len(result) == 0: quantity_str = (f' with requested quantity {quantity}' if quantity else '') - yield f'Resources \'{name}\'{quantity_str} not found. ' - yield 'Try \'sky show-gpus --all\' ' - yield 'to show available accelerators.' + cloud_str = f' on {cloud_obj}.' if cloud else ' in cloud catalogs.' + yield f'Resources \'{name}\'{quantity_str} not found{cloud_str} ' + yield 'To show available accelerators, run: sky show-gpus --all' return + for i, (gpu, items) in enumerate(result.items()): accelerator_table_headers = [ 'GPU', diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index c306a546e0d..af5cc1cb1aa 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -35,7 +35,7 @@ 'T': 2**40, 'P': 2**50, } -NO_GPU_ERROR_MESSAGE = 'No GPUs{gpu_info_msg} found in Kubernetes cluster. \ +NO_GPU_ERROR_MESSAGE = 'Resources{gpu_info_msg} not found in Kubernetes cluster. \ If your cluster contains GPUs, make sure nvidia.com/gpu resource is available on the nodes and the node labels for identifying GPUs \ (e.g., skypilot.co/accelerator) are setup correctly. \ {debug_msg}' From 3d3e1214369d43d8605bdaa9144b38968d8bb41a Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Fri, 24 May 2024 12:46:11 -0700 Subject: [PATCH 18/26] wip --- sky/cli.py | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index a9b474153ac..d996040e823 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -3028,9 +3028,7 @@ def _list_to_str(lst): def _kubernetes_realtime_gpu_output(name_filter: Optional[str] = None, quantity_filter: Optional[int] = None, - gpu_col_name: Optional[str] = None): - if gpu_col_name is None: - gpu_col_name = 'GPU' + raise_if_not_found: bool = False): if quantity_filter: qty_header = 'QTY_FILTER' free_header = 'FILTERED_FREE_GPUS' @@ -3038,7 +3036,7 @@ def _kubernetes_realtime_gpu_output(name_filter: Optional[str] = None, qty_header = 'QTY_PER_NODE' free_header = 'TOTAL_FREE_GPUS' realtime_gpu_table = log_utils.create_table( - [gpu_col_name, qty_header, 'TOTAL_GPUS', free_header]) + ['GPU', qty_header, 'TOTAL_GPUS', free_header]) counts, capacity, available = service_catalog.list_accelerator_realtime( gpus_only=True, clouds='kubernetes', @@ -3062,6 +3060,8 @@ def _kubernetes_realtime_gpu_output(name_filter: Optional[str] = None, ' run: sky show-gpus --cloud kubernetes') err_msg = kubernetes_utils.NO_GPU_ERROR_MESSAGE.format( gpu_info_msg=gpu_info_msg, debug_msg=debug_msg) + if raise_if_not_found: + raise ValueError(err_msg) yield err_msg return for gpu, _ in sorted(counts.items()): @@ -3090,15 +3090,27 @@ def _output(): c for c in service_catalog.ALL_CLOUDS if c != 'kubernetes'] if accelerator_str is None: + k8s_messages = '' + print_section_titles = False # If cloud is kubernetes, we want to show real-time capacity if kubernetes_is_enabled and (cloud is None or cloud_is_kubernetes): - yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' - f'Kubernetes GPUs{colorama.Style.RESET_ALL}\n') - yield from _kubernetes_realtime_gpu_output() - yield '\n\n' + try: + # If --cloud kubernetes is not specified, we want to catch + # the case where no GPUs are available on the cluster and + # print the warning at the end. + k8s_output_generator = _kubernetes_realtime_gpu_output( + raise_if_not_found=(cloud is None) + ) + except ValueError as e: + k8s_messages += f'Note: {str(e)}\n' + else: + print_section_titles = True + yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' + f'Kubernetes GPUs{colorama.Style.RESET_ALL}\n') + yield from k8s_output_generator + yield '\n\n' if kubernetes_utils.get_autoscaler_type() is not None: - yield '\n' - yield kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE + k8s_messages += kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE if cloud_is_kubernetes: # Do not show clouds if --cloud kubernetes is specified if not kubernetes_is_enabled: @@ -3112,9 +3124,8 @@ def _output(): region_filter=region, ) - if kubernetes_is_enabled and cloud is None: - # Show section headers only if Kubernetes is enabled and - # a cloud is not specified + if print_section_titles: + # If section titles were printed above, print again here yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' f'Cloud GPUs{colorama.Style.RESET_ALL}\n') From e13ba3d86dbf046e9b26ccb86875cfd54ba6524d Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Fri, 24 May 2024 15:49:43 -0700 Subject: [PATCH 19/26] move messages to the end --- sky/cli.py | 84 ++++++++++++++++++------------- sky/provision/kubernetes/utils.py | 6 +-- 2 files changed, 52 insertions(+), 38 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index d996040e823..6823224b5b3 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -3026,9 +3026,9 @@ def show_gpus( def _list_to_str(lst): return ', '.join([str(e) for e in lst]) - def _kubernetes_realtime_gpu_output(name_filter: Optional[str] = None, - quantity_filter: Optional[int] = None, - raise_if_not_found: bool = False): + def _get_kubernetes_realtime_gpu_table(name_filter: Optional[str] = None, + quantity_filter: Optional[int] = None + ): if quantity_filter: qty_header = 'QTY_FILTER' free_header = 'FILTERED_FREE_GPUS' @@ -3050,26 +3050,23 @@ def _kubernetes_realtime_gpu_output(name_filter: Optional[str] = None, f'and available ({list(available.keys())}) ' 'must be same.') if len(counts) == 0: - gpu_info_msg = '' + err_msg = 'No GPUs found in Kubernetes cluster. ' debug_msg = 'To further debug, run: sky check.' if name_filter is not None: gpu_info_msg = f' {name_filter!r}' if quantity_filter is not None: gpu_info_msg += f' with requested quantity {quantity_filter}' + err_msg = f'Resources{gpu_info_msg} not found in Kubernetes cluster. ' debug_msg = ('To show available accelerators on kubernetes,' ' run: sky show-gpus --cloud kubernetes') - err_msg = kubernetes_utils.NO_GPU_ERROR_MESSAGE.format( - gpu_info_msg=gpu_info_msg, debug_msg=debug_msg) - if raise_if_not_found: - raise ValueError(err_msg) - yield err_msg - return + full_err_msg = err_msg + kubernetes_utils.NO_GPU_HELP_MESSAGE + debug_msg + raise ValueError(full_err_msg) for gpu, _ in sorted(counts.items()): realtime_gpu_table.add_row([ gpu, _list_to_str(counts.pop(gpu)), capacity[gpu], available[gpu] ]) - yield from realtime_gpu_table.get_string() + return realtime_gpu_table def _output(): gpu_table = log_utils.create_table( @@ -3089,8 +3086,9 @@ def _output(): clouds_to_list = [ c for c in service_catalog.ALL_CLOUDS if c != 'kubernetes'] + k8s_messages = '' if accelerator_str is None: - k8s_messages = '' + # Collect k8s related messages in k8s_messages and print them at end print_section_titles = False # If cloud is kubernetes, we want to show real-time capacity if kubernetes_is_enabled and (cloud is None or cloud_is_kubernetes): @@ -3098,24 +3096,27 @@ def _output(): # If --cloud kubernetes is not specified, we want to catch # the case where no GPUs are available on the cluster and # print the warning at the end. - k8s_output_generator = _kubernetes_realtime_gpu_output( - raise_if_not_found=(cloud is None) - ) + k8s_realtime_table = _get_kubernetes_realtime_gpu_table() except ValueError as e: - k8s_messages += f'Note: {str(e)}\n' + if cloud_is_kubernetes: + # Immediately show the error msg if --cloud kubernetes + yield str(e) + else: + # Show the error message at the end if not specified + k8s_messages += f'Note: {str(e)}' else: print_section_titles = True yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' f'Kubernetes GPUs{colorama.Style.RESET_ALL}\n') - yield from k8s_output_generator - yield '\n\n' + yield from k8s_realtime_table.get_string() if kubernetes_utils.get_autoscaler_type() is not None: - k8s_messages += kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE + k8s_messages += '\n' + kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE if cloud_is_kubernetes: # Do not show clouds if --cloud kubernetes is specified if not kubernetes_is_enabled: yield ('Kubernetes is not enabled. To fix, run: ' 'sky check kubernetes ') + yield k8s_messages return result = service_catalog.list_accelerator_counts( @@ -3126,6 +3127,7 @@ def _output(): if print_section_titles: # If section titles were printed above, print again here + yield '\n\n' yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' f'Cloud GPUs{colorama.Style.RESET_ALL}\n') @@ -3150,17 +3152,15 @@ def _output(): other_table.add_row([gpu, _list_to_str(qty)]) yield from other_table.get_string() yield '\n\n' - if (cloud is None and kubernetes_is_enabled and - kubernetes_autoscaling): - yield kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE + if k8s_messages: + yield k8s_messages yield '\n\n' else: yield ('\n\nHint: use -a/--all to see all accelerators ' '(including non-common ones) and pricing.') - if (cloud is None and kubernetes_is_enabled and - kubernetes_autoscaling): - yield '\n' - yield kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE + if k8s_messages: + yield '\n\n' + yield k8s_messages return else: # Parse accelerator string @@ -3184,15 +3184,30 @@ def _output(): else: name, quantity = accelerator_str, None + print_section_titles = False if kubernetes_is_enabled and (cloud is None or cloud_is_kubernetes) and not show_all: - yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' - f'Kubernetes GPUs{colorama.Style.RESET_ALL}\n') - # Get real-time availability of GPUs for Kubernetes - yield from _kubernetes_realtime_gpu_output(name_filter=name, + try: + k8s_realtime_table = _get_kubernetes_realtime_gpu_table(name_filter=name, quantity_filter=quantity) - if cloud_is_kubernetes: - return - yield '\n\n' + except ValueError as e: + if cloud_is_kubernetes: + yield str(e) + else: + k8s_messages += f'Note: {str(e)}' + else: + print_section_titles = True + yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' + f'Kubernetes GPUs{colorama.Style.RESET_ALL}') + yield from k8s_realtime_table.get_string() + if kubernetes_utils.get_autoscaler_type() is not None: + k8s_messages += '\n' + kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE + if cloud_is_kubernetes: + # Do not show clouds if --cloud kubernetes is specified + if not kubernetes_is_enabled: + yield ('Kubernetes is not enabled. To fix, run: ' + 'sky check kubernetes ') + yield k8s_messages + return # For clouds other than Kubernetes, get the accelerator details # Case-sensitive result = service_catalog.list_accelerators(gpus_only=True, @@ -3231,7 +3246,8 @@ def _output(): new_result[gpu] = sorted_dataclasses result = new_result - if kubernetes_is_enabled and (cloud is None or cloud_is_kubernetes) and not show_all: + if print_section_titles and not show_all: + yield '\n\n' yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' f'Cloud GPUs{colorama.Style.RESET_ALL}\n') diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index af5cc1cb1aa..5e0564f9707 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -35,10 +35,8 @@ 'T': 2**40, 'P': 2**50, } -NO_GPU_ERROR_MESSAGE = 'Resources{gpu_info_msg} not found in Kubernetes cluster. \ -If your cluster contains GPUs, make sure nvidia.com/gpu resource is available on the nodes and the node labels for identifying GPUs \ -(e.g., skypilot.co/accelerator) are setup correctly. \ -{debug_msg}' +NO_GPU_HELP_MESSAGE = 'If your cluster contains GPUs, make sure nvidia.com/gpu resource is available on the nodes and the node labels for identifying GPUs \ +(e.g., skypilot.co/accelerator) are setup correctly. ' KUBERNETES_AUTOSCALER_NOTE = ( 'Note: Kubernetes cluster autoscaling is enabled. ' From 9e308e08a3642cff60cb9935994ab4c24aa9ba2a Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Fri, 24 May 2024 15:53:43 -0700 Subject: [PATCH 20/26] lint --- sky/cli.py | 39 +++++++++++++++++++------------ sky/provision/kubernetes/utils.py | 6 +++-- 2 files changed, 28 insertions(+), 17 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index 6823224b5b3..e11f890ced5 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -3026,9 +3026,9 @@ def show_gpus( def _list_to_str(lst): return ', '.join([str(e) for e in lst]) - def _get_kubernetes_realtime_gpu_table(name_filter: Optional[str] = None, - quantity_filter: Optional[int] = None - ): + def _get_kubernetes_realtime_gpu_table( + name_filter: Optional[str] = None, + quantity_filter: Optional[int] = None): if quantity_filter: qty_header = 'QTY_FILTER' free_header = 'FILTERED_FREE_GPUS' @@ -3055,11 +3055,15 @@ def _get_kubernetes_realtime_gpu_table(name_filter: Optional[str] = None, if name_filter is not None: gpu_info_msg = f' {name_filter!r}' if quantity_filter is not None: - gpu_info_msg += f' with requested quantity {quantity_filter}' - err_msg = f'Resources{gpu_info_msg} not found in Kubernetes cluster. ' + gpu_info_msg += (' with requested quantity' + f' {quantity_filter}') + err_msg = (f'Resources{gpu_info_msg} not found ' + 'in Kubernetes cluster. ') debug_msg = ('To show available accelerators on kubernetes,' ' run: sky show-gpus --cloud kubernetes') - full_err_msg = err_msg + kubernetes_utils.NO_GPU_HELP_MESSAGE + debug_msg + full_err_msg = (err_msg + + kubernetes_utils.NO_GPU_HELP_MESSAGE + + debug_msg) raise ValueError(full_err_msg) for gpu, _ in sorted(counts.items()): realtime_gpu_table.add_row([ @@ -3084,7 +3088,8 @@ def _output(): clouds_to_list = cloud if cloud is None: clouds_to_list = [ - c for c in service_catalog.ALL_CLOUDS if c != 'kubernetes'] + c for c in service_catalog.ALL_CLOUDS if c != 'kubernetes' + ] k8s_messages = '' if accelerator_str is None: @@ -3109,8 +3114,10 @@ def _output(): yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' f'Kubernetes GPUs{colorama.Style.RESET_ALL}\n') yield from k8s_realtime_table.get_string() - if kubernetes_utils.get_autoscaler_type() is not None: - k8s_messages += '\n' + kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE + if kubernetes_autoscaling: + k8s_messages += ('\n' + + kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE + ) if cloud_is_kubernetes: # Do not show clouds if --cloud kubernetes is specified if not kubernetes_is_enabled: @@ -3185,10 +3192,11 @@ def _output(): name, quantity = accelerator_str, None print_section_titles = False - if kubernetes_is_enabled and (cloud is None or cloud_is_kubernetes) and not show_all: + if kubernetes_is_enabled and (cloud is None or + cloud_is_kubernetes) and not show_all: try: - k8s_realtime_table = _get_kubernetes_realtime_gpu_table(name_filter=name, - quantity_filter=quantity) + k8s_realtime_table = _get_kubernetes_realtime_gpu_table( + name_filter=name, quantity_filter=quantity) except ValueError as e: if cloud_is_kubernetes: yield str(e) @@ -3199,8 +3207,9 @@ def _output(): yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' f'Kubernetes GPUs{colorama.Style.RESET_ALL}') yield from k8s_realtime_table.get_string() - if kubernetes_utils.get_autoscaler_type() is not None: - k8s_messages += '\n' + kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE + if kubernetes_autoscaling: + k8s_messages += ('\n' + + kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE) if cloud_is_kubernetes: # Do not show clouds if --cloud kubernetes is specified if not kubernetes_is_enabled: @@ -3208,6 +3217,7 @@ def _output(): 'sky check kubernetes ') yield k8s_messages return + # For clouds other than Kubernetes, get the accelerator details # Case-sensitive result = service_catalog.list_accelerators(gpus_only=True, @@ -3259,7 +3269,6 @@ def _output(): yield 'To show available accelerators, run: sky show-gpus --all' return - for i, (gpu, items) in enumerate(result.items()): accelerator_table_headers = [ 'GPU', diff --git a/sky/provision/kubernetes/utils.py b/sky/provision/kubernetes/utils.py index 5e0564f9707..d5f91f639f6 100644 --- a/sky/provision/kubernetes/utils.py +++ b/sky/provision/kubernetes/utils.py @@ -35,8 +35,10 @@ 'T': 2**40, 'P': 2**50, } -NO_GPU_HELP_MESSAGE = 'If your cluster contains GPUs, make sure nvidia.com/gpu resource is available on the nodes and the node labels for identifying GPUs \ -(e.g., skypilot.co/accelerator) are setup correctly. ' +NO_GPU_HELP_MESSAGE = ('If your cluster contains GPUs, make sure ' + 'nvidia.com/gpu resource is available on the nodes and ' + 'the node labels for identifying GPUs ' + '(e.g., skypilot.co/accelerator) are setup correctly. ') KUBERNETES_AUTOSCALER_NOTE = ( 'Note: Kubernetes cluster autoscaling is enabled. ' From 8a36851f79850363a805b4ac3968a7c9e5a61350 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Fri, 24 May 2024 15:55:55 -0700 Subject: [PATCH 21/26] lint --- sky/cli.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index e11f890ced5..07fd39792ff 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -3061,8 +3061,7 @@ def _get_kubernetes_realtime_gpu_table( 'in Kubernetes cluster. ') debug_msg = ('To show available accelerators on kubernetes,' ' run: sky show-gpus --cloud kubernetes') - full_err_msg = (err_msg + - kubernetes_utils.NO_GPU_HELP_MESSAGE + + full_err_msg = (err_msg + kubernetes_utils.NO_GPU_HELP_MESSAGE + debug_msg) raise ValueError(full_err_msg) for gpu, _ in sorted(counts.items()): @@ -3115,9 +3114,8 @@ def _output(): f'Kubernetes GPUs{colorama.Style.RESET_ALL}\n') yield from k8s_realtime_table.get_string() if kubernetes_autoscaling: - k8s_messages += ('\n' + - kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE - ) + k8s_messages += ( + '\n' + kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE) if cloud_is_kubernetes: # Do not show clouds if --cloud kubernetes is specified if not kubernetes_is_enabled: From db958959f7fe8f61532f1876e7c6dff56aba20f8 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Fri, 24 May 2024 16:19:02 -0700 Subject: [PATCH 22/26] show sections if name is specified --- sky/cli.py | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index 07fd39792ff..39c29d9ce36 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -3192,19 +3192,19 @@ def _output(): print_section_titles = False if kubernetes_is_enabled and (cloud is None or cloud_is_kubernetes) and not show_all: + # Print section title if not showing all and instead a specific + # accelerator is requested + print_section_titles = True + yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' + f'Kubernetes GPUs{colorama.Style.RESET_ALL}\n') try: k8s_realtime_table = _get_kubernetes_realtime_gpu_table( name_filter=name, quantity_filter=quantity) - except ValueError as e: - if cloud_is_kubernetes: - yield str(e) - else: - k8s_messages += f'Note: {str(e)}' - else: - print_section_titles = True - yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' - f'Kubernetes GPUs{colorama.Style.RESET_ALL}') yield from k8s_realtime_table.get_string() + except ValueError as e: + # In the case of a specific accelerator, show the error message + # immediately (e.g., "Resources H100 not found ...") + yield str(e) if kubernetes_autoscaling: k8s_messages += ('\n' + kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE) @@ -3265,6 +3265,10 @@ def _output(): cloud_str = f' on {cloud_obj}.' if cloud else ' in cloud catalogs.' yield f'Resources \'{name}\'{quantity_str} not found{cloud_str} ' yield 'To show available accelerators, run: sky show-gpus --all' + + if k8s_messages: + yield '\n' + yield k8s_messages return for i, (gpu, items) in enumerate(result.items()): @@ -3322,6 +3326,9 @@ def _output(): if i != 0: yield '\n\n' yield from accelerator_table.get_string() + if k8s_messages: + yield '\n' + yield k8s_messages if show_all: click.echo_via_pager(_output()) From 91a43565ce63e108201f07b1b481d136dc189959 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Mon, 27 May 2024 10:30:43 -0700 Subject: [PATCH 23/26] comments --- sky/cli.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index 39c29d9ce36..2cd243e86b6 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -3190,8 +3190,8 @@ def _output(): name, quantity = accelerator_str, None print_section_titles = False - if kubernetes_is_enabled and (cloud is None or - cloud_is_kubernetes) and not show_all: + if (kubernetes_is_enabled and (cloud is None or + cloud_is_kubernetes) and not show_all): # Print section title if not showing all and instead a specific # accelerator is requested print_section_titles = True From 8e48e683c4a215e1cfd3b8cce02250926b54ba89 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Mon, 27 May 2024 11:22:46 -0700 Subject: [PATCH 24/26] lint --- sky/cli.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index 2cd243e86b6..aa118c3af01 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -3190,8 +3190,8 @@ def _output(): name, quantity = accelerator_str, None print_section_titles = False - if (kubernetes_is_enabled and (cloud is None or - cloud_is_kubernetes) and not show_all): + if (kubernetes_is_enabled and (cloud is None or cloud_is_kubernetes) and + not show_all): # Print section title if not showing all and instead a specific # accelerator is requested print_section_titles = True From 997bec180eb0452f785b06aeb461d4054eb0d834 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Mon, 27 May 2024 12:59:53 -0700 Subject: [PATCH 25/26] fix bugs and move warning for show_all to the top --- sky/cli.py | 34 ++++++++++++++-------------------- 1 file changed, 14 insertions(+), 20 deletions(-) diff --git a/sky/cli.py b/sky/cli.py index aa118c3af01..74b6d38545a 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -3051,7 +3051,7 @@ def _get_kubernetes_realtime_gpu_table( 'must be same.') if len(counts) == 0: err_msg = 'No GPUs found in Kubernetes cluster. ' - debug_msg = 'To further debug, run: sky check.' + debug_msg = 'To further debug, run: sky check ' if name_filter is not None: gpu_info_msg = f' {name_filter!r}' if quantity_filter is not None: @@ -3060,7 +3060,7 @@ def _get_kubernetes_realtime_gpu_table( err_msg = (f'Resources{gpu_info_msg} not found ' 'in Kubernetes cluster. ') debug_msg = ('To show available accelerators on kubernetes,' - ' run: sky show-gpus --cloud kubernetes') + ' run: sky show-gpus --cloud kubernetes ') full_err_msg = (err_msg + kubernetes_utils.NO_GPU_HELP_MESSAGE + debug_msg) raise ValueError(full_err_msg) @@ -3102,12 +3102,10 @@ def _output(): # print the warning at the end. k8s_realtime_table = _get_kubernetes_realtime_gpu_table() except ValueError as e: - if cloud_is_kubernetes: - # Immediately show the error msg if --cloud kubernetes - yield str(e) - else: - # Show the error message at the end if not specified - k8s_messages += f'Note: {str(e)}' + if not cloud_is_kubernetes: + # Make it a note if cloud is not kubernetes + k8s_messages += f'Note: ' + k8s_messages += str(e) else: print_section_titles = True yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}' @@ -3124,6 +3122,12 @@ def _output(): yield k8s_messages return + # For show_all, show the k8s message at the start since output is + # long and the user may not scroll to the end. + if show_all and k8s_messages: + yield k8s_messages + yield '\n\n' + result = service_catalog.list_accelerator_counts( gpus_only=True, clouds=clouds_to_list, @@ -3157,14 +3161,11 @@ def _output(): other_table.add_row([gpu, _list_to_str(qty)]) yield from other_table.get_string() yield '\n\n' - if k8s_messages: - yield k8s_messages - yield '\n\n' else: yield ('\n\nHint: use -a/--all to see all accelerators ' '(including non-common ones) and pricing.') if k8s_messages: - yield '\n\n' + yield '\n' yield k8s_messages return else: @@ -3208,12 +3209,12 @@ def _output(): if kubernetes_autoscaling: k8s_messages += ('\n' + kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE) + yield k8s_messages if cloud_is_kubernetes: # Do not show clouds if --cloud kubernetes is specified if not kubernetes_is_enabled: yield ('Kubernetes is not enabled. To fix, run: ' 'sky check kubernetes ') - yield k8s_messages return # For clouds other than Kubernetes, get the accelerator details @@ -3265,10 +3266,6 @@ def _output(): cloud_str = f' on {cloud_obj}.' if cloud else ' in cloud catalogs.' yield f'Resources \'{name}\'{quantity_str} not found{cloud_str} ' yield 'To show available accelerators, run: sky show-gpus --all' - - if k8s_messages: - yield '\n' - yield k8s_messages return for i, (gpu, items) in enumerate(result.items()): @@ -3326,9 +3323,6 @@ def _output(): if i != 0: yield '\n\n' yield from accelerator_table.get_string() - if k8s_messages: - yield '\n' - yield k8s_messages if show_all: click.echo_via_pager(_output()) From 72f08d91c03386dc55b13ceecc113505017c30f1 Mon Sep 17 00:00:00 2001 From: Romil Bhardwaj Date: Mon, 27 May 2024 13:02:15 -0700 Subject: [PATCH 26/26] lint --- sky/cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sky/cli.py b/sky/cli.py index 74b6d38545a..0bcec3d2f4b 100644 --- a/sky/cli.py +++ b/sky/cli.py @@ -3104,7 +3104,7 @@ def _output(): except ValueError as e: if not cloud_is_kubernetes: # Make it a note if cloud is not kubernetes - k8s_messages += f'Note: ' + k8s_messages += 'Note: ' k8s_messages += str(e) else: print_section_titles = True