Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[k8s] Realtime GPU availability of kubernetes cluster in sky show-gpus #3499

Merged
merged 28 commits into from
May 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
e6b975d
wip
romilbhardwaj Apr 30, 2024
a6b5bfc
filtering support
romilbhardwaj Apr 30, 2024
1346159
lint
romilbhardwaj Apr 30, 2024
6bbbf25
update doc
romilbhardwaj Apr 30, 2024
a263365
rename headers
romilbhardwaj May 2, 2024
6dfb785
Merge branch 'master' of https://github.com/skypilot-org/skypilot int…
romilbhardwaj May 17, 2024
0bd06a4
comments
romilbhardwaj May 17, 2024
6bf3045
add TODO
romilbhardwaj May 17, 2024
f960322
Add autoscaler note
romilbhardwaj May 18, 2024
8e1821d
Merge branch 'master' of https://github.com/skypilot-org/skypilot int…
romilbhardwaj May 23, 2024
8878254
case sensitive fix
romilbhardwaj May 24, 2024
3fe8fc6
case sensitive fix
romilbhardwaj May 24, 2024
2203d6b
show kubernetes GPUs in a separate table in sky show-gpus
romilbhardwaj May 24, 2024
b75e471
lint
romilbhardwaj May 24, 2024
ba98957
lint
romilbhardwaj May 24, 2024
b44b759
fix for non-k8s cloud specified
romilbhardwaj May 24, 2024
57cc132
fix for region specified with k8s
romilbhardwaj May 24, 2024
4665386
lint
romilbhardwaj May 24, 2024
400336f
show kubernetes in separate section
romilbhardwaj May 24, 2024
3d3e121
wip
romilbhardwaj May 24, 2024
e13ba3d
move messages to the end
romilbhardwaj May 24, 2024
9e308e0
lint
romilbhardwaj May 24, 2024
8a36851
lint
romilbhardwaj May 24, 2024
db95895
show sections if name is specified
romilbhardwaj May 24, 2024
91a4356
comments
romilbhardwaj May 27, 2024
8e48e68
lint
romilbhardwaj May 27, 2024
997bec1
fix bugs and move warning for show_all to the top
romilbhardwaj May 27, 2024
72f08d9
lint
romilbhardwaj May 27, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
204 changes: 166 additions & 38 deletions sky/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -2966,17 +2966,31 @@ def show_gpus(
To show all regions for a specified accelerator, use
``sky show-gpus <accelerator> --all-regions``.

If ``--region`` or ``--all-regions`` is not specified, the price displayed
for each instance type is the lowest across all regions for both on-demand
and spot instances. There may be multiple regions with the same lowest
price.

If ``--cloud kubernetes`` is specified, it will show the maximum quantities
of the GPU available on a single node and the real-time availability of
the GPU across all nodes in the Kubernetes cluster.

Definitions of certain fields:

* ``DEVICE_MEM``: Memory of a single device; does not depend on the device
count of the instance (VM).

* ``HOST_MEM``: Memory of the host instance (VM).

If ``--region`` or ``--all-regions`` is not specified, the price displayed
for each instance type is the lowest across all regions for both on-demand
and spot instances. There may be multiple regions with the same lowest
price.
* ``QTY_PER_NODE`` (Kubernetes only): GPU quantities that can be requested
on a single node.

* ``TOTAL_GPUS`` (Kubernetes only): Total number of GPUs available in the
Kubernetes cluster.

* ``TOTAL_FREE_GPUS`` (Kubernetes only): Number of currently free GPUs
in the Kubernetes cluster. This is fetched in real-time and may change
when other users are using the cluster.
"""
# validation for the --region flag
if region is not None and cloud is None:
Expand All @@ -2999,9 +3013,64 @@ def show_gpus(
if show_all and accelerator_str is not None:
raise click.UsageError('--all is only allowed without a GPU name.')

# Kubernetes specific bools
cloud_is_kubernetes = isinstance(cloud_obj, sky_clouds.Kubernetes)
kubernetes_autoscaling = kubernetes_utils.get_autoscaler_type() is not None
kubernetes_is_enabled = sky_clouds.cloud_in_iterable(
sky_clouds.Kubernetes(), global_user_state.get_cached_enabled_clouds())

if cloud_is_kubernetes and region is not None:
raise click.UsageError(
'The --region flag cannot be set with --cloud kubernetes.')

def _list_to_str(lst):
return ', '.join([str(e) for e in lst])

def _get_kubernetes_realtime_gpu_table(
name_filter: Optional[str] = None,
quantity_filter: Optional[int] = None):
if quantity_filter:
qty_header = 'QTY_FILTER'
free_header = 'FILTERED_FREE_GPUS'
else:
qty_header = 'QTY_PER_NODE'
free_header = 'TOTAL_FREE_GPUS'
realtime_gpu_table = log_utils.create_table(
['GPU', qty_header, 'TOTAL_GPUS', free_header])
counts, capacity, available = service_catalog.list_accelerator_realtime(
gpus_only=True,
clouds='kubernetes',
name_filter=name_filter,
region_filter=region,
quantity_filter=quantity_filter,
case_sensitive=False)
assert (set(counts.keys()) == set(capacity.keys()) == set(
available.keys())), (f'Keys of counts ({list(counts.keys())}), '
f'capacity ({list(capacity.keys())}), '
f'and available ({list(available.keys())}) '
'must be same.')
if len(counts) == 0:
err_msg = 'No GPUs found in Kubernetes cluster. '
debug_msg = 'To further debug, run: sky check '
if name_filter is not None:
gpu_info_msg = f' {name_filter!r}'
if quantity_filter is not None:
gpu_info_msg += (' with requested quantity'
f' {quantity_filter}')
err_msg = (f'Resources{gpu_info_msg} not found '
'in Kubernetes cluster. ')
debug_msg = ('To show available accelerators on kubernetes,'
' run: sky show-gpus --cloud kubernetes ')
full_err_msg = (err_msg + kubernetes_utils.NO_GPU_HELP_MESSAGE +
debug_msg)
raise ValueError(full_err_msg)
for gpu, _ in sorted(counts.items()):
realtime_gpu_table.add_row([
gpu,
_list_to_str(counts.pop(gpu)), capacity[gpu], available[gpu]
])
return realtime_gpu_table

def _output():
gpu_table = log_utils.create_table(
['COMMON_GPU', 'AVAILABLE_QUANTITIES'])
Expand All @@ -3012,35 +3081,69 @@ def _output():

name, quantity = None, None

# Kubernetes specific bools
cloud_is_kubernetes = isinstance(cloud_obj, sky_clouds.Kubernetes)
kubernetes_autoscaling = kubernetes_utils.get_autoscaler_type(
) is not None
# Optimization - do not poll for Kubernetes API for fetching
# common GPUs because that will be fetched later for the table after
# common GPUs.
clouds_to_list = cloud
if cloud is None:
clouds_to_list = [
c for c in service_catalog.ALL_CLOUDS if c != 'kubernetes'
]

k8s_messages = ''
if accelerator_str is None:
# Collect k8s related messages in k8s_messages and print them at end
print_section_titles = False
# If cloud is kubernetes, we want to show real-time capacity
if kubernetes_is_enabled and (cloud is None or cloud_is_kubernetes):
try:
# If --cloud kubernetes is not specified, we want to catch
# the case where no GPUs are available on the cluster and
# print the warning at the end.
k8s_realtime_table = _get_kubernetes_realtime_gpu_table()
except ValueError as e:
if not cloud_is_kubernetes:
# Make it a note if cloud is not kubernetes
k8s_messages += 'Note: '
k8s_messages += str(e)
else:
print_section_titles = True
yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
f'Kubernetes GPUs{colorama.Style.RESET_ALL}\n')
yield from k8s_realtime_table.get_string()
if kubernetes_autoscaling:
k8s_messages += (
'\n' + kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE)
if cloud_is_kubernetes:
# Do not show clouds if --cloud kubernetes is specified
if not kubernetes_is_enabled:
yield ('Kubernetes is not enabled. To fix, run: '
'sky check kubernetes ')
yield k8s_messages
return

# For show_all, show the k8s message at the start since output is
# long and the user may not scroll to the end.
if show_all and k8s_messages:
yield k8s_messages
yield '\n\n'

result = service_catalog.list_accelerator_counts(
gpus_only=True,
clouds=cloud,
clouds=clouds_to_list,
region_filter=region,
)

if len(result) == 0 and cloud_is_kubernetes:
yield kubernetes_utils.NO_GPU_ERROR_MESSAGE
if kubernetes_autoscaling:
yield '\n'
yield kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE
return
if print_section_titles:
# If section titles were printed above, print again here
yield '\n\n'
yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
f'Cloud GPUs{colorama.Style.RESET_ALL}\n')

# "Common" GPUs
# If cloud is kubernetes, we want to show all GPUs here, even if
# they are not listed as common in SkyPilot.
if cloud_is_kubernetes:
for gpu, _ in sorted(result.items()):
for gpu in service_catalog.get_common_gpus():
if gpu in result:
gpu_table.add_row([gpu, _list_to_str(result.pop(gpu))])
else:
for gpu in service_catalog.get_common_gpus():
if gpu in result:
gpu_table.add_row([gpu, _list_to_str(result.pop(gpu))])
yield from gpu_table.get_string()

# Google TPUs
Expand All @@ -3058,16 +3161,12 @@ def _output():
other_table.add_row([gpu, _list_to_str(qty)])
yield from other_table.get_string()
yield '\n\n'
if (cloud_is_kubernetes or
cloud is None) and kubernetes_autoscaling:
yield kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE
yield '\n\n'
else:
yield ('\n\nHint: use -a/--all to see all accelerators '
'(including non-common ones) and pricing.')
if (cloud_is_kubernetes or
cloud is None) and kubernetes_autoscaling:
yield kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE
if k8s_messages:
yield '\n'
yield k8s_messages
return
else:
# Parse accelerator string
Expand All @@ -3091,12 +3190,40 @@ def _output():
else:
name, quantity = accelerator_str, None

print_section_titles = False
if (kubernetes_is_enabled and (cloud is None or cloud_is_kubernetes) and
not show_all):
# Print section title if not showing all and instead a specific
# accelerator is requested
print_section_titles = True
yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
f'Kubernetes GPUs{colorama.Style.RESET_ALL}\n')
try:
k8s_realtime_table = _get_kubernetes_realtime_gpu_table(
name_filter=name, quantity_filter=quantity)
yield from k8s_realtime_table.get_string()
except ValueError as e:
# In the case of a specific accelerator, show the error message
# immediately (e.g., "Resources H100 not found ...")
yield str(e)
if kubernetes_autoscaling:
k8s_messages += ('\n' +
kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE)
yield k8s_messages
if cloud_is_kubernetes:
# Do not show clouds if --cloud kubernetes is specified
if not kubernetes_is_enabled:
yield ('Kubernetes is not enabled. To fix, run: '
'sky check kubernetes ')
return

# For clouds other than Kubernetes, get the accelerator details
# Case-sensitive
result = service_catalog.list_accelerators(gpus_only=True,
name_filter=name,
quantity_filter=quantity,
region_filter=region,
clouds=cloud,
clouds=clouds_to_list,
case_sensitive=False,
all_regions=all_regions)
# Import here to save module load speed.
Expand Down Expand Up @@ -3128,16 +3255,17 @@ def _output():
new_result[gpu] = sorted_dataclasses
result = new_result

if len(result) == 0:
if cloud == 'kubernetes':
yield kubernetes_utils.NO_GPU_ERROR_MESSAGE
return
if print_section_titles and not show_all:
yield '\n\n'
yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
f'Cloud GPUs{colorama.Style.RESET_ALL}\n')

if len(result) == 0:
quantity_str = (f' with requested quantity {quantity}'
if quantity else '')
yield f'Resources \'{name}\'{quantity_str} not found. '
yield 'Try \'sky show-gpus --all\' '
yield 'to show available accelerators.'
cloud_str = f' on {cloud_obj}.' if cloud else ' in cloud catalogs.'
yield f'Resources \'{name}\'{quantity_str} not found{cloud_str} '
yield 'To show available accelerators, run: sky show-gpus --all'
return

for i, (gpu, items) in enumerate(result.items()):
Expand Down
40 changes: 40 additions & 0 deletions sky/clouds/service_catalog/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,46 @@ def list_accelerator_counts(
return ret


def list_accelerator_realtime(
gpus_only: bool = True,
name_filter: Optional[str] = None,
region_filter: Optional[str] = None,
quantity_filter: Optional[int] = None,
clouds: CloudFilter = None,
case_sensitive: bool = True,
) -> Tuple[Dict[str, List[int]], Dict[str, int], Dict[str, int]]:
"""List all accelerators offered by Sky with their realtime availability.

Realtime availability is the total number of accelerators in the cluster
and number of accelerators available at the time of the call.

Used for fixed size cluster settings, such as Kubernetes.

Returns:
A tuple of three dictionaries mapping canonical accelerator names to:
- A list of available counts. (e.g., [1, 2, 4])
- Total number of accelerators in the cluster (capacity).
- Number of accelerators available at the time of call (availability).
"""
qtys_map, total_accelerators_capacity, total_accelerators_available = (
_map_clouds_catalog(clouds,
'list_accelerators_realtime',
gpus_only,
name_filter,
region_filter,
quantity_filter,
case_sensitive=case_sensitive,
all_regions=False,
require_price=False))
accelerator_counts: Dict[str, List[int]] = collections.defaultdict(list)
for gpu, items in qtys_map.items():
for item in items:
accelerator_counts[gpu].append(item.accelerator_count)
accelerator_counts[gpu] = sorted(accelerator_counts[gpu])
return (accelerator_counts, total_accelerators_capacity,
total_accelerators_available)


def instance_type_exists(instance_type: str,
clouds: CloudFilter = None) -> bool:
"""Check the existence of a instance type."""
Expand Down
Loading
Loading