Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Core] Add labels field to resources #3464

Merged
merged 18 commits into from
May 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions docs/source/reference/config.rst
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ Available fields and semantics:
#
# Users should guarantee that these key-values are valid AWS tags, otherwise
# errors from the cloud provider will be surfaced.
instance_tags:
labels:
# (Example) AWS Migration Acceleration Program (MAP). This tag enables the
# program's discounts.
# Ref: https://docs.aws.amazon.com/mgn/latest/ug/map-program-tagging.html
Expand Down Expand Up @@ -142,9 +142,9 @@ Available fields and semantics:
#
# Users should guarantee that these key-values are valid GCP labels, otherwise
# errors from the cloud provider will be surfaced.
instance_tags:
labels:
Owner: user-unique-name
my-tag: my-value
my-label: my-value

# VPC to use (optional).
#
Expand Down
14 changes: 14 additions & 0 deletions docs/source/reference/yaml-spec.rst
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,20 @@ Available fields:
# To use a more limited but easier to manage tool:
# https://github.com/IBM/vpc-img-inst

# Labels to apply to the instances (optional).
#
# If specified, these labels will be applied to the VMs or pods created
# by SkyPilot. These are useful for assigning metadata that may be
# used by external tools. Implementation depends on the chosen cloud -
# On AWS, labels map to instance tags. On GCP, labels map to instance
# labels. On Kubernetes, labels map to pod labels. On other clouds,
# labels are not supported and will be ignored.
romilbhardwaj marked this conversation as resolved.
Show resolved Hide resolved
#
# Note: Labels are applied only on the first launch of the cluster. They
# are not updated on subsequent launches.
labels:
my-label: my-value

# Candidate resources (optional). If specified, SkyPilot will only use
# these candidate resources to launch the cluster. The fields specified
# outside of `any_of`, `ordered` will be used as the default values for
Expand Down
23 changes: 15 additions & 8 deletions sky/backends/backend_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -840,13 +840,20 @@ def write_cluster_config(
ssh_proxy_command = ssh_proxy_command_config[region_name]
logger.debug(f'Using ssh_proxy_command: {ssh_proxy_command!r}')

# User-supplied instance tags.
instance_tags = {}
instance_tags = skypilot_config.get_nested(
(str(cloud).lower(), 'instance_tags'), {})
# instance_tags is a dict, which is guaranteed by the type check in
# User-supplied global instance tags from ~/.sky/config.yaml.
labels = skypilot_config.get_nested((str(cloud).lower(), 'labels'), {})
# Deprecated: instance_tags have been replaced by labels. For backward
# compatibility, we support them and the schema allows them only if
# `labels` are not specified. This should be removed after 0.7.0.
labels = skypilot_config.get_nested((str(cloud).lower(), 'instance_tags'),
labels)
# labels is a dict, which is guaranteed by the type check in
# schemas.py
assert isinstance(instance_tags, dict), instance_tags
assert isinstance(labels, dict), labels

# Get labels from resources and override from the labels to_provision.
if to_provision.labels:
labels.update(to_provision.labels)

# Dump the Ray ports to a file for Ray job submission
dump_port_command = (
Expand Down Expand Up @@ -879,8 +886,8 @@ def write_cluster_config(
'vpc_name': skypilot_config.get_nested(
(str(cloud).lower(), 'vpc_name'), None),

# User-supplied instance tags.
'instance_tags': instance_tags,
# User-supplied labels.
'labels': labels,
# The reservation pools that specified by the user. This is
# currently only used by GCP.
'specific_reservations': specific_reservations,
Expand Down
19 changes: 19 additions & 0 deletions sky/clouds/aws.py
Original file line number Diff line number Diff line change
Expand Up @@ -962,3 +962,22 @@ def delete_image(cls, image_id: str, region: Optional[str]) -> None:
error_msg=f'Failed to delete image {image_id!r} on {region}.',
stderr=stderr,
stream_logs=True)

@classmethod
def is_label_valid(cls, label_key: str,
label_value: str) -> Tuple[bool, Optional[str]]:
key_regex = re.compile(r'^[^aws:][\S]{0,127}$')
value_regex = re.compile(r'^[\S]{0,255}$')
key_valid = bool(key_regex.match(label_key))
value_valid = bool(value_regex.match(label_value))
error_msg = None
if not key_valid:
error_msg = (f'Invalid tag key {label_key} for AWS. '
'Key must start with any character except \'aws:\' '
'and must be 128 characters or fewer in length.')
if not value_valid:
error_msg = (f'Invalid tag value {label_value} for AWS. '
'Value must be 256 characters or fewer in length.')
if not key_valid or not value_valid:
return False, error_msg
return True, None
19 changes: 19 additions & 0 deletions sky/clouds/cloud.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,6 +320,25 @@ def is_image_tag_valid(cls, image_tag: str, region: Optional[str]) -> bool:
region,
clouds=cls._REPR.lower())

@classmethod
def is_label_valid(cls, label_key: str,
label_value: str) -> Tuple[bool, Optional[str]]:
"""Validates that the label key and value are valid for this cloud.

Labels can be implemented in different ways across clouds. For example,
on AWS we use instance tags, on GCP we use labels, and on Kubernetes we
use labels. This method should be implemented to validate the label
format for the cloud.

Returns:
A tuple of a boolean indicating whether the label is valid and an
optional string describing the reason if the label is invalid.
"""
# If a cloud does not support labels, they are ignored. Only clouds
# that support labels implement this method.
del label_key, label_value
return True, None

def get_feasible_launchable_resources(
self,
resources: 'resources_lib.Resources',
Expand Down
22 changes: 22 additions & 0 deletions sky/clouds/gcp.py
Original file line number Diff line number Diff line change
Expand Up @@ -1069,3 +1069,25 @@ def delete_image(cls, image_id: str, region: Optional[str]) -> None:
error_msg=f'Failed to delete image {image_name!r}',
stderr=stderr,
stream_logs=True)

@classmethod
def is_label_valid(cls, label_key: str,
label_value: str) -> Tuple[bool, Optional[str]]:
key_regex = re.compile(r'^[a-z]([a-z0-9_-]{0,62})?$')
value_regex = re.compile(r'^[a-z0-9_-]{0,63}$')
key_valid = bool(key_regex.match(label_key))
value_valid = bool(value_regex.match(label_value))
error_msg = None
condition_msg = ('can include lowercase alphanumeric characters, '
'dashes, and underscores, with a total length of 63 '
'characters or less.')
if not key_valid:
error_msg = (f'Invalid label key {label_key} for GCP. '
f'Key must start with a lowercase letter '
f'and {condition_msg}')
if not value_valid:
error_msg = (f'Invalid label value {label_value} for GCP. Value '
f'{condition_msg}')
if not key_valid or not value_valid:
return False, error_msg
return True, None
25 changes: 25 additions & 0 deletions sky/clouds/kubernetes.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Kubernetes."""
import json
import os
import re
import typing
from typing import Dict, Iterator, List, Optional, Tuple

Expand Down Expand Up @@ -398,3 +399,27 @@ def get_current_user_identity(cls) -> Optional[List[str]]:
return [f'{cluster}_{user}_{namespace}']
except k8s.config.config_exception.ConfigException:
return None

@classmethod
def is_label_valid(cls, label_key: str,
label_value: str) -> Tuple[bool, Optional[str]]:
key_regex = re.compile(
r'^(?:[a-z0-9]([-a-z0-9]*[a-z0-9])?\/)?[a-z0-9]([-a-z0-9_.]{0,61}'
r'[a-z0-9])?$')
value_regex = re.compile(
r'^[a-zA-Z0-9]([-a-zA-Z0-9_.]{0,61}[a-zA-Z0-9])?$')
key_valid = bool(key_regex.match(label_key))
value_valid = bool(value_regex.match(label_value))
error_msg = None
condition_msg = ('Value must consist of alphanumeric characters or '
'\'-\', \'_\', \'.\', and must be no more than 63 '
'characters in length.')
if not key_valid:
error_msg = (f'Invalid label key {label_key} for Kubernetes. '
f'{condition_msg}')
if not value_valid:
error_msg = (f'Invalid label value {label_value} for Kubernetes. '
f'{condition_msg}')
if not key_valid or not value_valid:
return False, error_msg
return True, None
63 changes: 61 additions & 2 deletions sky/resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ class Resources:
"""
# If any fields changed, increment the version. For backward compatibility,
# modify the __setstate__ method to handle the old version.
_VERSION = 16
_VERSION = 17

def __init__(
self,
Expand All @@ -62,6 +62,7 @@ def __init__(
disk_size: Optional[int] = None,
disk_tier: Optional[Union[str, resources_utils.DiskTier]] = None,
ports: Optional[Union[int, str, List[str], Tuple[str]]] = None,
labels: Optional[Dict[str, str]] = None,
# Internal use only.
# pylint: disable=invalid-name
_docker_login_config: Optional[docker_utils.DockerLoginConfig] = None,
Expand Down Expand Up @@ -130,7 +131,13 @@ def __init__(
disk_tier: the disk performance tier to use. If None, defaults to
``'medium'``.
ports: the ports to open on the instance.
_docker_login_config: the docker configuration to use. This include
labels: the labels to apply to the instance. These are useful for
assigning metadata that may be used by external tools.
Implementation depends on the chosen cloud - On AWS, labels map to
instance tags. On GCP, labels map to instance labels. On
Kubernetes, labels map to pod labels. On other clouds, labels are
not supported and will be ignored.
_docker_login_config: the docker configuration to use. This includes
the docker username, password, and registry server. If None, skip
docker login.
_requires_fuse: whether the task requires FUSE mounting support. This
Expand Down Expand Up @@ -205,6 +212,8 @@ def __init__(
ports = None
self._ports = ports

self._labels = labels

self._docker_login_config = _docker_login_config

self._requires_fuse = _requires_fuse
Expand All @@ -219,6 +228,7 @@ def __init__(
self._try_validate_image_id()
self._try_validate_disk_tier()
self._try_validate_ports()
self._try_validate_labels()

# When querying the accelerators inside this func (we call self.accelerators
# which is a @property), we will check the cloud's catalog, which can error
Expand Down Expand Up @@ -417,6 +427,10 @@ def disk_tier(self) -> resources_utils.DiskTier:
def ports(self) -> Optional[List[str]]:
return self._ports

@property
def labels(self) -> Optional[Dict[str, str]]:
return self._labels

@property
def is_image_managed(self) -> Optional[bool]:
return self._is_image_managed
Expand Down Expand Up @@ -932,6 +946,34 @@ def _try_validate_ports(self) -> None:
# We don't need to check the ports format since we already done it
# in resources_utils.simplify_ports

def _try_validate_labels(self) -> None:
"""Try to validate the labels attribute.

Raises:
ValueError: if the attribute is invalid.
"""
if not self._labels:
return

if self.cloud is None:
# Because each cloud has its own label format, we cannot validate
# the labels without knowing the cloud.
with ux_utils.print_exception_no_traceback():
raise ValueError(
'Cloud must be specified when labels are provided.')
Comment on lines +958 to +963
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will cause failover unable to work. Should we just apply a global label format check instead? It might be fine to sacrifice some flexibility. Otherwise, a user have to specify the following to enable all the clouds they have which might be a bit unintuitive:

resources:
  labels:
    mykey1: myvalue1
  any_of:
    - cloud: aws
    - cloud: gcp

Copy link
Collaborator Author

@romilbhardwaj romilbhardwaj Apr 27, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Supporting failover is slightly tricky....

For example, labels in Kubernetes commonly use / to create a "namespace" for tags (e.g., skypilot.co/accelerators, app.kubernetes.io/component.. see recommended labels). However, GCP does not support / in labels, and a failover would cause provisioning to fail. Putting a stricter global format check would prevent users from creating these labels at all.

Another option could be to not do any validation at all and let these checks fail at provisioning time. We could do that, but this seemed cleaner so went with this for now. Any other ideas? Happy to change if you think we should support failover with labels.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see! I think it should be fine to require the cloud to be presented when the labels is specified. Another option to support failover could be the following:

Validate the labels in the following function, and return an empty list if the labels do not match the requirement for a specific cloud, i.e. make the current cloud infeasible:

def get_feasible_launchable_resources(
self,
resources: 'resources_lib.Resources',
num_nodes: int = 1
) -> Tuple[List['resources_lib.Resources'], List[str]]:
"""Returns ([feasible and launchable resources], [fuzzy candidates]).
Feasible resources refer to an offering respecting the resource
requirements. Currently, this function implements "filtering" the
cloud's offerings only w.r.t. accelerators constraints.
Launchable resources require a cloud and an instance type be assigned.
Fuzzy candidates example: when the requested GPU is A100:1 but is not
available in a cloud/region, the fuzzy candidates are results of a fuzzy
search in the catalog that are offered in the location. E.g.,
['A100-80GB:1', 'A100-80GB:2', 'A100-80GB:4', 'A100:8']
"""
if resources.is_launchable():
self._check_instance_type_accelerators_combination(resources)
resources_required_features = resources.get_required_cloud_features()
if num_nodes > 1:
resources_required_features.add(
CloudImplementationFeatures.MULTI_NODE)
try:
self.check_features_are_supported(resources,
resources_required_features)
except exceptions.NotSupportedError:
# TODO(zhwu): The resources are now silently filtered out. We
# should have some logging telling the user why the resources
# are not considered.
return ([], [])
return self._get_feasible_launchable_resources(resources)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed, that could work. Seems a little hacky though, since the error now becomes hidden in the warning instead of being a clear error.

Checking in resources.py (current):

sky launch task.yaml
Task from YAML spec: task.yaml
ValueError: Invalid label my/label=my/value. Invalid label value my/value for GCP. Value can include lowercase alphanumeric characters, dashes, and underscores, with a total length of 63 characters or less.

Checking in get_feasible_launchable_resources with warning (last proposal):

$ sky launch task.yaml
Task from YAML spec: task.yaml
W 04-30 15:36:34 cloud.py:383] Label my/label=my/value is invalid for cloud GCP. Reason: Invalid label value my/value for GCP. Value can include lowercase alphanumeric characters, dashes, and underscores, with a total length of 63 characters or less.
I 04-30 15:36:34 optimizer.py:1209] No resource satisfying GCP({'T4': 1}) on GCP.
sky.exceptions.ResourcesUnavailableError: Catalog does not contain any instances satisfying the request:
Task(run=<empty>)
  resources: GCP({'T4': 1}).

To fix: relax or change the resource requirements.

Hint: sky show-gpus to list available accelerators.
      sky check to check the enabled clouds.

wdyt? I'm leaning towards keeping the current variant, but can change to checking in get_feasible_launchable_resources to support failover if you feel strongly about it.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Keeping the current way sounds good to me, but we may want to file an issue for it for supporting failover when the labels are specified.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Filed #3500!


# Check if the label key value pairs are valid.
invalid_table = log_utils.create_table(['Label', 'Reason'])
for key, value in self._labels.items():
valid, err_msg = self.cloud.is_label_valid(key, value)
if not valid:
invalid_table.add_row([f'{key}: {value}', err_msg])
if len(invalid_table.rows) > 0:
with ux_utils.print_exception_no_traceback():
raise ValueError(
'The following labels are invalid:'
'\n\t' + invalid_table.get_string().replace('\n', '\n\t'))

def get_cost(self, seconds: float) -> float:
"""Returns cost in USD for the runtime in seconds."""
hours = seconds / 3600
Expand Down Expand Up @@ -1158,6 +1200,7 @@ def copy(self, **override) -> 'Resources':
image_id=override.pop('image_id', self.image_id),
disk_tier=override.pop('disk_tier', self.disk_tier),
ports=override.pop('ports', self.ports),
labels=override.pop('labels', self.labels),
_docker_login_config=override.pop('_docker_login_config',
self._docker_login_config),
_is_image_managed=override.pop('_is_image_managed',
Expand Down Expand Up @@ -1210,7 +1253,18 @@ def _override_resources(
resources_list = []
for override_config in override_configs:
new_resource_config = base_resource_config.copy()
# Labels are handled separately.
override_labels = override_config.pop('labels', None)
new_resource_config.update(override_config)

# Update the labels with the override labels.
labels = new_resource_config.get('labels', None)
if labels is not None and override_labels is not None:
labels.update(override_labels)
elif override_labels is not None:
labels = override_labels
Michaelvll marked this conversation as resolved.
Show resolved Hide resolved
new_resource_config['labels'] = labels

# Call from_yaml_config again instead of
# _from_yaml_config_single to handle the case, where both
# multiple accelerators and `any_of` is specified.
Expand Down Expand Up @@ -1297,6 +1351,7 @@ def _from_yaml_config_single(cls, config: Dict[str, str]) -> 'Resources':
resources_fields['image_id'] = config.pop('image_id', None)
resources_fields['disk_tier'] = config.pop('disk_tier', None)
resources_fields['ports'] = config.pop('ports', None)
resources_fields['labels'] = config.pop('labels', None)
resources_fields['_docker_login_config'] = config.pop(
'_docker_login_config', None)
resources_fields['_is_image_managed'] = config.pop(
Expand Down Expand Up @@ -1341,6 +1396,7 @@ def add_if_not_none(key, value):
if self.disk_tier is not None:
config['disk_tier'] = self.disk_tier.value
add_if_not_none('ports', self.ports)
add_if_not_none('labels', self.labels)
if self._docker_login_config is not None:
config['_docker_login_config'] = dataclasses.asdict(
self._docker_login_config)
Expand Down Expand Up @@ -1451,4 +1507,7 @@ def __setstate__(self, state):
# set the default to True for backward compatibility.
state['_requires_fuse'] = state.get('_requires_fuse', True)

if version < 17:
state['_labels'] = state.get('_labels', None)

self.__dict__.update(state)
8 changes: 4 additions & 4 deletions sky/templates/aws-ray.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -114,10 +114,10 @@ available_node_types:
Tags:
- Key: skypilot-user
Value: {{ user }}
{%- for tag_key, tag_value in instance_tags.items() %}
- Key: {{ tag_key }}
Value: {{ tag_value|tojson }}
{%- endfor %}
{%- for label_key, label_value in labels.items() %}
- Key: {{ label_key }}
Value: {{ label_value|tojson }}
{%- endfor %}

head_node_type: ray.head.default

Expand Down
4 changes: 2 additions & 2 deletions sky/templates/gcp-ray.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,8 @@ available_node_types:
node_config:
labels:
skypilot-user: {{ user }}
{%- for tag_key, tag_value in instance_tags.items() %}
{{ tag_key }}: {{ tag_value|tojson }}
{%- for label_key, label_value in labels.items() %}
{{ label_key }}: {{ label_value|tojson }}
{%- endfor %}
{%- if specific_reservations %}
reservationAffinity:
Expand Down
4 changes: 4 additions & 0 deletions sky/templates/kubernetes-ray.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,10 @@ available_node_types:
skypilot-cluster: {{cluster_name_on_cloud}}
# Identifies the SSH jump pod used by this pod. Used in life cycle management of the ssh jump pod.
skypilot-ssh-jump: {{k8s_ssh_jump_name}}
# Custom tags for the pods
{%- for label_key, label_value in labels.items() %}
{{ label_key }}: {{ label_value }}
{%- endfor %}
{% if k8s_fuse_device_required %}
annotations:
# Required for FUSE mounting to access /dev/fuse
Expand Down
Loading
Loading