From 5ce0b2c9b738444a76bb10319125f9c0318ab37a Mon Sep 17 00:00:00 2001 From: Kevin Date: Thu, 9 May 2024 16:38:25 -0400 Subject: [PATCH] simplify function calls and add option for custom resources Signed-off-by: Kevin --- src/codeflare_sdk/cluster/cluster.py | 96 ++++----- src/codeflare_sdk/cluster/config.py | 109 +++++++++-- src/codeflare_sdk/cluster/model.py | 7 +- .../templates/base-template.yaml | 4 - src/codeflare_sdk/utils/generate_yaml.py | 182 ++++++++++-------- src/codeflare_sdk/utils/pretty_print.py | 2 +- tests/e2e/local_interactive_sdk_kind_test.py | 1 - tests/e2e/local_interactive_sdk_oauth_test.py | 1 - tests/e2e/mnist_raycluster_sdk_kind_test.py | 1 - tests/e2e/mnist_raycluster_sdk_oauth_test.py | 1 - tests/e2e/start_ray_cluster.py | 1 - tests/test-case-bad.yaml | 4 +- tests/test-case-no-kueue-no-aw.yaml | 4 +- tests/test-case-no-mcad.yamls | 4 +- tests/test-case.yaml | 4 +- tests/test-default-appwrapper.yaml | 10 +- tests/unit_test.py | 60 +++--- tests/unit_test_support.py | 2 +- tests/upgrade/raycluster_sdk_upgrade_test.py | 1 - 19 files changed, 289 insertions(+), 205 deletions(-) diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py index 78bc666cf..f0f50eb36 100644 --- a/src/codeflare_sdk/cluster/cluster.py +++ b/src/codeflare_sdk/cluster/cluster.py @@ -29,6 +29,7 @@ from ..utils import pretty_print from ..utils.generate_yaml import ( generate_appwrapper, + head_worker_gpu_count_from_cluster, ) from ..utils.kube_api_helpers import _kube_api_error_handling from ..utils.generate_yaml import is_openshift_cluster @@ -118,48 +119,7 @@ def create_app_wrapper(self): f"Namespace {self.config.namespace} is of type {type(self.config.namespace)}. Check your Kubernetes Authentication." ) - # Before attempting to create the cluster AW, let's evaluate the ClusterConfig - - name = self.config.name - namespace = self.config.namespace - head_cpus = self.config.head_cpus - head_memory = self.config.head_memory - num_head_gpus = self.config.num_head_gpus - worker_cpu_requests = self.config.worker_cpu_requests - worker_cpu_limits = self.config.worker_cpu_limits - worker_memory_requests = self.config.worker_memory_requests - worker_memory_limits = self.config.worker_memory_limits - num_worker_gpus = self.config.num_worker_gpus - workers = self.config.num_workers - template = self.config.template - image = self.config.image - appwrapper = self.config.appwrapper - env = self.config.envs - image_pull_secrets = self.config.image_pull_secrets - write_to_file = self.config.write_to_file - local_queue = self.config.local_queue - labels = self.config.labels - return generate_appwrapper( - name=name, - namespace=namespace, - head_cpus=head_cpus, - head_memory=head_memory, - num_head_gpus=num_head_gpus, - worker_cpu_requests=worker_cpu_requests, - worker_cpu_limits=worker_cpu_limits, - worker_memory_requests=worker_memory_requests, - worker_memory_limits=worker_memory_limits, - num_worker_gpus=num_worker_gpus, - workers=workers, - template=template, - image=image, - appwrapper=appwrapper, - env=env, - image_pull_secrets=image_pull_secrets, - write_to_file=write_to_file, - local_queue=local_queue, - labels=labels, - ) + return generate_appwrapper(self) # creates a new cluster with the provided or default spec def up(self): @@ -305,7 +265,7 @@ def status( if print_to_console: # overriding the number of gpus with requested - cluster.worker_gpu = self.config.num_worker_gpus + _, cluster.worker_gpu = head_worker_gpu_count_from_cluster(self) pretty_print.print_cluster_status(cluster) elif print_to_console: if status == CodeFlareClusterStatus.UNKNOWN: @@ -443,6 +403,29 @@ def job_logs(self, job_id: str) -> str: """ return self.job_client.get_job_logs(job_id) + @staticmethod + def _head_worker_extended_resources_from_rc_dict(rc: Dict) -> Tuple[dict, dict]: + head_extended_resources, worker_extended_resources = {}, {} + for resource in rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ + "containers" + ][0]["resources"]["limits"].keys(): + if resource in ["memory", "cpu"]: + continue + worker_extended_resources[resource] = rc["spec"]["workerGroupSpecs"][0][ + "template" + ]["spec"]["containers"][0]["resources"]["limits"][resource] + + for resource in rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][ + 0 + ]["resources"]["limits"].keys(): + if resource in ["memory", "cpu"]: + continue + head_extended_resources[resource] = rc["spec"]["headGroupSpec"]["template"][ + "spec" + ]["containers"][0]["resources"]["limits"][resource] + + return head_extended_resources, worker_extended_resources + def from_k8_cluster_object( rc, appwrapper=True, @@ -456,6 +439,11 @@ def from_k8_cluster_object( else [] ) + ( + head_extended_resources, + worker_extended_resources, + ) = Cluster._head_worker_extended_resources_from_rc_dict(rc) + cluster_config = ClusterConfiguration( name=rc["metadata"]["name"], namespace=rc["metadata"]["namespace"], @@ -473,11 +461,8 @@ def from_k8_cluster_object( worker_memory_limits=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ "containers" ][0]["resources"]["limits"]["memory"], - num_worker_gpus=int( - rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0][ - "resources" - ]["limits"]["nvidia.com/gpu"] - ), + worker_extended_resource_requests=worker_extended_resources, + head_extended_resource_requests=head_extended_resources, image=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][ 0 ]["image"], @@ -858,6 +843,11 @@ def _map_to_ray_cluster(rc) -> Optional[RayCluster]: protocol = "https" dashboard_url = f"{protocol}://{ingress.spec.rules[0].host}" + ( + head_extended_resources, + worker_extended_resources, + ) = Cluster._head_worker_extended_resources_from_rc_dict(rc) + return RayCluster( name=rc["metadata"]["name"], status=status, @@ -872,7 +862,7 @@ def _map_to_ray_cluster(rc) -> Optional[RayCluster]: worker_cpu=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][ 0 ]["resources"]["limits"]["cpu"], - worker_gpu=0, # hard to detect currently how many gpus, can override it with what the user asked for + worker_extended_resources=worker_extended_resources, namespace=rc["metadata"]["namespace"], head_cpus=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][0][ "resources" @@ -880,9 +870,7 @@ def _map_to_ray_cluster(rc) -> Optional[RayCluster]: head_mem=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][0][ "resources" ]["limits"]["memory"], - head_gpu=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][0][ - "resources" - ]["limits"]["nvidia.com/gpu"], + head_extended_resources=head_extended_resources, dashboard=dashboard_url, ) @@ -907,12 +895,12 @@ def _copy_to_ray(cluster: Cluster) -> RayCluster: worker_mem_min=cluster.config.worker_memory_requests, worker_mem_max=cluster.config.worker_memory_limits, worker_cpu=cluster.config.worker_cpu_requests, - worker_gpu=cluster.config.num_worker_gpus, + worker_extended_resources=cluster.config.worker_extended_resource_requests, namespace=cluster.config.namespace, dashboard=cluster.cluster_dashboard_uri(), head_cpus=cluster.config.head_cpus, head_mem=cluster.config.head_memory, - head_gpu=cluster.config.num_head_gpus, + head_extended_resources=cluster.config.head_extended_resource_requests, ) if ray.status == CodeFlareClusterStatus.READY: ray.status = RayClusterStatus.READY diff --git a/src/codeflare_sdk/cluster/config.py b/src/codeflare_sdk/cluster/config.py index cb8e3d3d0..6a522fbc4 100644 --- a/src/codeflare_sdk/cluster/config.py +++ b/src/codeflare_sdk/cluster/config.py @@ -25,12 +25,51 @@ dir = pathlib.Path(__file__).parent.parent.resolve() +# https://docs.ray.io/en/latest/ray-core/scheduling/accelerators.html +DEFAULT_RESOURCE_MAPPING = { + "nvidia.com/gpu": "GPU", + "intel.com/gpu": "GPU", + "amd.com/gpu": "GPU", + "aws.amazon.com/neuroncore": "neuron_cores", + "google.com/tpu": "TPU", + "habana.ai/gaudi": "HPU", + "huawei.com/Ascend910": "NPU", + "huawei.com/Ascend310": "NPU", +} + @dataclass class ClusterConfiguration: """ This dataclass is used to specify resource requirements and other details, and is passed in as an argument when creating a Cluster object. + + Attributes: + - name: The name of the cluster. + - namespace: The namespace in which the cluster should be created. + - head_info: A list of strings containing information about the head node. + - head_cpus: The number of CPUs to allocate to the head node. + - head_memory: The amount of memory to allocate to the head node. + - head_gpus: The number of GPUs to allocate to the head node. (Deprecated, use head_extended_resource_requests) + - head_extended_resource_requests: A dictionary of extended resource requests for the head node. ex: {"nvidia.com/gpu": 1} + - machine_types: A list of machine types to use for the cluster. + - min_cpus: The minimum number of CPUs to allocate to each worker. + - max_cpus: The maximum number of CPUs to allocate to each worker. + - num_workers: The number of workers to create. + - min_memory: The minimum amount of memory to allocate to each worker. + - max_memory: The maximum amount of memory to allocate to each worker. + - num_gpus: The number of GPUs to allocate to each worker. (Deprecated, use worker_extended_resource_requests) + - template: The path to the template file to use for the cluster. + - appwrapper: A boolean indicating whether to use an AppWrapper. + - envs: A dictionary of environment variables to set for the cluster. + - image: The image to use for the cluster. + - image_pull_secrets: A list of image pull secrets to use for the cluster. + - write_to_file: A boolean indicating whether to write the cluster configuration to a file. + - verify_tls: A boolean indicating whether to verify TLS when connecting to the cluster. + - labels: A dictionary of labels to apply to the cluster. + - worker_extended_resource_requests: A dictionary of extended resource requests for each worker. ex: {"nvidia.com/gpu": 1} + - extended_resource_mapping: A dictionary of custom resource mappings to map extended resource requests to RayCluster resource names + - overwrite_default_resource_mapping: A boolean indicating whether to overwrite the default resource mapping. """ name: str @@ -39,7 +78,7 @@ class ClusterConfiguration: head_cpus: typing.Union[int, str] = 2 head_memory: typing.Union[int, str] = 8 head_gpus: int = None # Deprecating - num_head_gpus: int = 0 + head_extended_resource_requests: typing.Dict[str, int] = field(default_factory=dict) machine_types: list = field(default_factory=list) # ["m4.xlarge", "g4dn.xlarge"] worker_cpu_requests: typing.Union[int, str] = 1 worker_cpu_limits: typing.Union[int, str] = 1 @@ -50,7 +89,6 @@ class ClusterConfiguration: worker_memory_limits: typing.Union[int, str] = 2 min_memory: typing.Union[int, str] = None # Deprecating max_memory: typing.Union[int, str] = None # Deprecating - num_worker_gpus: int = 0 num_gpus: int = None # Deprecating template: str = f"{dir}/templates/base-template.yaml" appwrapper: bool = False @@ -60,6 +98,11 @@ class ClusterConfiguration: write_to_file: bool = False verify_tls: bool = True labels: dict = field(default_factory=dict) + worker_extended_resource_requests: typing.Dict[str, int] = field( + default_factory=dict + ) + extended_resource_mapping: typing.Dict[str, str] = field(default_factory=dict) + overwrite_default_resource_mapping: bool = False def __post_init__(self): if not self.verify_tls: @@ -70,8 +113,60 @@ def __post_init__(self): self._memory_to_string() self._str_mem_no_unit_add_GB() self._memory_to_resource() - self._gpu_to_resource() self._cpu_to_resource() + self._gpu_to_resource() + self._combine_extended_resource_mapping() + self._validate_extended_resource_requests(self.head_extended_resource_requests) + self._validate_extended_resource_requests( + self.worker_extended_resource_requests + ) + + def _combine_extended_resource_mapping(self): + if overwritten := set(self.extended_resource_mapping.keys()).intersection( + DEFAULT_RESOURCE_MAPPING.keys() + ): + if self.overwrite_default_resource_mapping: + warnings.warn( + f"Overwriting default resource mapping for {overwritten}", + UserWarning, + ) + else: + raise ValueError( + f"Resource mapping already exists for {overwritten}, set overwrite_default_resource_mapping to True to overwrite" + ) + self.extended_resource_mapping = { + **DEFAULT_RESOURCE_MAPPING, + **self.extended_resource_mapping, + } + + def _validate_extended_resource_requests( + self, extended_resources: typing.Dict[str, int] + ): + for k in extended_resources.keys(): + if k not in self.extended_resource_mapping.keys(): + raise ValueError( + f"extended resource '{k}' not found in extended_resource_mapping, available resources are {list(self.extended_resource_mapping.keys())}, to add more supported resources use extended_resource_mapping. i.e. extended_resource_mapping = {{'{k}': 'FOO_BAR'}}" + ) + + def _gpu_to_resource(self): + if self.head_gpus: + warnings.warn( + f"head_gpus is being deprecated, replacing with head_extended_resource_requests['nvidia.com/gpu'] = {self.head_gpus}" + ) + if "nvidia.com/gpu" in self.head_extended_resource_requests: + raise ValueError( + "nvidia.com/gpu already exists in head_extended_resource_requests" + ) + self.head_extended_resource_requests["nvidia.com/gpu"] = self.head_gpus + if self.num_gpus: + warnings.warn( + f"num_gpus is being deprecated, replacing with worker_extended_resource_requests['nvidia.com/gpu'] = {self.num_gpus}" + ) + if "nvidia.com/gpu" in self.worker_extended_resource_requests: + raise ValueError( + "nvidia.com/gpu already exists in worker_extended_resource_requests" + ) + self.worker_extended_resource_requests["nvidia.com/gpu"] = self.num_gpus def _str_mem_no_unit_add_GB(self): if isinstance(self.head_memory, str) and self.head_memory.isdecimal(): @@ -95,14 +190,6 @@ def _memory_to_string(self): if isinstance(self.worker_memory_limits, int): self.worker_memory_limits = f"{self.worker_memory_limits}G" - def _gpu_to_resource(self): - if self.head_gpus: - warnings.warn("head_gpus is being deprecated, use num_head_gpus") - self.num_head_gpus = self.head_gpus - if self.num_gpus: - warnings.warn("num_gpus is being deprecated, use num_worker_gpus") - self.num_worker_gpus = self.num_gpus - def _cpu_to_resource(self): if self.min_cpus: warnings.warn("min_cpus is being deprecated, use worker_cpu_requests") diff --git a/src/codeflare_sdk/cluster/model.py b/src/codeflare_sdk/cluster/model.py index 2547de254..5d6e2ed2a 100644 --- a/src/codeflare_sdk/cluster/model.py +++ b/src/codeflare_sdk/cluster/model.py @@ -18,8 +18,9 @@ dataclasses to store information for Ray clusters and AppWrappers. """ -from dataclasses import dataclass +from dataclasses import dataclass, field from enum import Enum +import typing class RayClusterStatus(Enum): @@ -74,14 +75,14 @@ class RayCluster: status: RayClusterStatus head_cpus: int head_mem: str - head_gpu: int workers: int worker_mem_min: str worker_mem_max: str worker_cpu: int - worker_gpu: int namespace: str dashboard: str + worker_extended_resources: typing.Dict[str, int] = field(default_factory=dict) + head_extended_resources: typing.Dict[str, int] = field(default_factory=dict) @dataclass diff --git a/src/codeflare_sdk/templates/base-template.yaml b/src/codeflare_sdk/templates/base-template.yaml index 7b36146a7..076bd2623 100644 --- a/src/codeflare_sdk/templates/base-template.yaml +++ b/src/codeflare_sdk/templates/base-template.yaml @@ -86,11 +86,9 @@ spec: limits: cpu: 2 memory: "8G" - nvidia.com/gpu: 0 requests: cpu: 2 memory: "8G" - nvidia.com/gpu: 0 volumeMounts: - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt name: odh-trusted-ca-cert @@ -163,11 +161,9 @@ spec: limits: cpu: "2" memory: "12G" - nvidia.com/gpu: "1" requests: cpu: "2" memory: "12G" - nvidia.com/gpu: "1" volumeMounts: - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt name: odh-trusted-ca-cert diff --git a/src/codeflare_sdk/utils/generate_yaml.py b/src/codeflare_sdk/utils/generate_yaml.py index cde49ed32..1644dc15e 100755 --- a/src/codeflare_sdk/utils/generate_yaml.py +++ b/src/codeflare_sdk/utils/generate_yaml.py @@ -17,6 +17,7 @@ (in the cluster sub-module) for AppWrapper generation. """ +import json from typing import Optional import typing import yaml @@ -31,6 +32,7 @@ from base64 import b64encode from urllib3.util import parse_url from kubernetes.client.exceptions import ApiException +import codeflare_sdk def read_template(template): @@ -78,10 +80,13 @@ def is_kind_cluster(): return False -def update_names(cluster_yaml, cluster_name, namespace): - meta = cluster_yaml.get("metadata") - meta["name"] = cluster_name - meta["namespace"] = namespace +def update_names( + cluster_yaml: dict, + cluster: "codeflare_sdk.cluster.Cluster", +): + metadata = cluster_yaml.get("metadata") + metadata["name"] = cluster.config.name + metadata["namespace"] = cluster.config.namespace def update_image(spec, image): @@ -114,7 +119,7 @@ def update_resources( worker_cpu_limits, worker_memory_requests, worker_memory_limits, - num_worker_gpus, + custom_resources, ): container = spec.get("containers") for resource in container: @@ -122,59 +127,103 @@ def update_resources( if requests is not None: requests["cpu"] = worker_cpu_requests requests["memory"] = worker_memory_requests - requests["nvidia.com/gpu"] = num_worker_gpus limits = resource.get("resources").get("limits") if limits is not None: limits["cpu"] = worker_cpu_limits limits["memory"] = worker_memory_limits - limits["nvidia.com/gpu"] = num_worker_gpus + for k in custom_resources.keys(): + limits[k] = custom_resources[k] + requests[k] = custom_resources[k] + + +def head_worker_gpu_count_from_cluster( + cluster: "codeflare_sdk.cluster.Cluster", +) -> typing.Tuple[int, int]: + head_gpus = 0 + worker_gpus = 0 + for k in cluster.config.head_extended_resource_requests.keys(): + resource_type = cluster.config.extended_resource_mapping[k] + if resource_type == "GPU": + head_gpus += int(cluster.config.head_extended_resource_requests[k]) + for k in cluster.config.worker_extended_resource_requests.keys(): + resource_type = cluster.config.extended_resource_mapping[k] + if resource_type == "GPU": + worker_gpus += int(cluster.config.worker_extended_resource_requests[k]) + + return head_gpus, worker_gpus + + +FORBIDDEN_CUSTOM_RESOURCE_TYPES = ["GPU", "CPU", "memory"] + + +def head_worker_resources_from_cluster( + cluster: "codeflare_sdk.cluster.Cluster", +) -> typing.Tuple[dict, dict]: + to_return = {}, {} + for k in cluster.config.head_extended_resource_requests.keys(): + resource_type = cluster.config.extended_resource_mapping[k] + if resource_type in FORBIDDEN_CUSTOM_RESOURCE_TYPES: + continue + to_return[0][resource_type] = cluster.config.head_extended_resource_requests[ + k + ] + to_return[0].get(resource_type, 0) + + for k in cluster.config.worker_extended_resource_requests.keys(): + resource_type = cluster.config.extended_resource_mapping[k] + if resource_type in FORBIDDEN_CUSTOM_RESOURCE_TYPES: + continue + to_return[1][resource_type] = cluster.config.worker_extended_resource_requests[ + k + ] + to_return[1].get(resource_type, 0) + return to_return def update_nodes( - cluster_yaml, - appwrapper_name, - worker_cpu_requests, - worker_cpu_limits, - worker_memory_requests, - worker_memory_limits, - num_worker_gpus, - workers, - image, - env, - image_pull_secrets, - head_cpus, - head_memory, - num_head_gpus, + ray_cluster_dict: dict, + cluster: "codeflare_sdk.cluster.Cluster", ): - head = cluster_yaml.get("spec").get("headGroupSpec") - head["rayStartParams"]["num-gpus"] = str(int(num_head_gpus)) + head = ray_cluster_dict.get("spec").get("headGroupSpec") + worker = ray_cluster_dict.get("spec").get("workerGroupSpecs")[0] + head_gpus, worker_gpus = head_worker_gpu_count_from_cluster(cluster) + head_resources, worker_resources = head_worker_resources_from_cluster(cluster) + head_resources = json.dumps(head_resources).replace('"', '\\"') + head_resources = f'"{head_resources}"' + worker_resources = json.dumps(worker_resources).replace('"', '\\"') + worker_resources = f'"{worker_resources}"' + head["rayStartParams"]["num-gpus"] = str(head_gpus) + head["rayStartParams"]["resources"] = head_resources - worker = cluster_yaml.get("spec").get("workerGroupSpecs")[0] # Head counts as first worker - worker["replicas"] = workers - worker["minReplicas"] = workers - worker["maxReplicas"] = workers - worker["groupName"] = "small-group-" + appwrapper_name - worker["rayStartParams"]["num-gpus"] = str(int(num_worker_gpus)) + worker["replicas"] = cluster.config.num_workers + worker["minReplicas"] = cluster.config.num_workers + worker["maxReplicas"] = cluster.config.num_workers + worker["groupName"] = "small-group-" + cluster.config.name + worker["rayStartParams"]["num-gpus"] = str(worker_gpus) + worker["rayStartParams"]["resources"] = worker_resources for comp in [head, worker]: spec = comp.get("template").get("spec") - update_image_pull_secrets(spec, image_pull_secrets) - update_image(spec, image) - update_env(spec, env) + update_image_pull_secrets(spec, cluster.config.image_pull_secrets) + update_image(spec, cluster.config.image) + update_env(spec, cluster.config.envs) if comp == head: # TODO: Eventually add head node configuration outside of template update_resources( - spec, head_cpus, head_cpus, head_memory, head_memory, num_head_gpus + spec, + cluster.config.head_cpus, + cluster.config.head_cpus, + cluster.config.head_memory, + cluster.config.head_memory, + cluster.config.head_extended_resource_requests, ) else: update_resources( spec, - worker_cpu_requests, - worker_cpu_limits, - worker_memory_requests, - worker_memory_limits, - num_worker_gpus, + cluster.config.worker_cpu_requests, + cluster.config.worker_cpu_limits, + cluster.config.worker_memory_requests, + cluster.config.worker_memory_limits, + cluster.config.worker_extended_resource_requests, ) @@ -278,63 +327,30 @@ def write_user_yaml(user_yaml, output_file_name): print(f"Written to: {output_file_name}") -def generate_appwrapper( - name: str, - namespace: str, - head_cpus: int, - head_memory: int, - num_head_gpus: int, - worker_cpu_requests: int, - worker_cpu_limits: int, - worker_memory_requests: int, - worker_memory_limits: int, - num_worker_gpus: int, - workers: int, - template: str, - image: str, - appwrapper: bool, - env, - image_pull_secrets: list, - write_to_file: bool, - local_queue: Optional[str], - labels, -): - cluster_yaml = read_template(template) - appwrapper_name, cluster_name = gen_names(name) - update_names(cluster_yaml, cluster_name, namespace) - update_nodes( +def generate_appwrapper(cluster: "codeflare_sdk.cluster.Cluster"): + cluster_yaml = read_template(cluster.config.template) + appwrapper_name, _ = gen_names(cluster.config.name) + update_names( cluster_yaml, - appwrapper_name, - worker_cpu_requests, - worker_cpu_limits, - worker_memory_requests, - worker_memory_limits, - num_worker_gpus, - workers, - image, - env, - image_pull_secrets, - head_cpus, - head_memory, - num_head_gpus, + cluster, ) - augment_labels(cluster_yaml, labels) + update_nodes(cluster_yaml, cluster) + augment_labels(cluster_yaml, cluster.config.labels) notebook_annotations(cluster_yaml) - user_yaml = ( - wrap_cluster(cluster_yaml, appwrapper_name, namespace) - if appwrapper + wrap_cluster(cluster_yaml, appwrapper_name, cluster.config.namespace) + if cluster.config.appwrapper else cluster_yaml ) - add_queue_label(user_yaml, namespace, local_queue) + add_queue_label(user_yaml, cluster.config.namespace, cluster.config.local_queue) - if write_to_file: + if cluster.config.write_to_file: directory_path = os.path.expanduser("~/.codeflare/resources/") outfile = os.path.join(directory_path, appwrapper_name + ".yaml") write_user_yaml(user_yaml, outfile) return outfile else: user_yaml = yaml.dump(user_yaml) - print(f"Yaml resources loaded for {name}") + print(f"Yaml resources loaded for {cluster.config.name}") return user_yaml diff --git a/src/codeflare_sdk/utils/pretty_print.py b/src/codeflare_sdk/utils/pretty_print.py index 42ef8398b..9431ffd75 100644 --- a/src/codeflare_sdk/utils/pretty_print.py +++ b/src/codeflare_sdk/utils/pretty_print.py @@ -138,7 +138,7 @@ def print_clusters(clusters: List[RayCluster]): workers = str(cluster.workers) memory = f"{cluster.worker_mem_min}~{cluster.worker_mem_max}" cpu = str(cluster.worker_cpu) - gpu = str(cluster.worker_gpu) + gpu = str(cluster.worker_extended_resources.get("nvidia.com/gpu", 0)) #'table0' to display the cluster name, status, url, and dashboard link table0 = Table(box=None, show_header=False) diff --git a/tests/e2e/local_interactive_sdk_kind_test.py b/tests/e2e/local_interactive_sdk_kind_test.py index 999c72e1a..8ca0bdac9 100644 --- a/tests/e2e/local_interactive_sdk_kind_test.py +++ b/tests/e2e/local_interactive_sdk_kind_test.py @@ -43,7 +43,6 @@ def run_local_interactives(self): worker_cpu_limits=1, worker_memory_requests=1, worker_memory_limits=2, - num_worker_gpus=0, image=ray_image, write_to_file=True, verify_tls=False, diff --git a/tests/e2e/local_interactive_sdk_oauth_test.py b/tests/e2e/local_interactive_sdk_oauth_test.py index a2d5b6123..4177413ec 100644 --- a/tests/e2e/local_interactive_sdk_oauth_test.py +++ b/tests/e2e/local_interactive_sdk_oauth_test.py @@ -48,7 +48,6 @@ def run_local_interactives(self): worker_cpu_limits=1, worker_memory_requests=4, worker_memory_limits=4, - num_worker_gpus=0, image=ray_image, verify_tls=False, ) diff --git a/tests/e2e/mnist_raycluster_sdk_kind_test.py b/tests/e2e/mnist_raycluster_sdk_kind_test.py index d85397221..af5fcc1f8 100644 --- a/tests/e2e/mnist_raycluster_sdk_kind_test.py +++ b/tests/e2e/mnist_raycluster_sdk_kind_test.py @@ -41,7 +41,6 @@ def run_mnist_raycluster_sdk_kind(self): worker_cpu_limits=1, worker_memory_requests=1, worker_memory_limits=2, - num_worker_gpus=0, image=ray_image, write_to_file=True, verify_tls=False, diff --git a/tests/e2e/mnist_raycluster_sdk_oauth_test.py b/tests/e2e/mnist_raycluster_sdk_oauth_test.py index ed2f4fb1e..4e85e9f1d 100644 --- a/tests/e2e/mnist_raycluster_sdk_oauth_test.py +++ b/tests/e2e/mnist_raycluster_sdk_oauth_test.py @@ -48,7 +48,6 @@ def run_mnist_raycluster_sdk_oauth(self): worker_cpu_limits=1, worker_memory_requests=1, worker_memory_limits=2, - num_worker_gpus=0, image=ray_image, write_to_file=True, verify_tls=False, diff --git a/tests/e2e/start_ray_cluster.py b/tests/e2e/start_ray_cluster.py index 4fd7fb3fb..b34f0331f 100644 --- a/tests/e2e/start_ray_cluster.py +++ b/tests/e2e/start_ray_cluster.py @@ -19,7 +19,6 @@ worker_cpu_limits=1, worker_memory_requests=1, worker_memory_limits=2, - num_worker_gpus=0, image=ray_image, appwrapper=True, ) diff --git a/tests/test-case-bad.yaml b/tests/test-case-bad.yaml index d4d230d48..cb2f4a0aa 100644 --- a/tests/test-case-bad.yaml +++ b/tests/test-case-bad.yaml @@ -33,6 +33,7 @@ spec: block: 'true' dashboard-host: 0.0.0.0 num-gpus: '0' + resources: '"{}"' serviceType: ClusterIP template: spec: @@ -63,11 +64,9 @@ spec: limits: cpu: 2 memory: 8G - nvidia.com/gpu: 0 requests: cpu: 2 memory: 8G - nvidia.com/gpu: 0 rayVersion: 2.23.0 workerGroupSpecs: - groupName: small-group-unit-test-cluster @@ -76,6 +75,7 @@ spec: rayStartParams: block: 'true' num-gpus: '7' + resources: '"{}"' replicas: 2 template: metadata: diff --git a/tests/test-case-no-kueue-no-aw.yaml b/tests/test-case-no-kueue-no-aw.yaml index dfca7951d..3ea7a22d1 100644 --- a/tests/test-case-no-kueue-no-aw.yaml +++ b/tests/test-case-no-kueue-no-aw.yaml @@ -26,6 +26,7 @@ spec: block: 'true' dashboard-host: 0.0.0.0 num-gpus: '0' + resources: '"{}"' serviceType: ClusterIP template: spec: @@ -51,11 +52,9 @@ spec: limits: cpu: 2 memory: 8G - nvidia.com/gpu: 0 requests: cpu: 2 memory: 8G - nvidia.com/gpu: 0 volumeMounts: - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt name: odh-trusted-ca-cert @@ -94,6 +93,7 @@ spec: rayStartParams: block: 'true' num-gpus: '7' + resources: '"{}"' replicas: 2 template: metadata: diff --git a/tests/test-case-no-mcad.yamls b/tests/test-case-no-mcad.yamls index 2d0e7e9bd..45a3dfb9f 100644 --- a/tests/test-case-no-mcad.yamls +++ b/tests/test-case-no-mcad.yamls @@ -29,6 +29,7 @@ spec: block: 'true' dashboard-host: 0.0.0.0 num-gpus: '0' + resources: '"{}"' serviceType: ClusterIP template: spec: @@ -54,11 +55,9 @@ spec: limits: cpu: 2 memory: 8G - nvidia.com/gpu: 0 requests: cpu: 2 memory: 8G - nvidia.com/gpu: 0 volumeMounts: - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt name: odh-trusted-ca-cert @@ -97,6 +96,7 @@ spec: rayStartParams: block: 'true' num-gpus: '7' + resources: '"{}"' replicas: 2 template: metadata: diff --git a/tests/test-case.yaml b/tests/test-case.yaml index 00b241afb..461ed7dfc 100644 --- a/tests/test-case.yaml +++ b/tests/test-case.yaml @@ -34,6 +34,7 @@ spec: block: 'true' dashboard-host: 0.0.0.0 num-gpus: '0' + resources: '"{}"' serviceType: ClusterIP template: spec: @@ -59,11 +60,9 @@ spec: limits: cpu: 2 memory: 8G - nvidia.com/gpu: 0 requests: cpu: 2 memory: 8G - nvidia.com/gpu: 0 volumeMounts: - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt name: odh-trusted-ca-cert @@ -102,6 +101,7 @@ spec: rayStartParams: block: 'true' num-gpus: '7' + resources: '"{}"' replicas: 2 template: metadata: diff --git a/tests/test-default-appwrapper.yaml b/tests/test-default-appwrapper.yaml index cc44e2340..cc27e37a1 100644 --- a/tests/test-default-appwrapper.yaml +++ b/tests/test-default-appwrapper.yaml @@ -34,9 +34,11 @@ spec: block: 'true' dashboard-host: 0.0.0.0 num-gpus: '0' + resources: '"{}"' serviceType: ClusterIP template: spec: + imagePullSecrets: [] containers: - image: quay.io/rhoai/ray:2.23.0-py39-cu121 imagePullPolicy: Always @@ -59,11 +61,9 @@ spec: limits: cpu: 2 memory: 8G - nvidia.com/gpu: 0 requests: cpu: 2 memory: 8G - nvidia.com/gpu: 0 volumeMounts: - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt name: odh-trusted-ca-cert @@ -77,7 +77,6 @@ spec: - mountPath: /etc/ssl/certs/odh-ca-bundle.crt name: odh-ca-cert subPath: odh-ca-bundle.crt - imagePullSecrets: [] volumes: - configMap: items: @@ -101,6 +100,7 @@ spec: rayStartParams: block: 'true' num-gpus: '0' + resources: '"{}"' replicas: 1 template: metadata: @@ -109,6 +109,7 @@ spec: labels: key: value spec: + imagePullSecrets: [] containers: - image: quay.io/rhoai/ray:2.23.0-py39-cu121 lifecycle: @@ -123,11 +124,9 @@ spec: limits: cpu: 1 memory: 2G - nvidia.com/gpu: 0 requests: cpu: 1 memory: 2G - nvidia.com/gpu: 0 volumeMounts: - mountPath: /etc/pki/tls/certs/odh-trusted-ca-bundle.crt name: odh-trusted-ca-cert @@ -141,7 +140,6 @@ spec: - mountPath: /etc/ssl/certs/odh-ca-bundle.crt name: odh-ca-cert subPath: odh-ca-bundle.crt - imagePullSecrets: [] volumes: - configMap: items: diff --git a/tests/unit_test.py b/tests/unit_test.py index 3daba3a19..2decade20 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -260,7 +260,7 @@ def test_config_creation(): assert config.num_workers == 2 assert config.worker_cpu_requests == 3 and config.worker_cpu_limits == 4 assert config.worker_memory_requests == "5G" and config.worker_memory_limits == "6G" - assert config.num_worker_gpus == 7 + assert config.worker_extended_resource_requests == {"nvidia.com/gpu": 7} assert config.image == "quay.io/rhoai/ray:2.23.0-py39-cu121" assert config.template == f"{parent}/src/codeflare_sdk/templates/base-template.yaml" assert config.machine_types == ["cpu.small", "gpu.large"] @@ -406,7 +406,7 @@ def test_cluster_creation_no_mcad_local_queue(mocker): worker_cpu_limits=4, worker_memory_requests=5, worker_memory_limits=6, - num_worker_gpus=7, + worker_extended_resource_requests={"nvidia.com/gpu": 7}, machine_types=["cpu.small", "gpu.large"], image_pull_secrets=["unit-test-pull-secret"], image="quay.io/rhoai/ray:2.23.0-py39-cu121", @@ -883,12 +883,10 @@ def test_ray_details(mocker, capsys): worker_mem_min="2G", worker_mem_max="2G", worker_cpu=1, - worker_gpu=0, namespace="ns", dashboard="fake-uri", head_cpus=2, head_mem=8, - head_gpu=0, ) mocker.patch( "codeflare_sdk.cluster.cluster.Cluster.status", @@ -922,7 +920,7 @@ def test_ray_details(mocker, capsys): assert ray1.worker_mem_min == ray2.worker_mem_min assert ray1.worker_mem_max == ray2.worker_mem_max assert ray1.worker_cpu == ray2.worker_cpu - assert ray1.worker_gpu == ray2.worker_gpu + assert ray1.worker_extended_resources == ray2.worker_extended_resources try: print_clusters([ray1, ray2]) print_cluster_status(ray1) @@ -1129,12 +1127,10 @@ def get_ray_obj(group, version, namespace, plural, cls=None): "limits": { "cpu": 2, "memory": "8G", - "nvidia.com/gpu": 0, }, "requests": { "cpu": 2, "memory": "8G", - "nvidia.com/gpu": 0, }, }, "volumeMounts": [ @@ -1198,7 +1194,10 @@ def get_ray_obj(group, version, namespace, plural, cls=None): "groupName": "small-group-quicktest", "maxReplicas": 1, "minReplicas": 1, - "rayStartParams": {"block": "true", "num-gpus": "0"}, + "rayStartParams": { + "block": "true", + "num-gpus": "0", + }, "replicas": 1, "scaleStrategy": {}, "template": { @@ -1249,12 +1248,10 @@ def get_ray_obj(group, version, namespace, plural, cls=None): "limits": { "cpu": 1, "memory": "2G", - "nvidia.com/gpu": 0, }, "requests": { "cpu": 1, "memory": "2G", - "nvidia.com/gpu": 0, }, }, "volumeMounts": [ @@ -1413,12 +1410,10 @@ def get_ray_obj(group, version, namespace, plural, cls=None): "limits": { "cpu": 2, "memory": "8G", - "nvidia.com/gpu": 0, }, "requests": { "cpu": 2, "memory": "8G", - "nvidia.com/gpu": 0, }, }, } @@ -1432,7 +1427,10 @@ def get_ray_obj(group, version, namespace, plural, cls=None): "groupName": "small-group-quicktest2", "maxReplicas": 1, "minReplicas": 1, - "rayStartParams": {"block": "true", "num-gpus": "0"}, + "rayStartParams": { + "block": "true", + "num-gpus": "0", + }, "replicas": 1, "template": { "metadata": { @@ -1469,12 +1467,10 @@ def get_ray_obj(group, version, namespace, plural, cls=None): "limits": { "cpu": 1, "memory": "2G", - "nvidia.com/gpu": 0, }, "requests": { "cpu": 1, "memory": "2G", - "nvidia.com/gpu": 0, }, }, } @@ -1591,12 +1587,10 @@ def get_aw_obj(group, version, namespace, plural): "limits": { "cpu": 2, "memory": "8G", - "nvidia.com/gpu": 0, }, "requests": { "cpu": 2, "memory": "8G", - "nvidia.com/gpu": 0, }, }, } @@ -1650,12 +1644,10 @@ def get_aw_obj(group, version, namespace, plural): "limits": { "cpu": 1, "memory": "2G", - "nvidia.com/gpu": 0, }, "requests": { "cpu": 1, "memory": "2G", - "nvidia.com/gpu": 0, }, }, } @@ -1786,12 +1778,10 @@ def get_aw_obj(group, version, namespace, plural): "limits": { "cpu": 2, "memory": "8G", - "nvidia.com/gpu": 0, }, "requests": { "cpu": 2, "memory": "8G", - "nvidia.com/gpu": 0, }, }, } @@ -1845,12 +1835,10 @@ def get_aw_obj(group, version, namespace, plural): "limits": { "cpu": 1, "memory": "2G", - "nvidia.com/gpu": 0, }, "requests": { "cpu": 1, "memory": "2G", - "nvidia.com/gpu": 0, }, }, } @@ -2002,7 +1990,7 @@ def custom_side_effect(group, version, namespace, plural, **kwargs): cluster_config.worker_memory_requests == "2G" and cluster_config.worker_memory_limits == "2G" ) - assert cluster_config.num_worker_gpus == 0 + assert cluster_config.worker_extended_resource_requests == {} assert ( cluster_config.image == "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103" @@ -2044,7 +2032,7 @@ def test_get_cluster(mocker): cluster_config.worker_memory_requests == "2G" and cluster_config.worker_memory_limits == "2G" ) - assert cluster_config.num_worker_gpus == 0 + assert cluster_config.worker_extended_resource_requests == {} assert ( cluster_config.image == "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103" @@ -2082,7 +2070,7 @@ def test_get_cluster_no_mcad(mocker): cluster_config.worker_memory_requests == "2G" and cluster_config.worker_memory_limits == "2G" ) - assert cluster_config.num_worker_gpus == 0 + assert cluster_config.worker_extended_resource_requests == {} assert ( cluster_config.image == "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103" @@ -2310,12 +2298,10 @@ def test_cluster_status(mocker): worker_mem_min=2, worker_mem_max=2, worker_cpu=1, - worker_gpu=0, namespace="ns", dashboard="fake-uri", head_cpus=2, head_mem=8, - head_gpu=0, ) cf = Cluster( ClusterConfiguration( @@ -2806,6 +2792,24 @@ def test_rjc_list_jobs(ray_job_client, mocker): assert job_list_jobs == jobs_list +def test_cluster_config_deprecation_conversion(mocker): + config = ClusterConfiguration( + name="test", + num_gpus=2, + head_gpus=1, + min_memory=3, + max_memory=4, + min_cpus=1, + max_cpus=2, + ) + assert config.worker_extended_resource_requests == {"nvidia.com/gpu": 2} + assert config.head_extended_resource_requests == {"nvidia.com/gpu": 1} + assert config.worker_memory_requests == "3G" + assert config.worker_memory_limits == "4G" + assert config.worker_cpu_requests == 1 + assert config.worker_cpu_limits == 2 + + # Make sure to always keep this function last def test_cleanup(): os.remove(f"{aw_dir}unit-test-no-kueue.yaml") diff --git a/tests/unit_test_support.py b/tests/unit_test_support.py index 51c47aa61..36c7d8710 100644 --- a/tests/unit_test_support.py +++ b/tests/unit_test_support.py @@ -13,7 +13,7 @@ def createClusterConfig(): worker_cpu_limits=4, worker_memory_requests=5, worker_memory_limits=6, - num_worker_gpus=7, + worker_extended_resource_requests={"nvidia.com/gpu": 7}, appwrapper=True, machine_types=["cpu.small", "gpu.large"], image_pull_secrets=["unit-test-pull-secret"], diff --git a/tests/upgrade/raycluster_sdk_upgrade_test.py b/tests/upgrade/raycluster_sdk_upgrade_test.py index 982048111..ff98e28f6 100644 --- a/tests/upgrade/raycluster_sdk_upgrade_test.py +++ b/tests/upgrade/raycluster_sdk_upgrade_test.py @@ -54,7 +54,6 @@ def run_mnist_raycluster_sdk_oauth(self): worker_cpu_limits=1, worker_memory_requests=1, worker_memory_limits=2, - num_worker_gpus=0, image=ray_image, write_to_file=True, verify_tls=False,