From a36ebdb63c7d017c234608eb8ad4866872f7fcf1 Mon Sep 17 00:00:00 2001 From: Alex Fan Date: Mon, 27 May 2024 12:34:32 +0100 Subject: [PATCH] Update SDK args --- src/codeflare_sdk/cluster/cluster.py | 50 ++++++------- src/codeflare_sdk/cluster/config.py | 69 ++++++++++++++---- src/codeflare_sdk/utils/generate_yaml.py | 72 +++++++++++-------- tests/e2e/local_interactive_sdk_kind_test.py | 10 +-- tests/e2e/local_interactive_sdk_oauth_test.py | 10 +-- tests/e2e/mnist_raycluster_sdk_kind_test.py | 10 +-- tests/e2e/mnist_raycluster_sdk_oauth_test.py | 10 +-- tests/e2e/start_ray_cluster.py | 10 +-- tests/unit_test.py | 52 +++++++++----- tests/unit_test_support.py | 10 +-- tests/upgrade/raycluster_sdk_upgrade_test.py | 10 +-- 11 files changed, 193 insertions(+), 120 deletions(-) diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py index e5bbcd86a..b823cfd54 100644 --- a/src/codeflare_sdk/cluster/cluster.py +++ b/src/codeflare_sdk/cluster/cluster.py @@ -137,12 +137,12 @@ def create_app_wrapper(self): namespace = self.config.namespace head_cpus = self.config.head_cpus head_memory = self.config.head_memory - head_gpus = self.config.head_gpus - min_cpu = self.config.min_cpus - max_cpu = self.config.max_cpus - min_memory = self.config.min_memory - max_memory = self.config.max_memory - gpu = self.config.num_gpus + num_head_gpus = self.config.num_head_gpus + worker_cpu_requests = self.config.worker_cpu_requests + worker_cpu_limits = self.config.worker_cpu_limits + worker_memory_requests = self.config.worker_memory_requests + worker_memory_limits = self.config.worker_memory_limits + num_worker_gpus = self.config.num_worker_gpus workers = self.config.num_workers template = self.config.template image = self.config.image @@ -157,12 +157,12 @@ def create_app_wrapper(self): namespace=namespace, head_cpus=head_cpus, head_memory=head_memory, - head_gpus=head_gpus, - min_cpu=min_cpu, - max_cpu=max_cpu, - min_memory=min_memory, - max_memory=max_memory, - gpu=gpu, + num_head_gpus=num_head_gpus, + worker_cpu_requests=worker_cpu_requests, + worker_cpu_limits=worker_cpu_limits, + worker_memory_requests=worker_memory_requests, + worker_memory_limits=worker_memory_limits, + num_worker_gpus=num_worker_gpus, workers=workers, template=template, image=image, @@ -318,7 +318,7 @@ def status( if print_to_console: # overriding the number of gpus with requested - cluster.worker_gpu = self.config.num_gpus + cluster.worker_gpu = self.config.num_worker_gpus pretty_print.print_cluster_status(cluster) elif print_to_console: if status == CodeFlareClusterStatus.UNKNOWN: @@ -474,19 +474,19 @@ def from_k8_cluster_object( namespace=rc["metadata"]["namespace"], machine_types=machine_types, num_workers=rc["spec"]["workerGroupSpecs"][0]["minReplicas"], - min_cpus=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ + worker_cpu_requests=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ "containers" ][0]["resources"]["requests"]["cpu"], - max_cpus=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ + worker_cpu_limits=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ "containers" ][0]["resources"]["limits"]["cpu"], - min_memory=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ - "containers" - ][0]["resources"]["requests"]["memory"], - max_memory=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ + worker_memory_requests=rc["spec"]["workerGroupSpecs"][0]["template"][ + "spec" + ]["containers"][0]["resources"]["requests"]["memory"], + worker_memory_limits=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ "containers" ][0]["resources"]["limits"]["memory"], - num_gpus=int( + num_worker_gpus=int( rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0][ "resources" ]["limits"]["nvidia.com/gpu"] @@ -917,15 +917,15 @@ def _copy_to_ray(cluster: Cluster) -> RayCluster: name=cluster.config.name, status=cluster.status(print_to_console=False)[0], workers=cluster.config.num_workers, - worker_mem_min=cluster.config.min_memory, - worker_mem_max=cluster.config.max_memory, - worker_cpu=cluster.config.min_cpus, - worker_gpu=cluster.config.num_gpus, + worker_mem_min=cluster.config.worker_memory_requests, + worker_mem_max=cluster.config.worker_memory_limits, + worker_cpu=cluster.config.worker_cpu_requests, + worker_gpu=cluster.config.num_worker_gpus, namespace=cluster.config.namespace, dashboard=cluster.cluster_dashboard_uri(), head_cpus=cluster.config.head_cpus, head_mem=cluster.config.head_memory, - head_gpu=cluster.config.head_gpus, + head_gpu=cluster.config.num_head_gpus, ) if ray.status == CodeFlareClusterStatus.READY: ray.status = RayClusterStatus.READY diff --git a/src/codeflare_sdk/cluster/config.py b/src/codeflare_sdk/cluster/config.py index 9e069c376..cb8e3d3d0 100644 --- a/src/codeflare_sdk/cluster/config.py +++ b/src/codeflare_sdk/cluster/config.py @@ -21,6 +21,7 @@ from dataclasses import dataclass, field import pathlib import typing +import warnings dir = pathlib.Path(__file__).parent.parent.resolve() @@ -37,14 +38,20 @@ class ClusterConfiguration: head_info: list = field(default_factory=list) head_cpus: typing.Union[int, str] = 2 head_memory: typing.Union[int, str] = 8 - head_gpus: int = 0 + head_gpus: int = None # Deprecating + num_head_gpus: int = 0 machine_types: list = field(default_factory=list) # ["m4.xlarge", "g4dn.xlarge"] - min_cpus: typing.Union[int, str] = 1 - max_cpus: typing.Union[int, str] = 1 + worker_cpu_requests: typing.Union[int, str] = 1 + worker_cpu_limits: typing.Union[int, str] = 1 + min_cpus: typing.Union[int, str] = None # Deprecating + max_cpus: typing.Union[int, str] = None # Deprecating num_workers: int = 1 - min_memory: typing.Union[int, str] = 2 - max_memory: typing.Union[int, str] = 2 - num_gpus: int = 0 + worker_memory_requests: typing.Union[int, str] = 2 + worker_memory_limits: typing.Union[int, str] = 2 + min_memory: typing.Union[int, str] = None # Deprecating + max_memory: typing.Union[int, str] = None # Deprecating + num_worker_gpus: int = 0 + num_gpus: int = None # Deprecating template: str = f"{dir}/templates/base-template.yaml" appwrapper: bool = False envs: dict = field(default_factory=dict) @@ -59,23 +66,57 @@ def __post_init__(self): print( "Warning: TLS verification has been disabled - Endpoint checks will be bypassed" ) + self._memory_to_string() self._str_mem_no_unit_add_GB() + self._memory_to_resource() + self._gpu_to_resource() + self._cpu_to_resource() def _str_mem_no_unit_add_GB(self): if isinstance(self.head_memory, str) and self.head_memory.isdecimal(): self.head_memory = f"{self.head_memory}G" - if isinstance(self.min_memory, str) and self.min_memory.isdecimal(): - self.min_memory = f"{self.min_memory}G" - if isinstance(self.max_memory, str) and self.max_memory.isdecimal(): - self.max_memory = f"{self.max_memory}G" + if ( + isinstance(self.worker_memory_requests, str) + and self.worker_memory_requests.isdecimal() + ): + self.worker_memory_requests = f"{self.worker_memory_requests}G" + if ( + isinstance(self.worker_memory_limits, str) + and self.worker_memory_limits.isdecimal() + ): + self.worker_memory_limits = f"{self.worker_memory_limits}G" def _memory_to_string(self): if isinstance(self.head_memory, int): self.head_memory = f"{self.head_memory}G" - if isinstance(self.min_memory, int): - self.min_memory = f"{self.min_memory}G" - if isinstance(self.max_memory, int): - self.max_memory = f"{self.max_memory}G" + if isinstance(self.worker_memory_requests, int): + self.worker_memory_requests = f"{self.worker_memory_requests}G" + if isinstance(self.worker_memory_limits, int): + self.worker_memory_limits = f"{self.worker_memory_limits}G" + + def _gpu_to_resource(self): + if self.head_gpus: + warnings.warn("head_gpus is being deprecated, use num_head_gpus") + self.num_head_gpus = self.head_gpus + if self.num_gpus: + warnings.warn("num_gpus is being deprecated, use num_worker_gpus") + self.num_worker_gpus = self.num_gpus + + def _cpu_to_resource(self): + if self.min_cpus: + warnings.warn("min_cpus is being deprecated, use worker_cpu_requests") + self.worker_cpu_requests = self.min_cpus + if self.max_cpus: + warnings.warn("max_cpus is being deprecated, use worker_cpu_limits") + self.worker_cpu_limits = self.max_cpus + + def _memory_to_resource(self): + if self.min_memory: + warnings.warn("min_memory is being deprecated, use worker_memory_requests") + self.worker_memory_requests = f"{self.min_memory}G" + if self.max_memory: + warnings.warn("max_memory is being deprecated, use worker_memory_limits") + self.worker_memory_limits = f"{self.max_memory}G" local_queue: str = None diff --git a/src/codeflare_sdk/utils/generate_yaml.py b/src/codeflare_sdk/utils/generate_yaml.py index 30edcd913..183705642 100755 --- a/src/codeflare_sdk/utils/generate_yaml.py +++ b/src/codeflare_sdk/utils/generate_yaml.py @@ -106,39 +106,46 @@ def update_env(spec, env): container["env"] = env -def update_resources(spec, min_cpu, max_cpu, min_memory, max_memory, gpu): +def update_resources( + spec, + worker_cpu_requests, + worker_cpu_limits, + worker_memory_requests, + worker_memory_limits, + num_worker_gpus, +): container = spec.get("containers") for resource in container: requests = resource.get("resources").get("requests") if requests is not None: - requests["cpu"] = min_cpu - requests["memory"] = min_memory - requests["nvidia.com/gpu"] = gpu + requests["cpu"] = worker_cpu_requests + requests["memory"] = worker_memory_requests + requests["nvidia.com/gpu"] = num_worker_gpus limits = resource.get("resources").get("limits") if limits is not None: - limits["cpu"] = max_cpu - limits["memory"] = max_memory - limits["nvidia.com/gpu"] = gpu + limits["cpu"] = worker_cpu_limits + limits["memory"] = worker_memory_limits + limits["nvidia.com/gpu"] = num_worker_gpus def update_nodes( cluster_yaml, appwrapper_name, - min_cpu, - max_cpu, - min_memory, - max_memory, - gpu, + worker_cpu_requests, + worker_cpu_limits, + worker_memory_requests, + worker_memory_limits, + num_worker_gpus, workers, image, env, image_pull_secrets, head_cpus, head_memory, - head_gpus, + num_head_gpus, ): head = cluster_yaml.get("spec").get("headGroupSpec") - head["rayStartParams"]["num-gpus"] = str(int(head_gpus)) + head["rayStartParams"]["num-gpus"] = str(int(num_head_gpus)) worker = cluster_yaml.get("spec").get("workerGroupSpecs")[0] # Head counts as first worker @@ -146,7 +153,7 @@ def update_nodes( worker["minReplicas"] = workers worker["maxReplicas"] = workers worker["groupName"] = "small-group-" + appwrapper_name - worker["rayStartParams"]["num-gpus"] = str(int(gpu)) + worker["rayStartParams"]["num-gpus"] = str(int(num_worker_gpus)) for comp in [head, worker]: spec = comp.get("template").get("spec") @@ -156,10 +163,17 @@ def update_nodes( if comp == head: # TODO: Eventually add head node configuration outside of template update_resources( - spec, head_cpus, head_cpus, head_memory, head_memory, head_gpus + spec, head_cpus, head_cpus, head_memory, head_memory, num_head_gpus ) else: - update_resources(spec, min_cpu, max_cpu, min_memory, max_memory, gpu) + update_resources( + spec, + worker_cpu_requests, + worker_cpu_limits, + worker_memory_requests, + worker_memory_limits, + num_worker_gpus, + ) def del_from_list_by_name(l: list, target: typing.List[str]) -> list: @@ -265,12 +279,12 @@ def generate_appwrapper( namespace: str, head_cpus: int, head_memory: int, - head_gpus: int, - min_cpu: int, - max_cpu: int, - min_memory: int, - max_memory: int, - gpu: int, + num_head_gpus: int, + worker_cpu_requests: int, + worker_cpu_limits: int, + worker_memory_requests: int, + worker_memory_limits: int, + num_worker_gpus: int, workers: int, template: str, image: str, @@ -287,18 +301,18 @@ def generate_appwrapper( update_nodes( cluster_yaml, appwrapper_name, - min_cpu, - max_cpu, - min_memory, - max_memory, - gpu, + worker_cpu_requests, + worker_cpu_limits, + worker_memory_requests, + worker_memory_limits, + num_worker_gpus, workers, image, env, image_pull_secrets, head_cpus, head_memory, - head_gpus, + num_head_gpus, ) augment_labels(cluster_yaml, labels) notebook_annotations(cluster_yaml) diff --git a/tests/e2e/local_interactive_sdk_kind_test.py b/tests/e2e/local_interactive_sdk_kind_test.py index 647ac4727..999c72e1a 100644 --- a/tests/e2e/local_interactive_sdk_kind_test.py +++ b/tests/e2e/local_interactive_sdk_kind_test.py @@ -39,11 +39,11 @@ def run_local_interactives(self): num_workers=1, head_cpus="500m", head_memory=2, - min_cpus="500m", - max_cpus=1, - min_memory=1, - max_memory=2, - num_gpus=0, + worker_cpu_requests="500m", + worker_cpu_limits=1, + worker_memory_requests=1, + worker_memory_limits=2, + num_worker_gpus=0, image=ray_image, write_to_file=True, verify_tls=False, diff --git a/tests/e2e/local_interactive_sdk_oauth_test.py b/tests/e2e/local_interactive_sdk_oauth_test.py index c3fd1d851..a2d5b6123 100644 --- a/tests/e2e/local_interactive_sdk_oauth_test.py +++ b/tests/e2e/local_interactive_sdk_oauth_test.py @@ -44,11 +44,11 @@ def run_local_interactives(self): namespace=self.namespace, name=cluster_name, num_workers=1, - min_cpus=1, - max_cpus=1, - min_memory=4, - max_memory=4, - num_gpus=0, + worker_cpu_requests=1, + worker_cpu_limits=1, + worker_memory_requests=4, + worker_memory_limits=4, + num_worker_gpus=0, image=ray_image, verify_tls=False, ) diff --git a/tests/e2e/mnist_raycluster_sdk_kind_test.py b/tests/e2e/mnist_raycluster_sdk_kind_test.py index 630bc5df4..d85397221 100644 --- a/tests/e2e/mnist_raycluster_sdk_kind_test.py +++ b/tests/e2e/mnist_raycluster_sdk_kind_test.py @@ -37,11 +37,11 @@ def run_mnist_raycluster_sdk_kind(self): num_workers=1, head_cpus="500m", head_memory=2, - min_cpus="500m", - max_cpus=1, - min_memory=1, - max_memory=2, - num_gpus=0, + worker_cpu_requests="500m", + worker_cpu_limits=1, + worker_memory_requests=1, + worker_memory_limits=2, + num_worker_gpus=0, image=ray_image, write_to_file=True, verify_tls=False, diff --git a/tests/e2e/mnist_raycluster_sdk_oauth_test.py b/tests/e2e/mnist_raycluster_sdk_oauth_test.py index e489c39f8..ed2f4fb1e 100644 --- a/tests/e2e/mnist_raycluster_sdk_oauth_test.py +++ b/tests/e2e/mnist_raycluster_sdk_oauth_test.py @@ -44,11 +44,11 @@ def run_mnist_raycluster_sdk_oauth(self): num_workers=1, head_cpus="500m", head_memory=2, - min_cpus="500m", - max_cpus=1, - min_memory=1, - max_memory=2, - num_gpus=0, + worker_cpu_requests="500m", + worker_cpu_limits=1, + worker_memory_requests=1, + worker_memory_limits=2, + num_worker_gpus=0, image=ray_image, write_to_file=True, verify_tls=False, diff --git a/tests/e2e/start_ray_cluster.py b/tests/e2e/start_ray_cluster.py index 957d0c25e..4fd7fb3fb 100644 --- a/tests/e2e/start_ray_cluster.py +++ b/tests/e2e/start_ray_cluster.py @@ -15,11 +15,11 @@ num_workers=1, head_cpus="500m", head_memory=2, - min_cpus="500m", - max_cpus=1, - min_memory=1, - max_memory=2, - num_gpus=0, + worker_cpu_requests="500m", + worker_cpu_limits=1, + worker_memory_requests=1, + worker_memory_limits=2, + num_worker_gpus=0, image=ray_image, appwrapper=True, ) diff --git a/tests/unit_test.py b/tests/unit_test.py index e8fa61c34..8cc679a50 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -258,9 +258,9 @@ def test_config_creation(): assert config.name == "unit-test-cluster" and config.namespace == "ns" assert config.num_workers == 2 - assert config.min_cpus == 3 and config.max_cpus == 4 - assert config.min_memory == "5G" and config.max_memory == "6G" - assert config.num_gpus == 7 + assert config.worker_cpu_requests == 3 and config.worker_cpu_limits == 4 + assert config.worker_memory_requests == "5G" and config.worker_memory_limits == "6G" + assert config.num_worker_gpus == 7 assert config.image == "quay.io/rhoai/ray:2.23.0-py39-cu121" assert config.template == f"{parent}/src/codeflare_sdk/templates/base-template.yaml" assert config.machine_types == ["cpu.small", "gpu.large"] @@ -393,11 +393,11 @@ def test_cluster_creation_no_mcad_local_queue(mocker): name="unit-test-cluster-ray", namespace="ns", num_workers=2, - min_cpus=3, - max_cpus=4, - min_memory=5, - max_memory=6, - num_gpus=7, + worker_cpu_requests=3, + worker_cpu_limits=4, + worker_memory_requests=5, + worker_memory_limits=6, + num_worker_gpus=7, machine_types=["cpu.small", "gpu.large"], image_pull_secrets=["unit-test-pull-secret"], image="quay.io/rhoai/ray:2.23.0-py39-cu121", @@ -1985,9 +1985,15 @@ def custom_side_effect(group, version, namespace, plural, **kwargs): "m4.xlarge" in cluster_config.machine_types and "g4dn.xlarge" in cluster_config.machine_types ) - assert cluster_config.min_cpus == 1 and cluster_config.max_cpus == 1 - assert cluster_config.min_memory == "2G" and cluster_config.max_memory == "2G" - assert cluster_config.num_gpus == 0 + assert ( + cluster_config.worker_cpu_requests == 1 + and cluster_config.worker_cpu_limits == 1 + ) + assert ( + cluster_config.worker_memory_requests == "2G" + and cluster_config.worker_memory_limits == "2G" + ) + assert cluster_config.num_worker_gpus == 0 assert ( cluster_config.image == "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103" @@ -2021,9 +2027,15 @@ def test_get_cluster(mocker): "m4.xlarge" in cluster_config.machine_types and "g4dn.xlarge" in cluster_config.machine_types ) - assert cluster_config.min_cpus == 1 and cluster_config.max_cpus == 1 - assert cluster_config.min_memory == "2G" and cluster_config.max_memory == "2G" - assert cluster_config.num_gpus == 0 + assert ( + cluster_config.worker_cpu_requests == 1 + and cluster_config.worker_cpu_limits == 1 + ) + assert ( + cluster_config.worker_memory_requests == "2G" + and cluster_config.worker_memory_limits == "2G" + ) + assert cluster_config.num_worker_gpus == 0 assert ( cluster_config.image == "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103" @@ -2053,9 +2065,15 @@ def test_get_cluster_no_mcad(mocker): "m4.xlarge" in cluster_config.machine_types and "g4dn.xlarge" in cluster_config.machine_types ) - assert cluster_config.min_cpus == 1 and cluster_config.max_cpus == 1 - assert cluster_config.min_memory == "2G" and cluster_config.max_memory == "2G" - assert cluster_config.num_gpus == 0 + assert ( + cluster_config.worker_cpu_requests == 1 + and cluster_config.worker_cpu_limits == 1 + ) + assert ( + cluster_config.worker_memory_requests == "2G" + and cluster_config.worker_memory_limits == "2G" + ) + assert cluster_config.num_worker_gpus == 0 assert ( cluster_config.image == "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103" diff --git a/tests/unit_test_support.py b/tests/unit_test_support.py index 9e7a60b6b..51c47aa61 100644 --- a/tests/unit_test_support.py +++ b/tests/unit_test_support.py @@ -9,11 +9,11 @@ def createClusterConfig(): name="unit-test-cluster", namespace="ns", num_workers=2, - min_cpus=3, - max_cpus=4, - min_memory=5, - max_memory=6, - num_gpus=7, + worker_cpu_requests=3, + worker_cpu_limits=4, + worker_memory_requests=5, + worker_memory_limits=6, + num_worker_gpus=7, appwrapper=True, machine_types=["cpu.small", "gpu.large"], image_pull_secrets=["unit-test-pull-secret"], diff --git a/tests/upgrade/raycluster_sdk_upgrade_test.py b/tests/upgrade/raycluster_sdk_upgrade_test.py index 235383f65..982048111 100644 --- a/tests/upgrade/raycluster_sdk_upgrade_test.py +++ b/tests/upgrade/raycluster_sdk_upgrade_test.py @@ -50,11 +50,11 @@ def run_mnist_raycluster_sdk_oauth(self): num_workers=1, head_cpus=1, head_memory=2, - min_cpus=1, - max_cpus=1, - min_memory=1, - max_memory=2, - num_gpus=0, + worker_cpu_requests=1, + worker_cpu_limits=1, + worker_memory_requests=1, + worker_memory_limits=2, + num_worker_gpus=0, image=ray_image, write_to_file=True, verify_tls=False,