Skip to content

Commit

Permalink
Update SDK args
Browse files Browse the repository at this point in the history
  • Loading branch information
Alex Fan authored and openshift-merge-bot[bot] committed Jul 2, 2024
1 parent ff3ade7 commit a36ebdb
Show file tree
Hide file tree
Showing 11 changed files with 193 additions and 120 deletions.
50 changes: 25 additions & 25 deletions src/codeflare_sdk/cluster/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,12 +137,12 @@ def create_app_wrapper(self):
namespace = self.config.namespace
head_cpus = self.config.head_cpus
head_memory = self.config.head_memory
head_gpus = self.config.head_gpus
min_cpu = self.config.min_cpus
max_cpu = self.config.max_cpus
min_memory = self.config.min_memory
max_memory = self.config.max_memory
gpu = self.config.num_gpus
num_head_gpus = self.config.num_head_gpus
worker_cpu_requests = self.config.worker_cpu_requests
worker_cpu_limits = self.config.worker_cpu_limits
worker_memory_requests = self.config.worker_memory_requests
worker_memory_limits = self.config.worker_memory_limits
num_worker_gpus = self.config.num_worker_gpus
workers = self.config.num_workers
template = self.config.template
image = self.config.image
Expand All @@ -157,12 +157,12 @@ def create_app_wrapper(self):
namespace=namespace,
head_cpus=head_cpus,
head_memory=head_memory,
head_gpus=head_gpus,
min_cpu=min_cpu,
max_cpu=max_cpu,
min_memory=min_memory,
max_memory=max_memory,
gpu=gpu,
num_head_gpus=num_head_gpus,
worker_cpu_requests=worker_cpu_requests,
worker_cpu_limits=worker_cpu_limits,
worker_memory_requests=worker_memory_requests,
worker_memory_limits=worker_memory_limits,
num_worker_gpus=num_worker_gpus,
workers=workers,
template=template,
image=image,
Expand Down Expand Up @@ -318,7 +318,7 @@ def status(

if print_to_console:
# overriding the number of gpus with requested
cluster.worker_gpu = self.config.num_gpus
cluster.worker_gpu = self.config.num_worker_gpus
pretty_print.print_cluster_status(cluster)
elif print_to_console:
if status == CodeFlareClusterStatus.UNKNOWN:
Expand Down Expand Up @@ -474,19 +474,19 @@ def from_k8_cluster_object(
namespace=rc["metadata"]["namespace"],
machine_types=machine_types,
num_workers=rc["spec"]["workerGroupSpecs"][0]["minReplicas"],
min_cpus=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
worker_cpu_requests=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
"containers"
][0]["resources"]["requests"]["cpu"],
max_cpus=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
worker_cpu_limits=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
"containers"
][0]["resources"]["limits"]["cpu"],
min_memory=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
"containers"
][0]["resources"]["requests"]["memory"],
max_memory=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
worker_memory_requests=rc["spec"]["workerGroupSpecs"][0]["template"][
"spec"
]["containers"][0]["resources"]["requests"]["memory"],
worker_memory_limits=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
"containers"
][0]["resources"]["limits"]["memory"],
num_gpus=int(
num_worker_gpus=int(
rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0][
"resources"
]["limits"]["nvidia.com/gpu"]
Expand Down Expand Up @@ -917,15 +917,15 @@ def _copy_to_ray(cluster: Cluster) -> RayCluster:
name=cluster.config.name,
status=cluster.status(print_to_console=False)[0],
workers=cluster.config.num_workers,
worker_mem_min=cluster.config.min_memory,
worker_mem_max=cluster.config.max_memory,
worker_cpu=cluster.config.min_cpus,
worker_gpu=cluster.config.num_gpus,
worker_mem_min=cluster.config.worker_memory_requests,
worker_mem_max=cluster.config.worker_memory_limits,
worker_cpu=cluster.config.worker_cpu_requests,
worker_gpu=cluster.config.num_worker_gpus,
namespace=cluster.config.namespace,
dashboard=cluster.cluster_dashboard_uri(),
head_cpus=cluster.config.head_cpus,
head_mem=cluster.config.head_memory,
head_gpu=cluster.config.head_gpus,
head_gpu=cluster.config.num_head_gpus,
)
if ray.status == CodeFlareClusterStatus.READY:
ray.status = RayClusterStatus.READY
Expand Down
69 changes: 55 additions & 14 deletions src/codeflare_sdk/cluster/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from dataclasses import dataclass, field
import pathlib
import typing
import warnings

dir = pathlib.Path(__file__).parent.parent.resolve()

Expand All @@ -37,14 +38,20 @@ class ClusterConfiguration:
head_info: list = field(default_factory=list)
head_cpus: typing.Union[int, str] = 2
head_memory: typing.Union[int, str] = 8
head_gpus: int = 0
head_gpus: int = None # Deprecating
num_head_gpus: int = 0
machine_types: list = field(default_factory=list) # ["m4.xlarge", "g4dn.xlarge"]
min_cpus: typing.Union[int, str] = 1
max_cpus: typing.Union[int, str] = 1
worker_cpu_requests: typing.Union[int, str] = 1
worker_cpu_limits: typing.Union[int, str] = 1
min_cpus: typing.Union[int, str] = None # Deprecating
max_cpus: typing.Union[int, str] = None # Deprecating
num_workers: int = 1
min_memory: typing.Union[int, str] = 2
max_memory: typing.Union[int, str] = 2
num_gpus: int = 0
worker_memory_requests: typing.Union[int, str] = 2
worker_memory_limits: typing.Union[int, str] = 2
min_memory: typing.Union[int, str] = None # Deprecating
max_memory: typing.Union[int, str] = None # Deprecating
num_worker_gpus: int = 0
num_gpus: int = None # Deprecating
template: str = f"{dir}/templates/base-template.yaml"
appwrapper: bool = False
envs: dict = field(default_factory=dict)
Expand All @@ -59,23 +66,57 @@ def __post_init__(self):
print(
"Warning: TLS verification has been disabled - Endpoint checks will be bypassed"
)

self._memory_to_string()
self._str_mem_no_unit_add_GB()
self._memory_to_resource()
self._gpu_to_resource()
self._cpu_to_resource()

def _str_mem_no_unit_add_GB(self):
if isinstance(self.head_memory, str) and self.head_memory.isdecimal():
self.head_memory = f"{self.head_memory}G"
if isinstance(self.min_memory, str) and self.min_memory.isdecimal():
self.min_memory = f"{self.min_memory}G"
if isinstance(self.max_memory, str) and self.max_memory.isdecimal():
self.max_memory = f"{self.max_memory}G"
if (
isinstance(self.worker_memory_requests, str)
and self.worker_memory_requests.isdecimal()
):
self.worker_memory_requests = f"{self.worker_memory_requests}G"
if (
isinstance(self.worker_memory_limits, str)
and self.worker_memory_limits.isdecimal()
):
self.worker_memory_limits = f"{self.worker_memory_limits}G"

def _memory_to_string(self):
if isinstance(self.head_memory, int):
self.head_memory = f"{self.head_memory}G"
if isinstance(self.min_memory, int):
self.min_memory = f"{self.min_memory}G"
if isinstance(self.max_memory, int):
self.max_memory = f"{self.max_memory}G"
if isinstance(self.worker_memory_requests, int):
self.worker_memory_requests = f"{self.worker_memory_requests}G"
if isinstance(self.worker_memory_limits, int):
self.worker_memory_limits = f"{self.worker_memory_limits}G"

def _gpu_to_resource(self):
if self.head_gpus:
warnings.warn("head_gpus is being deprecated, use num_head_gpus")
self.num_head_gpus = self.head_gpus
if self.num_gpus:
warnings.warn("num_gpus is being deprecated, use num_worker_gpus")
self.num_worker_gpus = self.num_gpus

def _cpu_to_resource(self):
if self.min_cpus:
warnings.warn("min_cpus is being deprecated, use worker_cpu_requests")
self.worker_cpu_requests = self.min_cpus
if self.max_cpus:
warnings.warn("max_cpus is being deprecated, use worker_cpu_limits")
self.worker_cpu_limits = self.max_cpus

def _memory_to_resource(self):
if self.min_memory:
warnings.warn("min_memory is being deprecated, use worker_memory_requests")
self.worker_memory_requests = f"{self.min_memory}G"
if self.max_memory:
warnings.warn("max_memory is being deprecated, use worker_memory_limits")
self.worker_memory_limits = f"{self.max_memory}G"

local_queue: str = None
72 changes: 43 additions & 29 deletions src/codeflare_sdk/utils/generate_yaml.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,47 +106,54 @@ def update_env(spec, env):
container["env"] = env


def update_resources(spec, min_cpu, max_cpu, min_memory, max_memory, gpu):
def update_resources(
spec,
worker_cpu_requests,
worker_cpu_limits,
worker_memory_requests,
worker_memory_limits,
num_worker_gpus,
):
container = spec.get("containers")
for resource in container:
requests = resource.get("resources").get("requests")
if requests is not None:
requests["cpu"] = min_cpu
requests["memory"] = min_memory
requests["nvidia.com/gpu"] = gpu
requests["cpu"] = worker_cpu_requests
requests["memory"] = worker_memory_requests
requests["nvidia.com/gpu"] = num_worker_gpus
limits = resource.get("resources").get("limits")
if limits is not None:
limits["cpu"] = max_cpu
limits["memory"] = max_memory
limits["nvidia.com/gpu"] = gpu
limits["cpu"] = worker_cpu_limits
limits["memory"] = worker_memory_limits
limits["nvidia.com/gpu"] = num_worker_gpus


def update_nodes(
cluster_yaml,
appwrapper_name,
min_cpu,
max_cpu,
min_memory,
max_memory,
gpu,
worker_cpu_requests,
worker_cpu_limits,
worker_memory_requests,
worker_memory_limits,
num_worker_gpus,
workers,
image,
env,
image_pull_secrets,
head_cpus,
head_memory,
head_gpus,
num_head_gpus,
):
head = cluster_yaml.get("spec").get("headGroupSpec")
head["rayStartParams"]["num-gpus"] = str(int(head_gpus))
head["rayStartParams"]["num-gpus"] = str(int(num_head_gpus))

worker = cluster_yaml.get("spec").get("workerGroupSpecs")[0]
# Head counts as first worker
worker["replicas"] = workers
worker["minReplicas"] = workers
worker["maxReplicas"] = workers
worker["groupName"] = "small-group-" + appwrapper_name
worker["rayStartParams"]["num-gpus"] = str(int(gpu))
worker["rayStartParams"]["num-gpus"] = str(int(num_worker_gpus))

for comp in [head, worker]:
spec = comp.get("template").get("spec")
Expand All @@ -156,10 +163,17 @@ def update_nodes(
if comp == head:
# TODO: Eventually add head node configuration outside of template
update_resources(
spec, head_cpus, head_cpus, head_memory, head_memory, head_gpus
spec, head_cpus, head_cpus, head_memory, head_memory, num_head_gpus
)
else:
update_resources(spec, min_cpu, max_cpu, min_memory, max_memory, gpu)
update_resources(
spec,
worker_cpu_requests,
worker_cpu_limits,
worker_memory_requests,
worker_memory_limits,
num_worker_gpus,
)


def del_from_list_by_name(l: list, target: typing.List[str]) -> list:
Expand Down Expand Up @@ -265,12 +279,12 @@ def generate_appwrapper(
namespace: str,
head_cpus: int,
head_memory: int,
head_gpus: int,
min_cpu: int,
max_cpu: int,
min_memory: int,
max_memory: int,
gpu: int,
num_head_gpus: int,
worker_cpu_requests: int,
worker_cpu_limits: int,
worker_memory_requests: int,
worker_memory_limits: int,
num_worker_gpus: int,
workers: int,
template: str,
image: str,
Expand All @@ -287,18 +301,18 @@ def generate_appwrapper(
update_nodes(
cluster_yaml,
appwrapper_name,
min_cpu,
max_cpu,
min_memory,
max_memory,
gpu,
worker_cpu_requests,
worker_cpu_limits,
worker_memory_requests,
worker_memory_limits,
num_worker_gpus,
workers,
image,
env,
image_pull_secrets,
head_cpus,
head_memory,
head_gpus,
num_head_gpus,
)
augment_labels(cluster_yaml, labels)
notebook_annotations(cluster_yaml)
Expand Down
10 changes: 5 additions & 5 deletions tests/e2e/local_interactive_sdk_kind_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,11 @@ def run_local_interactives(self):
num_workers=1,
head_cpus="500m",
head_memory=2,
min_cpus="500m",
max_cpus=1,
min_memory=1,
max_memory=2,
num_gpus=0,
worker_cpu_requests="500m",
worker_cpu_limits=1,
worker_memory_requests=1,
worker_memory_limits=2,
num_worker_gpus=0,
image=ray_image,
write_to_file=True,
verify_tls=False,
Expand Down
10 changes: 5 additions & 5 deletions tests/e2e/local_interactive_sdk_oauth_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,11 @@ def run_local_interactives(self):
namespace=self.namespace,
name=cluster_name,
num_workers=1,
min_cpus=1,
max_cpus=1,
min_memory=4,
max_memory=4,
num_gpus=0,
worker_cpu_requests=1,
worker_cpu_limits=1,
worker_memory_requests=4,
worker_memory_limits=4,
num_worker_gpus=0,
image=ray_image,
verify_tls=False,
)
Expand Down
10 changes: 5 additions & 5 deletions tests/e2e/mnist_raycluster_sdk_kind_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,11 @@ def run_mnist_raycluster_sdk_kind(self):
num_workers=1,
head_cpus="500m",
head_memory=2,
min_cpus="500m",
max_cpus=1,
min_memory=1,
max_memory=2,
num_gpus=0,
worker_cpu_requests="500m",
worker_cpu_limits=1,
worker_memory_requests=1,
worker_memory_limits=2,
num_worker_gpus=0,
image=ray_image,
write_to_file=True,
verify_tls=False,
Expand Down
10 changes: 5 additions & 5 deletions tests/e2e/mnist_raycluster_sdk_oauth_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,11 @@ def run_mnist_raycluster_sdk_oauth(self):
num_workers=1,
head_cpus="500m",
head_memory=2,
min_cpus="500m",
max_cpus=1,
min_memory=1,
max_memory=2,
num_gpus=0,
worker_cpu_requests="500m",
worker_cpu_limits=1,
worker_memory_requests=1,
worker_memory_limits=2,
num_worker_gpus=0,
image=ray_image,
write_to_file=True,
verify_tls=False,
Expand Down
Loading

0 comments on commit a36ebdb

Please sign in to comment.