Skip to content

Commit

Permalink
minor fixes and unit tests additions
Browse files Browse the repository at this point in the history
  • Loading branch information
MichaelClifford committed Sep 24, 2023
1 parent 7d6e294 commit 7f5e11b
Show file tree
Hide file tree
Showing 3 changed files with 47 additions and 7 deletions.
12 changes: 12 additions & 0 deletions src/codeflare_sdk/cluster/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -614,6 +614,15 @@ def _map_to_ray_cluster(rc) -> Optional[RayCluster]:
worker_gpu=0, # hard to detect currently how many gpus, can override it with what the user asked for
namespace=rc["metadata"]["namespace"],
dashboard=ray_route,
head_cpus=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][0][
"resources"
]["limits"]["cpu"],
head_mem=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][0][
"resources"
]["limits"]["memory"],
head_gpu=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][0][
"resources"
]["limits"]["nvidia.com/gpu"],
)


Expand Down Expand Up @@ -644,6 +653,9 @@ def _copy_to_ray(cluster: Cluster) -> RayCluster:
worker_gpu=cluster.config.num_gpus,
namespace=cluster.config.namespace,
dashboard=cluster.cluster_dashboard_uri(),
head_cpus=cluster.config.head_cpus,
head_mem=cluster.config.head_memory,
head_gpu=cluster.config.head_gpus,
)
if ray.status == CodeFlareClusterStatus.READY:
ray.status = RayClusterStatus.READY
Expand Down
36 changes: 29 additions & 7 deletions src/codeflare_sdk/utils/generate_yaml.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,17 @@ def update_priority(yaml, item, dispatch_priority, priority_val):


def update_custompodresources(
item, min_cpu, max_cpu, min_memory, max_memory, gpu, workers, head_cpus, head_memory, head_gpus):
item,
min_cpu,
max_cpu,
min_memory,
max_memory,
gpu,
workers,
head_cpus,
head_memory,
head_gpus,
):
if "custompodresources" in item.keys():
custompodresources = item.get("custompodresources")
for i in range(len(custompodresources)):
Expand All @@ -120,8 +130,8 @@ def update_custompodresources(
resource["limits"]["memory"] = str(head_memory) + "G"
resource["requests"]["nvidia.com/gpu"] = head_gpus
resource["limits"]["nvidia.com/gpu"] = head_gpus
else:

else:
for k, v in resource.items():
if k == "replicas" and i == 1:
resource[k] = workers
Expand Down Expand Up @@ -217,8 +227,8 @@ def update_nodes(
):
if "generictemplate" in item.keys():
head = item.get("generictemplate").get("spec").get("headGroupSpec")
head["rayStartParams"]["num_gpus"] = str(int(head_gpus))
head["rayStartParams"]["num-gpus"] = str(int(head_gpus))

worker = item.get("generictemplate").get("spec").get("workerGroupSpecs")[0]
# Head counts as first worker
worker["replicas"] = workers
Expand All @@ -235,7 +245,9 @@ def update_nodes(
update_env(spec, env)
if comp == head:
# TODO: Eventually add head node configuration outside of template
update_resources(spec, head_cpus, head_cpus, head_memory, head_memory, head_gpus)
update_resources(
spec, head_cpus, head_cpus, head_memory, head_memory, head_gpus
)
else:
update_resources(spec, min_cpu, max_cpu, min_memory, max_memory, gpu)

Expand Down Expand Up @@ -388,7 +400,17 @@ def generate_appwrapper(
update_labels(user_yaml, instascale, instance_types)
update_priority(user_yaml, item, dispatch_priority, priority_val)
update_custompodresources(
item, min_cpu, max_cpu, min_memory, max_memory, gpu, workers, head_cpus, head_memory, head_gpus)
item,
min_cpu,
max_cpu,
min_memory,
max_memory,
gpu,
workers,
head_cpus,
head_memory,
head_gpus,
)
update_nodes(
item,
appwrapper_name,
Expand Down
6 changes: 6 additions & 0 deletions tests/unit_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -525,6 +525,9 @@ def test_ray_details(mocker, capsys):
worker_gpu=0,
namespace="ns",
dashboard="fake-uri",
head_cpus=2,
head_mem=8,
head_gpu=0,
)
mocker.patch(
"codeflare_sdk.cluster.cluster.Cluster.status",
Expand Down Expand Up @@ -1685,6 +1688,9 @@ def test_cluster_status(mocker):
worker_gpu=0,
namespace="ns",
dashboard="fake-uri",
head_cpus=2,
head_mem=8,
head_gpu=0,
)
cf = Cluster(ClusterConfiguration(name="test", namespace="ns"))
mocker.patch("codeflare_sdk.cluster.cluster._app_wrapper_status", return_value=None)
Expand Down

0 comments on commit 7f5e11b

Please sign in to comment.