diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py index 84f73198e..99b11582d 100644 --- a/src/codeflare_sdk/cluster/cluster.py +++ b/src/codeflare_sdk/cluster/cluster.py @@ -614,6 +614,15 @@ def _map_to_ray_cluster(rc) -> Optional[RayCluster]: worker_gpu=0, # hard to detect currently how many gpus, can override it with what the user asked for namespace=rc["metadata"]["namespace"], dashboard=ray_route, + head_cpus=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][0][ + "resources" + ]["limits"]["cpu"], + head_mem=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][0][ + "resources" + ]["limits"]["memory"], + head_gpu=rc["spec"]["headGroupSpec"]["template"]["spec"]["containers"][0][ + "resources" + ]["limits"]["nvidia.com/gpu"], ) @@ -644,6 +653,9 @@ def _copy_to_ray(cluster: Cluster) -> RayCluster: worker_gpu=cluster.config.num_gpus, namespace=cluster.config.namespace, dashboard=cluster.cluster_dashboard_uri(), + head_cpus=cluster.config.head_cpus, + head_mem=cluster.config.head_memory, + head_gpu=cluster.config.head_gpus, ) if ray.status == CodeFlareClusterStatus.READY: ray.status = RayClusterStatus.READY diff --git a/src/codeflare_sdk/utils/generate_yaml.py b/src/codeflare_sdk/utils/generate_yaml.py index eb45611fc..95e1c5ecb 100755 --- a/src/codeflare_sdk/utils/generate_yaml.py +++ b/src/codeflare_sdk/utils/generate_yaml.py @@ -107,7 +107,17 @@ def update_priority(yaml, item, dispatch_priority, priority_val): def update_custompodresources( - item, min_cpu, max_cpu, min_memory, max_memory, gpu, workers, head_cpus, head_memory, head_gpus): + item, + min_cpu, + max_cpu, + min_memory, + max_memory, + gpu, + workers, + head_cpus, + head_memory, + head_gpus, +): if "custompodresources" in item.keys(): custompodresources = item.get("custompodresources") for i in range(len(custompodresources)): @@ -120,8 +130,8 @@ def update_custompodresources( resource["limits"]["memory"] = str(head_memory) + "G" resource["requests"]["nvidia.com/gpu"] = head_gpus resource["limits"]["nvidia.com/gpu"] = head_gpus - - else: + + else: for k, v in resource.items(): if k == "replicas" and i == 1: resource[k] = workers @@ -217,8 +227,8 @@ def update_nodes( ): if "generictemplate" in item.keys(): head = item.get("generictemplate").get("spec").get("headGroupSpec") - head["rayStartParams"]["num_gpus"] = str(int(head_gpus)) - + head["rayStartParams"]["num-gpus"] = str(int(head_gpus)) + worker = item.get("generictemplate").get("spec").get("workerGroupSpecs")[0] # Head counts as first worker worker["replicas"] = workers @@ -235,7 +245,9 @@ def update_nodes( update_env(spec, env) if comp == head: # TODO: Eventually add head node configuration outside of template - update_resources(spec, head_cpus, head_cpus, head_memory, head_memory, head_gpus) + update_resources( + spec, head_cpus, head_cpus, head_memory, head_memory, head_gpus + ) else: update_resources(spec, min_cpu, max_cpu, min_memory, max_memory, gpu) @@ -388,7 +400,17 @@ def generate_appwrapper( update_labels(user_yaml, instascale, instance_types) update_priority(user_yaml, item, dispatch_priority, priority_val) update_custompodresources( - item, min_cpu, max_cpu, min_memory, max_memory, gpu, workers, head_cpus, head_memory, head_gpus) + item, + min_cpu, + max_cpu, + min_memory, + max_memory, + gpu, + workers, + head_cpus, + head_memory, + head_gpus, + ) update_nodes( item, appwrapper_name, diff --git a/tests/unit_test.py b/tests/unit_test.py index b046b1f13..4a8e2f441 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -525,6 +525,9 @@ def test_ray_details(mocker, capsys): worker_gpu=0, namespace="ns", dashboard="fake-uri", + head_cpus=2, + head_mem=8, + head_gpu=0, ) mocker.patch( "codeflare_sdk.cluster.cluster.Cluster.status", @@ -1685,6 +1688,9 @@ def test_cluster_status(mocker): worker_gpu=0, namespace="ns", dashboard="fake-uri", + head_cpus=2, + head_mem=8, + head_gpu=0, ) cf = Cluster(ClusterConfiguration(name="test", namespace="ns")) mocker.patch("codeflare_sdk.cluster.cluster._app_wrapper_status", return_value=None)