Skip to content

Commit

Permalink
add ability to configure head node
Browse files Browse the repository at this point in the history
  • Loading branch information
MichaelClifford committed Sep 23, 2023
1 parent abec0ef commit 7d6e294
Show file tree
Hide file tree
Showing 4 changed files with 55 additions and 28 deletions.
6 changes: 6 additions & 0 deletions src/codeflare_sdk/cluster/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,9 @@ def create_app_wrapper(self):

name = self.config.name
namespace = self.config.namespace
head_cpus = self.config.head_cpus
head_memory = self.config.head_memory
head_gpus = self.config.head_gpus
min_cpu = self.config.min_cpus
max_cpu = self.config.max_cpus
min_memory = self.config.min_memory
Expand All @@ -126,6 +129,9 @@ def create_app_wrapper(self):
return generate_appwrapper(
name=name,
namespace=namespace,
head_cpus=head_cpus,
head_memory=head_memory,
head_gpus=head_gpus,
min_cpu=min_cpu,
max_cpu=max_cpu,
min_memory=min_memory,
Expand Down
3 changes: 3 additions & 0 deletions src/codeflare_sdk/cluster/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@ class ClusterConfiguration:
name: str
namespace: str = None
head_info: list = field(default_factory=list)
head_cpus: int = 2
head_memory: int = 8
head_gpus: int = 0
machine_types: list = field(default_factory=list) # ["m4.xlarge", "g4dn.xlarge"]
min_cpus: int = 1
max_cpus: int = 1
Expand Down
3 changes: 3 additions & 0 deletions src/codeflare_sdk/cluster/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,9 @@ class RayCluster:

name: str
status: RayClusterStatus
head_cpus: int
head_mem: str
head_gpu: int
workers: int
worker_mem_min: str
worker_mem_max: str
Expand Down
71 changes: 43 additions & 28 deletions src/codeflare_sdk/utils/generate_yaml.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,35 +107,41 @@ def update_priority(yaml, item, dispatch_priority, priority_val):


def update_custompodresources(
item, min_cpu, max_cpu, min_memory, max_memory, gpu, workers
):
item, min_cpu, max_cpu, min_memory, max_memory, gpu, workers, head_cpus, head_memory, head_gpus):
if "custompodresources" in item.keys():
custompodresources = item.get("custompodresources")
for i in range(len(custompodresources)):
resource = custompodresources[i]
if i == 0:
# Leave head node resources as template default
continue
resource = custompodresources[i]
for k, v in resource.items():
if k == "replicas" and i == 1:
resource[k] = workers
if k == "requests" or k == "limits":
for spec, _ in v.items():
if spec == "cpu":
if k == "limits":
resource[k][spec] = max_cpu
else:
resource[k][spec] = min_cpu
if spec == "memory":
if k == "limits":
resource[k][spec] = str(max_memory) + "G"
else:
resource[k][spec] = str(min_memory) + "G"
if spec == "nvidia.com/gpu":
if i == 0:
resource[k][spec] = 0
else:
resource[k][spec] = gpu
resource["requests"]["cpu"] = head_cpus
resource["limits"]["cpu"] = head_cpus
resource["requests"]["memory"] = str(head_memory) + "G"
resource["limits"]["memory"] = str(head_memory) + "G"
resource["requests"]["nvidia.com/gpu"] = head_gpus
resource["limits"]["nvidia.com/gpu"] = head_gpus

else:
for k, v in resource.items():
if k == "replicas" and i == 1:
resource[k] = workers
if k == "requests" or k == "limits":
for spec, _ in v.items():
if spec == "cpu":
if k == "limits":
resource[k][spec] = max_cpu
else:
resource[k][spec] = min_cpu
if spec == "memory":
if k == "limits":
resource[k][spec] = str(max_memory) + "G"
else:
resource[k][spec] = str(min_memory) + "G"
if spec == "nvidia.com/gpu":
if i == 0:
resource[k][spec] = 0
else:
resource[k][spec] = gpu
else:
sys.exit("Error: malformed template")

Expand Down Expand Up @@ -205,11 +211,15 @@ def update_nodes(
instascale,
env,
image_pull_secrets,
head_cpus,
head_memory,
head_gpus,
):
if "generictemplate" in item.keys():
head = item.get("generictemplate").get("spec").get("headGroupSpec")
head["rayStartParams"]["num_gpus"] = str(int(head_gpus))

worker = item.get("generictemplate").get("spec").get("workerGroupSpecs")[0]

# Head counts as first worker
worker["replicas"] = workers
worker["minReplicas"] = workers
Expand All @@ -225,7 +235,7 @@ def update_nodes(
update_env(spec, env)
if comp == head:
# TODO: Eventually add head node configuration outside of template
continue
update_resources(spec, head_cpus, head_cpus, head_memory, head_memory, head_gpus)
else:
update_resources(spec, min_cpu, max_cpu, min_memory, max_memory, gpu)

Expand Down Expand Up @@ -350,6 +360,9 @@ def write_user_appwrapper(user_yaml, output_file_name):
def generate_appwrapper(
name: str,
namespace: str,
head_cpus: int,
head_memory: int,
head_gpus: int,
min_cpu: int,
max_cpu: int,
min_memory: int,
Expand All @@ -375,8 +388,7 @@ def generate_appwrapper(
update_labels(user_yaml, instascale, instance_types)
update_priority(user_yaml, item, dispatch_priority, priority_val)
update_custompodresources(
item, min_cpu, max_cpu, min_memory, max_memory, gpu, workers
)
item, min_cpu, max_cpu, min_memory, max_memory, gpu, workers, head_cpus, head_memory, head_gpus)
update_nodes(
item,
appwrapper_name,
Expand All @@ -390,6 +402,9 @@ def generate_appwrapper(
instascale,
env,
image_pull_secrets,
head_cpus,
head_memory,
head_gpus,
)
update_dashboard_route(route_item, cluster_name, namespace)
if local_interactive:
Expand Down

0 comments on commit 7d6e294

Please sign in to comment.