Skip to content

Commit

Permalink
Get cluster (#189)
Browse files Browse the repository at this point in the history
* Add: get_cluster function to get cluster with specified name and namespace

* Test: make unit tests for get_cluster function
  • Loading branch information
carsonmh authored and Maxusmusti committed Jul 13, 2023
1 parent 9647455 commit 89ac249
Show file tree
Hide file tree
Showing 2 changed files with 93 additions and 3 deletions.
64 changes: 64 additions & 0 deletions src/codeflare_sdk/cluster/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,49 @@ def torchx_config(
to_return["requirements"] = requirements
return to_return

def from_k8_cluster_object(rc):
machine_types = (
rc["metadata"]["labels"]["orderedinstance"].split("_")
if "orderedinstance" in rc["metadata"]["labels"]
else []
)
local_interactive = (
"volumeMounts"
in rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0]
)
cluster_config = ClusterConfiguration(
name=rc["metadata"]["name"],
namespace=rc["metadata"]["namespace"],
machine_types=machine_types,
min_worker=rc["spec"]["workerGroupSpecs"][0]["minReplicas"],
max_worker=rc["spec"]["workerGroupSpecs"][0]["maxReplicas"],
min_cpus=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
"containers"
][0]["resources"]["requests"]["cpu"],
max_cpus=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][
"containers"
][0]["resources"]["limits"]["cpu"],
min_memory=int(
rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0][
"resources"
]["requests"]["memory"][:-1]
),
max_memory=int(
rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0][
"resources"
]["limits"]["memory"][:-1]
),
gpu=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0][
"resources"
]["limits"]["nvidia.com/gpu"],
instascale=True if machine_types else False,
image=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][
0
]["image"],
local_interactive=local_interactive,
)
return Cluster(cluster_config)


def list_all_clusters(namespace: str, print_to_console: bool = True):
"""
Expand Down Expand Up @@ -339,6 +382,27 @@ def get_current_namespace(): # pragma: no cover
return "default"


def get_cluster(cluster_name: str, namespace: str = "default"):
try:
config.load_kube_config()
api_instance = client.CustomObjectsApi()
rcs = api_instance.list_namespaced_custom_object(
group="ray.io",
version="v1alpha1",
namespace=namespace,
plural="rayclusters",
)
except Exception as e:
return _kube_api_error_handling(e)

for rc in rcs["items"]:
if rc["metadata"]["name"] == cluster_name:
return Cluster.from_k8_cluster_object(rc)
raise FileNotFoundError(
f"Cluster {cluster_name} is not found in {namespace} namespace"
)


# private methods


Expand Down
32 changes: 29 additions & 3 deletions tests/unit_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
list_all_clusters,
list_all_queued,
_copy_to_ray,
get_cluster,
_app_wrapper_status,
_ray_cluster_status,
)
Expand Down Expand Up @@ -614,6 +615,7 @@ def get_ray_obj(group, version, namespace, plural, cls=None):
"appwrapper.mcad.ibm.com": "quicktest",
"controller-tools.k8s.io": "1.0",
"resourceName": "quicktest",
"orderedinstance": "m4.xlarge_g4dn.xlarge",
},
"managedFields": [
{
Expand Down Expand Up @@ -791,10 +793,10 @@ def get_ray_obj(group, version, namespace, plural, cls=None):
"workerGroupSpecs": [
{
"groupName": "small-group-quicktest",
"maxReplicas": 1,
"minReplicas": 1,
"maxReplicas": 2,
"minReplicas": 2,
"rayStartParams": {"block": "true", "num-gpus": "0"},
"replicas": 1,
"replicas": 2,
"template": {
"metadata": {
"annotations": {"key": "value"},
Expand Down Expand Up @@ -1529,6 +1531,30 @@ def get_aw_obj(group, version, namespace, plural):
return api_obj1


def test_get_cluster(mocker):
mocker.patch("kubernetes.config.load_kube_config", return_value="ignore")
mocker.patch(
"kubernetes.client.CustomObjectsApi.list_namespaced_custom_object",
side_effect=get_ray_obj,
)
cluster = get_cluster("quicktest")
cluster_config = cluster.config
assert cluster_config.name == "quicktest" and cluster_config.namespace == "ns"
assert (
"m4.xlarge" in cluster_config.machine_types
and "g4dn.xlarge" in cluster_config.machine_types
)
assert cluster_config.min_cpus == 1 and cluster_config.max_cpus == 1
assert cluster_config.min_memory == 2 and cluster_config.max_memory == 2
assert cluster_config.gpu == 0
assert cluster_config.instascale
assert (
cluster_config.image
== "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103"
)
assert cluster_config.min_worker == 2 and cluster_config.max_worker == 2


def test_list_clusters(mocker, capsys):
mocker.patch("kubernetes.config.load_kube_config", return_value="ignore")
mocker.patch(
Expand Down

0 comments on commit 89ac249

Please sign in to comment.