From d7ae866670003c7e2529b050157b3106f5a1b0a2 Mon Sep 17 00:00:00 2001 From: Mustafa Eyceoz Date: Thu, 20 Jul 2023 15:28:08 -0400 Subject: [PATCH] K8s Support for SDK (#146) * First pass on just the basics * Added namespace retrieval and dashboard route access via kubernetes * Added exception handling * Remove unnecessary comment * Change AW fs loading to k8s and begin converting unit tests * Finished unit test update * Update requirements * Get cluster (#189) * Add: get_cluster function to get cluster with specified name and namespace * Test: make unit tests for get_cluster function * Fix: unit test failing because of ray cluster obj changed (#208) * Remove oc client and add helper functions (#187) * Remove oc client and add helper functions * Updates for error checking * make sure tests run without oc login * Removing CLI appwrapper generation * Updated import --------- Co-authored-by: Mustafa Eyceoz * Remove unused import * Update authentication for K8s (#237) * Updated authentication for Kubernetes * Updated template name and comment * Updated login functionality * Altered config_check() function * Altered comments and changed config_check() function * Added logic for handling current namespace when a user authenticates via kube client * Changed formatting * Made handler functions generic and altered get_current_namespace() functionality * Changed error message for cluster configuration * Removed default values for token + server * Added check for correct credentials * Changed how using certs works with certifi.where * Added unit tests for new authentication methods * Fixed formatting and updated .gitignore to include test created files * Fixed .gitignore * Updated unit authentication tests --------- Co-authored-by: Carson Harrell <64709520+carsonmh@users.noreply.github.com> Co-authored-by: ted chang Co-authored-by: Mark Campbell --- .gitignore | 2 + .../interactive/local_interactive.ipynb | 88 +- pyproject.toml | 3 +- requirements.txt | 2 + src/codeflare_sdk/cluster/auth.py | 151 +- src/codeflare_sdk/cluster/awload.py | 67 +- src/codeflare_sdk/cluster/cluster.py | 387 ++-- src/codeflare_sdk/cluster/config.py | 2 - src/codeflare_sdk/job/jobs.py | 4 +- src/codeflare_sdk/utils/generate_cert.py | 5 +- src/codeflare_sdk/utils/generate_yaml.py | 147 +- src/codeflare_sdk/utils/kube_api_helpers.py | 44 + tests/test-case-cmd.yaml | 173 -- tests/unit_test.py | 1966 +++++++++-------- 14 files changed, 1525 insertions(+), 1516 deletions(-) create mode 100644 src/codeflare_sdk/utils/kube_api_helpers.py delete mode 100644 tests/test-case-cmd.yaml diff --git a/.gitignore b/.gitignore index eef1052fe..fbb31b2b9 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,5 @@ Pipfile.lock poetry.lock .venv* build/ +tls-cluster-namespace +quicktest.yaml diff --git a/demo-notebooks/interactive/local_interactive.ipynb b/demo-notebooks/interactive/local_interactive.ipynb index 88a6ccd58..d70c00df7 100644 --- a/demo-notebooks/interactive/local_interactive.ipynb +++ b/demo-notebooks/interactive/local_interactive.ipynb @@ -32,20 +32,12 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "4364ac2e-dd10-4d30-ba66-12708daefb3f", "metadata": { "tags": [] }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Written to: hfgputest-1.yaml\n" - ] - } - ], + "outputs": [], "source": [ "# Create our cluster and submit appwrapper\n", "namespace = \"default\"\n", @@ -89,7 +81,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "12eef53c", "metadata": {}, @@ -99,38 +90,21 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "id": "cf1b749e-2335-42c2-b673-26768ec9895d", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "rayclient-hfgputest-1-default.apps.tedbig412.cp.fyre.ibm.com\n" - ] - } - ], + "outputs": [], "source": [ - "import openshift as oc\n", "from codeflare_sdk.utils import generate_cert\n", "\n", "if local_interactive:\n", " generate_cert.generate_tls_cert(cluster_name, namespace)\n", - " generate_cert.export_env(cluster_name, namespace)\n", - "\n", - "with oc.project(namespace):\n", - " routes=oc.selector(\"route\").objects()\n", - " rayclient_url=\"\"\n", - " for r in routes:\n", - " if \"rayclient\" in r.name():\n", - " rayclient_url=r.model.spec.host\n", - "print(rayclient_url)" + " generate_cert.export_env(cluster_name, namespace)" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 6, "id": "9483bb98-33b3-4beb-9b15-163d7e76c1d7", "metadata": { "scrolled": true, @@ -141,15 +115,15 @@ "name": "stderr", "output_type": "stream", "text": [ - "2023-05-31 14:12:37,816\tINFO client_builder.py:251 -- Passing the following kwargs to ray.init() on the server: logging_level\n", - "2023-05-31 14:12:37,820\tDEBUG worker.py:378 -- client gRPC channel state change: ChannelConnectivity.IDLE\n", - "2023-05-31 14:12:38,034\tDEBUG worker.py:378 -- client gRPC channel state change: ChannelConnectivity.CONNECTING\n", - "2023-05-31 14:12:38,246\tDEBUG worker.py:378 -- client gRPC channel state change: ChannelConnectivity.READY\n", - "2023-05-31 14:12:38,290\tDEBUG worker.py:807 -- Pinging server.\n", - "2023-05-31 14:12:40,521\tDEBUG worker.py:640 -- Retaining 00ffffffffffffffffffffffffffffffffffffff0100000001000000\n", - "2023-05-31 14:12:40,523\tDEBUG worker.py:564 -- Scheduling task get_dashboard_url 0 b'\\x00\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\x01\\x00\\x00\\x00\\x01\\x00\\x00\\x00'\n", - "2023-05-31 14:12:40,535\tDEBUG worker.py:640 -- Retaining c8ef45ccd0112571ffffffffffffffffffffffff0100000001000000\n", - "2023-05-31 14:12:41,379\tDEBUG worker.py:636 -- Releasing c8ef45ccd0112571ffffffffffffffffffffffff0100000001000000\n" + "2023-06-27 19:14:16,088\tINFO client_builder.py:251 -- Passing the following kwargs to ray.init() on the server: logging_level\n", + "2023-06-27 19:14:16,100\tDEBUG worker.py:378 -- client gRPC channel state change: ChannelConnectivity.IDLE\n", + "2023-06-27 19:14:16,308\tDEBUG worker.py:378 -- client gRPC channel state change: ChannelConnectivity.CONNECTING\n", + "2023-06-27 19:14:16,434\tDEBUG worker.py:378 -- client gRPC channel state change: ChannelConnectivity.READY\n", + "2023-06-27 19:14:16,436\tDEBUG worker.py:807 -- Pinging server.\n", + "2023-06-27 19:14:18,634\tDEBUG worker.py:640 -- Retaining 00ffffffffffffffffffffffffffffffffffffff0100000001000000\n", + "2023-06-27 19:14:18,635\tDEBUG worker.py:564 -- Scheduling task get_dashboard_url 0 b'\\x00\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\x01\\x00\\x00\\x00\\x01\\x00\\x00\\x00'\n", + "2023-06-27 19:14:18,645\tDEBUG worker.py:640 -- Retaining c8ef45ccd0112571ffffffffffffffffffffffff0100000001000000\n", + "2023-06-27 19:14:19,454\tDEBUG worker.py:636 -- Releasing c8ef45ccd0112571ffffffffffffffffffffffff0100000001000000\n" ] }, { @@ -190,7 +164,7 @@ " \n", " \n", " Dashboard:\n", - " http://10.254.12.141:8265\n", + " http://10.254.20.41:8265\n", "\n", "\n", " \n", @@ -198,10 +172,10 @@ "\n" ], "text/plain": [ - "ClientContext(dashboard_url='10.254.12.141:8265', python_version='3.8.13', ray_version='2.1.0', ray_commit='23f34d948dae8de9b168667ab27e6cf940b3ae85', protocol_version='2022-10-05', _num_clients=1, _context_to_restore=)" + "ClientContext(dashboard_url='10.254.20.41:8265', python_version='3.8.13', ray_version='2.1.0', ray_commit='23f34d948dae8de9b168667ab27e6cf940b3ae85', protocol_version='2022-10-05', _num_clients=1, _context_to_restore=)" ] }, - "execution_count": 12, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -210,12 +184,12 @@ "import ray\n", "\n", "ray.shutdown()\n", - "ray.init(address=f\"ray://{rayclient_url}\", logging_level=\"DEBUG\")" + "ray.init(address=cluster.local_client_url(), logging_level=\"DEBUG\")" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 7, "id": "3436eb4a-217c-4109-a3c3-309fda7e2442", "metadata": {}, "outputs": [], @@ -239,7 +213,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 8, "id": "5cca1874-2be3-4631-ae48-9adfa45e3af3", "metadata": { "scrolled": true, @@ -250,8 +224,8 @@ "name": "stderr", "output_type": "stream", "text": [ - "2023-05-31 14:13:29,868\tDEBUG worker.py:640 -- Retaining 00ffffffffffffffffffffffffffffffffffffff0100000002000000\n", - "2023-05-31 14:13:29,870\tDEBUG worker.py:564 -- Scheduling task heavy_calculation 0 b'\\x00\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\x01\\x00\\x00\\x00\\x02\\x00\\x00\\x00'\n" + "2023-06-27 19:14:28,222\tDEBUG worker.py:640 -- Retaining 00ffffffffffffffffffffffffffffffffffffff0100000002000000\n", + "2023-06-27 19:14:28,222\tDEBUG worker.py:564 -- Scheduling task heavy_calculation 0 b'\\x00\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\x01\\x00\\x00\\x00\\x02\\x00\\x00\\x00'\n" ] } ], @@ -261,7 +235,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 9, "id": "01172c29-e8bf-41ef-8db5-eccb07906111", "metadata": {}, "outputs": [ @@ -269,8 +243,8 @@ "name": "stderr", "output_type": "stream", "text": [ - "2023-05-31 14:13:32,643\tDEBUG worker.py:640 -- Retaining 16310a0f0a45af5cffffffffffffffffffffffff0100000001000000\n", - "2023-05-31 14:13:34,677\tDEBUG worker.py:439 -- Internal retry for get [ClientObjectRef(16310a0f0a45af5cffffffffffffffffffffffff0100000001000000)]\n" + "2023-06-27 19:14:29,202\tDEBUG worker.py:640 -- Retaining 16310a0f0a45af5cffffffffffffffffffffffff0100000001000000\n", + "2023-06-27 19:14:31,224\tDEBUG worker.py:439 -- Internal retry for get [ClientObjectRef(16310a0f0a45af5cffffffffffffffffffffffff0100000001000000)]\n" ] }, { @@ -279,7 +253,7 @@ "1789.4644387076714" ] }, - "execution_count": 15, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -290,7 +264,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 10, "id": "9e79b547-a457-4232-b77d-19147067b972", "metadata": {}, "outputs": [ @@ -298,10 +272,10 @@ "name": "stderr", "output_type": "stream", "text": [ - "2023-05-31 14:13:37,659\tDEBUG dataclient.py:287 -- Got unawaited response connection_cleanup {\n", + "2023-06-27 19:14:33,161\tDEBUG dataclient.py:287 -- Got unawaited response connection_cleanup {\n", "}\n", "\n", - "2023-05-31 14:13:38,681\tDEBUG dataclient.py:278 -- Shutting down data channel.\n" + "2023-06-27 19:14:34,460\tDEBUG dataclient.py:278 -- Shutting down data channel.\n" ] } ], @@ -312,7 +286,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 11, "id": "2c198f1f-68bf-43ff-a148-02b5cb000ff2", "metadata": {}, "outputs": [], diff --git a/pyproject.toml b/pyproject.toml index ffbce20f1..e4224fd99 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,9 +24,10 @@ python = "^3.7" openshift-client = "1.0.18" rich = "^12.5" ray = {version = "2.5.0", extras = ["default"]} -kubernetes = "25.3.0" +kubernetes = ">= 25.3.0, < 27" codeflare-torchx = "0.6.0.dev0" cryptography = "40.0.2" +executing = "1.2.0" [tool.poetry.group.docs] optional = true diff --git a/requirements.txt b/requirements.txt index c654bf782..2a48812aa 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,5 @@ ray[default]==2.5.0 kubernetes>=25.3.0,<27 codeflare-torchx==0.6.0.dev0 pydantic<2 # 2.0+ broke ray[default] see detail: https://github.com/ray-project/ray/pull/37000 +cryptography==40.0.2 +executing==1.2.0 diff --git a/src/codeflare_sdk/cluster/auth.py b/src/codeflare_sdk/cluster/auth.py index 33ad8cf7d..85db3d61d 100644 --- a/src/codeflare_sdk/cluster/auth.py +++ b/src/codeflare_sdk/cluster/auth.py @@ -20,8 +20,12 @@ """ import abc -import openshift as oc -from openshift import OpenShiftPythonException +from kubernetes import client, config + +global api_client +api_client = None +global config_path +config_path = None class Authentication(metaclass=abc.ABCMeta): @@ -43,80 +47,131 @@ def logout(self): pass +class KubeConfiguration(metaclass=abc.ABCMeta): + """ + An abstract class that defines the method for loading a user defined config file using the `load_kube_config()` function + """ + + def load_kube_config(self): + """ + Method for setting your Kubernetes configuration to a certain file + """ + pass + + def logout(self): + """ + Method for logging out of the remote cluster + """ + pass + + class TokenAuthentication(Authentication): """ - `TokenAuthentication` is a subclass of `Authentication`. It can be used to authenticate to an OpenShift + `TokenAuthentication` is a subclass of `Authentication`. It can be used to authenticate to a Kubernetes cluster when the user has an API token and the API server address. """ - def __init__(self, token: str = None, server: str = None, skip_tls: bool = False): + def __init__( + self, + token: str, + server: str, + skip_tls: bool = False, + ca_cert_path: str = None, + ): """ Initialize a TokenAuthentication object that requires a value for `token`, the API Token - and `server`, the API server address for authenticating to an OpenShift cluster. + and `server`, the API server address for authenticating to a Kubernetes cluster. """ self.token = token self.server = server self.skip_tls = skip_tls + self.ca_cert_path = ca_cert_path def login(self) -> str: """ - This function is used to login to an OpenShift cluster using the user's API token and API server address. - Depending on the cluster, a user can choose to login in with "--insecure-skip-tls-verify` by setting `skip_tls` - to `True`. + This function is used to log in to a Kubernetes cluster using the user's API token and API server address. + Depending on the cluster, a user can choose to login in with `--insecure-skip-tls-verify` by setting `skip_tls` + to `True` or `--certificate-authority` by setting `skip_tls` to False and providing a path to a ca bundle with `ca_cert_path`. """ - args = [f"--token={self.token}", f"--server={self.server}"] - if self.skip_tls: - args.append("--insecure-skip-tls-verify") + global config_path + global api_client try: - response = oc.invoke("login", args) - except OpenShiftPythonException as osp: # pragma: no cover - error_msg = osp.result.err() - if "The server uses a certificate signed by unknown authority" in error_msg: - return "Error: certificate auth failure, please set `skip_tls=True` in TokenAuthentication" - elif "invalid" in error_msg: - raise PermissionError(error_msg) + configuration = client.Configuration() + configuration.api_key_prefix["authorization"] = "Bearer" + configuration.host = self.server + configuration.api_key["authorization"] = self.token + if self.skip_tls == False and self.ca_cert_path == None: + configuration.verify_ssl = True + elif self.skip_tls == False: + configuration.ssl_ca_cert = self.ca_cert_path else: - return error_msg - return response.out() + configuration.verify_ssl = False + api_client = client.ApiClient(configuration) + client.AuthenticationApi(api_client).get_api_group() + config_path = None + return "Logged into %s" % self.server + except client.ApiException: # pragma: no cover + api_client = None + print("Authentication Error please provide the correct token + server") def logout(self) -> str: """ - This function is used to logout of an OpenShift cluster. + This function is used to logout of a Kubernetes cluster. """ - args = [f"--token={self.token}", f"--server={self.server}"] - response = oc.invoke("logout", args) - return response.out() + global config_path + config_path = None + global api_client + api_client = None + return "Successfully logged out of %s" % self.server -class PasswordUserAuthentication(Authentication): +class KubeConfigFileAuthentication(KubeConfiguration): """ - `PasswordUserAuthentication` is a subclass of `Authentication`. It can be used to authenticate to an OpenShift - cluster when the user has a username and password. + A class that defines the necessary methods for passing a user's own Kubernetes config file. + Specifically this class defines the `load_kube_config()` and `config_check()` functions. """ - def __init__( - self, - username: str = None, - password: str = None, - ): - """ - Initialize a PasswordUserAuthentication object that requires a value for `username` - and `password` for authenticating to an OpenShift cluster. - """ - self.username = username - self.password = password + def __init__(self, kube_config_path: str = None): + self.kube_config_path = kube_config_path - def login(self) -> str: + def load_kube_config(self): """ - This function is used to login to an OpenShift cluster using the user's `username` and `password`. + Function for loading a user's own predefined Kubernetes config file. """ - response = oc.login(self.username, self.password) - return response.out() + global config_path + global api_client + try: + if self.kube_config_path == None: + return "Please specify a config file path" + config_path = self.kube_config_path + api_client = None + config.load_kube_config(config_path) + response = "Loaded user config file at path %s" % self.kube_config_path + except config.ConfigException: # pragma: no cover + config_path = None + raise Exception("Please specify a config file path") + return response + + +def config_check() -> str: + """ + Function for loading the config file at the default config location ~/.kube/config if the user has not + specified their own config file or has logged in with their token and server. + """ + global config_path + global api_client + if config_path == None and api_client == None: + config.load_kube_config() + if config_path != None and api_client == None: + return config_path - def logout(self) -> str: - """ - This function is used to logout of an OpenShift cluster. - """ - response = oc.invoke("logout") - return response.out() + +def api_config_handler() -> str: + """ + This function is used to load the api client if the user has logged in + """ + if api_client != None and config_path == None: + return api_client + else: + return None diff --git a/src/codeflare_sdk/cluster/awload.py b/src/codeflare_sdk/cluster/awload.py index 5621d6734..12544ebac 100644 --- a/src/codeflare_sdk/cluster/awload.py +++ b/src/codeflare_sdk/cluster/awload.py @@ -20,9 +20,12 @@ from os.path import isfile import errno import os -import openshift as oc import yaml +from kubernetes import client, config +from ..utils.kube_api_helpers import _kube_api_error_handling +from .auth import config_check, api_config_handler + class AWManager: """ @@ -40,10 +43,10 @@ def __init__(self, filename: str) -> None: self.filename = filename try: with open(self.filename) as f: - awyaml = yaml.load(f, Loader=yaml.FullLoader) - assert awyaml["kind"] == "AppWrapper" - self.name = awyaml["metadata"]["name"] - self.namespace = awyaml["metadata"]["namespace"] + self.awyaml = yaml.load(f, Loader=yaml.FullLoader) + assert self.awyaml["kind"] == "AppWrapper" + self.name = self.awyaml["metadata"]["name"] + self.namespace = self.awyaml["metadata"]["namespace"] except: raise ValueError( f"{filename } is not a correctly formatted AppWrapper yaml" @@ -55,19 +58,17 @@ def submit(self) -> None: Attempts to create the AppWrapper custom resource using the yaml file """ try: - with oc.project(self.namespace): - oc.invoke("create", ["-f", self.filename]) - except oc.OpenShiftPythonException as osp: # pragma: no cover - error_msg = osp.result.err() - if "Unauthorized" in error_msg or "Forbidden" in error_msg: - raise PermissionError( - "Action not permitted, have you put in correct/up-to-date auth credentials?" - ) - elif "AlreadyExists" in error_msg: - raise FileExistsError( - f"An AppWrapper of the name {self.name} already exists in namespace {self.namespace}" - ) - raise osp + config_check() + api_instance = client.CustomObjectsApi(api_config_handler()) + api_instance.create_namespaced_custom_object( + group="mcad.ibm.com", + version="v1beta1", + namespace=self.namespace, + plural="appwrappers", + body=self.awyaml, + ) + except Exception as e: + return _kube_api_error_handling(e) self.submitted = True print(f"AppWrapper {self.filename} submitted!") @@ -82,25 +83,17 @@ def remove(self) -> None: return try: - with oc.project(self.namespace): - oc.invoke("delete", ["AppWrapper", self.name]) - except oc.OpenShiftPythonException as osp: # pragma: no cover - error_msg = osp.result.err() - if ( - 'the server doesn\'t have a resource type "AppWrapper"' in error_msg - or "forbidden" in error_msg - or "Unauthorized" in error_msg - or "Missing or incomplete configuration" in error_msg - ): - raise PermissionError( - "Action not permitted, have you put in correct/up-to-date auth credentials?" - ) - elif "not found" in error_msg: - self.submitted = False - print("AppWrapper not found, was deleted in another manner") - return - else: - raise osp + config_check() + api_instance = client.CustomObjectsApi(api_config_handler()) + api_instance.delete_namespaced_custom_object( + group="mcad.ibm.com", + version="v1beta1", + namespace=self.namespace, + plural="appwrappers", + name=self.name, + ) + except Exception as e: + return _kube_api_error_handling(e) self.submitted = False print(f"AppWrapper {self.name} removed!") diff --git a/src/codeflare_sdk/cluster/cluster.py b/src/codeflare_sdk/cluster/cluster.py index c09e981b3..ff92bfcf0 100644 --- a/src/codeflare_sdk/cluster/cluster.py +++ b/src/codeflare_sdk/cluster/cluster.py @@ -18,15 +18,15 @@ cluster setup queue, a list of all existing clusters, and the user's working namespace. """ -from os import stat from time import sleep from typing import List, Optional, Tuple, Dict -import openshift as oc from ray.job_submission import JobSubmissionClient +from .auth import config_check, api_config_handler from ..utils import pretty_print from ..utils.generate_yaml import generate_appwrapper +from ..utils.kube_api_helpers import _kube_api_error_handling from .config import ClusterConfiguration from .model import ( AppWrapper, @@ -35,6 +35,9 @@ RayCluster, RayClusterStatus, ) +from kubernetes import client, config +import yaml +import os class Cluster: @@ -65,8 +68,10 @@ def create_app_wrapper(self): """ if self.config.namespace is None: - self.config.namespace = oc.get_project_name() - if type(self.config.namespace) is not str: + self.config.namespace = get_current_namespace() + if self.config.namespace is None: + print("Please specify with namespace=") + elif type(self.config.namespace) is not str: raise TypeError( f"Namespace {self.config.namespace} is of type {type(self.config.namespace)}. Check your Kubernetes Authentication." ) @@ -112,15 +117,19 @@ def up(self): """ namespace = self.config.namespace try: - with oc.project(namespace): - oc.invoke("apply", ["-f", self.app_wrapper_yaml]) - except oc.OpenShiftPythonException as osp: # pragma: no cover - error_msg = osp.result.err() - if "Unauthorized" in error_msg: - raise PermissionError( - "Action not permitted, have you put in correct/up-to-date auth credentials?" - ) - raise osp + config_check() + api_instance = client.CustomObjectsApi(api_config_handler()) + with open(self.app_wrapper_yaml) as f: + aw = yaml.load(f, Loader=yaml.FullLoader) + api_instance.create_namespaced_custom_object( + group="mcad.ibm.com", + version="v1beta1", + namespace=namespace, + plural="appwrappers", + body=aw, + ) + except Exception as e: # pragma: no cover + return _kube_api_error_handling(e) def down(self): """ @@ -129,23 +138,17 @@ def down(self): """ namespace = self.config.namespace try: - with oc.project(namespace): - oc.invoke("delete", ["AppWrapper", self.app_wrapper_name]) - except oc.OpenShiftPythonException as osp: # pragma: no cover - error_msg = osp.result.err() - if ( - 'the server doesn\'t have a resource type "AppWrapper"' in error_msg - or "forbidden" in error_msg - or "Unauthorized" in error_msg - or "Missing or incomplete configuration" in error_msg - ): - raise PermissionError( - "Action not permitted, have you run auth.login()/cluster.up() yet?" - ) - elif "not found" in error_msg: - print("Cluster not found, have you run cluster.up() yet?") - else: - raise osp + config_check() + api_instance = client.CustomObjectsApi(api_config_handler()) + api_instance.delete_namespaced_custom_object( + group="mcad.ibm.com", + version="v1beta1", + namespace=namespace, + plural="appwrappers", + name=self.app_wrapper_name, + ) + except Exception as e: # pragma: no cover + return _kube_api_error_handling(e) def status( self, print_to_console: bool = True @@ -247,16 +250,21 @@ def cluster_dashboard_uri(self) -> str: Returns a string containing the cluster's dashboard URI. """ try: - with oc.project(self.config.namespace): - route = oc.invoke( - "get", ["route", "-o", "jsonpath='{$.items[*].spec.host}'"] - ) - route = route.out().split(" ") - route = [x for x in route if f"ray-dashboard-{self.config.name}" in x] - route = route[0].strip().strip("'") - return f"http://{route}" - except: - return "Dashboard route not available yet, have you run cluster.up()?" + config_check() + api_instance = client.CustomObjectsApi(api_config_handler()) + routes = api_instance.list_namespaced_custom_object( + group="route.openshift.io", + version="v1", + namespace=self.config.namespace, + plural="routes", + ) + except Exception as e: # pragma: no cover + return _kube_api_error_handling(e) + + for route in routes["items"]: + if route["metadata"]["name"] == f"ray-dashboard-{self.config.name}": + return f"http://{route['spec']['host']}" + return "Dashboard route not available yet, have you run cluster.up()?" def list_jobs(self) -> List: """ @@ -296,6 +304,56 @@ def torchx_config( to_return["requirements"] = requirements return to_return + def from_k8_cluster_object(rc): + machine_types = ( + rc["metadata"]["labels"]["orderedinstance"].split("_") + if "orderedinstance" in rc["metadata"]["labels"] + else [] + ) + local_interactive = ( + "volumeMounts" + in rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0] + ) + cluster_config = ClusterConfiguration( + name=rc["metadata"]["name"], + namespace=rc["metadata"]["namespace"], + machine_types=machine_types, + min_worker=rc["spec"]["workerGroupSpecs"][0]["minReplicas"], + max_worker=rc["spec"]["workerGroupSpecs"][0]["maxReplicas"], + min_cpus=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ + "containers" + ][0]["resources"]["requests"]["cpu"], + max_cpus=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ + "containers" + ][0]["resources"]["limits"]["cpu"], + min_memory=int( + rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0][ + "resources" + ]["requests"]["memory"][:-1] + ), + max_memory=int( + rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0][ + "resources" + ]["limits"]["memory"][:-1] + ), + gpu=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][0][ + "resources" + ]["limits"]["nvidia.com/gpu"], + instascale=True if machine_types else False, + image=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][ + 0 + ]["image"], + local_interactive=local_interactive, + ) + return Cluster(cluster_config) + + def local_client_url(self): + if self.config.local_interactive == True: + ingress_domain = _get_ingress_domain() + return f"ray://rayclient-{self.config.name}-{self.config.namespace}.{ingress_domain}" + else: + return "None" + def list_all_clusters(namespace: str, print_to_console: bool = True): """ @@ -320,78 +378,120 @@ def list_all_queued(namespace: str, print_to_console: bool = True): return app_wrappers +def get_current_namespace(): # pragma: no cover + if api_config_handler() != None: + if os.path.isfile("/var/run/secrets/kubernetes.io/serviceaccount/namespace"): + try: + file = open( + "/var/run/secrets/kubernetes.io/serviceaccount/namespace", "r" + ) + active_context = file.readline().strip("\n") + return active_context + except Exception as e: + print("Unable to find current namespace") + return None + else: + print("Unable to find current namespace") + return None + else: + try: + _, active_context = config.list_kube_config_contexts(config_check()) + except Exception as e: + return _kube_api_error_handling(e) + try: + return active_context["context"]["namespace"] + except KeyError: + return None + + +def get_cluster(cluster_name: str, namespace: str = "default"): + try: + config.load_kube_config() + api_instance = client.CustomObjectsApi() + rcs = api_instance.list_namespaced_custom_object( + group="ray.io", + version="v1alpha1", + namespace=namespace, + plural="rayclusters", + ) + except Exception as e: + return _kube_api_error_handling(e) + + for rc in rcs["items"]: + if rc["metadata"]["name"] == cluster_name: + return Cluster.from_k8_cluster_object(rc) + raise FileNotFoundError( + f"Cluster {cluster_name} is not found in {namespace} namespace" + ) + + # private methods +def _get_ingress_domain(): + try: + config.load_kube_config() + api_client = client.CustomObjectsApi() + ingress = api_client.get_cluster_custom_object( + "config.openshift.io", "v1", "ingresses", "cluster" + ) + except Exception as e: # pragma: no cover + return _kube_api_error_handling(e) + return ingress["spec"]["domain"] def _app_wrapper_status(name, namespace="default") -> Optional[AppWrapper]: - cluster = None try: - with oc.project(namespace), oc.timeout(10 * 60): - cluster = oc.selector(f"appwrapper/{name}").object() - except oc.OpenShiftPythonException as osp: # pragma: no cover - msg = osp.msg - if "Expected a single object, but selected 0" in msg: - return cluster - error_msg = osp.result.err() - if not ( - 'the server doesn\'t have a resource type "appwrapper"' in error_msg - or "forbidden" in error_msg - or "Unauthorized" in error_msg - or "Missing or incomplete configuration" in error_msg - ): - raise osp - - if cluster: - return _map_to_app_wrapper(cluster) - - return cluster + config_check() + api_instance = client.CustomObjectsApi(api_config_handler()) + aws = api_instance.list_namespaced_custom_object( + group="mcad.ibm.com", + version="v1beta1", + namespace=namespace, + plural="appwrappers", + ) + except Exception as e: # pragma: no cover + return _kube_api_error_handling(e) + + for aw in aws["items"]: + if aw["metadata"]["name"] == name: + return _map_to_app_wrapper(aw) + return None def _ray_cluster_status(name, namespace="default") -> Optional[RayCluster]: - cluster = None try: - with oc.project(namespace), oc.timeout(10 * 60): - cluster = oc.selector(f"rayclusters/{name}").object() - except oc.OpenShiftPythonException as osp: # pragma: no cover - msg = osp.msg - if "Expected a single object, but selected 0" in msg: - return cluster - error_msg = osp.result.err() - if not ( - 'the server doesn\'t have a resource type "rayclusters"' in error_msg - or "forbidden" in error_msg - or "Unauthorized" in error_msg - or "Missing or incomplete configuration" in error_msg - ): - raise osp - - if cluster: - return _map_to_ray_cluster(cluster) - - return cluster + config_check() + api_instance = client.CustomObjectsApi(api_config_handler()) + rcs = api_instance.list_namespaced_custom_object( + group="ray.io", + version="v1alpha1", + namespace=namespace, + plural="rayclusters", + ) + except Exception as e: # pragma: no cover + return _kube_api_error_handling(e) + + for rc in rcs["items"]: + if rc["metadata"]["name"] == name: + return _map_to_ray_cluster(rc) + return None def _get_ray_clusters(namespace="default") -> List[RayCluster]: list_of_clusters = [] try: - with oc.project(namespace), oc.timeout(10 * 60): - ray_clusters = oc.selector("rayclusters").objects() - except oc.OpenShiftPythonException as osp: # pragma: no cover - error_msg = osp.result.err() - if ( - 'the server doesn\'t have a resource type "rayclusters"' in error_msg - or "forbidden" in error_msg - or "Unauthorized" in error_msg - or "Missing or incomplete configuration" in error_msg - ): - raise PermissionError( - "Action not permitted, have you put in correct/up-to-date auth credentials?" - ) - else: - raise osp + config_check() + api_instance = client.CustomObjectsApi(api_config_handler()) + rcs = api_instance.list_namespaced_custom_object( + group="ray.io", + version="v1alpha1", + namespace=namespace, + plural="rayclusters", + ) + except Exception as e: # pragma: no cover + return _kube_api_error_handling(e) - for cluster in ray_clusters: - list_of_clusters.append(_map_to_ray_cluster(cluster)) + for rc in rcs["items"]: + list_of_clusters.append(_map_to_ray_cluster(rc)) return list_of_clusters @@ -401,23 +501,18 @@ def _get_app_wrappers( list_of_app_wrappers = [] try: - with oc.project(namespace), oc.timeout(10 * 60): - app_wrappers = oc.selector("appwrappers").objects() - except oc.OpenShiftPythonException as osp: # pragma: no cover - error_msg = osp.result.err() - if ( - 'the server doesn\'t have a resource type "appwrappers"' in error_msg - or "forbidden" in error_msg - or "Unauthorized" in error_msg - or "Missing or incomplete configuration" in error_msg - ): - raise PermissionError( - "Action not permitted, have you put in correct/up-to-date auth credentials?" - ) - else: - raise osp + config_check() + api_instance = client.CustomObjectsApi(api_config_handler()) + aws = api_instance.list_namespaced_custom_object( + group="mcad.ibm.com", + version="v1beta1", + namespace=namespace, + plural="appwrappers", + ) + except Exception as e: # pragma: no cover + return _kube_api_error_handling(e) - for item in app_wrappers: + for item in aws["items"]: app_wrapper = _map_to_app_wrapper(item) if filter and app_wrapper.status in filter: list_of_app_wrappers.append(app_wrapper) @@ -427,48 +522,52 @@ def _get_app_wrappers( return list_of_app_wrappers -def _map_to_ray_cluster(cluster) -> Optional[RayCluster]: - cluster_model = cluster.model - if type(cluster_model.status.state) == oc.model.MissingModel: - status = RayClusterStatus.UNKNOWN +def _map_to_ray_cluster(rc) -> Optional[RayCluster]: + if "state" in rc["status"]: + status = RayClusterStatus(rc["status"]["state"].lower()) else: - status = RayClusterStatus(cluster_model.status.state.lower()) + status = RayClusterStatus.UNKNOWN - with oc.project(cluster.namespace()), oc.timeout(10 * 60): - route = ( - oc.selector(f"route/ray-dashboard-{cluster.name()}") - .object() - .model.spec.host - ) + config_check() + api_instance = client.CustomObjectsApi(api_config_handler()) + routes = api_instance.list_namespaced_custom_object( + group="route.openshift.io", + version="v1", + namespace=rc["metadata"]["namespace"], + plural="routes", + ) + ray_route = None + for route in routes["items"]: + if route["metadata"]["name"] == f"ray-dashboard-{rc['metadata']['name']}": + ray_route = route["spec"]["host"] return RayCluster( - name=cluster.name(), + name=rc["metadata"]["name"], status=status, # for now we are not using autoscaling so same replicas is fine - min_workers=cluster_model.spec.workerGroupSpecs[0].replicas, - max_workers=cluster_model.spec.workerGroupSpecs[0].replicas, - worker_mem_max=cluster_model.spec.workerGroupSpecs[0] - .template.spec.containers[0] - .resources.limits.memory, - worker_mem_min=cluster_model.spec.workerGroupSpecs[0] - .template.spec.containers[0] - .resources.requests.memory, - worker_cpu=cluster_model.spec.workerGroupSpecs[0] - .template.spec.containers[0] - .resources.limits.cpu, + min_workers=rc["spec"]["workerGroupSpecs"][0]["replicas"], + max_workers=rc["spec"]["workerGroupSpecs"][0]["replicas"], + worker_mem_max=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ + "containers" + ][0]["resources"]["limits"]["memory"], + worker_mem_min=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"][ + "containers" + ][0]["resources"]["requests"]["memory"], + worker_cpu=rc["spec"]["workerGroupSpecs"][0]["template"]["spec"]["containers"][ + 0 + ]["resources"]["limits"]["cpu"], worker_gpu=0, # hard to detect currently how many gpus, can override it with what the user asked for - namespace=cluster.namespace(), - dashboard=route, + namespace=rc["metadata"]["namespace"], + dashboard=ray_route, ) -def _map_to_app_wrapper(cluster) -> AppWrapper: - cluster_model = cluster.model +def _map_to_app_wrapper(aw) -> AppWrapper: return AppWrapper( - name=cluster.name(), - status=AppWrapperStatus(cluster_model.status.state.lower()), - can_run=cluster_model.status.canrun, - job_state=cluster_model.status.queuejobstate, + name=aw["metadata"]["name"], + status=AppWrapperStatus(aw["status"]["state"].lower()), + can_run=aw["status"]["canrun"], + job_state=aw["status"]["queuejobstate"], ) diff --git a/src/codeflare_sdk/cluster/config.py b/src/codeflare_sdk/cluster/config.py index f24425635..31f70d6b9 100644 --- a/src/codeflare_sdk/cluster/config.py +++ b/src/codeflare_sdk/cluster/config.py @@ -19,9 +19,7 @@ """ from dataclasses import dataclass, field -from .auth import Authentication import pathlib -import openshift dir = pathlib.Path(__file__).parent.parent.resolve() diff --git a/src/codeflare_sdk/job/jobs.py b/src/codeflare_sdk/job/jobs.py index 6b5ce0a53..b1db70d54 100644 --- a/src/codeflare_sdk/job/jobs.py +++ b/src/codeflare_sdk/job/jobs.py @@ -17,13 +17,13 @@ from typing import TYPE_CHECKING, Optional, Dict, List from pathlib import Path -import openshift as oc from torchx.components.dist import ddp from torchx.runner import get_runner from torchx.specs import AppHandle, parse_app_handle, AppDryRunInfo if TYPE_CHECKING: from ..cluster.cluster import Cluster +from ..cluster.cluster import get_current_namespace all_jobs: List["Job"] = [] torchx_runner = get_runner() @@ -124,7 +124,7 @@ def _missing_spec(self, spec: str): def _dry_run_no_cluster(self): if self.scheduler_args is not None: if self.scheduler_args.get("namespace") is None: - self.scheduler_args["namespace"] = oc.get_project_name() + self.scheduler_args["namespace"] = get_current_namespace() return torchx_runner.dryrun( app=ddp( *self.script_args, diff --git a/src/codeflare_sdk/utils/generate_cert.py b/src/codeflare_sdk/utils/generate_cert.py index 2d73621b8..04b04d3e0 100644 --- a/src/codeflare_sdk/utils/generate_cert.py +++ b/src/codeflare_sdk/utils/generate_cert.py @@ -19,6 +19,7 @@ from cryptography import x509 from cryptography.x509.oid import NameOID import datetime +from ..cluster.auth import config_check, api_config_handler from kubernetes import client, config @@ -82,8 +83,8 @@ def generate_tls_cert(cluster_name, namespace, days=30): # Similar to: # oc get secret ca-secret- -o template='{{index .data "ca.key"}}' # oc get secret ca-secret- -o template='{{index .data "ca.crt"}}'|base64 -d > ${TLSDIR}/ca.crt - config.load_kube_config() - v1 = client.CoreV1Api() + config_check() + v1 = client.CoreV1Api(api_config_handler()) secret = v1.read_namespaced_secret(f"ca-secret-{cluster_name}", namespace).data ca_cert = secret.get("ca.crt") ca_key = secret.get("ca.key") diff --git a/src/codeflare_sdk/utils/generate_yaml.py b/src/codeflare_sdk/utils/generate_yaml.py index 9538a1e8e..c4361abae 100755 --- a/src/codeflare_sdk/utils/generate_yaml.py +++ b/src/codeflare_sdk/utils/generate_yaml.py @@ -21,7 +21,8 @@ import sys import argparse import uuid -import openshift as oc +from kubernetes import client, config +from .kube_api_helpers import _kube_api_error_handling def read_template(template): @@ -248,12 +249,16 @@ def enable_local_interactive(resources, cluster_name, namespace): ][0].get("command")[2] command = command.replace("deployment-name", cluster_name) - - server_name = ( - oc.whoami("--show-server").split(":")[1].split("//")[1].replace("api", "apps") - ) - - command = command.replace("server-name", server_name) + try: + config.load_kube_config() + api_client = client.CustomObjectsApi() + ingress = api_client.get_cluster_custom_object( + "config.openshift.io", "v1", "ingresses", "cluster" + ) + except Exception as e: # pragma: no cover + return _kube_api_error_handling(e) + domain = ingress["spec"]["domain"] + command = command.replace("server-name", domain) item["generictemplate"]["spec"]["headGroupSpec"]["template"]["spec"][ "initContainers" @@ -338,131 +343,3 @@ def generate_appwrapper( outfile = appwrapper_name + ".yaml" write_user_appwrapper(user_yaml, outfile) return outfile - - -def main(): # pragma: no cover - parser = argparse.ArgumentParser(description="Generate user AppWrapper") - parser.add_argument( - "--name", - required=False, - default="", - help="User selected name for AppWrapper and Ray Cluster (auto-generated if not provided)", - ) - parser.add_argument( - "--min-cpu", - type=int, - required=True, - help="min number of CPU(s) in a worker required for running job", - ) - parser.add_argument( - "--max-cpu", - type=int, - required=True, - help="max number of CPU(s) in a worker required for running job", - ) - parser.add_argument( - "--min-memory", - type=int, - required=True, - help="min RAM required in a worker for running job, in GB", - ) - parser.add_argument( - "--max-memory", - type=int, - required=True, - help="max RAM required in a worker for running job, in GB", - ) - parser.add_argument( - "--gpu", - type=int, - required=True, - help="GPU(s) required in a worker for running job", - ) - parser.add_argument( - "--workers", - type=int, - required=True, - help="How many workers are required in the cluster", - ) - parser.add_argument( - "--template", required=True, help="Template AppWrapper yaml file" - ) - parser.add_argument( - "--image", - required=False, - default="rayproject/ray:latest", - help="Ray image to be used (defaults to rayproject/ray:latest)", - ) - parser.add_argument( - "--instascale", - default=False, - required=False, - action="store_true", - help="Indicates that instascale is installed on the cluster", - ) - parser.add_argument( - "--instance-types", - type=str, - nargs="+", - default=[], - required=False, - help="Head,worker instance types (space separated)", - ) - parser.add_argument( - "--namespace", - required=False, - default="default", - help="Set the kubernetes namespace you want to deploy your cluster to. Default. If left blank, uses the 'default' namespace", - ) - parser.add_argument( - "--local-interactive", - required=False, - default=False, - help="Enable local interactive mode", - ) - parser.add_argument( - "--image-pull-secrets", - required=False, - default=[], - help="Set image pull secrets for private registries", - ) - - args = parser.parse_args() - name = args.name - min_cpu = args.min_cpu - max_cpu = args.max_cpu - min_memory = args.min_memory - max_memory = args.max_memory - gpu = args.gpu - workers = args.workers - template = args.template - image = args.image - instascale = args.instascale - instance_types = args.instance_types - namespace = args.namespace - local_interactive = args.local_interactive - env = {} - image_pull_secrets = args.image_pull_secrets - - outfile = generate_appwrapper( - name, - namespace, - min_cpu, - max_cpu, - min_memory, - max_memory, - gpu, - workers, - template, - image, - instascale, - instance_types, - local_interactive, - env, - image_pull_secrets, - ) - return outfile - - -if __name__ == "__main__": # pragma: no cover - main() diff --git a/src/codeflare_sdk/utils/kube_api_helpers.py b/src/codeflare_sdk/utils/kube_api_helpers.py new file mode 100644 index 000000000..58358a053 --- /dev/null +++ b/src/codeflare_sdk/utils/kube_api_helpers.py @@ -0,0 +1,44 @@ +# Copyright 2022 IBM, Red Hat +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This sub-module exists primarily to be used internally for any Kubernetes +API error handling or wrapping. +""" + +import executing +from kubernetes import client, config + + +# private methods +def _kube_api_error_handling(e: Exception): # pragma: no cover + perm_msg = ( + "Action not permitted, have you put in correct/up-to-date auth credentials?" + ) + nf_msg = "No instances found, nothing to be done." + exists_msg = "Resource with this name already exists." + if type(e) == config.ConfigException: + raise PermissionError(perm_msg) + if type(e) == executing.executing.NotOneValueFound: + print(nf_msg) + return + if type(e) == client.ApiException: + if e.reason == "Not Found": + print(nf_msg) + return + elif e.reason == "Unauthorized" or e.reason == "Forbidden": + raise PermissionError(perm_msg) + elif e.reason == "Conflict": + raise FileExistsError(exists_msg) + raise e diff --git a/tests/test-case-cmd.yaml b/tests/test-case-cmd.yaml deleted file mode 100644 index ea235ec9a..000000000 --- a/tests/test-case-cmd.yaml +++ /dev/null @@ -1,173 +0,0 @@ -apiVersion: mcad.ibm.com/v1beta1 -kind: AppWrapper -metadata: - name: unit-cmd-cluster - namespace: default -spec: - priority: 9 - resources: - GenericItems: - - custompodresources: - - limits: - cpu: 2 - memory: 8G - nvidia.com/gpu: 0 - replicas: 1 - requests: - cpu: 2 - memory: 8G - nvidia.com/gpu: 0 - - limits: - cpu: 1 - memory: 2G - nvidia.com/gpu: 1 - replicas: 2 - requests: - cpu: 1 - memory: 2G - nvidia.com/gpu: 1 - generictemplate: - apiVersion: ray.io/v1alpha1 - kind: RayCluster - metadata: - labels: - appwrapper.mcad.ibm.com: unit-cmd-cluster - controller-tools.k8s.io: '1.0' - name: unit-cmd-cluster - namespace: default - spec: - autoscalerOptions: - idleTimeoutSeconds: 60 - imagePullPolicy: Always - resources: - limits: - cpu: 500m - memory: 512Mi - requests: - cpu: 500m - memory: 512Mi - upscalingMode: Default - enableInTreeAutoscaling: false - headGroupSpec: - rayStartParams: - block: 'true' - dashboard-host: 0.0.0.0 - num-gpus: '0' - serviceType: ClusterIP - template: - spec: - containers: - - env: - - name: MY_POD_IP - valueFrom: - fieldRef: - fieldPath: status.podIP - - name: RAY_USE_TLS - value: '0' - - name: RAY_TLS_SERVER_CERT - value: /home/ray/workspace/tls/server.crt - - name: RAY_TLS_SERVER_KEY - value: /home/ray/workspace/tls/server.key - - name: RAY_TLS_CA_CERT - value: /home/ray/workspace/tls/ca.crt - image: rayproject/ray:latest - imagePullPolicy: Always - lifecycle: - preStop: - exec: - command: - - /bin/sh - - -c - - ray stop - name: ray-head - ports: - - containerPort: 6379 - name: gcs - - containerPort: 8265 - name: dashboard - - containerPort: 10001 - name: client - resources: - limits: - cpu: 2 - memory: 8G - nvidia.com/gpu: 0 - requests: - cpu: 2 - memory: 8G - nvidia.com/gpu: 0 - imagePullSecrets: [] - rayVersion: 2.1.0 - workerGroupSpecs: - - groupName: small-group-unit-cmd-cluster - maxReplicas: 2 - minReplicas: 2 - rayStartParams: - block: 'true' - num-gpus: '1' - replicas: 2 - template: - metadata: - annotations: - key: value - labels: - key: value - spec: - containers: - - env: - - name: MY_POD_IP - valueFrom: - fieldRef: - fieldPath: status.podIP - - name: RAY_USE_TLS - value: '0' - - name: RAY_TLS_SERVER_CERT - value: /home/ray/workspace/tls/server.crt - - name: RAY_TLS_SERVER_KEY - value: /home/ray/workspace/tls/server.key - - name: RAY_TLS_CA_CERT - value: /home/ray/workspace/tls/ca.crt - image: rayproject/ray:latest - lifecycle: - preStop: - exec: - command: - - /bin/sh - - -c - - ray stop - name: machine-learning - resources: - limits: - cpu: 1 - memory: 2G - nvidia.com/gpu: 1 - requests: - cpu: 1 - memory: 2G - nvidia.com/gpu: 1 - imagePullSecrets: [] - initContainers: - - command: - - sh - - -c - - until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; - do echo waiting for myservice; sleep 2; done - image: busybox:1.28 - name: init-myservice - replicas: 1 - - generictemplate: - apiVersion: route.openshift.io/v1 - kind: Route - metadata: - labels: - odh-ray-cluster-service: unit-cmd-cluster-head-svc - name: ray-dashboard-unit-cmd-cluster - namespace: default - spec: - port: - targetPort: dashboard - to: - kind: Service - name: unit-cmd-cluster-head-svc - replica: 1 - Items: [] diff --git a/tests/unit_test.py b/tests/unit_test.py index d1ea3e75e..21c1adf24 100644 --- a/tests/unit_test.py +++ b/tests/unit_test.py @@ -21,7 +21,7 @@ parent = Path(__file__).resolve().parents[1] sys.path.append(str(parent) + "/src") -from kubernetes import client +from kubernetes import client, config from codeflare_sdk.cluster.awload import AWManager from codeflare_sdk.cluster.cluster import ( Cluster, @@ -29,11 +29,14 @@ list_all_clusters, list_all_queued, _copy_to_ray, + get_cluster, + _app_wrapper_status, + _ray_cluster_status, ) from codeflare_sdk.cluster.auth import ( TokenAuthentication, - PasswordUserAuthentication, Authentication, + KubeConfigFileAuthentication, ) from codeflare_sdk.utils.pretty_print import ( print_no_resources_found, @@ -62,7 +65,6 @@ ) import openshift -from openshift import OpenShiftPythonException from openshift.selector import Selector import ray from torchx.specs import AppDryRunInfo, AppDef @@ -70,6 +72,7 @@ from torchx.schedulers.ray_scheduler import RayJob from torchx.schedulers.kubernetes_mcad_scheduler import KubernetesMCADJob import pytest +import yaml # For mocking openshift client results @@ -85,120 +88,79 @@ def att_side_effect(self): return self.high_level_operation -def att_side_effect_tls(self): - if "--insecure-skip-tls-verify" in self.high_level_operation[1]: - return self.high_level_operation - else: - raise OpenShiftPythonException( - "The server uses a certificate signed by unknown authority" - ) - - def test_token_auth_creation(): try: - token_auth = TokenAuthentication() - assert token_auth.token == None - assert token_auth.server == None - assert token_auth.skip_tls == False - - token_auth = TokenAuthentication("token") - assert token_auth.token == "token" - assert token_auth.server == None - assert token_auth.skip_tls == False - - token_auth = TokenAuthentication("token", "server") + token_auth = TokenAuthentication(token="token", server="server") assert token_auth.token == "token" assert token_auth.server == "server" assert token_auth.skip_tls == False + assert token_auth.ca_cert_path == None - token_auth = TokenAuthentication("token", server="server") + token_auth = TokenAuthentication(token="token", server="server", skip_tls=True) assert token_auth.token == "token" assert token_auth.server == "server" - assert token_auth.skip_tls == False + assert token_auth.skip_tls == True + assert token_auth.ca_cert_path == None - token_auth = TokenAuthentication(token="token", server="server") + token_auth = TokenAuthentication(token="token", server="server", skip_tls=False) assert token_auth.token == "token" assert token_auth.server == "server" assert token_auth.skip_tls == False + assert token_auth.ca_cert_path == None - token_auth = TokenAuthentication(token="token", server="server", skip_tls=True) + token_auth = TokenAuthentication( + token="token", server="server", skip_tls=False, ca_cert_path="path/to/cert" + ) assert token_auth.token == "token" assert token_auth.server == "server" - assert token_auth.skip_tls == True + assert token_auth.skip_tls == False + assert token_auth.ca_cert_path == "path/to/cert" except Exception: assert 0 == 1 def test_token_auth_login_logout(mocker): - mocker.patch("openshift.invoke", side_effect=arg_side_effect) - mock_res = mocker.patch.object(openshift.Result, "out") - mock_res.side_effect = lambda: att_side_effect(fake_res) + mocker.patch.object(client, "ApiClient") - token_auth = TokenAuthentication(token="testtoken", server="testserver:6443") - assert token_auth.login() == ( - "login", - ["--token=testtoken", "--server=testserver:6443"], - ) - assert token_auth.logout() == ( - "logout", - ["--token=testtoken", "--server=testserver:6443"], + token_auth = TokenAuthentication( + token="testtoken", server="testserver:6443", skip_tls=False, ca_cert_path=None ) + assert token_auth.login() == ("Logged into testserver:6443") + assert token_auth.logout() == ("Successfully logged out of testserver:6443") def test_token_auth_login_tls(mocker): - mocker.patch("openshift.invoke", side_effect=arg_side_effect) - mock_res = mocker.patch.object(openshift.Result, "out") - mock_res.side_effect = lambda: att_side_effect_tls(fake_res) - - # FIXME - Pytest mocker not allowing caught exception - # token_auth = TokenAuthentication(token="testtoken", server="testserver") - # assert token_auth.login() == "Error: certificate auth failure, please set `skip_tls=True` in TokenAuthentication" + mocker.patch.object(client, "ApiClient") token_auth = TokenAuthentication( - token="testtoken", server="testserver:6443", skip_tls=True + token="testtoken", server="testserver:6443", skip_tls=True, ca_cert_path=None ) - assert token_auth.login() == ( - "login", - ["--token=testtoken", "--server=testserver:6443", "--insecure-skip-tls-verify"], + assert token_auth.login() == ("Logged into testserver:6443") + token_auth = TokenAuthentication( + token="testtoken", server="testserver:6443", skip_tls=False, ca_cert_path=None ) + assert token_auth.login() == ("Logged into testserver:6443") + token_auth = TokenAuthentication( + token="testtoken", + server="testserver:6443", + skip_tls=False, + ca_cert_path="path/to/cert", + ) + assert token_auth.login() == ("Logged into testserver:6443") -def test_passwd_auth_creation(): - try: - passwd_auth = PasswordUserAuthentication() - assert passwd_auth.username == None - assert passwd_auth.password == None - - passwd_auth = PasswordUserAuthentication("user") - assert passwd_auth.username == "user" - assert passwd_auth.password == None - - passwd_auth = PasswordUserAuthentication("user", "passwd") - assert passwd_auth.username == "user" - assert passwd_auth.password == "passwd" - - passwd_auth = PasswordUserAuthentication("user", password="passwd") - assert passwd_auth.username == "user" - assert passwd_auth.password == "passwd" - - passwd_auth = PasswordUserAuthentication(username="user", password="passwd") - assert passwd_auth.username == "user" - assert passwd_auth.password == "passwd" - - except Exception: - assert 0 == 1 - - -def test_passwd_auth_login_logout(mocker): - mocker.patch("openshift.invoke", side_effect=arg_side_effect) - mocker.patch("openshift.login", side_effect=arg_side_effect) - mock_res = mocker.patch.object(openshift.Result, "out") - mock_res.side_effect = lambda: att_side_effect(fake_res) +def test_load_kube_config(mocker): + mocker.patch.object(config, "load_kube_config") + kube_config_auth = KubeConfigFileAuthentication( + kube_config_path="/path/to/your/config" + ) + response = kube_config_auth.load_kube_config() - token_auth = PasswordUserAuthentication(username="user", password="passwd") - assert token_auth.login() == ("user", "passwd") - assert token_auth.logout() == ("logout",) + assert ( + response + == "Loaded user config file at path %s" % kube_config_auth.kube_config_path + ) def test_auth_coverage(): @@ -248,7 +210,7 @@ def test_cluster_creation(): def test_default_cluster_creation(mocker): mocker.patch( - "openshift.get_project_name", + "codeflare_sdk.cluster.cluster.get_current_namespace", return_value="opendatahub", ) default_config = ClusterConfiguration( @@ -263,38 +225,103 @@ def test_default_cluster_creation(mocker): return cluster -def arg_check_apply_effect(*args): - assert args[0] == "apply" - assert args[1] == ["-f", "unit-test-cluster.yaml"] +def arg_check_apply_effect(group, version, namespace, plural, body, *args): + assert group == "mcad.ibm.com" + assert version == "v1beta1" + assert namespace == "ns" + assert plural == "appwrappers" + with open("unit-test-cluster.yaml") as f: + aw = yaml.load(f, Loader=yaml.FullLoader) + assert body == aw + assert args == tuple() -def arg_check_del_effect(*args): - assert args[0] == "delete" - assert args[1] == ["AppWrapper", "unit-test-cluster"] +def arg_check_del_effect(group, version, namespace, plural, name, *args): + assert group == "mcad.ibm.com" + assert version == "v1beta1" + assert namespace == "ns" + assert plural == "appwrappers" + assert name == "unit-test-cluster" + assert args == tuple() def test_cluster_up_down(mocker): + mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") mocker.patch( - "codeflare_sdk.cluster.auth.TokenAuthentication.login", return_value="ignore" + "kubernetes.client.CustomObjectsApi.create_namespaced_custom_object", + side_effect=arg_check_apply_effect, ) mocker.patch( - "codeflare_sdk.cluster.auth.TokenAuthentication.logout", return_value="ignore" + "kubernetes.client.CustomObjectsApi.delete_namespaced_custom_object", + side_effect=arg_check_del_effect, ) - mocker.patch("openshift.invoke", side_effect=arg_check_apply_effect) cluster = test_cluster_creation() cluster.up() - mocker.patch("openshift.invoke", side_effect=arg_check_del_effect) cluster.down() -def out_route(self): - return "ray-dashboard-raycluster-autoscaler-ns.apps.cluster.awsroute.org ray-dashboard-unit-test-cluster-ns.apps.cluster.awsroute.org" +def aw_status_fields(group, version, namespace, plural, *args): + assert group == "mcad.ibm.com" + assert version == "v1beta1" + assert namespace == "test-ns" + assert plural == "appwrappers" + assert args == tuple() + return {"items": []} + + +def test_aw_status(mocker): + mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") + mocker.patch( + "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", + side_effect=aw_status_fields, + ) + aw = _app_wrapper_status("test-aw", "test-ns") + assert aw == None + + +def rc_status_fields(group, version, namespace, plural, *args): + assert group == "ray.io" + assert version == "v1alpha1" + assert namespace == "test-ns" + assert plural == "rayclusters" + assert args == tuple() + return {"items": []} + + +def test_rc_status(mocker): + mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") + mocker.patch( + "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", + side_effect=rc_status_fields, + ) + rc = _ray_cluster_status("test-rc", "test-ns") + assert rc == None + + +def uri_retreival(group, version, namespace, plural, *args): + assert group == "route.openshift.io" + assert version == "v1" + assert namespace == "ns" + assert plural == "routes" + assert args == tuple() + return { + "items": [ + { + "metadata": {"name": "ray-dashboard-unit-test-cluster"}, + "spec": { + "host": "ray-dashboard-unit-test-cluster-ns.apps.cluster.awsroute.org" + }, + } + ] + } def test_cluster_uris(mocker): - mocker.patch("openshift.invoke", return_value=fake_res) - mock_res = mocker.patch.object(openshift.Result, "out") - mock_res.side_effect = lambda: out_route(fake_res) + mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") + mocker.patch( + "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", + side_effect=uri_retreival, + ) cluster = test_cluster_creation() assert cluster.cluster_uri() == "ray://unit-test-cluster-head-svc.ns.svc:10001" @@ -309,14 +336,40 @@ def test_cluster_uris(mocker): ) +def test_local_client_url(mocker): + mocker.patch( + "kubernetes.client.CustomObjectsApi.get_cluster_custom_object", + return_value={"spec": {"domain": ""}}, + ) + mocker.patch( + "codeflare_sdk.cluster.cluster._get_ingress_domain", + return_value="apps.cluster.awsroute.org", + ) + mocker.patch( + "codeflare_sdk.cluster.cluster.Cluster.create_app_wrapper", + return_value="unit-test-cluster-localinter.yaml", + ) + + cluster_config = ClusterConfiguration( + name="unit-test-cluster-localinter", namespace="ns", local_interactive=True + ) + cluster = Cluster(cluster_config) + assert ( + cluster.local_client_url() + == "ray://rayclient-unit-test-cluster-localinter-ns.apps.cluster.awsroute.org" + ) + + def ray_addr(self, *args): return self._address def test_ray_job_wrapping(mocker): - mocker.patch("openshift.invoke", return_value=fake_res) - mock_res = mocker.patch.object(openshift.Result, "out") - mock_res.side_effect = lambda: out_route(fake_res) + mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") + mocker.patch( + "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", + side_effect=uri_retreival, + ) cluster = test_cluster_creation() mocker.patch( @@ -402,7 +455,7 @@ def test_print_appwrappers(capsys): ) -def test_ray_details(capsys): +def test_ray_details(mocker, capsys): ray1 = RayCluster( name="raytest1", status=RayClusterStatus.READY, @@ -415,6 +468,14 @@ def test_ray_details(capsys): namespace="ns", dashboard="fake-uri", ) + mocker.patch( + "codeflare_sdk.cluster.cluster.Cluster.status", + return_value=(False, CodeFlareClusterStatus.UNKNOWN), + ) + mocker.patch( + "codeflare_sdk.cluster.cluster.Cluster.cluster_dashboard_uri", + return_value="", + ) cf = Cluster(ClusterConfiguration(name="raytest2", namespace="ns")) captured = capsys.readouterr() ray2 = _copy_to_ray(cf) @@ -519,223 +580,151 @@ def act_side_effect_list(self): return [self] -def get_selector(*args): - selector = Selector({"operation": "selector", "status": 0, "actions": []}) - return selector - - -def get_obj_none(): - return [] - - -def get_ray_obj(cls=None): - api_obj = openshift.apiobject.APIObject( - { - "apiVersion": "ray.io/v1alpha1", - "kind": "RayCluster", - "metadata": { - "creationTimestamp": "2023-02-22T16:26:07Z", - "generation": 1, - "labels": { - "appwrapper.mcad.ibm.com": "quicktest", - "controller-tools.k8s.io": "1.0", - "resourceName": "quicktest", - }, - "managedFields": [ - { - "apiVersion": "ray.io/v1alpha1", - "fieldsType": "FieldsV1", - "fieldsV1": { - "f:metadata": { - "f:labels": { - ".": {}, - "f:appwrapper.mcad.ibm.com": {}, - "f:controller-tools.k8s.io": {}, - "f:resourceName": {}, - }, - "f:ownerReferences": { - ".": {}, - 'k:{"uid":"6334fc1b-471e-4876-8e7b-0b2277679235"}': {}, - }, - }, - "f:spec": { - ".": {}, - "f:autoscalerOptions": { - ".": {}, - "f:idleTimeoutSeconds": {}, - "f:imagePullPolicy": {}, - "f:resources": { +def get_obj_none(group, version, namespace, plural): + return {"items": []} + + +def get_ray_obj(group, version, namespace, plural, cls=None): + api_obj = { + "items": [ + { + "apiVersion": "ray.io/v1alpha1", + "kind": "RayCluster", + "metadata": { + "creationTimestamp": "2023-02-22T16:26:07Z", + "generation": 1, + "labels": { + "appwrapper.mcad.ibm.com": "quicktest", + "controller-tools.k8s.io": "1.0", + "resourceName": "quicktest", + "orderedinstance": "m4.xlarge_g4dn.xlarge", + }, + "managedFields": [ + { + "apiVersion": "ray.io/v1alpha1", + "fieldsType": "FieldsV1", + "fieldsV1": { + "f:metadata": { + "f:labels": { ".": {}, - "f:limits": { - ".": {}, - "f:cpu": {}, - "f:memory": {}, - }, - "f:requests": { - ".": {}, - "f:cpu": {}, - "f:memory": {}, - }, + "f:appwrapper.mcad.ibm.com": {}, + "f:controller-tools.k8s.io": {}, + "f:resourceName": {}, + }, + "f:ownerReferences": { + ".": {}, + 'k:{"uid":"6334fc1b-471e-4876-8e7b-0b2277679235"}': {}, }, - "f:upscalingMode": {}, }, - "f:enableInTreeAutoscaling": {}, - "f:headGroupSpec": { + "f:spec": { ".": {}, - "f:rayStartParams": { + "f:autoscalerOptions": { ".": {}, - "f:block": {}, - "f:dashboard-host": {}, - "f:num-gpus": {}, + "f:idleTimeoutSeconds": {}, + "f:imagePullPolicy": {}, + "f:resources": { + ".": {}, + "f:limits": { + ".": {}, + "f:cpu": {}, + "f:memory": {}, + }, + "f:requests": { + ".": {}, + "f:cpu": {}, + "f:memory": {}, + }, + }, + "f:upscalingMode": {}, }, - "f:serviceType": {}, - "f:template": { + "f:enableInTreeAutoscaling": {}, + "f:headGroupSpec": { ".": {}, - "f:spec": {".": {}, "f:containers": {}}, + "f:rayStartParams": { + ".": {}, + "f:block": {}, + "f:dashboard-host": {}, + "f:num-gpus": {}, + }, + "f:serviceType": {}, + "f:template": { + ".": {}, + "f:spec": {".": {}, "f:containers": {}}, + }, }, + "f:rayVersion": {}, + "f:workerGroupSpecs": {}, }, - "f:rayVersion": {}, - "f:workerGroupSpecs": {}, }, + "manager": "mcad-controller", + "operation": "Update", + "time": "2023-02-22T16:26:07Z", }, - "manager": "mcad-controller", - "operation": "Update", - "time": "2023-02-22T16:26:07Z", - }, - { - "apiVersion": "ray.io/v1alpha1", - "fieldsType": "FieldsV1", - "fieldsV1": { - "f:status": { - ".": {}, - "f:availableWorkerReplicas": {}, - "f:desiredWorkerReplicas": {}, - "f:endpoints": { + { + "apiVersion": "ray.io/v1alpha1", + "fieldsType": "FieldsV1", + "fieldsV1": { + "f:status": { ".": {}, - "f:client": {}, - "f:dashboard": {}, - "f:gcs": {}, - }, - "f:lastUpdateTime": {}, - "f:maxWorkerReplicas": {}, - "f:minWorkerReplicas": {}, - "f:state": {}, - } - }, - "manager": "manager", - "operation": "Update", - "subresource": "status", - "time": "2023-02-22T16:26:16Z", - }, - ], - "name": "quicktest", - "namespace": "ns", - "ownerReferences": [ - { - "apiVersion": "mcad.ibm.com/v1beta1", - "blockOwnerDeletion": True, - "controller": True, - "kind": "AppWrapper", - "name": "quicktest", - "uid": "6334fc1b-471e-4876-8e7b-0b2277679235", - } - ], - "resourceVersion": "9482407", - "uid": "44d45d1f-26c8-43e7-841f-831dbd8c1285", - }, - "spec": { - "autoscalerOptions": { - "idleTimeoutSeconds": 60, - "imagePullPolicy": "Always", - "resources": { - "limits": {"cpu": "500m", "memory": "512Mi"}, - "requests": {"cpu": "500m", "memory": "512Mi"}, - }, - "upscalingMode": "Default", - }, - "enableInTreeAutoscaling": False, - "headGroupSpec": { - "rayStartParams": { - "block": "true", - "dashboard-host": "0.0.0.0", - "num-gpus": "0", - }, - "serviceType": "ClusterIP", - "template": { - "spec": { - "containers": [ - { - "image": "quay.io/project-codeflare/ray:2.5.0-py38-cu116", - "imagePullPolicy": "Always", - "lifecycle": { - "preStop": { - "exec": { - "command": ["/bin/sh", "-c", "ray stop"] - } - } - }, - "name": "ray-head", - "ports": [ - { - "containerPort": 6379, - "name": "gcs", - "protocol": "TCP", - }, - { - "containerPort": 8265, - "name": "dashboard", - "protocol": "TCP", - }, - { - "containerPort": 10001, - "name": "client", - "protocol": "TCP", - }, - ], - "resources": { - "limits": { - "cpu": 2, - "memory": "8G", - "nvidia.com/gpu": 0, - }, - "requests": { - "cpu": 2, - "memory": "8G", - "nvidia.com/gpu": 0, - }, + "f:availableWorkerReplicas": {}, + "f:desiredWorkerReplicas": {}, + "f:endpoints": { + ".": {}, + "f:client": {}, + "f:dashboard": {}, + "f:gcs": {}, }, + "f:lastUpdateTime": {}, + "f:maxWorkerReplicas": {}, + "f:minWorkerReplicas": {}, + "f:state": {}, } - ] + }, + "manager": "manager", + "operation": "Update", + "subresource": "status", + "time": "2023-02-22T16:26:16Z", + }, + ], + "name": "quicktest", + "namespace": "ns", + "ownerReferences": [ + { + "apiVersion": "mcad.ibm.com/v1beta1", + "blockOwnerDeletion": True, + "controller": True, + "kind": "AppWrapper", + "name": "quicktest", + "uid": "6334fc1b-471e-4876-8e7b-0b2277679235", } - }, + ], + "resourceVersion": "9482407", + "uid": "44d45d1f-26c8-43e7-841f-831dbd8c1285", }, - "rayVersion": "1.12.0", - "workerGroupSpecs": [ - { - "groupName": "small-group-quicktest", - "maxReplicas": 1, - "minReplicas": 1, - "rayStartParams": {"block": "true", "num-gpus": "0"}, - "replicas": 1, + "spec": { + "autoscalerOptions": { + "idleTimeoutSeconds": 60, + "imagePullPolicy": "Always", + "resources": { + "limits": {"cpu": "500m", "memory": "512Mi"}, + "requests": {"cpu": "500m", "memory": "512Mi"}, + }, + "upscalingMode": "Default", + }, + "enableInTreeAutoscaling": False, + "headGroupSpec": { + "rayStartParams": { + "block": "true", + "dashboard-host": "0.0.0.0", + "num-gpus": "0", + }, + "serviceType": "ClusterIP", "template": { - "metadata": { - "annotations": {"key": "value"}, - "labels": {"key": "value"}, - }, "spec": { "containers": [ { - "env": [ - { - "name": "MY_POD_IP", - "valueFrom": { - "fieldRef": { - "fieldPath": "status.podIP" - } - }, - } - ], - "image": "quay.io/project-codeflare/ray:2.5.0-py38-cu116", + "image": "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103", + "imagePullPolicy": "Always", "lifecycle": { "preStop": { "exec": { @@ -747,262 +736,271 @@ def get_ray_obj(cls=None): } } }, - "name": "machine-learning", + "name": "ray-head", + "ports": [ + { + "containerPort": 6379, + "name": "gcs", + "protocol": "TCP", + }, + { + "containerPort": 8265, + "name": "dashboard", + "protocol": "TCP", + }, + { + "containerPort": 10001, + "name": "client", + "protocol": "TCP", + }, + ], "resources": { "limits": { - "cpu": 1, - "memory": "2G", + "cpu": 2, + "memory": "8G", "nvidia.com/gpu": 0, }, "requests": { - "cpu": 1, - "memory": "2G", + "cpu": 2, + "memory": "8G", "nvidia.com/gpu": 0, }, }, } - ], - "initContainers": [ - { - "command": [ - "sh", - "-c", - "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done", - ], - "image": "busybox:1.28", - "name": "init-myservice", - } - ], - }, - }, - } - ], - }, - "status": { - "availableWorkerReplicas": 2, - "desiredWorkerReplicas": 1, - "endpoints": {"client": "10001", "dashboard": "8265", "gcs": "6379"}, - "lastUpdateTime": "2023-02-22T16:26:16Z", - "maxWorkerReplicas": 1, - "minWorkerReplicas": 1, - "state": "ready", - }, - } - ) - return [api_obj] - - -def get_aw_obj(): - api_obj1 = openshift.apiobject.APIObject( - { - "apiVersion": "mcad.ibm.com/v1beta1", - "kind": "AppWrapper", - "metadata": { - "annotations": { - "kubectl.kubernetes.io/last-applied-configuration": '{"apiVersion":"mcad.ibm.com/v1beta1","kind":"AppWrapper","metadata":{"annotations":{},"name":"quicktest1","namespace":"ns"},"spec":{"priority":9,"resources":{"GenericItems":[{"custompodresources":[{"limits":{"cpu":2,"memory":"8G","nvidia.com/gpu":0},"replicas":1,"requests":{"cpu":2,"memory":"8G","nvidia.com/gpu":0}},{"limits":{"cpu":1,"memory":"2G","nvidia.com/gpu":0},"replicas":1,"requests":{"cpu":1,"memory":"2G","nvidia.com/gpu":0}}],"generictemplate":{"apiVersion":"ray.io/v1alpha1","kind":"RayCluster","metadata":{"labels":{"appwrapper.mcad.ibm.com":"quicktest1","controller-tools.k8s.io":"1.0"},"name":"quicktest1","namespace":"ns"},"spec":{"autoscalerOptions":{"idleTimeoutSeconds":60,"imagePullPolicy":"Always","resources":{"limits":{"cpu":"500m","memory":"512Mi"},"requests":{"cpu":"500m","memory":"512Mi"}},"upscalingMode":"Default"},"enableInTreeAutoscaling":false,"headGroupSpec":{"rayStartParams":{"block":"true","dashboard-host":"0.0.0.0","num-gpus":"0"},"serviceType":"ClusterIP","template":{"spec":{"containers":[{"image":"quay.io/project-codeflare/ray:2.5.0-py38-cu116","imagePullPolicy":"Always","lifecycle":{"preStop":{"exec":{"command":["/bin/sh","-c","ray stop"]}}},"name":"ray-head","ports":[{"containerPort":6379,"name":"gcs"},{"containerPort":8265,"name":"dashboard"},{"containerPort":10001,"name":"client"}],"resources":{"limits":{"cpu":2,"memory":"8G","nvidia.com/gpu":0},"requests":{"cpu":2,"memory":"8G","nvidia.com/gpu":0}}}]}}},"rayVersion":"1.12.0","workerGroupSpecs":[{"groupName":"small-group-quicktest","maxReplicas":1,"minReplicas":1,"rayStartParams":{"block":"true","num-gpus":"0"},"replicas":1,"template":{"metadata":{"annotations":{"key":"value"},"labels":{"key":"value"}},"spec":{"containers":[{"env":[{"name":"MY_POD_IP","valueFrom":{"fieldRef":{"fieldPath":"status.podIP"}}}],"image":"quay.io/project-codeflare/ray:2.5.0-py38-cu116","lifecycle":{"preStop":{"exec":{"command":["/bin/sh","-c","ray stop"]}}},"name":"machine-learning","resources":{"limits":{"cpu":1,"memory":"2G","nvidia.com/gpu":0},"requests":{"cpu":1,"memory":"2G","nvidia.com/gpu":0}}}],"initContainers":[{"command":["sh","-c","until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"],"image":"busybox:1.28","name":"init-myservice"}]}}}]}},"replicas":1},{"generictemplate":{"apiVersion":"route.openshift.io/v1","kind":"Route","metadata":{"labels":{"odh-ray-cluster-service":"quicktest-head-svc"},"name":"ray-dashboard-quicktest","namespace":"default"},"spec":{"port":{"targetPort":"dashboard"},"to":{"kind":"Service","name":"quicktest-head-svc"}}},"replica":1}],"Items":[]}}}\n' - }, - "creationTimestamp": "2023-02-22T16:26:07Z", - "generation": 4, - "managedFields": [ - { - "apiVersion": "mcad.ibm.com/v1beta1", - "fieldsType": "FieldsV1", - "fieldsV1": { - "f:spec": { - "f:resources": {"f:GenericItems": {}, "f:metadata": {}}, - "f:schedulingSpec": {}, - "f:service": {".": {}, "f:spec": {}}, - }, - "f:status": { - ".": {}, - "f:canrun": {}, - "f:conditions": {}, - "f:controllerfirsttimestamp": {}, - "f:filterignore": {}, - "f:queuejobstate": {}, - "f:sender": {}, - "f:state": {}, - "f:systempriority": {}, - }, + ] + } }, - "manager": "Go-http-client", - "operation": "Update", - "time": "2023-02-22T16:26:07Z", }, - { - "apiVersion": "mcad.ibm.com/v1beta1", - "fieldsType": "FieldsV1", - "fieldsV1": { - "f:metadata": { - "f:annotations": { - ".": {}, - "f:kubectl.kubernetes.io/last-applied-configuration": {}, - } - }, - "f:spec": { - ".": {}, - "f:priority": {}, - "f:resources": {".": {}, "f:Items": {}}, + "rayVersion": "1.12.0", + "workerGroupSpecs": [ + { + "groupName": "small-group-quicktest", + "maxReplicas": 1, + "minReplicas": 1, + "rayStartParams": {"block": "true", "num-gpus": "0"}, + "replicas": 1, + "template": { + "metadata": { + "annotations": {"key": "value"}, + "labels": {"key": "value"}, + }, + "spec": { + "containers": [ + { + "env": [ + { + "name": "MY_POD_IP", + "valueFrom": { + "fieldRef": { + "fieldPath": "status.podIP" + } + }, + } + ], + "image": "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103", + "lifecycle": { + "preStop": { + "exec": { + "command": [ + "/bin/sh", + "-c", + "ray stop", + ] + } + } + }, + "name": "machine-learning", + "resources": { + "limits": { + "cpu": 1, + "memory": "2G", + "nvidia.com/gpu": 0, + }, + "requests": { + "cpu": 1, + "memory": "2G", + "nvidia.com/gpu": 0, + }, + }, + } + ], + "initContainers": [ + { + "command": [ + "sh", + "-c", + "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done", + ], + "image": "busybox:1.28", + "name": "init-myservice", + } + ], + }, }, - }, - "manager": "kubectl-client-side-apply", - "operation": "Update", - "time": "2023-02-22T16:26:07Z", + } + ], + }, + "status": { + "availableWorkerReplicas": 2, + "desiredWorkerReplicas": 1, + "endpoints": { + "client": "10001", + "dashboard": "8265", + "gcs": "6379", }, - ], - "name": "quicktest1", - "namespace": "ns", - "resourceVersion": "9482384", - "uid": "6334fc1b-471e-4876-8e7b-0b2277679235", - }, - "spec": { - "priority": 9, - "resources": { - "GenericItems": [ + "lastUpdateTime": "2023-02-22T16:26:16Z", + "maxWorkerReplicas": 1, + "minWorkerReplicas": 1, + "state": "ready", + }, + } + ] + } + return api_obj + + +def get_aw_obj(group, version, namespace, plural): + api_obj1 = { + "items": [ + { + "apiVersion": "mcad.ibm.com/v1beta1", + "kind": "AppWrapper", + "metadata": { + "annotations": { + "kubectl.kubernetes.io/last-applied-configuration": '{"apiVersion":"mcad.ibm.com/v1beta1","kind":"AppWrapper","metadata":{"annotations":{},"name":"quicktest1","namespace":"ns"},"spec":{"priority":9,"resources":{"GenericItems":[{"custompodresources":[{"limits":{"cpu":2,"memory":"8G","nvidia.com/gpu":0},"replicas":1,"requests":{"cpu":2,"memory":"8G","nvidia.com/gpu":0}},{"limits":{"cpu":1,"memory":"2G","nvidia.com/gpu":0},"replicas":1,"requests":{"cpu":1,"memory":"2G","nvidia.com/gpu":0}}],"generictemplate":{"apiVersion":"ray.io/v1alpha1","kind":"RayCluster","metadata":{"labels":{"appwrapper.mcad.ibm.com":"quicktest1","controller-tools.k8s.io":"1.0"},"name":"quicktest1","namespace":"ns"},"spec":{"autoscalerOptions":{"idleTimeoutSeconds":60,"imagePullPolicy":"Always","resources":{"limits":{"cpu":"500m","memory":"512Mi"},"requests":{"cpu":"500m","memory":"512Mi"}},"upscalingMode":"Default"},"enableInTreeAutoscaling":false,"headGroupSpec":{"rayStartParams":{"block":"true","dashboard-host":"0.0.0.0","num-gpus":"0"},"serviceType":"ClusterIP","template":{"spec":{"containers":[{"image":"ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103","imagePullPolicy":"Always","lifecycle":{"preStop":{"exec":{"command":["/bin/sh","-c","ray stop"]}}},"name":"ray-head","ports":[{"containerPort":6379,"name":"gcs"},{"containerPort":8265,"name":"dashboard"},{"containerPort":10001,"name":"client"}],"resources":{"limits":{"cpu":2,"memory":"8G","nvidia.com/gpu":0},"requests":{"cpu":2,"memory":"8G","nvidia.com/gpu":0}}}]}}},"rayVersion":"1.12.0","workerGroupSpecs":[{"groupName":"small-group-quicktest","maxReplicas":1,"minReplicas":1,"rayStartParams":{"block":"true","num-gpus":"0"},"replicas":1,"template":{"metadata":{"annotations":{"key":"value"},"labels":{"key":"value"}},"spec":{"containers":[{"env":[{"name":"MY_POD_IP","valueFrom":{"fieldRef":{"fieldPath":"status.podIP"}}}],"image":"ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103","lifecycle":{"preStop":{"exec":{"command":["/bin/sh","-c","ray stop"]}}},"name":"machine-learning","resources":{"limits":{"cpu":1,"memory":"2G","nvidia.com/gpu":0},"requests":{"cpu":1,"memory":"2G","nvidia.com/gpu":0}}}],"initContainers":[{"command":["sh","-c","until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"],"image":"busybox:1.28","name":"init-myservice"}]}}}]}},"replicas":1},{"generictemplate":{"apiVersion":"route.openshift.io/v1","kind":"Route","metadata":{"labels":{"odh-ray-cluster-service":"quicktest-head-svc"},"name":"ray-dashboard-quicktest","namespace":"default"},"spec":{"port":{"targetPort":"dashboard"},"to":{"kind":"Service","name":"quicktest-head-svc"}}},"replica":1}],"Items":[]}}}\n' + }, + "creationTimestamp": "2023-02-22T16:26:07Z", + "generation": 4, + "managedFields": [ { - "allocated": 0, - "custompodresources": [ - { - "limits": { - "cpu": "2", - "memory": "8G", - "nvidia.com/gpu": "0", - }, - "replicas": 1, - "requests": { - "cpu": "2", - "memory": "8G", - "nvidia.com/gpu": "0", + "apiVersion": "mcad.ibm.com/v1beta1", + "fieldsType": "FieldsV1", + "fieldsV1": { + "f:spec": { + "f:resources": { + "f:GenericItems": {}, + "f:metadata": {}, }, + "f:schedulingSpec": {}, + "f:service": {".": {}, "f:spec": {}}, }, - { - "limits": { - "cpu": "1", - "memory": "2G", - "nvidia.com/gpu": "0", - }, - "replicas": 1, - "requests": { - "cpu": "1", - "memory": "2G", - "nvidia.com/gpu": "0", - }, + "f:status": { + ".": {}, + "f:canrun": {}, + "f:conditions": {}, + "f:controllerfirsttimestamp": {}, + "f:filterignore": {}, + "f:queuejobstate": {}, + "f:sender": {}, + "f:state": {}, + "f:systempriority": {}, }, - ], - "generictemplate": { - "apiVersion": "ray.io/v1alpha1", - "kind": "RayCluster", - "metadata": { - "labels": { - "appwrapper.mcad.ibm.com": "quicktest1", - "controller-tools.k8s.io": "1.0", - }, - "name": "quicktest1", - "namespace": "ns", + }, + "manager": "Go-http-client", + "operation": "Update", + "time": "2023-02-22T16:26:07Z", + }, + { + "apiVersion": "mcad.ibm.com/v1beta1", + "fieldsType": "FieldsV1", + "fieldsV1": { + "f:metadata": { + "f:annotations": { + ".": {}, + "f:kubectl.kubernetes.io/last-applied-configuration": {}, + } }, - "spec": { - "autoscalerOptions": { - "idleTimeoutSeconds": 60, - "imagePullPolicy": "Always", - "resources": { - "limits": { - "cpu": "500m", - "memory": "512Mi", - }, - "requests": { - "cpu": "500m", - "memory": "512Mi", - }, + "f:spec": { + ".": {}, + "f:priority": {}, + "f:resources": {".": {}, "f:Items": {}}, + }, + }, + "manager": "kubectl-client-side-apply", + "operation": "Update", + "time": "2023-02-22T16:26:07Z", + }, + ], + "name": "quicktest1", + "namespace": "ns", + "resourceVersion": "9482384", + "uid": "6334fc1b-471e-4876-8e7b-0b2277679235", + }, + "spec": { + "priority": 9, + "resources": { + "GenericItems": [ + { + "allocated": 0, + "custompodresources": [ + { + "limits": { + "cpu": "2", + "memory": "8G", + "nvidia.com/gpu": "0", + }, + "replicas": 1, + "requests": { + "cpu": "2", + "memory": "8G", + "nvidia.com/gpu": "0", }, - "upscalingMode": "Default", }, - "enableInTreeAutoscaling": False, - "headGroupSpec": { - "rayStartParams": { - "block": "true", - "dashboard-host": "0.0.0.0", - "num-gpus": "0", + { + "limits": { + "cpu": "1", + "memory": "2G", + "nvidia.com/gpu": "0", }, - "serviceType": "ClusterIP", - "template": { - "spec": { - "containers": [ - { - "image": "quay.io/project-codeflare/ray:2.5.0-py38-cu116", - "imagePullPolicy": "Always", - "lifecycle": { - "preStop": { - "exec": { - "command": [ - "/bin/sh", - "-c", - "ray stop", - ] - } - } - }, - "name": "ray-head", - "ports": [ - { - "containerPort": 6379, - "name": "gcs", - }, - { - "containerPort": 8265, - "name": "dashboard", - }, - { - "containerPort": 10001, - "name": "client", - }, - ], - "resources": { - "limits": { - "cpu": 2, - "memory": "8G", - "nvidia.com/gpu": 0, - }, - "requests": { - "cpu": 2, - "memory": "8G", - "nvidia.com/gpu": 0, - }, - }, - } - ] - } + "replicas": 1, + "requests": { + "cpu": "1", + "memory": "2G", + "nvidia.com/gpu": "0", }, }, - "rayVersion": "1.12.0", - "workerGroupSpecs": [ - { - "groupName": "small-group-quicktest", - "maxReplicas": 1, - "minReplicas": 1, + ], + "generictemplate": { + "apiVersion": "ray.io/v1alpha1", + "kind": "RayCluster", + "metadata": { + "labels": { + "appwrapper.mcad.ibm.com": "quicktest1", + "controller-tools.k8s.io": "1.0", + }, + "name": "quicktest1", + "namespace": "ns", + }, + "spec": { + "autoscalerOptions": { + "idleTimeoutSeconds": 60, + "imagePullPolicy": "Always", + "resources": { + "limits": { + "cpu": "500m", + "memory": "512Mi", + }, + "requests": { + "cpu": "500m", + "memory": "512Mi", + }, + }, + "upscalingMode": "Default", + }, + "enableInTreeAutoscaling": False, + "headGroupSpec": { "rayStartParams": { "block": "true", + "dashboard-host": "0.0.0.0", "num-gpus": "0", }, - "replicas": 1, + "serviceType": "ClusterIP", "template": { - "metadata": { - "annotations": {"key": "value"}, - "labels": {"key": "value"}, - }, "spec": { "containers": [ { - "env": [ - { - "name": "MY_POD_IP", - "valueFrom": { - "fieldRef": { - "fieldPath": "status.podIP" - } - }, - } - ], - "image": "quay.io/project-codeflare/ray:2.5.0-py38-cu116", + "image": "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103", + "imagePullPolicy": "Always", "lifecycle": { "preStop": { "exec": { @@ -1014,317 +1012,318 @@ def get_aw_obj(): } } }, - "name": "machine-learning", + "name": "ray-head", + "ports": [ + { + "containerPort": 6379, + "name": "gcs", + }, + { + "containerPort": 8265, + "name": "dashboard", + }, + { + "containerPort": 10001, + "name": "client", + }, + ], "resources": { "limits": { - "cpu": 1, - "memory": "2G", + "cpu": 2, + "memory": "8G", "nvidia.com/gpu": 0, }, "requests": { - "cpu": 1, - "memory": "2G", + "cpu": 2, + "memory": "8G", "nvidia.com/gpu": 0, }, }, } - ], - "initContainers": [ - { - "command": [ - "sh", - "-c", - "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done", - ], - "image": "busybox:1.28", - "name": "init-myservice", - } - ], - }, + ] + } }, - } - ], + }, + "rayVersion": "1.12.0", + "workerGroupSpecs": [ + { + "groupName": "small-group-quicktest", + "maxReplicas": 1, + "minReplicas": 1, + "rayStartParams": { + "block": "true", + "num-gpus": "0", + }, + "replicas": 1, + "template": { + "metadata": { + "annotations": {"key": "value"}, + "labels": {"key": "value"}, + }, + "spec": { + "containers": [ + { + "env": [ + { + "name": "MY_POD_IP", + "valueFrom": { + "fieldRef": { + "fieldPath": "status.podIP" + } + }, + } + ], + "image": "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103", + "lifecycle": { + "preStop": { + "exec": { + "command": [ + "/bin/sh", + "-c", + "ray stop", + ] + } + } + }, + "name": "machine-learning", + "resources": { + "limits": { + "cpu": 1, + "memory": "2G", + "nvidia.com/gpu": 0, + }, + "requests": { + "cpu": 1, + "memory": "2G", + "nvidia.com/gpu": 0, + }, + }, + } + ], + "initContainers": [ + { + "command": [ + "sh", + "-c", + "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done", + ], + "image": "busybox:1.28", + "name": "init-myservice", + } + ], + }, + }, + } + ], + }, }, + "metadata": {}, + "priority": 0, + "priorityslope": 0, + "replicas": 1, }, - "metadata": {}, - "priority": 0, - "priorityslope": 0, - "replicas": 1, - }, - { - "allocated": 0, - "generictemplate": { - "apiVersion": "route.openshift.io/v1", - "kind": "Route", - "metadata": { - "labels": { - "odh-ray-cluster-service": "quicktest-head-svc" + { + "allocated": 0, + "generictemplate": { + "apiVersion": "route.openshift.io/v1", + "kind": "Route", + "metadata": { + "labels": { + "odh-ray-cluster-service": "quicktest-head-svc" + }, + "name": "ray-dashboard-quicktest", + "namespace": "default", }, - "name": "ray-dashboard-quicktest", - "namespace": "default", - }, - "spec": { - "port": {"targetPort": "dashboard"}, - "to": { - "kind": "Service", - "name": "quicktest-head-svc", + "spec": { + "port": {"targetPort": "dashboard"}, + "to": { + "kind": "Service", + "name": "quicktest-head-svc", + }, }, }, + "metadata": {}, + "priority": 0, + "priorityslope": 0, }, - "metadata": {}, - "priority": 0, - "priorityslope": 0, + ], + "Items": [], + "metadata": {}, + }, + "schedulingSpec": {}, + "service": {"spec": {}}, + }, + "status": { + "canrun": True, + "conditions": [ + { + "lastTransitionMicroTime": "2023-02-22T16:26:07.559447Z", + "lastUpdateMicroTime": "2023-02-22T16:26:07.559447Z", + "status": "True", + "type": "Init", + }, + { + "lastTransitionMicroTime": "2023-02-22T16:26:07.559551Z", + "lastUpdateMicroTime": "2023-02-22T16:26:07.559551Z", + "reason": "AwaitingHeadOfLine", + "status": "True", + "type": "Queueing", + }, + { + "lastTransitionMicroTime": "2023-02-22T16:26:13.220564Z", + "lastUpdateMicroTime": "2023-02-22T16:26:13.220564Z", + "reason": "AppWrapperRunnable", + "status": "True", + "type": "Dispatched", }, ], - "Items": [], - "metadata": {}, + "controllerfirsttimestamp": "2023-02-22T16:26:07.559447Z", + "filterignore": True, + "queuejobstate": "Dispatched", + "sender": "before manageQueueJob - afterEtcdDispatching", + "state": "Running", + "systempriority": 9, }, - "schedulingSpec": {}, - "service": {"spec": {}}, - }, - "status": { - "canrun": True, - "conditions": [ - { - "lastTransitionMicroTime": "2023-02-22T16:26:07.559447Z", - "lastUpdateMicroTime": "2023-02-22T16:26:07.559447Z", - "status": "True", - "type": "Init", - }, - { - "lastTransitionMicroTime": "2023-02-22T16:26:07.559551Z", - "lastUpdateMicroTime": "2023-02-22T16:26:07.559551Z", - "reason": "AwaitingHeadOfLine", - "status": "True", - "type": "Queueing", - }, - { - "lastTransitionMicroTime": "2023-02-22T16:26:13.220564Z", - "lastUpdateMicroTime": "2023-02-22T16:26:13.220564Z", - "reason": "AppWrapperRunnable", - "status": "True", - "type": "Dispatched", - }, - ], - "controllerfirsttimestamp": "2023-02-22T16:26:07.559447Z", - "filterignore": True, - "queuejobstate": "Dispatched", - "sender": "before manageQueueJob - afterEtcdDispatching", - "state": "Running", - "systempriority": 9, }, - } - ) - api_obj2 = openshift.apiobject.APIObject( - { - "apiVersion": "mcad.ibm.com/v1beta1", - "kind": "AppWrapper", - "metadata": { - "annotations": { - "kubectl.kubernetes.io/last-applied-configuration": '{"apiVersion":"mcad.ibm.com/v1beta1","kind":"AppWrapper","metadata":{"annotations":{},"name":"quicktest2","namespace":"ns"},"spec":{"priority":9,"resources":{"GenericItems":[{"custompodresources":[{"limits":{"cpu":2,"memory":"8G","nvidia.com/gpu":0},"replicas":1,"requests":{"cpu":2,"memory":"8G","nvidia.com/gpu":0}},{"limits":{"cpu":1,"memory":"2G","nvidia.com/gpu":0},"replicas":1,"requests":{"cpu":1,"memory":"2G","nvidia.com/gpu":0}}],"generictemplate":{"apiVersion":"ray.io/v1alpha1","kind":"RayCluster","metadata":{"labels":{"appwrapper.mcad.ibm.com":"quicktest2","controller-tools.k8s.io":"1.0"},"name":"quicktest2","namespace":"ns"},"spec":{"autoscalerOptions":{"idleTimeoutSeconds":60,"imagePullPolicy":"Always","resources":{"limits":{"cpu":"500m","memory":"512Mi"},"requests":{"cpu":"500m","memory":"512Mi"}},"upscalingMode":"Default"},"enableInTreeAutoscaling":false,"headGroupSpec":{"rayStartParams":{"block":"true","dashboard-host":"0.0.0.0","num-gpus":"0"},"serviceType":"ClusterIP","template":{"spec":{"containers":[{"image":"quay.io/project-codeflare/ray:2.5.0-py38-cu116","imagePullPolicy":"Always","lifecycle":{"preStop":{"exec":{"command":["/bin/sh","-c","ray stop"]}}},"name":"ray-head","ports":[{"containerPort":6379,"name":"gcs"},{"containerPort":8265,"name":"dashboard"},{"containerPort":10001,"name":"client"}],"resources":{"limits":{"cpu":2,"memory":"8G","nvidia.com/gpu":0},"requests":{"cpu":2,"memory":"8G","nvidia.com/gpu":0}}}]}}},"rayVersion":"1.12.0","workerGroupSpecs":[{"groupName":"small-group-quicktest","maxReplicas":1,"minReplicas":1,"rayStartParams":{"block":"true","num-gpus":"0"},"replicas":1,"template":{"metadata":{"annotations":{"key":"value"},"labels":{"key":"value"}},"spec":{"containers":[{"env":[{"name":"MY_POD_IP","valueFrom":{"fieldRef":{"fieldPath":"status.podIP"}}}],"image":"quay.io/project-codeflare/ray:2.5.0-py38-cu116","lifecycle":{"preStop":{"exec":{"command":["/bin/sh","-c","ray stop"]}}},"name":"machine-learning","resources":{"limits":{"cpu":1,"memory":"2G","nvidia.com/gpu":0},"requests":{"cpu":1,"memory":"2G","nvidia.com/gpu":0}}}],"initContainers":[{"command":["sh","-c","until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"],"image":"busybox:1.28","name":"init-myservice"}]}}}]}},"replicas":1},{"generictemplate":{"apiVersion":"route.openshift.io/v1","kind":"Route","metadata":{"labels":{"odh-ray-cluster-service":"quicktest-head-svc"},"name":"ray-dashboard-quicktest","namespace":"default"},"spec":{"port":{"targetPort":"dashboard"},"to":{"kind":"Service","name":"quicktest-head-svc"}}},"replica":1}],"Items":[]}}}\n' - }, - "creationTimestamp": "2023-02-22T16:26:07Z", - "generation": 4, - "managedFields": [ - { - "apiVersion": "mcad.ibm.com/v1beta1", - "fieldsType": "FieldsV1", - "fieldsV1": { - "f:spec": { - "f:resources": {"f:GenericItems": {}, "f:metadata": {}}, - "f:schedulingSpec": {}, - "f:service": {".": {}, "f:spec": {}}, - }, - "f:status": { - ".": {}, - "f:canrun": {}, - "f:conditions": {}, - "f:controllerfirsttimestamp": {}, - "f:filterignore": {}, - "f:queuejobstate": {}, - "f:sender": {}, - "f:state": {}, - "f:systempriority": {}, - }, - }, - "manager": "Go-http-client", - "operation": "Update", - "time": "2023-02-22T16:26:07Z", + { + "apiVersion": "mcad.ibm.com/v1beta1", + "kind": "AppWrapper", + "metadata": { + "annotations": { + "kubectl.kubernetes.io/last-applied-configuration": '{"apiVersion":"mcad.ibm.com/v1beta1","kind":"AppWrapper","metadata":{"annotations":{},"name":"quicktest2","namespace":"ns"},"spec":{"priority":9,"resources":{"GenericItems":[{"custompodresources":[{"limits":{"cpu":2,"memory":"8G","nvidia.com/gpu":0},"replicas":1,"requests":{"cpu":2,"memory":"8G","nvidia.com/gpu":0}},{"limits":{"cpu":1,"memory":"2G","nvidia.com/gpu":0},"replicas":1,"requests":{"cpu":1,"memory":"2G","nvidia.com/gpu":0}}],"generictemplate":{"apiVersion":"ray.io/v1alpha1","kind":"RayCluster","metadata":{"labels":{"appwrapper.mcad.ibm.com":"quicktest2","controller-tools.k8s.io":"1.0"},"name":"quicktest2","namespace":"ns"},"spec":{"autoscalerOptions":{"idleTimeoutSeconds":60,"imagePullPolicy":"Always","resources":{"limits":{"cpu":"500m","memory":"512Mi"},"requests":{"cpu":"500m","memory":"512Mi"}},"upscalingMode":"Default"},"enableInTreeAutoscaling":false,"headGroupSpec":{"rayStartParams":{"block":"true","dashboard-host":"0.0.0.0","num-gpus":"0"},"serviceType":"ClusterIP","template":{"spec":{"containers":[{"image":"ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103","imagePullPolicy":"Always","lifecycle":{"preStop":{"exec":{"command":["/bin/sh","-c","ray stop"]}}},"name":"ray-head","ports":[{"containerPort":6379,"name":"gcs"},{"containerPort":8265,"name":"dashboard"},{"containerPort":10001,"name":"client"}],"resources":{"limits":{"cpu":2,"memory":"8G","nvidia.com/gpu":0},"requests":{"cpu":2,"memory":"8G","nvidia.com/gpu":0}}}]}}},"rayVersion":"1.12.0","workerGroupSpecs":[{"groupName":"small-group-quicktest","maxReplicas":1,"minReplicas":1,"rayStartParams":{"block":"true","num-gpus":"0"},"replicas":1,"template":{"metadata":{"annotations":{"key":"value"},"labels":{"key":"value"}},"spec":{"containers":[{"env":[{"name":"MY_POD_IP","valueFrom":{"fieldRef":{"fieldPath":"status.podIP"}}}],"image":"ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103","lifecycle":{"preStop":{"exec":{"command":["/bin/sh","-c","ray stop"]}}},"name":"machine-learning","resources":{"limits":{"cpu":1,"memory":"2G","nvidia.com/gpu":0},"requests":{"cpu":1,"memory":"2G","nvidia.com/gpu":0}}}],"initContainers":[{"command":["sh","-c","until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done"],"image":"busybox:1.28","name":"init-myservice"}]}}}]}},"replicas":1},{"generictemplate":{"apiVersion":"route.openshift.io/v1","kind":"Route","metadata":{"labels":{"odh-ray-cluster-service":"quicktest-head-svc"},"name":"ray-dashboard-quicktest","namespace":"default"},"spec":{"port":{"targetPort":"dashboard"},"to":{"kind":"Service","name":"quicktest-head-svc"}}},"replica":1}],"Items":[]}}}\n' }, - { - "apiVersion": "mcad.ibm.com/v1beta1", - "fieldsType": "FieldsV1", - "fieldsV1": { - "f:metadata": { - "f:annotations": { + "creationTimestamp": "2023-02-22T16:26:07Z", + "generation": 4, + "managedFields": [ + { + "apiVersion": "mcad.ibm.com/v1beta1", + "fieldsType": "FieldsV1", + "fieldsV1": { + "f:spec": { + "f:resources": { + "f:GenericItems": {}, + "f:metadata": {}, + }, + "f:schedulingSpec": {}, + "f:service": {".": {}, "f:spec": {}}, + }, + "f:status": { ".": {}, - "f:kubectl.kubernetes.io/last-applied-configuration": {}, - } - }, - "f:spec": { - ".": {}, - "f:priority": {}, - "f:resources": {".": {}, "f:Items": {}}, + "f:canrun": {}, + "f:conditions": {}, + "f:controllerfirsttimestamp": {}, + "f:filterignore": {}, + "f:queuejobstate": {}, + "f:sender": {}, + "f:state": {}, + "f:systempriority": {}, + }, }, + "manager": "Go-http-client", + "operation": "Update", + "time": "2023-02-22T16:26:07Z", }, - "manager": "kubectl-client-side-apply", - "operation": "Update", - "time": "2023-02-22T16:26:07Z", - }, - ], - "name": "quicktest2", - "namespace": "ns", - "resourceVersion": "9482384", - "uid": "6334fc1b-471e-4876-8e7b-0b2277679235", - }, - "spec": { - "priority": 9, - "resources": { - "GenericItems": [ { - "allocated": 0, - "custompodresources": [ - { - "limits": { - "cpu": "2", - "memory": "8G", - "nvidia.com/gpu": "0", - }, - "replicas": 1, - "requests": { - "cpu": "2", - "memory": "8G", - "nvidia.com/gpu": "0", - }, - }, - { - "limits": { - "cpu": "1", - "memory": "2G", - "nvidia.com/gpu": "0", - }, - "replicas": 1, - "requests": { - "cpu": "1", - "memory": "2G", - "nvidia.com/gpu": "0", - }, + "apiVersion": "mcad.ibm.com/v1beta1", + "fieldsType": "FieldsV1", + "fieldsV1": { + "f:metadata": { + "f:annotations": { + ".": {}, + "f:kubectl.kubernetes.io/last-applied-configuration": {}, + } }, - ], - "generictemplate": { - "apiVersion": "ray.io/v1alpha1", - "kind": "RayCluster", - "metadata": { - "labels": { - "appwrapper.mcad.ibm.com": "quicktest2", - "controller-tools.k8s.io": "1.0", - }, - "name": "quicktest2", - "namespace": "ns", + "f:spec": { + ".": {}, + "f:priority": {}, + "f:resources": {".": {}, "f:Items": {}}, }, - "spec": { - "autoscalerOptions": { - "idleTimeoutSeconds": 60, - "imagePullPolicy": "Always", - "resources": { - "limits": { - "cpu": "500m", - "memory": "512Mi", - }, - "requests": { - "cpu": "500m", - "memory": "512Mi", - }, + }, + "manager": "kubectl-client-side-apply", + "operation": "Update", + "time": "2023-02-22T16:26:07Z", + }, + ], + "name": "quicktest2", + "namespace": "ns", + "resourceVersion": "9482384", + "uid": "6334fc1b-471e-4876-8e7b-0b2277679235", + }, + "spec": { + "priority": 9, + "resources": { + "GenericItems": [ + { + "allocated": 0, + "custompodresources": [ + { + "limits": { + "cpu": "2", + "memory": "8G", + "nvidia.com/gpu": "0", + }, + "replicas": 1, + "requests": { + "cpu": "2", + "memory": "8G", + "nvidia.com/gpu": "0", }, - "upscalingMode": "Default", }, - "enableInTreeAutoscaling": False, - "headGroupSpec": { - "rayStartParams": { - "block": "true", - "dashboard-host": "0.0.0.0", - "num-gpus": "0", + { + "limits": { + "cpu": "1", + "memory": "2G", + "nvidia.com/gpu": "0", }, - "serviceType": "ClusterIP", - "template": { - "spec": { - "containers": [ - { - "image": "quay.io/project-codeflare/ray:2.5.0-py38-cu116", - "imagePullPolicy": "Always", - "lifecycle": { - "preStop": { - "exec": { - "command": [ - "/bin/sh", - "-c", - "ray stop", - ] - } - } - }, - "name": "ray-head", - "ports": [ - { - "containerPort": 6379, - "name": "gcs", - }, - { - "containerPort": 8265, - "name": "dashboard", - }, - { - "containerPort": 10001, - "name": "client", - }, - ], - "resources": { - "limits": { - "cpu": 2, - "memory": "8G", - "nvidia.com/gpu": 0, - }, - "requests": { - "cpu": 2, - "memory": "8G", - "nvidia.com/gpu": 0, - }, - }, - } - ] - } + "replicas": 1, + "requests": { + "cpu": "1", + "memory": "2G", + "nvidia.com/gpu": "0", }, }, - "rayVersion": "1.12.0", - "workerGroupSpecs": [ - { - "groupName": "small-group-quicktest", - "maxReplicas": 1, - "minReplicas": 1, + ], + "generictemplate": { + "apiVersion": "ray.io/v1alpha1", + "kind": "RayCluster", + "metadata": { + "labels": { + "appwrapper.mcad.ibm.com": "quicktest2", + "controller-tools.k8s.io": "1.0", + }, + "name": "quicktest2", + "namespace": "ns", + }, + "spec": { + "autoscalerOptions": { + "idleTimeoutSeconds": 60, + "imagePullPolicy": "Always", + "resources": { + "limits": { + "cpu": "500m", + "memory": "512Mi", + }, + "requests": { + "cpu": "500m", + "memory": "512Mi", + }, + }, + "upscalingMode": "Default", + }, + "enableInTreeAutoscaling": False, + "headGroupSpec": { "rayStartParams": { "block": "true", + "dashboard-host": "0.0.0.0", "num-gpus": "0", }, - "replicas": 1, + "serviceType": "ClusterIP", "template": { - "metadata": { - "annotations": {"key": "value"}, - "labels": {"key": "value"}, - }, "spec": { "containers": [ { - "env": [ - { - "name": "MY_POD_IP", - "valueFrom": { - "fieldRef": { - "fieldPath": "status.podIP" - } - }, - } - ], - "image": "quay.io/project-codeflare/ray:2.5.0-py38-cu116", + "image": "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103", + "imagePullPolicy": "Always", "lifecycle": { "preStop": { "exec": { @@ -1336,114 +1335,214 @@ def get_aw_obj(): } } }, - "name": "machine-learning", + "name": "ray-head", + "ports": [ + { + "containerPort": 6379, + "name": "gcs", + }, + { + "containerPort": 8265, + "name": "dashboard", + }, + { + "containerPort": 10001, + "name": "client", + }, + ], "resources": { "limits": { - "cpu": 1, - "memory": "2G", + "cpu": 2, + "memory": "8G", "nvidia.com/gpu": 0, }, "requests": { - "cpu": 1, - "memory": "2G", + "cpu": 2, + "memory": "8G", "nvidia.com/gpu": 0, }, }, } - ], - "initContainers": [ - { - "command": [ - "sh", - "-c", - "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done", - ], - "image": "busybox:1.28", - "name": "init-myservice", - } - ], - }, + ] + } }, - } - ], + }, + "rayVersion": "1.12.0", + "workerGroupSpecs": [ + { + "groupName": "small-group-quicktest", + "maxReplicas": 1, + "minReplicas": 1, + "rayStartParams": { + "block": "true", + "num-gpus": "0", + }, + "replicas": 1, + "template": { + "metadata": { + "annotations": {"key": "value"}, + "labels": {"key": "value"}, + }, + "spec": { + "containers": [ + { + "env": [ + { + "name": "MY_POD_IP", + "valueFrom": { + "fieldRef": { + "fieldPath": "status.podIP" + } + }, + } + ], + "image": "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103", + "lifecycle": { + "preStop": { + "exec": { + "command": [ + "/bin/sh", + "-c", + "ray stop", + ] + } + } + }, + "name": "machine-learning", + "resources": { + "limits": { + "cpu": 1, + "memory": "2G", + "nvidia.com/gpu": 0, + }, + "requests": { + "cpu": 1, + "memory": "2G", + "nvidia.com/gpu": 0, + }, + }, + } + ], + "initContainers": [ + { + "command": [ + "sh", + "-c", + "until nslookup $RAY_IP.$(cat /var/run/secrets/kubernetes.io/serviceaccount/namespace).svc.cluster.local; do echo waiting for myservice; sleep 2; done", + ], + "image": "busybox:1.28", + "name": "init-myservice", + } + ], + }, + }, + } + ], + }, }, + "metadata": {}, + "priority": 0, + "priorityslope": 0, + "replicas": 1, }, - "metadata": {}, - "priority": 0, - "priorityslope": 0, - "replicas": 1, - }, - { - "allocated": 0, - "generictemplate": { - "apiVersion": "route.openshift.io/v1", - "kind": "Route", - "metadata": { - "labels": { - "odh-ray-cluster-service": "quicktest-head-svc" + { + "allocated": 0, + "generictemplate": { + "apiVersion": "route.openshift.io/v1", + "kind": "Route", + "metadata": { + "labels": { + "odh-ray-cluster-service": "quicktest-head-svc" + }, + "name": "ray-dashboard-quicktest", + "namespace": "default", }, - "name": "ray-dashboard-quicktest", - "namespace": "default", - }, - "spec": { - "port": {"targetPort": "dashboard"}, - "to": { - "kind": "Service", - "name": "quicktest-head-svc", + "spec": { + "port": {"targetPort": "dashboard"}, + "to": { + "kind": "Service", + "name": "quicktest-head-svc", + }, }, }, + "metadata": {}, + "priority": 0, + "priorityslope": 0, }, - "metadata": {}, - "priority": 0, - "priorityslope": 0, + ], + "Items": [], + "metadata": {}, + }, + "schedulingSpec": {}, + "service": {"spec": {}}, + }, + "status": { + "canrun": True, + "conditions": [ + { + "lastTransitionMicroTime": "2023-02-22T16:26:07.559447Z", + "lastUpdateMicroTime": "2023-02-22T16:26:07.559447Z", + "status": "True", + "type": "Init", + }, + { + "lastTransitionMicroTime": "2023-02-22T16:26:07.559551Z", + "lastUpdateMicroTime": "2023-02-22T16:26:07.559551Z", + "reason": "AwaitingHeadOfLine", + "status": "True", + "type": "Queueing", + }, + { + "lastTransitionMicroTime": "2023-02-22T16:26:13.220564Z", + "lastUpdateMicroTime": "2023-02-22T16:26:13.220564Z", + "reason": "AppWrapperRunnable", + "status": "True", + "type": "Dispatched", }, ], - "Items": [], - "metadata": {}, + "controllerfirsttimestamp": "2023-02-22T16:26:07.559447Z", + "filterignore": True, + "queuejobstate": "Dispatched", + "sender": "before manageQueueJob - afterEtcdDispatching", + "state": "Pending", + "systempriority": 9, }, - "schedulingSpec": {}, - "service": {"spec": {}}, - }, - "status": { - "canrun": True, - "conditions": [ - { - "lastTransitionMicroTime": "2023-02-22T16:26:07.559447Z", - "lastUpdateMicroTime": "2023-02-22T16:26:07.559447Z", - "status": "True", - "type": "Init", - }, - { - "lastTransitionMicroTime": "2023-02-22T16:26:07.559551Z", - "lastUpdateMicroTime": "2023-02-22T16:26:07.559551Z", - "reason": "AwaitingHeadOfLine", - "status": "True", - "type": "Queueing", - }, - { - "lastTransitionMicroTime": "2023-02-22T16:26:13.220564Z", - "lastUpdateMicroTime": "2023-02-22T16:26:13.220564Z", - "reason": "AppWrapperRunnable", - "status": "True", - "type": "Dispatched", - }, - ], - "controllerfirsttimestamp": "2023-02-22T16:26:07.559447Z", - "filterignore": True, - "queuejobstate": "Dispatched", - "sender": "before manageQueueJob - afterEtcdDispatching", - "state": "Pending", - "systempriority": 9, }, - } + ] + } + return api_obj1 + + +def test_get_cluster(mocker): + mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") + mocker.patch( + "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", + side_effect=get_ray_obj, ) - return [api_obj1, api_obj2] + cluster = get_cluster("quicktest") + cluster_config = cluster.config + assert cluster_config.name == "quicktest" and cluster_config.namespace == "ns" + assert ( + "m4.xlarge" in cluster_config.machine_types + and "g4dn.xlarge" in cluster_config.machine_types + ) + assert cluster_config.min_cpus == 1 and cluster_config.max_cpus == 1 + assert cluster_config.min_memory == 2 and cluster_config.max_memory == 2 + assert cluster_config.gpu == 0 + assert cluster_config.instascale + assert ( + cluster_config.image + == "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103" + ) + assert cluster_config.min_worker == 1 and cluster_config.max_worker == 1 def test_list_clusters(mocker, capsys): - mocker.patch("openshift.selector", side_effect=get_selector) - mock_res = mocker.patch.object(Selector, "objects") - mock_res.side_effect = get_obj_none + mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") + mocker.patch( + "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", + side_effect=get_obj_none, + ) list_all_clusters("ns") captured = capsys.readouterr() assert captured.out == ( @@ -1451,7 +1550,10 @@ def test_list_clusters(mocker, capsys): "│ No resources found, have you run cluster.up() yet? │\n" "╰──────────────────────────────────────────────────────────────────────────────╯\n" ) - mock_res.side_effect = get_ray_obj + mocker.patch( + "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", + side_effect=get_ray_obj, + ) list_all_clusters("ns") captured = capsys.readouterr() assert captured.out == ( @@ -1477,9 +1579,11 @@ def test_list_clusters(mocker, capsys): def test_list_queue(mocker, capsys): - mocker.patch("openshift.selector", side_effect=get_selector) - mock_res = mocker.patch.object(Selector, "objects") - mock_res.side_effect = get_obj_none + mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") + mocker.patch( + "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", + side_effect=get_obj_none, + ) list_all_queued("ns") captured = capsys.readouterr() assert captured.out == ( @@ -1487,7 +1591,10 @@ def test_list_queue(mocker, capsys): "│ No resources found, have you run cluster.up() yet? │\n" "╰──────────────────────────────────────────────────────────────────────────────╯\n" ) - mock_res.side_effect = get_aw_obj + mocker.patch( + "kubernetes.client.CustomObjectsApi.list_namespaced_custom_object", + side_effect=get_aw_obj, + ) list_all_queued("ns") captured = capsys.readouterr() assert captured.out == ( @@ -1507,6 +1614,7 @@ def test_list_queue(mocker, capsys): def test_cluster_status(mocker): + mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") fake_aw = AppWrapper( "test", AppWrapperStatus.FAILED, can_run=True, job_state="unused" ) @@ -1523,6 +1631,8 @@ def test_cluster_status(mocker): dashboard="fake-uri", ) cf = Cluster(ClusterConfiguration(name="test", namespace="ns")) + mocker.patch("codeflare_sdk.cluster.cluster._app_wrapper_status", return_value=None) + mocker.patch("codeflare_sdk.cluster.cluster._ray_cluster_status", return_value=None) status, ready = cf.status() assert status == CodeFlareClusterStatus.UNKNOWN assert ready == False @@ -1584,6 +1694,9 @@ def test_cluster_status(mocker): def test_wait_ready(mocker, capsys): + mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") + mocker.patch("codeflare_sdk.cluster.cluster._app_wrapper_status", return_value=None) + mocker.patch("codeflare_sdk.cluster.cluster._ray_cluster_status", return_value=None) cf = Cluster(ClusterConfiguration(name="test", namespace="ns")) try: cf.wait_ready(timeout=5) @@ -1655,12 +1768,17 @@ def test_DDPJobDefinition_creation(): return ddp -def test_DDPJobDefinition_dry_run(): +def test_DDPJobDefinition_dry_run(mocker): """ Test that the dry run method returns the correct type: AppDryRunInfo, that the attributes of the returned object are of the correct type, and that the values from cluster and job definition are correctly passed. """ + mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") + mocker.patch( + "codeflare_sdk.cluster.cluster.Cluster.cluster_dashboard_uri", + return_value="", + ) ddp = test_DDPJobDefinition_creation() cluster = Cluster(test_config_creation()) ddp_job = ddp._dry_run(cluster) @@ -1693,7 +1811,7 @@ def test_DDPJobDefinition_dry_run_no_cluster(mocker): """ mocker.patch( - "openshift.get_project_name", + "codeflare_sdk.job.jobs.get_current_namespace", return_value="opendatahub", ) @@ -1725,11 +1843,15 @@ def test_DDPJobDefinition_dry_run_no_cluster(mocker): assert ddp_job._scheduler == "kubernetes_mcad" -def test_DDPJobDefinition_dry_run_no_resource_args(): +def test_DDPJobDefinition_dry_run_no_resource_args(mocker): """ Test that the dry run correctly gets resources from the cluster object when the job definition does not specify resources. """ + mocker.patch( + "codeflare_sdk.cluster.cluster.Cluster.cluster_dashboard_uri", + return_value="", + ) cluster = Cluster(test_config_creation()) ddp = DDPJobDefinition( script="test.py", @@ -1762,7 +1884,7 @@ def test_DDPJobDefinition_dry_run_no_cluster_no_resource_args(mocker): """ mocker.patch( - "openshift.get_project_name", + "codeflare_sdk.job.jobs.get_current_namespace", return_value="opendatahub", ) @@ -1814,11 +1936,15 @@ def test_DDPJobDefinition_submit(mocker): Tests that the submit method returns the correct type: DDPJob And that the attributes of the returned object are of the correct type """ + mocker.patch( + "codeflare_sdk.cluster.cluster.Cluster.cluster_dashboard_uri", + return_value="fake-dashboard-uri", + ) ddp_def = test_DDPJobDefinition_creation() cluster = Cluster(test_config_creation()) mocker.patch( - "openshift.get_project_name", - return_value="opendatahub", + "codeflare_sdk.job.jobs.get_current_namespace", + side_effect="opendatahub", ) mocker.patch( "codeflare_sdk.job.jobs.torchx_runner.schedule", @@ -1841,6 +1967,10 @@ def test_DDPJobDefinition_submit(mocker): def test_DDPJob_creation(mocker): + mocker.patch( + "codeflare_sdk.cluster.cluster.Cluster.cluster_dashboard_uri", + return_value="fake-dashboard-uri", + ) ddp_def = test_DDPJobDefinition_creation() cluster = Cluster(test_config_creation()) mocker.patch( @@ -1867,8 +1997,8 @@ def test_DDPJob_creation_no_cluster(mocker): ddp_def = test_DDPJobDefinition_creation() ddp_def.image = "fake-image" mocker.patch( - "openshift.get_project_name", - return_value="opendatahub", + "codeflare_sdk.job.jobs.get_current_namespace", + side_effect="opendatahub", ) mocker.patch( "codeflare_sdk.job.jobs.torchx_runner.schedule", @@ -1959,14 +2089,24 @@ def test_AWManager_creation(): ) -def arg_check_aw_create_effect(*args): - assert args[0] == "create" - assert args[1] == ["-f", "test.yaml"] +def arg_check_aw_apply_effect(group, version, namespace, plural, body, *args): + assert group == "mcad.ibm.com" + assert version == "v1beta1" + assert namespace == "ns" + assert plural == "appwrappers" + with open("test.yaml") as f: + aw = yaml.load(f, Loader=yaml.FullLoader) + assert body == aw + assert args == tuple() -def arg_check_aw_delete_effect(*args): - assert args[0] == "delete" - assert args[1] == ["AppWrapper", "test"] +def arg_check_aw_del_effect(group, version, namespace, plural, name, *args): + assert group == "mcad.ibm.com" + assert version == "v1beta1" + assert namespace == "ns" + assert plural == "appwrappers" + assert name == "test" + assert args == tuple() def test_AWManager_submit_remove(mocker, capsys): @@ -1978,10 +2118,17 @@ def test_AWManager_submit_remove(mocker, capsys): == "AppWrapper not submitted by this manager yet, nothing to remove\n" ) assert testaw.submitted == False - mocker.patch("openshift.invoke", side_effect=arg_check_aw_create_effect) + mocker.patch("kubernetes.config.load_kube_config", return_value="ignore") + mocker.patch( + "kubernetes.client.CustomObjectsApi.create_namespaced_custom_object", + side_effect=arg_check_aw_apply_effect, + ) + mocker.patch( + "kubernetes.client.CustomObjectsApi.delete_namespaced_custom_object", + side_effect=arg_check_aw_del_effect, + ) testaw.submit() assert testaw.submitted == True - mocker.patch("openshift.invoke", side_effect=arg_check_aw_delete_effect) testaw.remove() assert testaw.submitted == False @@ -2068,20 +2215,9 @@ def test_export_env(): ) -# Make sure to keep this function and the following function at the end of the file -def test_cmd_line_generation(): - os.system( - f"python3 {parent}/src/codeflare_sdk/utils/generate_yaml.py --name=unit-cmd-cluster --min-cpu=1 --max-cpu=1 --min-memory=2 --max-memory=2 --gpu=1 --workers=2 --template=src/codeflare_sdk/templates/base-template.yaml" - ) - assert filecmp.cmp( - "unit-cmd-cluster.yaml", f"{parent}/tests/test-case-cmd.yaml", shallow=True - ) - os.remove("unit-test-cluster.yaml") - os.remove("unit-test-default-cluster.yaml") - os.remove("unit-cmd-cluster.yaml") - - # Make sure to always keep this function last def test_cleanup(): + os.remove("unit-test-cluster.yaml") + os.remove("unit-test-default-cluster.yaml") os.remove("test.yaml") os.remove("raytest2.yaml")