Skip to content

Commit

Permalink
K8s Support for SDK (#146)
Browse files Browse the repository at this point in the history
* First pass on just the basics

* Added namespace retrieval and dashboard route access via kubernetes

* Added exception handling

* Remove unnecessary comment

* Change AW fs loading to k8s and begin converting unit tests

* Finished unit test update

* Update requirements

* Get cluster (#189)

* Add: get_cluster function to get cluster with specified name and namespace

* Test: make unit tests for get_cluster function

* Fix: unit test failing because of ray cluster obj changed (#208)

* Remove oc client and add helper functions (#187)

* Remove oc client and add helper functions

* Updates for error checking

* make sure tests run without oc login

* Removing CLI appwrapper generation

* Updated import

---------

Co-authored-by: Mustafa Eyceoz <[email protected]>

* Remove unused import

* Update authentication for K8s (#237)

* Updated authentication for Kubernetes

* Updated template name and comment

* Updated login functionality

* Altered config_check() function

* Altered comments and changed config_check() function

* Added logic for handling current namespace when a user authenticates via kube client

* Changed formatting

* Made handler functions generic and altered get_current_namespace() functionality

* Changed error message for cluster configuration

* Removed default values for token + server

* Added check for correct credentials

* Changed how using certs works with certifi.where

* Added unit tests for new authentication methods

* Fixed formatting and updated .gitignore to include test created files

* Fixed .gitignore

* Updated unit authentication tests

---------

Co-authored-by: Carson Harrell <[email protected]>
Co-authored-by: ted chang <[email protected]>
Co-authored-by: Mark Campbell <[email protected]>
  • Loading branch information
4 people committed Jul 20, 2023
1 parent 30ec84c commit d7ae866
Show file tree
Hide file tree
Showing 14 changed files with 1,525 additions and 1,516 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,5 @@ Pipfile.lock
poetry.lock
.venv*
build/
tls-cluster-namespace
quicktest.yaml
88 changes: 31 additions & 57 deletions demo-notebooks/interactive/local_interactive.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -32,20 +32,12 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"id": "4364ac2e-dd10-4d30-ba66-12708daefb3f",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Written to: hfgputest-1.yaml\n"
]
}
],
"outputs": [],
"source": [
"# Create our cluster and submit appwrapper\n",
"namespace = \"default\"\n",
Expand Down Expand Up @@ -89,7 +81,6 @@
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "12eef53c",
"metadata": {},
Expand All @@ -99,38 +90,21 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": null,
"id": "cf1b749e-2335-42c2-b673-26768ec9895d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"rayclient-hfgputest-1-default.apps.tedbig412.cp.fyre.ibm.com\n"
]
}
],
"outputs": [],
"source": [
"import openshift as oc\n",
"from codeflare_sdk.utils import generate_cert\n",
"\n",
"if local_interactive:\n",
" generate_cert.generate_tls_cert(cluster_name, namespace)\n",
" generate_cert.export_env(cluster_name, namespace)\n",
"\n",
"with oc.project(namespace):\n",
" routes=oc.selector(\"route\").objects()\n",
" rayclient_url=\"\"\n",
" for r in routes:\n",
" if \"rayclient\" in r.name():\n",
" rayclient_url=r.model.spec.host\n",
"print(rayclient_url)"
" generate_cert.export_env(cluster_name, namespace)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 6,
"id": "9483bb98-33b3-4beb-9b15-163d7e76c1d7",
"metadata": {
"scrolled": true,
Expand All @@ -141,15 +115,15 @@
"name": "stderr",
"output_type": "stream",
"text": [
"2023-05-31 14:12:37,816\tINFO client_builder.py:251 -- Passing the following kwargs to ray.init() on the server: logging_level\n",
"2023-05-31 14:12:37,820\tDEBUG worker.py:378 -- client gRPC channel state change: ChannelConnectivity.IDLE\n",
"2023-05-31 14:12:38,034\tDEBUG worker.py:378 -- client gRPC channel state change: ChannelConnectivity.CONNECTING\n",
"2023-05-31 14:12:38,246\tDEBUG worker.py:378 -- client gRPC channel state change: ChannelConnectivity.READY\n",
"2023-05-31 14:12:38,290\tDEBUG worker.py:807 -- Pinging server.\n",
"2023-05-31 14:12:40,521\tDEBUG worker.py:640 -- Retaining 00ffffffffffffffffffffffffffffffffffffff0100000001000000\n",
"2023-05-31 14:12:40,523\tDEBUG worker.py:564 -- Scheduling task get_dashboard_url 0 b'\\x00\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\x01\\x00\\x00\\x00\\x01\\x00\\x00\\x00'\n",
"2023-05-31 14:12:40,535\tDEBUG worker.py:640 -- Retaining c8ef45ccd0112571ffffffffffffffffffffffff0100000001000000\n",
"2023-05-31 14:12:41,379\tDEBUG worker.py:636 -- Releasing c8ef45ccd0112571ffffffffffffffffffffffff0100000001000000\n"
"2023-06-27 19:14:16,088\tINFO client_builder.py:251 -- Passing the following kwargs to ray.init() on the server: logging_level\n",
"2023-06-27 19:14:16,100\tDEBUG worker.py:378 -- client gRPC channel state change: ChannelConnectivity.IDLE\n",
"2023-06-27 19:14:16,308\tDEBUG worker.py:378 -- client gRPC channel state change: ChannelConnectivity.CONNECTING\n",
"2023-06-27 19:14:16,434\tDEBUG worker.py:378 -- client gRPC channel state change: ChannelConnectivity.READY\n",
"2023-06-27 19:14:16,436\tDEBUG worker.py:807 -- Pinging server.\n",
"2023-06-27 19:14:18,634\tDEBUG worker.py:640 -- Retaining 00ffffffffffffffffffffffffffffffffffffff0100000001000000\n",
"2023-06-27 19:14:18,635\tDEBUG worker.py:564 -- Scheduling task get_dashboard_url 0 b'\\x00\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\x01\\x00\\x00\\x00\\x01\\x00\\x00\\x00'\n",
"2023-06-27 19:14:18,645\tDEBUG worker.py:640 -- Retaining c8ef45ccd0112571ffffffffffffffffffffffff0100000001000000\n",
"2023-06-27 19:14:19,454\tDEBUG worker.py:636 -- Releasing c8ef45ccd0112571ffffffffffffffffffffffff0100000001000000\n"
]
},
{
Expand Down Expand Up @@ -190,18 +164,18 @@
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left\"><b>Dashboard:</b></td>\n",
" <td style=\"text-align: left\"><b><a href=\"http://10.254.12.141:8265\" target=\"_blank\">http://10.254.12.141:8265</a></b></td>\n",
" <td style=\"text-align: left\"><b><a href=\"http://10.254.20.41:8265\" target=\"_blank\">http://10.254.20.41:8265</a></b></td>\n",
"</tr>\n",
"\n",
" </table>\n",
" </div>\n",
"</div>\n"
],
"text/plain": [
"ClientContext(dashboard_url='10.254.12.141:8265', python_version='3.8.13', ray_version='2.1.0', ray_commit='23f34d948dae8de9b168667ab27e6cf940b3ae85', protocol_version='2022-10-05', _num_clients=1, _context_to_restore=<ray.util.client._ClientContext object at 0x10e5d2bb0>)"
"ClientContext(dashboard_url='10.254.20.41:8265', python_version='3.8.13', ray_version='2.1.0', ray_commit='23f34d948dae8de9b168667ab27e6cf940b3ae85', protocol_version='2022-10-05', _num_clients=1, _context_to_restore=<ray.util.client._ClientContext object at 0x108ca2730>)"
]
},
"execution_count": 12,
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -210,12 +184,12 @@
"import ray\n",
"\n",
"ray.shutdown()\n",
"ray.init(address=f\"ray://{rayclient_url}\", logging_level=\"DEBUG\")"
"ray.init(address=cluster.local_client_url(), logging_level=\"DEBUG\")"
]
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 7,
"id": "3436eb4a-217c-4109-a3c3-309fda7e2442",
"metadata": {},
"outputs": [],
Expand All @@ -239,7 +213,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 8,
"id": "5cca1874-2be3-4631-ae48-9adfa45e3af3",
"metadata": {
"scrolled": true,
Expand All @@ -250,8 +224,8 @@
"name": "stderr",
"output_type": "stream",
"text": [
"2023-05-31 14:13:29,868\tDEBUG worker.py:640 -- Retaining 00ffffffffffffffffffffffffffffffffffffff0100000002000000\n",
"2023-05-31 14:13:29,870\tDEBUG worker.py:564 -- Scheduling task heavy_calculation 0 b'\\x00\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\x01\\x00\\x00\\x00\\x02\\x00\\x00\\x00'\n"
"2023-06-27 19:14:28,222\tDEBUG worker.py:640 -- Retaining 00ffffffffffffffffffffffffffffffffffffff0100000002000000\n",
"2023-06-27 19:14:28,222\tDEBUG worker.py:564 -- Scheduling task heavy_calculation 0 b'\\x00\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\x01\\x00\\x00\\x00\\x02\\x00\\x00\\x00'\n"
]
}
],
Expand All @@ -261,16 +235,16 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 9,
"id": "01172c29-e8bf-41ef-8db5-eccb07906111",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2023-05-31 14:13:32,643\tDEBUG worker.py:640 -- Retaining 16310a0f0a45af5cffffffffffffffffffffffff0100000001000000\n",
"2023-05-31 14:13:34,677\tDEBUG worker.py:439 -- Internal retry for get [ClientObjectRef(16310a0f0a45af5cffffffffffffffffffffffff0100000001000000)]\n"
"2023-06-27 19:14:29,202\tDEBUG worker.py:640 -- Retaining 16310a0f0a45af5cffffffffffffffffffffffff0100000001000000\n",
"2023-06-27 19:14:31,224\tDEBUG worker.py:439 -- Internal retry for get [ClientObjectRef(16310a0f0a45af5cffffffffffffffffffffffff0100000001000000)]\n"
]
},
{
Expand All @@ -279,7 +253,7 @@
"1789.4644387076714"
]
},
"execution_count": 15,
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -290,18 +264,18 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 10,
"id": "9e79b547-a457-4232-b77d-19147067b972",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2023-05-31 14:13:37,659\tDEBUG dataclient.py:287 -- Got unawaited response connection_cleanup {\n",
"2023-06-27 19:14:33,161\tDEBUG dataclient.py:287 -- Got unawaited response connection_cleanup {\n",
"}\n",
"\n",
"2023-05-31 14:13:38,681\tDEBUG dataclient.py:278 -- Shutting down data channel.\n"
"2023-06-27 19:14:34,460\tDEBUG dataclient.py:278 -- Shutting down data channel.\n"
]
}
],
Expand All @@ -312,7 +286,7 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 11,
"id": "2c198f1f-68bf-43ff-a148-02b5cb000ff2",
"metadata": {},
"outputs": [],
Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,10 @@ python = "^3.7"
openshift-client = "1.0.18"
rich = "^12.5"
ray = {version = "2.5.0", extras = ["default"]}
kubernetes = "25.3.0"
kubernetes = ">= 25.3.0, < 27"
codeflare-torchx = "0.6.0.dev0"
cryptography = "40.0.2"
executing = "1.2.0"

[tool.poetry.group.docs]
optional = true
Expand Down
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,5 @@ ray[default]==2.5.0
kubernetes>=25.3.0,<27
codeflare-torchx==0.6.0.dev0
pydantic<2 # 2.0+ broke ray[default] see detail: https://github.com/ray-project/ray/pull/37000
cryptography==40.0.2
executing==1.2.0
Loading

0 comments on commit d7ae866

Please sign in to comment.