Skip to content

Commit

Permalink
Allow interaction with ray client service via Route from outside of O…
Browse files Browse the repository at this point in the history
…CP cluster (#100)

* Allow interation with rayclient via route from outside of OCP cluster

* update rayclient route generation

* update test-case yamls for client route

* add unit tests for generate_cert.py

* Update local interactive notebook

* Add more cert test cases

* Replace openshift client with k8s python client

* Update initcontainer to work the KubeRay v0.5.0

* Allow rayCluster to spin up in other ns

* Add auth cell to the notebook

* remove non function method

---------

Co-authored-by: MichaelClifford <[email protected]>
  • Loading branch information
tedhtchang and MichaelClifford committed Jun 22, 2023
1 parent 8506985 commit df48547
Show file tree
Hide file tree
Showing 10 changed files with 847 additions and 0 deletions.
358 changes: 358 additions & 0 deletions demo-notebooks/interactive/local_interactive.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,358 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "9a44568b-61ef-41c7-8ad1-9a3b128f03a7",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"# Import pieces from codeflare-sdk\n",
"from codeflare_sdk.cluster.cluster import Cluster, ClusterConfiguration\n",
"from codeflare_sdk.cluster.auth import TokenAuthentication"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2cc66278",
"metadata": {},
"outputs": [],
"source": [
"# Create authentication object and log in to desired user account (if not already authenticated)\n",
"auth = TokenAuthentication(\n",
" token = \"XXXX\",\n",
" server = \"XXXX\",\n",
" skip_tls = False\n",
")\n",
"auth.login()"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "4364ac2e-dd10-4d30-ba66-12708daefb3f",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Written to: hfgputest-1.yaml\n"
]
}
],
"source": [
"# Create our cluster and submit appwrapper\n",
"namespace = \"default\"\n",
"cluster_name = \"hfgputest-1\"\n",
"local_interactive = True\n",
"\n",
"cluster = Cluster(ClusterConfiguration(local_interactive=local_interactive, namespace=namespace, name=cluster_name, min_worker=1, max_worker=1, min_cpus=1, max_cpus=1, min_memory=4, max_memory=4, gpu=0, instascale=False, machine_types=[\"m5.xlarge\", \"p3.8xlarge\"]))"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "69968140-15e6-482f-9529-82b0cd19524b",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"cluster.up()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "e20f9982-f671-460b-8c22-3d62e101fed9",
"metadata": {
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Waiting for requested resources to be set up...\n",
"Requested cluster up and running!\n"
]
}
],
"source": [
"cluster.wait_ready()"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "12eef53c",
"metadata": {},
"source": [
"### Connect via the rayclient route"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "cf1b749e-2335-42c2-b673-26768ec9895d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"rayclient-hfgputest-1-default.apps.tedbig412.cp.fyre.ibm.com\n"
]
}
],
"source": [
"import openshift as oc\n",
"from codeflare_sdk.utils import generate_cert\n",
"\n",
"if local_interactive:\n",
" generate_cert.generate_tls_cert(cluster_name, namespace)\n",
" generate_cert.export_env(cluster_name, namespace)\n",
"\n",
"with oc.project(namespace):\n",
" routes=oc.selector(\"route\").objects()\n",
" rayclient_url=\"\"\n",
" for r in routes:\n",
" if \"rayclient\" in r.name():\n",
" rayclient_url=r.model.spec.host\n",
"print(rayclient_url)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "9483bb98-33b3-4beb-9b15-163d7e76c1d7",
"metadata": {
"scrolled": true,
"tags": []
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2023-05-31 14:12:37,816\tINFO client_builder.py:251 -- Passing the following kwargs to ray.init() on the server: logging_level\n",
"2023-05-31 14:12:37,820\tDEBUG worker.py:378 -- client gRPC channel state change: ChannelConnectivity.IDLE\n",
"2023-05-31 14:12:38,034\tDEBUG worker.py:378 -- client gRPC channel state change: ChannelConnectivity.CONNECTING\n",
"2023-05-31 14:12:38,246\tDEBUG worker.py:378 -- client gRPC channel state change: ChannelConnectivity.READY\n",
"2023-05-31 14:12:38,290\tDEBUG worker.py:807 -- Pinging server.\n",
"2023-05-31 14:12:40,521\tDEBUG worker.py:640 -- Retaining 00ffffffffffffffffffffffffffffffffffffff0100000001000000\n",
"2023-05-31 14:12:40,523\tDEBUG worker.py:564 -- Scheduling task get_dashboard_url 0 b'\\x00\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\x01\\x00\\x00\\x00\\x01\\x00\\x00\\x00'\n",
"2023-05-31 14:12:40,535\tDEBUG worker.py:640 -- Retaining c8ef45ccd0112571ffffffffffffffffffffffff0100000001000000\n",
"2023-05-31 14:12:41,379\tDEBUG worker.py:636 -- Releasing c8ef45ccd0112571ffffffffffffffffffffffff0100000001000000\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
" <div style=\"margin-left: 50px;display: flex;flex-direction: row;align-items: center\">\n",
" <h3 style=\"color: var(--jp-ui-font-color0)\">Ray</h3>\n",
" <svg version=\"1.1\" id=\"ray\" width=\"3em\" viewBox=\"0 0 144.5 144.6\" style=\"margin-left: 3em;margin-right: 3em\">\n",
" <g id=\"layer-1\">\n",
" <path fill=\"#00a2e9\" class=\"st0\" d=\"M97.3,77.2c-3.8-1.1-6.2,0.9-8.3,5.1c-3.5,6.8-9.9,9.9-17.4,9.6S58,88.1,54.8,81.2c-1.4-3-3-4-6.3-4.1\n",
" c-5.6-0.1-9.9,0.1-13.1,6.4c-3.8,7.6-13.6,10.2-21.8,7.6C5.2,88.4-0.4,80.5,0,71.7c0.1-8.4,5.7-15.8,13.8-18.2\n",
" c8.4-2.6,17.5,0.7,22.3,8c1.3,1.9,1.3,5.2,3.6,5.6c3.9,0.6,8,0.2,12,0.2c1.8,0,1.9-1.6,2.4-2.8c3.5-7.8,9.7-11.8,18-11.9\n",
" c8.2-0.1,14.4,3.9,17.8,11.4c1.3,2.8,2.9,3.6,5.7,3.3c1-0.1,2,0.1,3,0c2.8-0.5,6.4,1.7,8.1-2.7s-2.3-5.5-4.1-7.5\n",
" c-5.1-5.7-10.9-10.8-16.1-16.3C84,38,81.9,37.1,78,38.3C66.7,42,56.2,35.7,53,24.1C50.3,14,57.3,2.8,67.7,0.5\n",
" C78.4-2,89,4.7,91.5,15.3c0.1,0.3,0.1,0.5,0.2,0.8c0.7,3.4,0.7,6.9-0.8,9.8c-1.7,3.2-0.8,5,1.5,7.2c6.7,6.5,13.3,13,19.8,19.7\n",
" c1.8,1.8,3,2.1,5.5,1.2c9.1-3.4,17.9-0.6,23.4,7c4.8,6.9,4.6,16.1-0.4,22.9c-5.4,7.2-14.2,9.9-23.1,6.5c-2.3-0.9-3.5-0.6-5.1,1.1\n",
" c-6.7,6.9-13.6,13.7-20.5,20.4c-1.8,1.8-2.5,3.2-1.4,5.9c3.5,8.7,0.3,18.6-7.7,23.6c-7.9,5-18.2,3.8-24.8-2.9\n",
" c-6.4-6.4-7.4-16.2-2.5-24.3c4.9-7.8,14.5-11,23.1-7.8c3,1.1,4.7,0.5,6.9-1.7C91.7,98.4,98,92.3,104.2,86c1.6-1.6,4.1-2.7,2.6-6.2\n",
" c-1.4-3.3-3.8-2.5-6.2-2.6C99.8,77.2,98.9,77.2,97.3,77.2z M72.1,29.7c5.5,0.1,9.9-4.3,10-9.8c0-0.1,0-0.2,0-0.3\n",
" C81.8,14,77,9.8,71.5,10.2c-5,0.3-9,4.2-9.3,9.2c-0.2,5.5,4,10.1,9.5,10.3C71.8,29.7,72,29.7,72.1,29.7z M72.3,62.3\n",
" c-5.4-0.1-9.9,4.2-10.1,9.7c0,0.2,0,0.3,0,0.5c0.2,5.4,4.5,9.7,9.9,10c5.1,0.1,9.9-4.7,10.1-9.8c0.2-5.5-4-10-9.5-10.3\n",
" C72.6,62.3,72.4,62.3,72.3,62.3z M115,72.5c0.1,5.4,4.5,9.7,9.8,9.9c5.6-0.2,10-4.8,10-10.4c-0.2-5.4-4.6-9.7-10-9.7\n",
" c-5.3-0.1-9.8,4.2-9.9,9.5C115,72.1,115,72.3,115,72.5z M19.5,62.3c-5.4,0.1-9.8,4.4-10,9.8c-0.1,5.1,5.2,10.4,10.2,10.3\n",
" c5.6-0.2,10-4.9,9.8-10.5c-0.1-5.4-4.5-9.7-9.9-9.6C19.6,62.3,19.5,62.3,19.5,62.3z M71.8,134.6c5.9,0.2,10.3-3.9,10.4-9.6\n",
" c0.5-5.5-3.6-10.4-9.1-10.8c-5.5-0.5-10.4,3.6-10.8,9.1c0,0.5,0,0.9,0,1.4c-0.2,5.3,4,9.8,9.3,10\n",
" C71.6,134.6,71.7,134.6,71.8,134.6z\"/>\n",
" </g>\n",
" </svg>\n",
" <table>\n",
" <tr>\n",
" <td style=\"text-align: left\"><b>Python version:</b></td>\n",
" <td style=\"text-align: left\"><b>3.8.13</b></td>\n",
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left\"><b>Ray version:</b></td>\n",
" <td style=\"text-align: left\"><b> 2.1.0</b></td>\n",
" </tr>\n",
" <tr>\n",
" <td style=\"text-align: left\"><b>Dashboard:</b></td>\n",
" <td style=\"text-align: left\"><b><a href=\"http://10.254.12.141:8265\" target=\"_blank\">http://10.254.12.141:8265</a></b></td>\n",
"</tr>\n",
"\n",
" </table>\n",
" </div>\n",
"</div>\n"
],
"text/plain": [
"ClientContext(dashboard_url='10.254.12.141:8265', python_version='3.8.13', ray_version='2.1.0', ray_commit='23f34d948dae8de9b168667ab27e6cf940b3ae85', protocol_version='2022-10-05', _num_clients=1, _context_to_restore=<ray.util.client._ClientContext object at 0x10e5d2bb0>)"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import ray\n",
"\n",
"ray.shutdown()\n",
"ray.init(address=f\"ray://{rayclient_url}\", logging_level=\"DEBUG\")"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "3436eb4a-217c-4109-a3c3-309fda7e2442",
"metadata": {},
"outputs": [],
"source": [
"import math\n",
"import ray\n",
"\n",
"@ray.remote\n",
"def heavy_calculation_part(num_iterations):\n",
" result = 0.0\n",
" for i in range(num_iterations):\n",
" for j in range(num_iterations):\n",
" for k in range(num_iterations):\n",
" result += math.sin(i) * math.cos(j) * math.tan(k)\n",
" return result\n",
"@ray.remote\n",
"def heavy_calculation(num_iterations):\n",
" results = ray.get([heavy_calculation_part.remote(num_iterations//30) for _ in range(30)])\n",
" return sum(results)\n"
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "5cca1874-2be3-4631-ae48-9adfa45e3af3",
"metadata": {
"scrolled": true,
"tags": []
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2023-05-31 14:13:29,868\tDEBUG worker.py:640 -- Retaining 00ffffffffffffffffffffffffffffffffffffff0100000002000000\n",
"2023-05-31 14:13:29,870\tDEBUG worker.py:564 -- Scheduling task heavy_calculation 0 b'\\x00\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\xff\\x01\\x00\\x00\\x00\\x02\\x00\\x00\\x00'\n"
]
}
],
"source": [
"ref = heavy_calculation.remote(3000)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "01172c29-e8bf-41ef-8db5-eccb07906111",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2023-05-31 14:13:32,643\tDEBUG worker.py:640 -- Retaining 16310a0f0a45af5cffffffffffffffffffffffff0100000001000000\n",
"2023-05-31 14:13:34,677\tDEBUG worker.py:439 -- Internal retry for get [ClientObjectRef(16310a0f0a45af5cffffffffffffffffffffffff0100000001000000)]\n"
]
},
{
"data": {
"text/plain": [
"1789.4644387076714"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ray.get(ref)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "9e79b547-a457-4232-b77d-19147067b972",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2023-05-31 14:13:37,659\tDEBUG dataclient.py:287 -- Got unawaited response connection_cleanup {\n",
"}\n",
"\n",
"2023-05-31 14:13:38,681\tDEBUG dataclient.py:278 -- Shutting down data channel.\n"
]
}
],
"source": [
"ray.cancel(ref)\n",
"ray.shutdown()"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "2c198f1f-68bf-43ff-a148-02b5cb000ff2",
"metadata": {},
"outputs": [],
"source": [
"cluster.down()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6879e471-a69f-4c74-9cec-a195cdead47c",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.13"
},
"vscode": {
"interpreter": {
"hash": "f9f85f796d01129d0dd105a088854619f454435301f6ffec2fea96ecbd9be4ac"
}
}
},
"nbformat": 4,
"nbformat_minor": 5
}
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ rich = "^12.5"
ray = {version = "2.1.0", extras = ["default"]}
kubernetes = ">= 25.3.0, < 27"
codeflare-torchx = "0.6.0.dev0"
cryptography = "40.0.2"

[tool.poetry.group.docs]
optional = true
Expand Down
2 changes: 2 additions & 0 deletions src/codeflare_sdk/cluster/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ def create_app_wrapper(self):
instascale = self.config.instascale
instance_types = self.config.machine_types
env = self.config.envs
local_interactive = self.config.local_interactive
return generate_appwrapper(
name=name,
namespace=namespace,
Expand All @@ -98,6 +99,7 @@ def create_app_wrapper(self):
instascale=instascale,
instance_types=instance_types,
env=env,
local_interactive=local_interactive,
)

# creates a new cluster with the provided or default spec
Expand Down
1 change: 1 addition & 0 deletions src/codeflare_sdk/cluster/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,3 +48,4 @@ class ClusterConfiguration:
instascale: bool = False
envs: dict = field(default_factory=dict)
image: str = "ghcr.io/foundation-model-stack/base:ray2.1.0-py38-gpu-pytorch1.12.0cu116-20221213-193103"
local_interactive: bool = False
Loading

0 comments on commit df48547

Please sign in to comment.