From 5b980daac1b278989d50cc8bca50a67496697542 Mon Sep 17 00:00:00 2001
From: artemvmin <artemvmin@google.com>
Date: Wed, 6 Mar 2024 10:46:29 -0800
Subject: [PATCH] Update notebook to periodically poll job status (#300)

---
 applications/rag/README.md                    |  13 ++-
 .../rag-kaggle-ray-sql-latest.ipynb           | 102 +++++-------------
 2 files changed, 34 insertions(+), 81 deletions(-)
diff --git a/applications/rag/README.md b/applications/rag/README.md
index d585535c1..e68d1f327 100644
--- a/applications/rag/README.md
+++ b/applications/rag/README.md
@@ -135,9 +135,14 @@ EOF
 
 ### Vector Embeddings for Dataset
 
+Choose a password for your CloudSQL user:
+```
+SQL_PASSWORD=
+```
+
 This step generates the vector embeddings for your input dataset. Currently, the default dataset is [Google Maps Restaurant Reviews](https://www.kaggle.com/datasets/denizbilginn/google-maps-restaurant-reviews). We will use a Jupyter notebook to run a Ray job that generates the embeddings & populates them into the instance `pgvector-instance` created above.
 
-1. Create a CloudSQL user to access the database: `gcloud sql users create rag-user-notebook --password=<choose a password> --instance=pgvector-instance --host=%`
+1. Create a CloudSQL user to access the database: `gcloud sql users create rag-user-notebook --password=${SQL_PASSWORD:?} --instance=pgvector-instance --host=%`
 
 2. Go to the Jupyterhub service endpoint in a browser:       
    * IAP disable: `kubectl get services proxy-public -n $NAMESPACE --output jsonpath='{.status.loadBalancer.ingress[0].ip}'`
@@ -161,9 +166,9 @@ This step generates the vector embeddings for your input dataset. Currently, the
     * `os.environ['KAGGLE_USERNAME']`
     * `os.environ['KAGGLE_KEY']`
 
-9. Run all the cells in the notebook. This generates vector embeddings for the input dataset (`denizbilginn/google-maps-restaurant-reviews`) and stores them in the `pgvector-instance` via a Ray job.
-    * When the last cell says the job has succeeded (eg: `Job 'raysubmit_APungAw6TyB55qxk' succeeded`), the vector embeddings have been generated and we can launch the frontend chat interface.
-    * Ray may take several minutes to create the runtime environment. During this time, the job will appear to be missing (e.g. `Status message: Job has not started yet`).
+9. Run all the cells in the notebook. This will generate vector embeddings for the input dataset (`denizbilginn/google-maps-restaurant-reviews`) and store them in the `pgvector-instance` via a Ray job.
+    * Once submitted, Ray will take several minutes to create the runtime environment and optionally scale up Ray worker nodes. During this time, the job status will remain PENDING.
+    * When the job status is SUCCEEDED, the vector embeddings have been generated and we are ready to launch the frontend chat interface.
 
 ### Launch the Frontend Chat Interface
 
diff --git a/applications/rag/example_notebooks/rag-kaggle-ray-sql-latest.ipynb b/applications/rag/example_notebooks/rag-kaggle-ray-sql-latest.ipynb
index 72d3d5915..b0a769af1 100644
--- a/applications/rag/example_notebooks/rag-kaggle-ray-sql-latest.ipynb
+++ b/applications/rag/example_notebooks/rag-kaggle-ray-sql-latest.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "id": "a814e91b-3afe-4c28-a3d6-fe087c7af552",
    "metadata": {},
    "outputs": [],
@@ -13,21 +13,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 2,
    "id": "1e26faef-9e2e-4793-b8af-0e18470b482d",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "google-maps-restaurant-reviews.zip: Skipping, found more recently modified local copy (use --force to force download)\n",
-      "Archive:  /home/jovyan/data/google-maps-restaurant-reviews.zip\n",
-      "  inflating: /persist-data/google-maps-restaurant-reviews/reviews.csv  \n",
-      "  inflating: /persist-data/google-maps-restaurant-reviews/sepetcioglu_restaurant.csv  \n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "import os\n",
     "# navigate to https://www.kaggle.com/settings/account and generate an API token to be used to setup the env variable. See https://www.kaggle.com/docs/api#authentication how to create one.\n",
@@ -42,7 +31,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 3,
    "id": "050f2c66-b92e-4ca6-a3b7-b7448d066f8e",
    "metadata": {},
    "outputs": [],
@@ -53,18 +42,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 4,
    "id": "c82cdcad-c74c-4196-9aa0-2e6bb49f4b58",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Overwriting test/test.py\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "%%writefile test/test.py\n",
     "# Comment out the above line if you want to see notebook print out, but the line is required for the actual ray job (the test.py is downloaded by the ray workers)\n",
@@ -255,38 +236,22 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 5,
    "id": "aeeb7b7a-23d8-4c6a-8165-7ce5516d2a41",
    "metadata": {},
    "outputs": [],
    "source": [
-    "import ray\n",
+    "import ray, time\n",
     "from ray.job_submission import JobSubmissionClient\n",
     "client = JobSubmissionClient(\"ray://example-cluster-kuberay-head-svc:10001\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
-   "id": "4f9f7495-b239-44e0-a96c-356ac5d48b3a",
+   "execution_count": 6,
+   "id": "7ba6c3ff-a25a-4f4d-b58e-68f7fe7d33df",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2024-02-12 21:28:25,605\tINFO dashboard_sdk.py:338 -- Uploading package gcs://_ray_pkg_6cb7c74b99fef592.zip.\n",
-      "2024-02-12 21:28:25,606\tINFO packaging.py:518 -- Creating a file package for local directory '/home/jovyan/test'.\n"
-     ]
-    },
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "jobid: raysubmit_8cQxrAChfX9BYKUW\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "job_id = client.submit_job(\n",
     "    entrypoint=\"python test.py\",\n",
@@ -303,40 +268,23 @@
     "                \"cloud-sql-python-connector[pg8000]==1.7.0\",\n",
     "                \"SQLAlchemy==2.0.7\",\n",
     "                \"huggingface_hub\",\n",
-    "                ]\n",
+    "        ]\n",
     "    }\n",
     ")\n",
-    "print(\"jobid:\", job_id)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 23,
-   "id": "2724d3a4-7613-4d98-951b-991419208d45",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Job submission server address: http://10.48.2.6:8265\n",
-      "Status for job 'raysubmit_8cQxrAChfX9BYKUW': RUNNING\n",
-      "Status message: Job is currently running.\n",
-      "\u001b[0m"
-     ]
-    }
-   ],
-   "source": [
-    "!ray job status {job_id}  --address \"ray://example-cluster-kuberay-head-svc:10001\" "
+    "\n",
+    "print(\"Job submitted with ID:\", job_id)\n",
+    "prev_status = \"\"\n",
+    "while True:\n",
+    "    status = client.get_job_status(job_id)\n",
+    "    if status != prev_status:\n",
+    "        print(\"Job status:\", status)\n",
+    "        prev_status = status\n",
+    "    if status.is_terminal():\n",
+    "        if status == 'FAILED':\n",
+    "            print(\"Job info:\", client.get_job_info(job_id))\n",
+    "        break\n",
+    "    time.sleep(5)\n"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "9c5c68f3-14d4-4ecf-afbd-c8f5c742618e",
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {