From 73aeecb564afee5c5cac2cf345999d6bba9795ea Mon Sep 17 00:00:00 2001 From: Richard Liu Date: Thu, 28 Mar 2024 03:21:29 +0000 Subject: [PATCH 01/20] move mysql stuff to jupyter --- applications/rag/main.tf | 4 ++++ applications/rag/workloads.tfvars | 18 +++++++++--------- .../config-selfauth-autopilot.yaml | 9 +++++++++ .../jupyter_config/config-selfauth.yaml | 8 ++++++++ modules/jupyter/main.tf | 6 ++++++ modules/jupyter/variables.tf | 18 ++++++++++++++++++ 6 files changed, 54 insertions(+), 9 deletions(-) diff --git a/applications/rag/main.tf b/applications/rag/main.tf index 1c015b93f..9ddd1252b 100644 --- a/applications/rag/main.tf +++ b/applications/rag/main.tf @@ -189,6 +189,10 @@ module "jupyterhub" { autopilot_cluster = local.enable_autopilot workload_identity_service_account = local.jupyter_service_account + db_secret_name = module.cloudsql.db_secret_name + cloudsql_instance_name = local.cloudsql_instance + db_region = local.cloudsql_instance_region + # IAP Auth parameters create_brand = var.create_brand support_email = var.support_email diff --git a/applications/rag/workloads.tfvars b/applications/rag/workloads.tfvars index d4c12c620..a060fcd8b 100644 --- a/applications/rag/workloads.tfvars +++ b/applications/rag/workloads.tfvars @@ -12,15 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. -project_id = "" +project_id = "ricliu-gke-dev" ## this is required for terraform to connect to GKE master and deploy workloads create_cluster = true # Create a GKE cluster in the specified network. -autopilot_cluster = true -cluster_name = "" -cluster_location = "us-central1" +autopilot_cluster = false +cluster_name = "raggedy-rag" +cluster_location = "us-east4" create_network = true -network_name = "ml-network" +network_name = "raga-network" subnetwork_cidr = "10.100.0.0/16" ## GKE environment variables @@ -29,10 +29,10 @@ create_gcs_bucket = true # The bucket name must be globally unique (across all of Google Cloud). # To verify, check that `gcloud storage buckets describe gs://` returns a 404. -gcs_bucket = "rag-data-" +gcs_bucket = "rag-data-ricliu-b" cloudsql_instance = "pgvector-instance" -cloudsql_instance_region = "us-central1" # defaults to cluster_location, if not specified +cloudsql_instance_region = "us-east4" # defaults to cluster_location, if not specified ## Service accounts @@ -70,7 +70,7 @@ jupyter_k8s_backend_service_port = 80 jupyter_domain = "" ## Provide domain for ingress resource and ssl certificate. If it's empty, it will use nip.io wildcard dns jupyter_client_id = "" jupyter_client_secret = "" -jupyter_members_allowlist = "user:,group:,serviceAccount:,domain:google.com" +jupyter_members_allowlist = "user:ricliu@google.com,group:,serviceAccount:,domain:google.com" ## Frontend IAP Settings frontend_add_auth = false # Set to true when using auth with IAP @@ -97,4 +97,4 @@ ray_dashboard_k8s_backend_service_port = 8265 ray_dashboard_domain = "" ## Provide domain for ingress resource and ssl certificate. If it's empty, it will use nip.io wildcard dns ray_dashboard_client_id = "" ray_dashboard_client_secret = "" -ray_dashboard_members_allowlist = "user:,group:,serviceAccount:,domain:google.com" \ No newline at end of file +ray_dashboard_members_allowlist = "user:,group:,serviceAccount:,domain:google.com" diff --git a/modules/jupyter/jupyter_config/config-selfauth-autopilot.yaml b/modules/jupyter/jupyter_config/config-selfauth-autopilot.yaml index 0e1c985e1..293893162 100644 --- a/modules/jupyter/jupyter_config/config-selfauth-autopilot.yaml +++ b/modules/jupyter/jupyter_config/config-selfauth-autopilot.yaml @@ -88,6 +88,7 @@ singleuser: extraEnv: # Used for GCSFuse to set the ephemeral storage as the home directory. If not set, it will show a permission error on the pod log when using GCSFuse. JUPYTER_ALLOW_INSECURE_WRITES: "true" + CLOUDSQL_INSTANCE_CONNECTION_NAME: ${cloudsql_instance_connection_name} extraLabels: ${indent(4, chomp(jsonencode(additional_labels)))} image: @@ -111,9 +112,17 @@ singleuser: volumeAttributes: bucketName: ${gcs_bucket} mountOptions: "implicit-dirs,uid=1000,gid=100" + - name: secret-volume + secret: + secretName: ${secret_name} + optional: true + extraVolumeMounts: - name: test-vol mountPath: /persist-data + - name: secret-volume + mountPath: /etc/secret-volume + readOnly: true profileList: - display_name: "CPU (C3)" description: "Creates CPU (C3) VMs as the compute for notebook execution." diff --git a/modules/jupyter/jupyter_config/config-selfauth.yaml b/modules/jupyter/jupyter_config/config-selfauth.yaml index f763e871f..f6b03aa8c 100644 --- a/modules/jupyter/jupyter_config/config-selfauth.yaml +++ b/modules/jupyter/jupyter_config/config-selfauth.yaml @@ -86,6 +86,7 @@ singleuser: extraEnv: # Used for GCSFuse to set the ephemeral storage as the home directory. If not set, it will show a permission error on the pod log when using GCSFuse. JUPYTER_ALLOW_INSECURE_WRITES: "true" + CLOUDSQL_INSTANCE_CONNECTION_NAME: ${cloudsql_instance_connection_name} extraLabels: ${indent(4, chomp(jsonencode(additional_labels)))} image: @@ -109,9 +110,16 @@ singleuser: volumeAttributes: bucketName: ${gcs_bucket} mountOptions: "implicit-dirs,uid=1000,gid=100" + - name: secret-volume + secret: + secretName: ${secret_name} + optional: true extraVolumeMounts: - name: test-vol mountPath: /persist-data + - name: secret-volume + mountPath: /etc/secret-volume + readOnly: true # More info on kubespawner overrides: https://jupyterhub-kubespawner.readthedocs.io/en/latest/spawner.html#kubespawner.KubeSpawner # profile example: # - display_name: "Learning Data Science" diff --git a/modules/jupyter/main.tf b/modules/jupyter/main.tf index c2346e285..9487349dd 100644 --- a/modules/jupyter/main.tf +++ b/modules/jupyter/main.tf @@ -17,6 +17,7 @@ data "google_project" "project" { } locals { + cloudsql_instance_connection_name = format("%s:%s:%s", var.project_id, var.db_region, var.cloudsql_instance_name) additional_labels = tomap({ for item in var.additional_labels : split("=", item)[0] => split("=", item)[1] @@ -121,6 +122,9 @@ resource "helm_release" "jupyterhub" { gcs_bucket = var.gcs_bucket k8s_service_account = var.workload_identity_service_account ephemeral_storage = var.ephemeral_storage + secret_name = var.db_secret_name + cloudsql_instance_connection_name = local.cloudsql_instance_connection_name + }) ] : [templatefile("${path.module}/jupyter_config/config-selfauth.yaml", { password = var.add_auth ? "dummy" : random_password.generated_password[0].result @@ -135,6 +139,8 @@ resource "helm_release" "jupyterhub" { gcs_bucket = var.gcs_bucket k8s_service_account = var.workload_identity_service_account ephemeral_storage = var.ephemeral_storage + secret_name = var.db_secret_name + cloudsql_instance_connection_name = local.cloudsql_instance_connection_name }) ] depends_on = [module.jupyterhub-workload-identity] diff --git a/modules/jupyter/variables.tf b/modules/jupyter/variables.tf index 3afe2e235..d3a04869d 100644 --- a/modules/jupyter/variables.tf +++ b/modules/jupyter/variables.tf @@ -135,3 +135,21 @@ variable "ephemeral_storage" { variable "autopilot_cluster" { type = bool } + +variable "db_region" { + type = string + description = "Cloud SQL instance region" + default = "us-east4" +} + +variable "db_secret_name" { + type = string + description = "CloudSQL user credentials" + default = "empty-secret" +} + +variable "cloudsql_instance_name" { + type = string + description = "Cloud SQL instance name" + default = "pgvector-instance" +} From e5c0e2cb3bacb9284bf35b7af5fd477c4b5a2e39 Mon Sep 17 00:00:00 2001 From: Richard Liu Date: Thu, 28 Mar 2024 03:26:13 +0000 Subject: [PATCH 02/20] new notebook --- .../rag-kaggle-ray-sql-refactored.ipynb | 1275 +++++++++++++++++ 1 file changed, 1275 insertions(+) create mode 100644 applications/rag/example_notebooks/rag-kaggle-ray-sql-refactored.ipynb diff --git a/applications/rag/example_notebooks/rag-kaggle-ray-sql-refactored.ipynb b/applications/rag/example_notebooks/rag-kaggle-ray-sql-refactored.ipynb new file mode 100644 index 000000000..01bf43e3f --- /dev/null +++ b/applications/rag/example_notebooks/rag-kaggle-ray-sql-refactored.ipynb @@ -0,0 +1,1275 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "00b1aff4", + "metadata": {}, + "outputs": [], + "source": [ + "# Replace these with your settings\n", + "# Navigate to https://www.kaggle.com/settings/account and generate an API token to be used to setup the env variable. See https://www.kaggle.com/docs/api#authentication how to create one.\n", + "KAGGLE_USERNAME = \"ricliu\"\n", + "KAGGLE_KEY = \"8145116fb63d4f2be10cab2c1b3ca238\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "a814e91b-3afe-4c28-a3d6-fe087c7af552", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: ray[default]==2.9.3 in /opt/conda/lib/python3.10/site-packages (2.9.3)\n", + "Requirement already satisfied: kaggle==1.6.6 in /opt/conda/lib/python3.10/site-packages (1.6.6)\n", + "Requirement already satisfied: click>=7.0 in /opt/conda/lib/python3.10/site-packages (from ray[default]==2.9.3) (8.1.3)\n", + "Requirement already satisfied: filelock in /opt/conda/lib/python3.10/site-packages (from ray[default]==2.9.3) (3.13.3)\n", + "Requirement already satisfied: jsonschema in /opt/conda/lib/python3.10/site-packages (from ray[default]==2.9.3) (4.17.3)\n", + "Requirement already satisfied: msgpack<2.0.0,>=1.0.0 in /opt/conda/lib/python3.10/site-packages (from ray[default]==2.9.3) (1.0.5)\n", + "Requirement already satisfied: packaging in /opt/conda/lib/python3.10/site-packages (from ray[default]==2.9.3) (23.1)\n", + "Requirement already satisfied: protobuf!=3.19.5,>=3.15.3 in /opt/conda/lib/python3.10/site-packages (from ray[default]==2.9.3) (4.21.12)\n", + "Requirement already satisfied: pyyaml in /opt/conda/lib/python3.10/site-packages (from ray[default]==2.9.3) (6.0)\n", + "Requirement already satisfied: aiosignal in /opt/conda/lib/python3.10/site-packages (from ray[default]==2.9.3) (1.3.1)\n", + "Requirement already satisfied: frozenlist in /opt/conda/lib/python3.10/site-packages (from ray[default]==2.9.3) (1.4.1)\n", + "Requirement already satisfied: requests in /opt/conda/lib/python3.10/site-packages (from ray[default]==2.9.3) (2.31.0)\n", + "Requirement already satisfied: aiohttp>=3.7 in /opt/conda/lib/python3.10/site-packages (from ray[default]==2.9.3) (3.7.4.post0)\n", + "Requirement already satisfied: aiohttp-cors in /opt/conda/lib/python3.10/site-packages (from ray[default]==2.9.3) (0.7.0)\n", + "Requirement already satisfied: colorful in /opt/conda/lib/python3.10/site-packages (from ray[default]==2.9.3) (0.5.6)\n", + "Requirement already satisfied: py-spy>=0.2.0 in /opt/conda/lib/python3.10/site-packages (from ray[default]==2.9.3) (0.3.14)\n", + "Requirement already satisfied: gpustat>=1.0.0 in /opt/conda/lib/python3.10/site-packages (from ray[default]==2.9.3) (1.1.1)\n", + "Requirement already satisfied: opencensus in /opt/conda/lib/python3.10/site-packages (from ray[default]==2.9.3) (0.11.4)\n", + "Requirement already satisfied: pydantic!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,<3 in /opt/conda/lib/python3.10/site-packages (from ray[default]==2.9.3) (2.6.4)\n", + "Requirement already satisfied: prometheus-client>=0.7.1 in /opt/conda/lib/python3.10/site-packages (from ray[default]==2.9.3) (0.17.0)\n", + "Requirement already satisfied: smart-open in /opt/conda/lib/python3.10/site-packages (from ray[default]==2.9.3) (7.0.4)\n", + "Requirement already satisfied: virtualenv!=20.21.1,>=20.0.24 in /opt/conda/lib/python3.10/site-packages (from ray[default]==2.9.3) (20.25.1)\n", + "Requirement already satisfied: grpcio>=1.42.0 in /opt/conda/lib/python3.10/site-packages (from ray[default]==2.9.3) (1.51.1)\n", + "Requirement already satisfied: six>=1.10 in /opt/conda/lib/python3.10/site-packages (from kaggle==1.6.6) (1.16.0)\n", + "Requirement already satisfied: certifi in /opt/conda/lib/python3.10/site-packages (from kaggle==1.6.6) (2023.5.7)\n", + "Requirement already satisfied: python-dateutil in /opt/conda/lib/python3.10/site-packages (from kaggle==1.6.6) (2.8.2)\n", + "Requirement already satisfied: tqdm in /opt/conda/lib/python3.10/site-packages (from kaggle==1.6.6) (4.65.0)\n", + "Requirement already satisfied: python-slugify in /opt/conda/lib/python3.10/site-packages (from kaggle==1.6.6) (8.0.4)\n", + "Requirement already satisfied: urllib3 in /opt/conda/lib/python3.10/site-packages (from kaggle==1.6.6) (2.0.2)\n", + "Requirement already satisfied: bleach in /opt/conda/lib/python3.10/site-packages (from kaggle==1.6.6) (6.0.0)\n", + "Requirement already satisfied: attrs>=17.3.0 in /opt/conda/lib/python3.10/site-packages (from aiohttp>=3.7->ray[default]==2.9.3) (23.1.0)\n", + "Requirement already satisfied: chardet<5.0,>=2.0 in /opt/conda/lib/python3.10/site-packages (from aiohttp>=3.7->ray[default]==2.9.3) (4.0.0)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /opt/conda/lib/python3.10/site-packages (from aiohttp>=3.7->ray[default]==2.9.3) (6.0.4)\n", + "Requirement already satisfied: async-timeout<4.0,>=3.0 in /opt/conda/lib/python3.10/site-packages (from aiohttp>=3.7->ray[default]==2.9.3) (3.0.1)\n", + "Requirement already satisfied: yarl<2.0,>=1.0 in /opt/conda/lib/python3.10/site-packages (from aiohttp>=3.7->ray[default]==2.9.3) (1.9.2)\n", + "Requirement already satisfied: typing-extensions>=3.6.5 in /opt/conda/lib/python3.10/site-packages (from aiohttp>=3.7->ray[default]==2.9.3) (4.6.2)\n", + "Requirement already satisfied: nvidia-ml-py>=11.450.129 in /opt/conda/lib/python3.10/site-packages (from gpustat>=1.0.0->ray[default]==2.9.3) (12.535.133)\n", + "Requirement already satisfied: psutil>=5.6.0 in /opt/conda/lib/python3.10/site-packages (from gpustat>=1.0.0->ray[default]==2.9.3) (5.9.5)\n", + "Requirement already satisfied: blessed>=1.17.1 in /opt/conda/lib/python3.10/site-packages (from gpustat>=1.0.0->ray[default]==2.9.3) (1.20.0)\n", + "Requirement already satisfied: annotated-types>=0.4.0 in /opt/conda/lib/python3.10/site-packages (from pydantic!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,<3->ray[default]==2.9.3) (0.6.0)\n", + "Requirement already satisfied: pydantic-core==2.16.3 in /opt/conda/lib/python3.10/site-packages (from pydantic!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,<3->ray[default]==2.9.3) (2.16.3)\n", + "Requirement already satisfied: distlib<1,>=0.3.7 in /opt/conda/lib/python3.10/site-packages (from virtualenv!=20.21.1,>=20.0.24->ray[default]==2.9.3) (0.3.8)\n", + "Requirement already satisfied: platformdirs<5,>=3.9.1 in /opt/conda/lib/python3.10/site-packages (from virtualenv!=20.21.1,>=20.0.24->ray[default]==2.9.3) (4.2.0)\n", + "Requirement already satisfied: webencodings in /opt/conda/lib/python3.10/site-packages (from bleach->kaggle==1.6.6) (0.5.1)\n", + "Requirement already satisfied: pyrsistent!=0.17.0,!=0.17.1,!=0.17.2,>=0.14.0 in /opt/conda/lib/python3.10/site-packages (from jsonschema->ray[default]==2.9.3) (0.19.3)\n", + "Requirement already satisfied: opencensus-context>=0.1.3 in /opt/conda/lib/python3.10/site-packages (from opencensus->ray[default]==2.9.3) (0.1.3)\n", + "Requirement already satisfied: google-api-core<3.0.0,>=1.0.0 in /opt/conda/lib/python3.10/site-packages (from opencensus->ray[default]==2.9.3) (2.18.0)\n", + "Requirement already satisfied: text-unidecode>=1.3 in /opt/conda/lib/python3.10/site-packages (from python-slugify->kaggle==1.6.6) (1.3)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from requests->ray[default]==2.9.3) (3.1.0)\n", + "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests->ray[default]==2.9.3) (3.4)\n", + "Requirement already satisfied: wrapt in /opt/conda/lib/python3.10/site-packages (from smart-open->ray[default]==2.9.3) (1.15.0)\n", + "Requirement already satisfied: wcwidth>=0.1.4 in /opt/conda/lib/python3.10/site-packages (from blessed>=1.17.1->gpustat>=1.0.0->ray[default]==2.9.3) (0.2.6)\n", + "Requirement already satisfied: googleapis-common-protos<2.0.dev0,>=1.56.2 in /opt/conda/lib/python3.10/site-packages (from google-api-core<3.0.0,>=1.0.0->opencensus->ray[default]==2.9.3) (1.63.0)\n", + "Requirement already satisfied: proto-plus<2.0.0dev,>=1.22.3 in /opt/conda/lib/python3.10/site-packages (from google-api-core<3.0.0,>=1.0.0->opencensus->ray[default]==2.9.3) (1.23.0)\n", + "Requirement already satisfied: google-auth<3.0.dev0,>=2.14.1 in /opt/conda/lib/python3.10/site-packages (from google-api-core<3.0.0,>=1.0.0->opencensus->ray[default]==2.9.3) (2.17.3)\n", + "Requirement already satisfied: cachetools<6.0,>=2.0.0 in /opt/conda/lib/python3.10/site-packages (from google-auth<3.0.dev0,>=2.14.1->google-api-core<3.0.0,>=1.0.0->opencensus->ray[default]==2.9.3) (5.3.0)\n", + "Requirement already satisfied: pyasn1-modules>=0.2.1 in /opt/conda/lib/python3.10/site-packages (from google-auth<3.0.dev0,>=2.14.1->google-api-core<3.0.0,>=1.0.0->opencensus->ray[default]==2.9.3) (0.2.7)\n", + "Requirement already satisfied: rsa<5,>=3.1.4 in /opt/conda/lib/python3.10/site-packages (from google-auth<3.0.dev0,>=2.14.1->google-api-core<3.0.0,>=1.0.0->opencensus->ray[default]==2.9.3) (4.9)\n", + "Requirement already satisfied: pyasn1<0.5.0,>=0.4.6 in /opt/conda/lib/python3.10/site-packages (from pyasn1-modules>=0.2.1->google-auth<3.0.dev0,>=2.14.1->google-api-core<3.0.0,>=1.0.0->opencensus->ray[default]==2.9.3) (0.4.8)\n", + "Collecting langchain\n", + " Downloading langchain-0.1.13-py3-none-any.whl (810 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m810.5/810.5 kB\u001b[0m \u001b[31m34.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: ray==2.9.3 in /opt/conda/lib/python3.10/site-packages (2.9.3)\n", + "Collecting datasets\n", + " Downloading datasets-2.18.0-py3-none-any.whl (510 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m510.5/510.5 kB\u001b[0m \u001b[31m54.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting sentence-transformers\n", + " Downloading sentence_transformers-2.6.1-py3-none-any.whl (163 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m163.3/163.3 kB\u001b[0m \u001b[31m33.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: click>=7.0 in /opt/conda/lib/python3.10/site-packages (from ray==2.9.3) (8.1.3)\n", + "Requirement already satisfied: filelock in /opt/conda/lib/python3.10/site-packages (from ray==2.9.3) (3.13.3)\n", + "Requirement already satisfied: jsonschema in /opt/conda/lib/python3.10/site-packages (from ray==2.9.3) (4.17.3)\n", + "Requirement already satisfied: msgpack<2.0.0,>=1.0.0 in /opt/conda/lib/python3.10/site-packages (from ray==2.9.3) (1.0.5)\n", + "Requirement already satisfied: packaging in /opt/conda/lib/python3.10/site-packages (from ray==2.9.3) (23.1)\n", + "Requirement already satisfied: protobuf!=3.19.5,>=3.15.3 in /opt/conda/lib/python3.10/site-packages (from ray==2.9.3) (4.21.12)\n", + "Requirement already satisfied: pyyaml in /opt/conda/lib/python3.10/site-packages (from ray==2.9.3) (6.0)\n", + "Requirement already satisfied: aiosignal in /opt/conda/lib/python3.10/site-packages (from ray==2.9.3) (1.3.1)\n", + "Requirement already satisfied: frozenlist in /opt/conda/lib/python3.10/site-packages (from ray==2.9.3) (1.4.1)\n", + "Requirement already satisfied: requests in /opt/conda/lib/python3.10/site-packages (from ray==2.9.3) (2.31.0)\n", + "Requirement already satisfied: SQLAlchemy<3,>=1.4 in /opt/conda/lib/python3.10/site-packages (from langchain) (2.0.15)\n", + "Collecting aiohttp<4.0.0,>=3.8.3 (from langchain)\n", + " Downloading aiohttp-3.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.2/1.2 MB\u001b[0m \u001b[31m81.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting async-timeout<5.0.0,>=4.0.0 (from langchain)\n", + " Downloading async_timeout-4.0.3-py3-none-any.whl (5.7 kB)\n", + "Collecting dataclasses-json<0.7,>=0.5.7 (from langchain)\n", + " Downloading dataclasses_json-0.6.4-py3-none-any.whl (28 kB)\n", + "Collecting jsonpatch<2.0,>=1.33 (from langchain)\n", + " Downloading jsonpatch-1.33-py2.py3-none-any.whl (12 kB)\n", + "Collecting langchain-community<0.1,>=0.0.29 (from langchain)\n", + " Downloading langchain_community-0.0.29-py3-none-any.whl (1.8 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.8/1.8 MB\u001b[0m \u001b[31m94.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting langchain-core<0.2.0,>=0.1.33 (from langchain)\n", + " Downloading langchain_core-0.1.35-py3-none-any.whl (273 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m273.0/273.0 kB\u001b[0m \u001b[31m47.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting langchain-text-splitters<0.1,>=0.0.1 (from langchain)\n", + " Downloading langchain_text_splitters-0.0.1-py3-none-any.whl (21 kB)\n", + "Collecting langsmith<0.2.0,>=0.1.17 (from langchain)\n", + " Downloading langsmith-0.1.36-py3-none-any.whl (86 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.9/86.9 kB\u001b[0m \u001b[31m21.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: numpy<2,>=1 in /opt/conda/lib/python3.10/site-packages (from langchain) (1.24.3)\n", + "Requirement already satisfied: pydantic<3,>=1 in /opt/conda/lib/python3.10/site-packages (from langchain) (2.6.4)\n", + "Collecting tenacity<9.0.0,>=8.1.0 (from langchain)\n", + " Downloading tenacity-8.2.3-py3-none-any.whl (24 kB)\n", + "Collecting pyarrow>=12.0.0 (from datasets)\n", + " Downloading pyarrow-15.0.2-cp310-cp310-manylinux_2_28_x86_64.whl (38.3 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m38.3/38.3 MB\u001b[0m \u001b[31m40.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", + "\u001b[?25hCollecting pyarrow-hotfix (from datasets)\n", + " Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)\n", + "Requirement already satisfied: dill<0.3.9,>=0.3.0 in /opt/conda/lib/python3.10/site-packages (from datasets) (0.3.6)\n", + "Requirement already satisfied: pandas in /opt/conda/lib/python3.10/site-packages (from datasets) (2.0.2)\n", + "Requirement already satisfied: tqdm>=4.62.1 in /opt/conda/lib/python3.10/site-packages (from datasets) (4.65.0)\n", + "Collecting xxhash (from datasets)\n", + " Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m37.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting multiprocess (from datasets)\n", + " Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m30.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: fsspec[http]<=2024.2.0,>=2023.1.0 in /opt/conda/lib/python3.10/site-packages (from datasets) (2023.5.0)\n", + "Collecting huggingface-hub>=0.19.4 (from datasets)\n", + " Downloading huggingface_hub-0.22.1-py3-none-any.whl (388 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m388.6/388.6 kB\u001b[0m \u001b[31m56.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting transformers<5.0.0,>=4.32.0 (from sentence-transformers)\n", + " Downloading transformers-4.39.1-py3-none-any.whl (8.8 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m8.8/8.8 MB\u001b[0m \u001b[31m115.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", + "\u001b[?25hCollecting torch>=1.11.0 (from sentence-transformers)\n", + " Downloading torch-2.2.2-cp310-cp310-manylinux1_x86_64.whl (755.5 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m755.5/755.5 MB\u001b[0m \u001b[31m1.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: scikit-learn in /opt/conda/lib/python3.10/site-packages (from sentence-transformers) (1.2.2)\n", + "Requirement already satisfied: scipy in /opt/conda/lib/python3.10/site-packages (from sentence-transformers) (1.10.1)\n", + "Requirement already satisfied: Pillow in /opt/conda/lib/python3.10/site-packages (from sentence-transformers) (9.5.0)\n", + "Requirement already satisfied: attrs>=17.3.0 in /opt/conda/lib/python3.10/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (23.1.0)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /opt/conda/lib/python3.10/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (6.0.4)\n", + "Requirement already satisfied: yarl<2.0,>=1.0 in /opt/conda/lib/python3.10/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.9.2)\n", + "Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain)\n", + " Downloading marshmallow-3.21.1-py3-none-any.whl (49 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m49.4/49.4 kB\u001b[0m \u001b[31m11.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain)\n", + " Downloading typing_inspect-0.9.0-py3-none-any.whl (8.8 kB)\n", + "Requirement already satisfied: typing-extensions>=3.7.4.3 in /opt/conda/lib/python3.10/site-packages (from huggingface-hub>=0.19.4->datasets) (4.6.2)\n", + "Requirement already satisfied: jsonpointer>=1.9 in /opt/conda/lib/python3.10/site-packages (from jsonpatch<2.0,>=1.33->langchain) (2.0)\n", + "Collecting packaging (from ray==2.9.3)\n", + " Downloading packaging-23.2-py3-none-any.whl (53 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m53.0/53.0 kB\u001b[0m \u001b[31m12.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting orjson<4.0.0,>=3.9.14 (from langsmith<0.2.0,>=0.1.17->langchain)\n", + " Downloading orjson-3.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (144 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m144.8/144.8 kB\u001b[0m \u001b[31m28.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: annotated-types>=0.4.0 in /opt/conda/lib/python3.10/site-packages (from pydantic<3,>=1->langchain) (0.6.0)\n", + "Requirement already satisfied: pydantic-core==2.16.3 in /opt/conda/lib/python3.10/site-packages (from pydantic<3,>=1->langchain) (2.16.3)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from requests->ray==2.9.3) (3.1.0)\n", + "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests->ray==2.9.3) (3.4)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from requests->ray==2.9.3) (2.0.2)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from requests->ray==2.9.3) (2023.5.7)\n", + "Requirement already satisfied: greenlet!=0.4.17 in /opt/conda/lib/python3.10/site-packages (from SQLAlchemy<3,>=1.4->langchain) (2.0.2)\n", + "Collecting typing-extensions>=3.7.4.3 (from huggingface-hub>=0.19.4->datasets)\n", + " Using cached typing_extensions-4.10.0-py3-none-any.whl (33 kB)\n", + "Requirement already satisfied: sympy in /opt/conda/lib/python3.10/site-packages (from torch>=1.11.0->sentence-transformers) (1.12)\n", + "Requirement already satisfied: networkx in /opt/conda/lib/python3.10/site-packages (from torch>=1.11.0->sentence-transformers) (3.1)\n", + "Requirement already satisfied: jinja2 in /opt/conda/lib/python3.10/site-packages (from torch>=1.11.0->sentence-transformers) (3.1.2)\n", + "Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)\n", + " Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m23.7/23.7 MB\u001b[0m \u001b[31m75.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", + "\u001b[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)\n", + " Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m823.6/823.6 kB\u001b[0m \u001b[31m77.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)\n", + " Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m14.1/14.1 MB\u001b[0m \u001b[31m99.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", + "\u001b[?25hCollecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence-transformers)\n", + " Downloading nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m731.7/731.7 MB\u001b[0m \u001b[31m1.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", + "\u001b[?25hCollecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.11.0->sentence-transformers)\n", + " Downloading nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m410.6/410.6 MB\u001b[0m \u001b[31m1.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", + "\u001b[?25hCollecting nvidia-cufft-cu12==11.0.2.54 (from torch>=1.11.0->sentence-transformers)\n", + " Downloading nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m121.6/121.6 MB\u001b[0m \u001b[31m10.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", + "\u001b[?25hCollecting nvidia-curand-cu12==10.3.2.106 (from torch>=1.11.0->sentence-transformers)\n", + " Downloading nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m56.5/56.5 MB\u001b[0m \u001b[31m24.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", + "\u001b[?25hCollecting nvidia-cusolver-cu12==11.4.5.107 (from torch>=1.11.0->sentence-transformers)\n", + " Downloading nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl (124.2 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m124.2/124.2 MB\u001b[0m \u001b[31m10.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", + "\u001b[?25hCollecting nvidia-cusparse-cu12==12.1.0.106 (from torch>=1.11.0->sentence-transformers)\n", + " Downloading nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl (196.0 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m196.0/196.0 MB\u001b[0m \u001b[31m6.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", + "\u001b[?25hCollecting nvidia-nccl-cu12==2.19.3 (from torch>=1.11.0->sentence-transformers)\n", + " Downloading nvidia_nccl_cu12-2.19.3-py3-none-manylinux1_x86_64.whl (166.0 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m166.0/166.0 MB\u001b[0m \u001b[31m6.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", + "\u001b[?25hCollecting nvidia-nvtx-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)\n", + " Downloading nvidia_nvtx_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (99 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m99.1/99.1 kB\u001b[0m \u001b[31m22.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting triton==2.2.0 (from torch>=1.11.0->sentence-transformers)\n", + " Downloading triton-2.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (167.9 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m167.9/167.9 MB\u001b[0m \u001b[31m7.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", + "\u001b[?25hCollecting nvidia-nvjitlink-cu12 (from nvidia-cusolver-cu12==11.4.5.107->torch>=1.11.0->sentence-transformers)\n", + " Downloading nvidia_nvjitlink_cu12-12.4.99-py3-none-manylinux2014_x86_64.whl (21.1 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m21.1/21.1 MB\u001b[0m \u001b[31m95.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", + "\u001b[?25hCollecting regex!=2019.12.17 (from transformers<5.0.0,>=4.32.0->sentence-transformers)\n", + " Downloading regex-2023.12.25-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (773 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m774.0/774.0 kB\u001b[0m \u001b[31m1.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m\n", + "\u001b[?25hCollecting tokenizers<0.19,>=0.14 (from transformers<5.0.0,>=4.32.0->sentence-transformers)\n", + " Downloading tokenizers-0.15.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.6/3.6 MB\u001b[0m \u001b[31m6.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m0:00:01\u001b[0m\n", + "\u001b[?25hCollecting safetensors>=0.4.1 (from transformers<5.0.0,>=4.32.0->sentence-transformers)\n", + " Downloading safetensors-0.4.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m95.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: pyrsistent!=0.17.0,!=0.17.1,!=0.17.2,>=0.14.0 in /opt/conda/lib/python3.10/site-packages (from jsonschema->ray==2.9.3) (0.19.3)\n", + "Collecting dill<0.3.9,>=0.3.0 (from datasets)\n", + " Downloading dill-0.3.8-py3-none-any.whl (116 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m116.3/116.3 kB\u001b[0m \u001b[31m27.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: python-dateutil>=2.8.2 in /opt/conda/lib/python3.10/site-packages (from pandas->datasets) (2.8.2)\n", + "Requirement already satisfied: pytz>=2020.1 in /opt/conda/lib/python3.10/site-packages (from pandas->datasets) (2023.3)\n", + "Requirement already satisfied: tzdata>=2022.1 in /opt/conda/lib/python3.10/site-packages (from pandas->datasets) (2023.3)\n", + "Requirement already satisfied: joblib>=1.1.1 in /opt/conda/lib/python3.10/site-packages (from scikit-learn->sentence-transformers) (1.2.0)\n", + "Requirement already satisfied: threadpoolctl>=2.0.0 in /opt/conda/lib/python3.10/site-packages (from scikit-learn->sentence-transformers) (3.1.0)\n", + "Requirement already satisfied: six>=1.5 in /opt/conda/lib/python3.10/site-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.16.0)\n", + "Collecting mypy-extensions>=0.3.0 (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain)\n", + " Downloading mypy_extensions-1.0.0-py3-none-any.whl (4.7 kB)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /opt/conda/lib/python3.10/site-packages (from jinja2->torch>=1.11.0->sentence-transformers) (2.1.2)\n", + "Requirement already satisfied: mpmath>=0.19 in /opt/conda/lib/python3.10/site-packages (from sympy->torch>=1.11.0->sentence-transformers) (1.3.0)\n", + "Installing collected packages: xxhash, typing-extensions, triton, tenacity, safetensors, regex, pyarrow-hotfix, pyarrow, packaging, orjson, nvidia-nvtx-cu12, nvidia-nvjitlink-cu12, nvidia-nccl-cu12, nvidia-curand-cu12, nvidia-cufft-cu12, nvidia-cuda-runtime-cu12, nvidia-cuda-nvrtc-cu12, nvidia-cuda-cupti-cu12, nvidia-cublas-cu12, mypy-extensions, jsonpatch, dill, async-timeout, typing-inspect, nvidia-cusparse-cu12, nvidia-cudnn-cu12, multiprocess, marshmallow, huggingface-hub, aiohttp, tokenizers, nvidia-cusolver-cu12, dataclasses-json, transformers, torch, langsmith, datasets, sentence-transformers, langchain-core, langchain-text-splitters, langchain-community, langchain\n", + " Attempting uninstall: typing-extensions\n", + " Found existing installation: typing_extensions 4.6.2\n", + " Uninstalling typing_extensions-4.6.2:\n", + " Successfully uninstalled typing_extensions-4.6.2\n", + " Attempting uninstall: pyarrow\n", + " Found existing installation: pyarrow 10.0.1\n", + " Uninstalling pyarrow-10.0.1:\n", + " Successfully uninstalled pyarrow-10.0.1\n", + " Attempting uninstall: packaging\n", + " Found existing installation: packaging 23.1\n", + " Uninstalling packaging-23.1:\n", + " Successfully uninstalled packaging-23.1\n", + " Attempting uninstall: jsonpatch\n", + " Found existing installation: jsonpatch 1.32\n", + " Uninstalling jsonpatch-1.32:\n", + " Successfully uninstalled jsonpatch-1.32\n", + " Attempting uninstall: dill\n", + " Found existing installation: dill 0.3.6\n", + " Uninstalling dill-0.3.6:\n", + " Successfully uninstalled dill-0.3.6\n", + " Attempting uninstall: async-timeout\n", + " Found existing installation: async-timeout 3.0.1\n", + " Uninstalling async-timeout-3.0.1:\n", + " Successfully uninstalled async-timeout-3.0.1\n", + " Attempting uninstall: aiohttp\n", + " Found existing installation: aiohttp 3.7.4.post0\n", + " Uninstalling aiohttp-3.7.4.post0:\n", + " Successfully uninstalled aiohttp-3.7.4.post0\n", + "Successfully installed aiohttp-3.9.3 async-timeout-4.0.3 dataclasses-json-0.6.4 datasets-2.18.0 dill-0.3.8 huggingface-hub-0.22.1 jsonpatch-1.33 langchain-0.1.13 langchain-community-0.0.29 langchain-core-0.1.35 langchain-text-splitters-0.0.1 langsmith-0.1.36 marshmallow-3.21.1 multiprocess-0.70.16 mypy-extensions-1.0.0 nvidia-cublas-cu12-12.1.3.1 nvidia-cuda-cupti-cu12-12.1.105 nvidia-cuda-nvrtc-cu12-12.1.105 nvidia-cuda-runtime-cu12-12.1.105 nvidia-cudnn-cu12-8.9.2.26 nvidia-cufft-cu12-11.0.2.54 nvidia-curand-cu12-10.3.2.106 nvidia-cusolver-cu12-11.4.5.107 nvidia-cusparse-cu12-12.1.0.106 nvidia-nccl-cu12-2.19.3 nvidia-nvjitlink-cu12-12.4.99 nvidia-nvtx-cu12-12.1.105 orjson-3.10.0 packaging-23.2 pyarrow-15.0.2 pyarrow-hotfix-0.6 regex-2023.12.25 safetensors-0.4.2 sentence-transformers-2.6.1 tenacity-8.2.3 tokenizers-0.15.2 torch-2.2.2 transformers-4.39.1 triton-2.2.0 typing-extensions-4.10.0 typing-inspect-0.9.0 xxhash-3.4.1\n", + "Collecting cloud-sql-python-connector[pg8000]\n", + " Downloading cloud_sql_python_connector-1.8.0-py2.py3-none-any.whl (36 kB)\n", + "Collecting SQLAlchemy==2.0.7\n", + " Downloading SQLAlchemy-2.0.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.7 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.7/2.7 MB\u001b[0m \u001b[31m64.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: typing-extensions>=4.2.0 in /opt/conda/lib/python3.10/site-packages (from SQLAlchemy==2.0.7) (4.10.0)\n", + "Requirement already satisfied: greenlet!=0.4.17 in /opt/conda/lib/python3.10/site-packages (from SQLAlchemy==2.0.7) (2.0.2)\n", + "Requirement already satisfied: aiohttp in /opt/conda/lib/python3.10/site-packages (from cloud-sql-python-connector[pg8000]) (3.9.3)\n", + "Collecting cryptography>=42.0.0 (from cloud-sql-python-connector[pg8000])\n", + " Downloading cryptography-42.0.5-cp39-abi3-manylinux_2_28_x86_64.whl (4.6 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.6/4.6 MB\u001b[0m \u001b[31m101.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: Requests in /opt/conda/lib/python3.10/site-packages (from cloud-sql-python-connector[pg8000]) (2.31.0)\n", + "Requirement already satisfied: google-auth in /opt/conda/lib/python3.10/site-packages (from cloud-sql-python-connector[pg8000]) (2.17.3)\n", + "Collecting pg8000>=1.30.5 (from cloud-sql-python-connector[pg8000])\n", + " Downloading pg8000-1.30.5-py3-none-any.whl (46 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.6/46.6 kB\u001b[0m \u001b[31m10.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: cffi>=1.12 in /opt/conda/lib/python3.10/site-packages (from cryptography>=42.0.0->cloud-sql-python-connector[pg8000]) (1.15.1)\n", + "Collecting scramp>=1.4.4 (from pg8000>=1.30.5->cloud-sql-python-connector[pg8000])\n", + " Downloading scramp-1.4.4-py3-none-any.whl (13 kB)\n", + "Requirement already satisfied: python-dateutil>=2.8.2 in /opt/conda/lib/python3.10/site-packages (from pg8000>=1.30.5->cloud-sql-python-connector[pg8000]) (2.8.2)\n", + "Requirement already satisfied: aiosignal>=1.1.2 in /opt/conda/lib/python3.10/site-packages (from aiohttp->cloud-sql-python-connector[pg8000]) (1.3.1)\n", + "Requirement already satisfied: attrs>=17.3.0 in /opt/conda/lib/python3.10/site-packages (from aiohttp->cloud-sql-python-connector[pg8000]) (23.1.0)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /opt/conda/lib/python3.10/site-packages (from aiohttp->cloud-sql-python-connector[pg8000]) (1.4.1)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /opt/conda/lib/python3.10/site-packages (from aiohttp->cloud-sql-python-connector[pg8000]) (6.0.4)\n", + "Requirement already satisfied: yarl<2.0,>=1.0 in /opt/conda/lib/python3.10/site-packages (from aiohttp->cloud-sql-python-connector[pg8000]) (1.9.2)\n", + "Requirement already satisfied: async-timeout<5.0,>=4.0 in /opt/conda/lib/python3.10/site-packages (from aiohttp->cloud-sql-python-connector[pg8000]) (4.0.3)\n", + "Requirement already satisfied: cachetools<6.0,>=2.0.0 in /opt/conda/lib/python3.10/site-packages (from google-auth->cloud-sql-python-connector[pg8000]) (5.3.0)\n", + "Requirement already satisfied: pyasn1-modules>=0.2.1 in /opt/conda/lib/python3.10/site-packages (from google-auth->cloud-sql-python-connector[pg8000]) (0.2.7)\n", + "Requirement already satisfied: six>=1.9.0 in /opt/conda/lib/python3.10/site-packages (from google-auth->cloud-sql-python-connector[pg8000]) (1.16.0)\n", + "Requirement already satisfied: rsa<5,>=3.1.4 in /opt/conda/lib/python3.10/site-packages (from google-auth->cloud-sql-python-connector[pg8000]) (4.9)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from Requests->cloud-sql-python-connector[pg8000]) (3.1.0)\n", + "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from Requests->cloud-sql-python-connector[pg8000]) (3.4)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from Requests->cloud-sql-python-connector[pg8000]) (2.0.2)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from Requests->cloud-sql-python-connector[pg8000]) (2023.5.7)\n", + "Requirement already satisfied: pycparser in /opt/conda/lib/python3.10/site-packages (from cffi>=1.12->cryptography>=42.0.0->cloud-sql-python-connector[pg8000]) (2.21)\n", + "Requirement already satisfied: pyasn1<0.5.0,>=0.4.6 in /opt/conda/lib/python3.10/site-packages (from pyasn1-modules>=0.2.1->google-auth->cloud-sql-python-connector[pg8000]) (0.4.8)\n", + "Collecting asn1crypto>=1.5.1 (from scramp>=1.4.4->pg8000>=1.30.5->cloud-sql-python-connector[pg8000])\n", + " Downloading asn1crypto-1.5.1-py2.py3-none-any.whl (105 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m105.0/105.0 kB\u001b[0m \u001b[31m25.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hInstalling collected packages: asn1crypto, SQLAlchemy, scramp, pg8000, cryptography, cloud-sql-python-connector\n", + " Attempting uninstall: SQLAlchemy\n", + " Found existing installation: SQLAlchemy 2.0.15\n", + " Uninstalling SQLAlchemy-2.0.15:\n", + " Successfully uninstalled SQLAlchemy-2.0.15\n", + " Attempting uninstall: cryptography\n", + " Found existing installation: cryptography 40.0.2\n", + " Uninstalling cryptography-40.0.2:\n", + " Successfully uninstalled cryptography-40.0.2\n", + "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "pyopenssl 23.1.1 requires cryptography<41,>=38.0.0, but you have cryptography 42.0.5 which is incompatible.\u001b[0m\u001b[31m\n", + "\u001b[0mSuccessfully installed SQLAlchemy-2.0.7 asn1crypto-1.5.1 cloud-sql-python-connector-1.8.0 cryptography-42.0.5 pg8000-1.30.5 scramp-1.4.4\n" + ] + } + ], + "source": [ + "!pip install ray[default]==2.9.3 kaggle==1.6.6\n", + "!pip install langchain ray==2.9.3 datasets sentence-transformers\n", + "!pip install cloud-sql-python-connector[pg8000] SQLAlchemy==2.0.7" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "1e26faef-9e2e-4793-b8af-0e18470b482d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading netflix-shows.zip to /home/jovyan/data\n", + " 0%| | 0.00/1.34M [00:00\n", + "
\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "\n", + "\n", + "
Python version:3.10.13
Ray version:2.9.3
Dashboard:http://10.244.0.8:8265
\n", + "\n", + "
\n", + "\n" + ], + "text/plain": [ + "ClientContext(dashboard_url='10.244.0.8:8265', python_version='3.10.13', ray_version='2.9.3', ray_commit='62655e11ed76509b78654b60be67bc59f8f3460a', protocol_version='2023-06-27', _num_clients=1, _context_to_restore=)" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import ray\n", + "\n", + "ray.init(\n", + " address=\"ray://ray-cluster-kuberay-head-svc:10001\",\n", + " runtime_env={\n", + " \"pip\": [ \n", + " \"langchain==0.1.9\",\n", + " \"transformers==4.38.1\",\n", + " \"sentence-transformers==2.5.1\",\n", + " \"pyarrow\",\n", + " \"datasets==2.18.0\",\n", + " \"torch==2.0.1\",\n", + " \"cloud-sql-python-connector[pg8000]==1.7.0\",\n", + " \"SQLAlchemy==2.0.7\",\n", + " \"huggingface_hub==0.21.3\",\n", + " ]\n", + " }\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "a392975f-3743-4b2c-8673-087b5633637e", + "metadata": {}, + "outputs": [ + { + "ename": "FileNotFoundError", + "evalue": "/data/netflix-shows/netflix_titles.csv", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[18], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# Process the dataset first, wrap the csv file contents into a Ray dataset\u001b[39;00m\n\u001b[1;32m 2\u001b[0m SHARED_DATASET_BASE_PATH\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m/data/netflix-shows/\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m----> 3\u001b[0m ray_ds \u001b[38;5;241m=\u001b[39m \u001b[43mray\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdata\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_csv\u001b[49m\u001b[43m(\u001b[49m\u001b[43mSHARED_DATASET_BASE_PATH\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m+\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mREVIEWS_FILE_NAME\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28mprint\u001b[39m(ray_ds\u001b[38;5;241m.\u001b[39mschema)\n\u001b[1;32m 6\u001b[0m \u001b[38;5;66;03m# Distributed flat map to extract the raw text fields.\u001b[39;00m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/ray/data/read_api.py:1270\u001b[0m, in \u001b[0;36mread_csv\u001b[0;34m(paths, filesystem, parallelism, ray_remote_args, arrow_open_stream_args, meta_provider, partition_filter, partitioning, include_paths, ignore_missing_paths, shuffle, file_extensions, **arrow_csv_args)\u001b[0m\n\u001b[1;32m 1267\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m meta_provider \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 1268\u001b[0m meta_provider \u001b[38;5;241m=\u001b[39m get_generic_metadata_provider(CSVDatasource\u001b[38;5;241m.\u001b[39m_FILE_EXTENSIONS)\n\u001b[0;32m-> 1270\u001b[0m datasource \u001b[38;5;241m=\u001b[39m \u001b[43mCSVDatasource\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1271\u001b[0m \u001b[43m \u001b[49m\u001b[43mpaths\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1272\u001b[0m \u001b[43m \u001b[49m\u001b[43marrow_csv_args\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43marrow_csv_args\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1273\u001b[0m \u001b[43m \u001b[49m\u001b[43mfilesystem\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfilesystem\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1274\u001b[0m \u001b[43m \u001b[49m\u001b[43mopen_stream_args\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43marrow_open_stream_args\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1275\u001b[0m \u001b[43m \u001b[49m\u001b[43mmeta_provider\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmeta_provider\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1276\u001b[0m \u001b[43m \u001b[49m\u001b[43mpartition_filter\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpartition_filter\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1277\u001b[0m \u001b[43m \u001b[49m\u001b[43mpartitioning\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpartitioning\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1278\u001b[0m \u001b[43m \u001b[49m\u001b[43mignore_missing_paths\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mignore_missing_paths\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1279\u001b[0m \u001b[43m \u001b[49m\u001b[43mshuffle\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mshuffle\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1280\u001b[0m \u001b[43m \u001b[49m\u001b[43minclude_paths\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minclude_paths\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1281\u001b[0m \u001b[43m \u001b[49m\u001b[43mfile_extensions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfile_extensions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1282\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1283\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m read_datasource(\n\u001b[1;32m 1284\u001b[0m datasource,\n\u001b[1;32m 1285\u001b[0m parallelism\u001b[38;5;241m=\u001b[39mparallelism,\n\u001b[1;32m 1286\u001b[0m ray_remote_args\u001b[38;5;241m=\u001b[39mray_remote_args,\n\u001b[1;32m 1287\u001b[0m )\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/ray/data/datasource/csv_datasource.py:25\u001b[0m, in \u001b[0;36mCSVDatasource.__init__\u001b[0;34m(self, paths, arrow_csv_args, **file_based_datasource_kwargs)\u001b[0m\n\u001b[1;32m 17\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__init__\u001b[39m(\n\u001b[1;32m 18\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 19\u001b[0m paths: Union[\u001b[38;5;28mstr\u001b[39m, List[\u001b[38;5;28mstr\u001b[39m]],\n\u001b[1;32m 20\u001b[0m arrow_csv_args: Optional[Dict[\u001b[38;5;28mstr\u001b[39m, Any]] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 21\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mfile_based_datasource_kwargs,\n\u001b[1;32m 22\u001b[0m ):\n\u001b[1;32m 23\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpyarrow\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m csv\n\u001b[0;32m---> 25\u001b[0m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__init__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mpaths\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mfile_based_datasource_kwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 27\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m arrow_csv_args \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 28\u001b[0m arrow_csv_args \u001b[38;5;241m=\u001b[39m {}\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/ray/data/datasource/file_based_datasource.py:168\u001b[0m, in \u001b[0;36mFileBasedDatasource.__init__\u001b[0;34m(self, paths, filesystem, schema, open_stream_args, meta_provider, partition_filter, partitioning, ignore_missing_paths, shuffle, include_paths, file_extensions)\u001b[0m\n\u001b[1;32m 164\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_include_paths \u001b[38;5;241m=\u001b[39m include_paths\n\u001b[1;32m 165\u001b[0m paths, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_filesystem \u001b[38;5;241m=\u001b[39m _resolve_paths_and_filesystem(paths, filesystem)\n\u001b[1;32m 166\u001b[0m paths, file_sizes \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mmap\u001b[39m(\n\u001b[1;32m 167\u001b[0m \u001b[38;5;28mlist\u001b[39m,\n\u001b[0;32m--> 168\u001b[0m \u001b[38;5;28;43mzip\u001b[39;49m\u001b[43m(\u001b[49m\n\u001b[1;32m 169\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mmeta_provider\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexpand_paths\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 170\u001b[0m \u001b[43m \u001b[49m\u001b[43mpaths\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 171\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_filesystem\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 172\u001b[0m \u001b[43m \u001b[49m\u001b[43mpartitioning\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 173\u001b[0m \u001b[43m \u001b[49m\u001b[43mignore_missing_paths\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mignore_missing_paths\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 174\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 175\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m,\n\u001b[1;32m 176\u001b[0m )\n\u001b[1;32m 178\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m ignore_missing_paths \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(paths) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[1;32m 179\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 180\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mNone of the provided paths exist. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 181\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThe \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mignore_missing_paths\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m field is set to True.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 182\u001b[0m )\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/ray/data/datasource/file_meta_provider.py:178\u001b[0m, in \u001b[0;36mDefaultFileMetadataProvider.expand_paths\u001b[0;34m(self, paths, filesystem, partitioning, ignore_missing_paths)\u001b[0m\n\u001b[1;32m 171\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mexpand_paths\u001b[39m(\n\u001b[1;32m 172\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 173\u001b[0m paths: List[\u001b[38;5;28mstr\u001b[39m],\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 176\u001b[0m ignore_missing_paths: \u001b[38;5;28mbool\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m,\n\u001b[1;32m 177\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Iterator[Tuple[\u001b[38;5;28mstr\u001b[39m, \u001b[38;5;28mint\u001b[39m]]:\n\u001b[0;32m--> 178\u001b[0m \u001b[38;5;28;01myield from\u001b[39;00m _expand_paths(paths, filesystem, partitioning, ignore_missing_paths)\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/ray/data/datasource/file_meta_provider.py:418\u001b[0m, in \u001b[0;36m_expand_paths\u001b[0;34m(paths, filesystem, partitioning, ignore_missing_paths)\u001b[0m\n\u001b[1;32m 405\u001b[0m \u001b[38;5;66;03m# We break down our processing paths into a few key cases:\u001b[39;00m\n\u001b[1;32m 406\u001b[0m \u001b[38;5;66;03m# 1. If len(paths) < threshold, fetch the file info for the individual files/paths\u001b[39;00m\n\u001b[1;32m 407\u001b[0m \u001b[38;5;66;03m# serially.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 411\u001b[0m \u001b[38;5;66;03m# 3. If more than threshold requests required, parallelize them via Ray tasks.\u001b[39;00m\n\u001b[1;32m 412\u001b[0m \u001b[38;5;66;03m# 1. Small # of paths case.\u001b[39;00m\n\u001b[1;32m 413\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[1;32m 414\u001b[0m \u001b[38;5;28mlen\u001b[39m(paths) \u001b[38;5;241m<\u001b[39m FILE_SIZE_FETCH_PARALLELIZATION_THRESHOLD\n\u001b[1;32m 415\u001b[0m \u001b[38;5;66;03m# Local file systems are very fast to hit.\u001b[39;00m\n\u001b[1;32m 416\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(filesystem, LocalFileSystem)\n\u001b[1;32m 417\u001b[0m ):\n\u001b[0;32m--> 418\u001b[0m \u001b[38;5;28;01myield from\u001b[39;00m _get_file_infos_serial(paths, filesystem, ignore_missing_paths)\n\u001b[1;32m 419\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 420\u001b[0m \u001b[38;5;66;03m# 2. Common path prefix case.\u001b[39;00m\n\u001b[1;32m 421\u001b[0m \u001b[38;5;66;03m# Get longest common path of all paths.\u001b[39;00m\n\u001b[1;32m 422\u001b[0m common_path \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mcommonpath(paths)\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/ray/data/datasource/file_meta_provider.py:445\u001b[0m, in \u001b[0;36m_get_file_infos_serial\u001b[0;34m(paths, filesystem, ignore_missing_paths)\u001b[0m\n\u001b[1;32m 439\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_get_file_infos_serial\u001b[39m(\n\u001b[1;32m 440\u001b[0m paths: List[\u001b[38;5;28mstr\u001b[39m],\n\u001b[1;32m 441\u001b[0m filesystem: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpyarrow.fs.FileSystem\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 442\u001b[0m ignore_missing_paths: \u001b[38;5;28mbool\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m,\n\u001b[1;32m 443\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Iterator[Tuple[\u001b[38;5;28mstr\u001b[39m, \u001b[38;5;28mint\u001b[39m]]:\n\u001b[1;32m 444\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m path \u001b[38;5;129;01min\u001b[39;00m paths:\n\u001b[0;32m--> 445\u001b[0m \u001b[38;5;28;01myield from\u001b[39;00m \u001b[43m_get_file_infos\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfilesystem\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mignore_missing_paths\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/ray/data/datasource/file_meta_provider.py:568\u001b[0m, in \u001b[0;36m_get_file_infos\u001b[0;34m(path, filesystem, ignore_missing_path)\u001b[0m\n\u001b[1;32m 566\u001b[0m \u001b[38;5;28;01mpass\u001b[39;00m\n\u001b[1;32m 567\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 568\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mFileNotFoundError\u001b[39;00m(path)\n\u001b[1;32m 570\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m file_infos\n", + "\u001b[0;31mFileNotFoundError\u001b[0m: /data/netflix-shows/netflix_titles.csv" + ] + } + ], + "source": [ + "# Process the dataset first, wrap the csv file contents into a Ray dataset\n", + "SHARED_DATASET_BASE_PATH=\"/data/netflix-shows/\"\n", + "ray_ds = ray.data.read_csv(SHARED_DATASET_BASE_PATH + REVIEWS_FILE_NAME)\n", + "print(ray_ds.schema)\n", + "\n", + "# Distributed flat map to extract the raw text fields.\n", + "ds_batch = ray_ds.flat_map(lambda row: [{\n", + " 'item': \"This is a \" + str(row[\"type\"]) + \" in \" + str(row[\"country\"]) + \" called \" + str(row[\"title\"]) + \n", + " \" added at \" + str(row[\"date_added\"]) + \" whose director is \" + str(row[\"director\"]) + \n", + " \" and with cast: \" + str(row[\"cast\"]) + \" released at \" + str(row[\"release_year\"]) + \n", + " \". Its rating is: \" + str(row['rating']) + \". Its duration is \" + str(row[\"duration\"]) + \n", + " \". Its description is \" + str(row['description']) + \".\"\n", + "}])\n", + "print(ds_batch.schema)\n", + "\n", + "# Distributed map batches to create chunks out of each row, and fetch the vector embeddings by running inference on the sentence transformer\n", + "ds_embed = ds_batch.map_batches(\n", + " Embed,\n", + " compute=ray.data.ActorPoolStrategy(size=ACTOR_POOL_SIZE),\n", + " batch_size=BATCH_SIZE, # Large batch size to maximize GPU utilization.\n", + " num_gpus=1, # 1 GPU for each actor.\n", + " # num_cpus=1,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "caeec9e4-bb01-4355-86ef-065bb443d780", + "metadata": {}, + "outputs": [], + "source": [ + "# Use this block for debug purpose to inspect the embeddings and raw text\n", + "# print(\"Embeddings ray dataset\", ds_embed.schema)\n", + "# for output in ds_embed.iter_rows():\n", + "# # restrict the text string to be less than 65535\n", + "# data_text = output[\"results\"][0][:65535]\n", + "# # vector data pass in needs to be a string \n", + "# data_emb = \",\".join(map(str, output[\"results\"][1]))\n", + "# data_emb = \"[\" + data_emb + \"]\"\n", + "# print (\"raw text:\", data_text, \", emdeddings:\", data_emb)\n", + "\n", + "# print(\"Embeddings ray dataset\", ds_embed.schema)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4cff4bbc-574d-4cc2-8c87-d0ff6d351626", + "metadata": {}, + "outputs": [], + "source": [ + "data_text = \"\"\n", + "data_emb = \"\"\n", + "\n", + "with pool.connect() as db_conn:\n", + " db_conn.execute(\n", + " sqlalchemy.text(\n", + " \"CREATE EXTENSION IF NOT EXISTS vector;\"\n", + " )\n", + " )\n", + " db_conn.commit()\n", + "\n", + " create_table_query = \"CREATE TABLE IF NOT EXISTS \" + TABLE_NAME + \" ( id VARCHAR(255) NOT NULL, text TEXT NOT NULL, text_embedding vector(384) NOT NULL, PRIMARY KEY (id));\"\n", + " db_conn.execute(\n", + " sqlalchemy.text(create_table_query)\n", + " )\n", + " # commit transaction (SQLAlchemy v2.X.X is commit as you go)\n", + " db_conn.commit()\n", + " print(\"Created table=\", TABLE_NAME)\n", + " \n", + " query_text = \"INSERT INTO \" + TABLE_NAME + \" (id, text, text_embedding) VALUES (:id, :text, :text_embedding)\"\n", + " insert_stmt = sqlalchemy.text(query_text)\n", + " for output in ds_embed.iter_rows():\n", + " # print (\"type of embeddings\", type(output[\"results\"][1]), \"len embeddings\", len(output[\"results\"][1]))\n", + " # restrict the text string to be less than 65535\n", + " data_text = output[\"results\"][0][:65535]\n", + " # vector data pass in needs to be a string \n", + " data_emb = \",\".join(map(str, output[\"results\"][1]))\n", + " data_emb = \"[\" + data_emb + \"]\"\n", + " # print(\"text_embedding is \", data_emb)\n", + " id = uuid.uuid4() \n", + " db_conn.execute(insert_stmt, parameters={\"id\": id, \"text\": data_text, \"text_embedding\": data_emb})\n", + "\n", + " # batch commit transactions\n", + " db_conn.commit()\n", + "\n", + " # query and fetch table\n", + " query_text = \"SELECT * FROM \" + TABLE_NAME\n", + " results = db_conn.execute(sqlalchemy.text(query_text)).fetchall()\n", + " # for row in results:\n", + " # print(row)\n", + "\n", + " # verify results\n", + " transformer = SentenceTransformer(SENTENCE_TRANSFORMER_MODEL)\n", + " query_text = \"During my holiday in Marmaris we ate here to fit the food. It's really good\" \n", + " query_emb = transformer.encode(query_text).tolist()\n", + " query_request = \"SELECT id, text, text_embedding, 1 - ('[\" + \",\".join(map(str, query_emb)) + \"]' <=> text_embedding) AS cosine_similarity FROM \" + TABLE_NAME + \" ORDER BY cosine_similarity DESC LIMIT 5;\" \n", + " query_results = db_conn.execute(sqlalchemy.text(query_request)).fetchall()\n", + " db_conn.commit()\n", + " print(\"print query_results, the 1st one is the hit\")\n", + " for row in query_results:\n", + " print(row)\n", + "\n", + "# cleanup connector object\n", + "connector.close()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "050f2c66-b92e-4ca6-a3b7-b7448d066f8e", + "metadata": {}, + "outputs": [], + "source": [ + "# Create a directory to package the contents that need to be downloaded in ray worker\n", + "! mkdir -p test" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c82cdcad-c74c-4196-9aa0-2e6bb49f4b58", + "metadata": {}, + "outputs": [], + "source": [ + "%%writefile test/test.py\n", + "# Comment out the above line if you want to see notebook print out, but the line is required for the actual ray job (the test.py is downloaded by the ray workers)\n", + "\n", + "import os\n", + "import uuid\n", + "import ray\n", + "from langchain.document_loaders import ArxivLoader\n", + "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", + "from sentence_transformers import SentenceTransformer\n", + "from typing import List\n", + "import torch\n", + "from datasets import load_dataset_builder, load_dataset, Dataset\n", + "from huggingface_hub import snapshot_download\n", + "from google.cloud.sql.connector import Connector, IPTypes\n", + "import sqlalchemy\n", + "\n", + "# initialize parameters\n", + "INSTANCE_CONNECTION_NAME = os.environ[\"CLOUDSQL_INSTANCE_CONNECTION_NAME\"]\n", + "print(f\"Your instance connection name is: {INSTANCE_CONNECTION_NAME}\")\n", + "DB_NAME = \"pgvector-database\"\n", + "\n", + "db_username_file = open(\"/etc/secret-volume/username\", \"r\")\n", + "DB_USER = db_username_file.read()\n", + "db_username_file.close()\n", + "\n", + "db_password_file = open(\"/etc/secret-volume/password\", \"r\")\n", + "DB_PASS = db_password_file.read()\n", + "db_password_file.close()\n", + "\n", + "# initialize Connector object\n", + "connector = Connector()\n", + "\n", + "# function to return the database connection object\n", + "def getconn():\n", + " conn = connector.connect(\n", + " INSTANCE_CONNECTION_NAME,\n", + " \"pg8000\",\n", + " user=DB_USER,\n", + " password=DB_PASS,\n", + " db=DB_NAME,\n", + " ip_type=IPTypes.PRIVATE\n", + " )\n", + " return conn\n", + "\n", + "# create connection pool with 'creator' argument to our connection object function\n", + "pool = sqlalchemy.create_engine(\n", + " \"postgresql+pg8000://\",\n", + " creator=getconn,\n", + ")\n", + "\n", + "SHARED_DATA_BASEPATH='/data/rag/st'\n", + "SENTENCE_TRANSFORMER_MODEL = 'intfloat/multilingual-e5-small' # Transformer to use for converting text chunks to vector embeddings\n", + "SENTENCE_TRANSFORMER_MODEL_PATH_NAME='models--intfloat--multilingual-e5-small' # the downloaded model path takes this form for a given model name\n", + "SENTENCE_TRANSFORMER_MODEL_SNAPSHOT=\"ffdcc22a9a5c973ef0470385cef91e1ecb461d9f\" # specific snapshot of the model to use\n", + "SENTENCE_TRANSFORMER_MODEL_PATH = SHARED_DATA_BASEPATH + '/' + SENTENCE_TRANSFORMER_MODEL_PATH_NAME + '/snapshots/' + SENTENCE_TRANSFORMER_MODEL_SNAPSHOT # the path where the model is downloaded one time\n", + "\n", + "# the dataset has been pre-dowloaded to the GCS bucket as part of the notebook in the cell above. Ray workers will find the dataset readily mounted.\n", + "SHARED_DATASET_BASE_PATH=\"/data/netflix-shows/\"\n", + "REVIEWS_FILE_NAME=\"netflix_titles.csv\"\n", + "\n", + "BATCH_SIZE = 100\n", + "CHUNK_SIZE = 1000 # text chunk sizes which will be converted to vector embeddings\n", + "CHUNK_OVERLAP = 10\n", + "TABLE_NAME = 'netflix_reviews_db' # CloudSQL table name\n", + "DIMENSION = 384 # Embeddings size\n", + "ACTOR_POOL_SIZE = 1 # number of actors for the distributed map_batches function\n", + "\n", + "class Embed:\n", + " def __init__(self):\n", + " print(\"torch cuda version\", torch.version.cuda)\n", + " device=\"cpu\"\n", + " if torch.cuda.is_available():\n", + " print(\"device cuda found\")\n", + " device=\"cuda\"\n", + "\n", + " print (\"reading sentence transformer model from cache path:\", SENTENCE_TRANSFORMER_MODEL_PATH)\n", + " self.transformer = SentenceTransformer(SENTENCE_TRANSFORMER_MODEL_PATH, device=device)\n", + " self.splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP, length_function=len)\n", + "\n", + " def __call__(self, text_batch: List[str]):\n", + " text = text_batch[\"item\"]\n", + " # print(\"type(text)=\", type(text), \"type(text_batch)=\", type(text_batch))\n", + " chunks = []\n", + " for data in text:\n", + " splits = self.splitter.split_text(data)\n", + " # print(\"len(data)\", len(data), \"len(splits)=\", len(splits))\n", + " chunks.extend(splits)\n", + "\n", + " embeddings = self.transformer.encode(\n", + " chunks,\n", + " batch_size=BATCH_SIZE\n", + " ).tolist()\n", + " print(\"len(chunks)=\", len(chunks), \", len(emb)=\", len(embeddings))\n", + " return {'results':list(zip(chunks, embeddings))}\n", + "\n", + "\n", + "# prepare the persistent shared directory to store artifacts needed for the ray workers\n", + "os.makedirs(SHARED_DATA_BASEPATH, exist_ok=True)\n", + "\n", + "# One time download of the sentence transformer model to a shared persistent storage available to the ray workers\n", + "snapshot_download(repo_id=SENTENCE_TRANSFORMER_MODEL, revision=SENTENCE_TRANSFORMER_MODEL_SNAPSHOT, cache_dir=SHARED_DATA_BASEPATH)\n", + "\n", + "# Process the dataset first, wrap the csv file contents into a Ray dataset\n", + "ray_ds = ray.data.read_csv(SHARED_DATASET_BASE_PATH + REVIEWS_FILE_NAME)\n", + "print(ray_ds.schema)\n", + "\n", + "# Distributed flat map to extract the raw text fields.\n", + "ds_batch = ray_ds.flat_map(lambda row: [{\n", + " 'item': \"This is a \" + str(row[\"type\"]) + \" in \" + str(row[\"country\"]) + \" called \" + str(row[\"title\"]) + \n", + " \" added at \" + str(row[\"date_added\"]) + \" whose director is \" + str(row[\"director\"]) + \n", + " \" and with cast: \" + str(row[\"cast\"]) + \" released at \" + str(row[\"release_year\"]) + \n", + " \". Its rating is: \" + str(row['rating']) + \". Its duration is \" + str(row[\"duration\"]) + \n", + " \". Its description is \" + str(row['description']) + \".\"\n", + "}])\n", + "print(ds_batch.schema)\n", + "\n", + "# Distributed map batches to create chunks out of each row, and fetch the vector embeddings by running inference on the sentence transformer\n", + "ds_embed = ds_batch.map_batches(\n", + " Embed,\n", + " compute=ray.data.ActorPoolStrategy(size=ACTOR_POOL_SIZE),\n", + " batch_size=BATCH_SIZE, # Large batch size to maximize GPU utilization.\n", + " num_gpus=1, # 1 GPU for each actor.\n", + " # num_cpus=1,\n", + ")\n", + "\n", + "# Use this block for debug purpose to inspect the embeddings and raw text\n", + "# print(\"Embeddings ray dataset\", ds_embed.schema)\n", + "# for output in ds_embed.iter_rows():\n", + "# # restrict the text string to be less than 65535\n", + "# data_text = output[\"results\"][0][:65535]\n", + "# # vector data pass in needs to be a string \n", + "# data_emb = \",\".join(map(str, output[\"results\"][1]))\n", + "# data_emb = \"[\" + data_emb + \"]\"\n", + "# print (\"raw text:\", data_text, \", emdeddings:\", data_emb)\n", + "\n", + "# print(\"Embeddings ray dataset\", ds_embed.schema)\n", + "\n", + "data_text = \"\"\n", + "data_emb = \"\"\n", + "\n", + "with pool.connect() as db_conn:\n", + " db_conn.execute(\n", + " sqlalchemy.text(\n", + " \"CREATE EXTENSION IF NOT EXISTS vector;\"\n", + " )\n", + " )\n", + " db_conn.commit()\n", + "\n", + " create_table_query = \"CREATE TABLE IF NOT EXISTS \" + TABLE_NAME + \" ( id VARCHAR(255) NOT NULL, text TEXT NOT NULL, text_embedding vector(384) NOT NULL, PRIMARY KEY (id));\"\n", + " db_conn.execute(\n", + " sqlalchemy.text(create_table_query)\n", + " )\n", + " # commit transaction (SQLAlchemy v2.X.X is commit as you go)\n", + " db_conn.commit()\n", + " print(\"Created table=\", TABLE_NAME)\n", + " \n", + " query_text = \"INSERT INTO \" + TABLE_NAME + \" (id, text, text_embedding) VALUES (:id, :text, :text_embedding)\"\n", + " insert_stmt = sqlalchemy.text(query_text)\n", + " for output in ds_embed.iter_rows():\n", + " # print (\"type of embeddings\", type(output[\"results\"][1]), \"len embeddings\", len(output[\"results\"][1]))\n", + " # restrict the text string to be less than 65535\n", + " data_text = output[\"results\"][0][:65535]\n", + " # vector data pass in needs to be a string \n", + " data_emb = \",\".join(map(str, output[\"results\"][1]))\n", + " data_emb = \"[\" + data_emb + \"]\"\n", + " # print(\"text_embedding is \", data_emb)\n", + " id = uuid.uuid4()\n", + " db_conn.execute(insert_stmt, parameters={\"id\": id, \"text\": data_text, \"text_embedding\": data_emb})\n", + "\n", + " # batch commit transactions\n", + " db_conn.commit()\n", + "\n", + " # query and fetch table\n", + " query_text = \"SELECT * FROM \" + TABLE_NAME\n", + " results = db_conn.execute(sqlalchemy.text(query_text)).fetchall()\n", + " # for row in results:\n", + " # print(row)\n", + "\n", + " # verify results\n", + " transformer = SentenceTransformer(SENTENCE_TRANSFORMER_MODEL)\n", + " query_text = \"During my holiday in Marmaris we ate here to fit the food. It's really good\" \n", + " query_emb = transformer.encode(query_text).tolist()\n", + " query_request = \"SELECT id, text, text_embedding, 1 - ('[\" + \",\".join(map(str, query_emb)) + \"]' <=> text_embedding) AS cosine_similarity FROM \" + TABLE_NAME + \" ORDER BY cosine_similarity DESC LIMIT 5;\" \n", + " query_results = db_conn.execute(sqlalchemy.text(query_request)).fetchall()\n", + " db_conn.commit()\n", + " print(\"print query_results, the 1st one is the hit\")\n", + " for row in query_results:\n", + " print(row)\n", + "\n", + "# cleanup connector object\n", + "connector.close()\n", + "print (\"end job\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aeeb7b7a-23d8-4c6a-8165-7ce5516d2a41", + "metadata": {}, + "outputs": [], + "source": [ + "import ray, time\n", + "from ray.job_submission import JobSubmissionClient\n", + "client = JobSubmissionClient(\"ray://ray-cluster-kuberay-head-svc:10001\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7ba6c3ff-a25a-4f4d-b58e-68f7fe7d33df", + "metadata": {}, + "outputs": [], + "source": [ + "# Port forward to the Ray dashboard and go to `localhost:8265` in a browser to see job status: kubectl port-forward -n service/ray-cluster-kuberay-head-svc 8265:8265\n", + "import time\n", + "\n", + "start_time = time.time()\n", + "job_id = client.submit_job(\n", + " entrypoint=\"python test.py\",\n", + " # Path to the local directory that contains the entrypoint file.\n", + " runtime_env={\n", + " \"working_dir\": \"/home/jovyan/test\", # upload the local working directory to ray workers\n", + " \"pip\": [\n", + " \"langchain==0.1.9\",\n", + " \"transformers==4.38.1\",\n", + " \"sentence-transformers==2.5.1\",\n", + " \"pyarrow\",\n", + " \"datasets==2.18.0\",\n", + " \"torch==2.0.1\",\n", + " \"cloud-sql-python-connector[pg8000]==1.7.0\",\n", + " \"SQLAlchemy==2.0.7\",\n", + " \"huggingface_hub==0.21.3\",\n", + " ],\n", + " }\n", + ")\n", + "\n", + "print(\"Job submitted with ID:\", job_id)\n", + "prev_status = \"\"\n", + "while True:\n", + " status = client.get_job_status(job_id)\n", + " if status != prev_status:\n", + " print(\"Job status:\", status)\n", + " print(\"Job info:\", client.get_job_info(job_id).message)\n", + " prev_status = status\n", + " if status.is_terminal():\n", + " break\n", + " time.sleep(1)\n", + "end_time = time.time()\n", + "job_duration = end_time - start_time\n", + "print(f\"Job completed in {job_duration} seconds.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "98ec6c2d-3295-4f67-9fa0-af6d5708955a", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From ef7d9e54f693b50fe52a4855b460dc214caacac7 Mon Sep 17 00:00:00 2001 From: Richard Liu Date: Thu, 28 Mar 2024 03:31:46 +0000 Subject: [PATCH 03/20] fix notebook --- .../rag-kaggle-ray-sql-refactored.ipynb | 954 +----------------- 1 file changed, 17 insertions(+), 937 deletions(-) diff --git a/applications/rag/example_notebooks/rag-kaggle-ray-sql-refactored.ipynb b/applications/rag/example_notebooks/rag-kaggle-ray-sql-refactored.ipynb index 01bf43e3f..c571ae1bb 100644 --- a/applications/rag/example_notebooks/rag-kaggle-ray-sql-refactored.ipynb +++ b/applications/rag/example_notebooks/rag-kaggle-ray-sql-refactored.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "00b1aff4", "metadata": {}, "outputs": [], @@ -15,319 +15,10 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "a814e91b-3afe-4c28-a3d6-fe087c7af552", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Requirement already satisfied: ray[default]==2.9.3 in /opt/conda/lib/python3.10/site-packages (2.9.3)\n", - "Requirement already satisfied: kaggle==1.6.6 in /opt/conda/lib/python3.10/site-packages (1.6.6)\n", - "Requirement already satisfied: click>=7.0 in /opt/conda/lib/python3.10/site-packages (from ray[default]==2.9.3) (8.1.3)\n", - "Requirement already satisfied: filelock in /opt/conda/lib/python3.10/site-packages (from ray[default]==2.9.3) (3.13.3)\n", - "Requirement already satisfied: jsonschema in /opt/conda/lib/python3.10/site-packages (from ray[default]==2.9.3) (4.17.3)\n", - "Requirement already satisfied: msgpack<2.0.0,>=1.0.0 in /opt/conda/lib/python3.10/site-packages (from ray[default]==2.9.3) (1.0.5)\n", - "Requirement already satisfied: packaging in /opt/conda/lib/python3.10/site-packages (from ray[default]==2.9.3) (23.1)\n", - "Requirement already satisfied: protobuf!=3.19.5,>=3.15.3 in /opt/conda/lib/python3.10/site-packages (from ray[default]==2.9.3) (4.21.12)\n", - "Requirement already satisfied: pyyaml in /opt/conda/lib/python3.10/site-packages (from ray[default]==2.9.3) (6.0)\n", - "Requirement already satisfied: aiosignal in /opt/conda/lib/python3.10/site-packages (from ray[default]==2.9.3) (1.3.1)\n", - "Requirement already satisfied: frozenlist in /opt/conda/lib/python3.10/site-packages (from ray[default]==2.9.3) (1.4.1)\n", - "Requirement already satisfied: requests in /opt/conda/lib/python3.10/site-packages (from ray[default]==2.9.3) (2.31.0)\n", - "Requirement already satisfied: aiohttp>=3.7 in /opt/conda/lib/python3.10/site-packages (from ray[default]==2.9.3) (3.7.4.post0)\n", - "Requirement already satisfied: aiohttp-cors in /opt/conda/lib/python3.10/site-packages (from ray[default]==2.9.3) (0.7.0)\n", - "Requirement already satisfied: colorful in /opt/conda/lib/python3.10/site-packages (from ray[default]==2.9.3) (0.5.6)\n", - "Requirement already satisfied: py-spy>=0.2.0 in /opt/conda/lib/python3.10/site-packages (from ray[default]==2.9.3) (0.3.14)\n", - "Requirement already satisfied: gpustat>=1.0.0 in /opt/conda/lib/python3.10/site-packages (from ray[default]==2.9.3) (1.1.1)\n", - "Requirement already satisfied: opencensus in /opt/conda/lib/python3.10/site-packages (from ray[default]==2.9.3) (0.11.4)\n", - "Requirement already satisfied: pydantic!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,<3 in /opt/conda/lib/python3.10/site-packages (from ray[default]==2.9.3) (2.6.4)\n", - "Requirement already satisfied: prometheus-client>=0.7.1 in /opt/conda/lib/python3.10/site-packages (from ray[default]==2.9.3) (0.17.0)\n", - "Requirement already satisfied: smart-open in /opt/conda/lib/python3.10/site-packages (from ray[default]==2.9.3) (7.0.4)\n", - "Requirement already satisfied: virtualenv!=20.21.1,>=20.0.24 in /opt/conda/lib/python3.10/site-packages (from ray[default]==2.9.3) (20.25.1)\n", - "Requirement already satisfied: grpcio>=1.42.0 in /opt/conda/lib/python3.10/site-packages (from ray[default]==2.9.3) (1.51.1)\n", - "Requirement already satisfied: six>=1.10 in /opt/conda/lib/python3.10/site-packages (from kaggle==1.6.6) (1.16.0)\n", - "Requirement already satisfied: certifi in /opt/conda/lib/python3.10/site-packages (from kaggle==1.6.6) (2023.5.7)\n", - "Requirement already satisfied: python-dateutil in /opt/conda/lib/python3.10/site-packages (from kaggle==1.6.6) (2.8.2)\n", - "Requirement already satisfied: tqdm in /opt/conda/lib/python3.10/site-packages (from kaggle==1.6.6) (4.65.0)\n", - "Requirement already satisfied: python-slugify in /opt/conda/lib/python3.10/site-packages (from kaggle==1.6.6) (8.0.4)\n", - "Requirement already satisfied: urllib3 in /opt/conda/lib/python3.10/site-packages (from kaggle==1.6.6) (2.0.2)\n", - "Requirement already satisfied: bleach in /opt/conda/lib/python3.10/site-packages (from kaggle==1.6.6) (6.0.0)\n", - "Requirement already satisfied: attrs>=17.3.0 in /opt/conda/lib/python3.10/site-packages (from aiohttp>=3.7->ray[default]==2.9.3) (23.1.0)\n", - "Requirement already satisfied: chardet<5.0,>=2.0 in /opt/conda/lib/python3.10/site-packages (from aiohttp>=3.7->ray[default]==2.9.3) (4.0.0)\n", - "Requirement already satisfied: multidict<7.0,>=4.5 in /opt/conda/lib/python3.10/site-packages (from aiohttp>=3.7->ray[default]==2.9.3) (6.0.4)\n", - "Requirement already satisfied: async-timeout<4.0,>=3.0 in /opt/conda/lib/python3.10/site-packages (from aiohttp>=3.7->ray[default]==2.9.3) (3.0.1)\n", - "Requirement already satisfied: yarl<2.0,>=1.0 in /opt/conda/lib/python3.10/site-packages (from aiohttp>=3.7->ray[default]==2.9.3) (1.9.2)\n", - "Requirement already satisfied: typing-extensions>=3.6.5 in /opt/conda/lib/python3.10/site-packages (from aiohttp>=3.7->ray[default]==2.9.3) (4.6.2)\n", - "Requirement already satisfied: nvidia-ml-py>=11.450.129 in /opt/conda/lib/python3.10/site-packages (from gpustat>=1.0.0->ray[default]==2.9.3) (12.535.133)\n", - "Requirement already satisfied: psutil>=5.6.0 in /opt/conda/lib/python3.10/site-packages (from gpustat>=1.0.0->ray[default]==2.9.3) (5.9.5)\n", - "Requirement already satisfied: blessed>=1.17.1 in /opt/conda/lib/python3.10/site-packages (from gpustat>=1.0.0->ray[default]==2.9.3) (1.20.0)\n", - "Requirement already satisfied: annotated-types>=0.4.0 in /opt/conda/lib/python3.10/site-packages (from pydantic!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,<3->ray[default]==2.9.3) (0.6.0)\n", - "Requirement already satisfied: pydantic-core==2.16.3 in /opt/conda/lib/python3.10/site-packages (from pydantic!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,<3->ray[default]==2.9.3) (2.16.3)\n", - "Requirement already satisfied: distlib<1,>=0.3.7 in /opt/conda/lib/python3.10/site-packages (from virtualenv!=20.21.1,>=20.0.24->ray[default]==2.9.3) (0.3.8)\n", - "Requirement already satisfied: platformdirs<5,>=3.9.1 in /opt/conda/lib/python3.10/site-packages (from virtualenv!=20.21.1,>=20.0.24->ray[default]==2.9.3) (4.2.0)\n", - "Requirement already satisfied: webencodings in /opt/conda/lib/python3.10/site-packages (from bleach->kaggle==1.6.6) (0.5.1)\n", - "Requirement already satisfied: pyrsistent!=0.17.0,!=0.17.1,!=0.17.2,>=0.14.0 in /opt/conda/lib/python3.10/site-packages (from jsonschema->ray[default]==2.9.3) (0.19.3)\n", - "Requirement already satisfied: opencensus-context>=0.1.3 in /opt/conda/lib/python3.10/site-packages (from opencensus->ray[default]==2.9.3) (0.1.3)\n", - "Requirement already satisfied: google-api-core<3.0.0,>=1.0.0 in /opt/conda/lib/python3.10/site-packages (from opencensus->ray[default]==2.9.3) (2.18.0)\n", - "Requirement already satisfied: text-unidecode>=1.3 in /opt/conda/lib/python3.10/site-packages (from python-slugify->kaggle==1.6.6) (1.3)\n", - "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from requests->ray[default]==2.9.3) (3.1.0)\n", - "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests->ray[default]==2.9.3) (3.4)\n", - "Requirement already satisfied: wrapt in /opt/conda/lib/python3.10/site-packages (from smart-open->ray[default]==2.9.3) (1.15.0)\n", - "Requirement already satisfied: wcwidth>=0.1.4 in /opt/conda/lib/python3.10/site-packages (from blessed>=1.17.1->gpustat>=1.0.0->ray[default]==2.9.3) (0.2.6)\n", - "Requirement already satisfied: googleapis-common-protos<2.0.dev0,>=1.56.2 in /opt/conda/lib/python3.10/site-packages (from google-api-core<3.0.0,>=1.0.0->opencensus->ray[default]==2.9.3) (1.63.0)\n", - "Requirement already satisfied: proto-plus<2.0.0dev,>=1.22.3 in /opt/conda/lib/python3.10/site-packages (from google-api-core<3.0.0,>=1.0.0->opencensus->ray[default]==2.9.3) (1.23.0)\n", - "Requirement already satisfied: google-auth<3.0.dev0,>=2.14.1 in /opt/conda/lib/python3.10/site-packages (from google-api-core<3.0.0,>=1.0.0->opencensus->ray[default]==2.9.3) (2.17.3)\n", - "Requirement already satisfied: cachetools<6.0,>=2.0.0 in /opt/conda/lib/python3.10/site-packages (from google-auth<3.0.dev0,>=2.14.1->google-api-core<3.0.0,>=1.0.0->opencensus->ray[default]==2.9.3) (5.3.0)\n", - "Requirement already satisfied: pyasn1-modules>=0.2.1 in /opt/conda/lib/python3.10/site-packages (from google-auth<3.0.dev0,>=2.14.1->google-api-core<3.0.0,>=1.0.0->opencensus->ray[default]==2.9.3) (0.2.7)\n", - "Requirement already satisfied: rsa<5,>=3.1.4 in /opt/conda/lib/python3.10/site-packages (from google-auth<3.0.dev0,>=2.14.1->google-api-core<3.0.0,>=1.0.0->opencensus->ray[default]==2.9.3) (4.9)\n", - "Requirement already satisfied: pyasn1<0.5.0,>=0.4.6 in /opt/conda/lib/python3.10/site-packages (from pyasn1-modules>=0.2.1->google-auth<3.0.dev0,>=2.14.1->google-api-core<3.0.0,>=1.0.0->opencensus->ray[default]==2.9.3) (0.4.8)\n", - "Collecting langchain\n", - " Downloading langchain-0.1.13-py3-none-any.whl (810 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m810.5/810.5 kB\u001b[0m \u001b[31m34.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: ray==2.9.3 in /opt/conda/lib/python3.10/site-packages (2.9.3)\n", - "Collecting datasets\n", - " Downloading datasets-2.18.0-py3-none-any.whl (510 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m510.5/510.5 kB\u001b[0m \u001b[31m54.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting sentence-transformers\n", - " Downloading sentence_transformers-2.6.1-py3-none-any.whl (163 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m163.3/163.3 kB\u001b[0m \u001b[31m33.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: click>=7.0 in /opt/conda/lib/python3.10/site-packages (from ray==2.9.3) (8.1.3)\n", - "Requirement already satisfied: filelock in /opt/conda/lib/python3.10/site-packages (from ray==2.9.3) (3.13.3)\n", - "Requirement already satisfied: jsonschema in /opt/conda/lib/python3.10/site-packages (from ray==2.9.3) (4.17.3)\n", - "Requirement already satisfied: msgpack<2.0.0,>=1.0.0 in /opt/conda/lib/python3.10/site-packages (from ray==2.9.3) (1.0.5)\n", - "Requirement already satisfied: packaging in /opt/conda/lib/python3.10/site-packages (from ray==2.9.3) (23.1)\n", - "Requirement already satisfied: protobuf!=3.19.5,>=3.15.3 in /opt/conda/lib/python3.10/site-packages (from ray==2.9.3) (4.21.12)\n", - "Requirement already satisfied: pyyaml in /opt/conda/lib/python3.10/site-packages (from ray==2.9.3) (6.0)\n", - "Requirement already satisfied: aiosignal in /opt/conda/lib/python3.10/site-packages (from ray==2.9.3) (1.3.1)\n", - "Requirement already satisfied: frozenlist in /opt/conda/lib/python3.10/site-packages (from ray==2.9.3) (1.4.1)\n", - "Requirement already satisfied: requests in /opt/conda/lib/python3.10/site-packages (from ray==2.9.3) (2.31.0)\n", - "Requirement already satisfied: SQLAlchemy<3,>=1.4 in /opt/conda/lib/python3.10/site-packages (from langchain) (2.0.15)\n", - "Collecting aiohttp<4.0.0,>=3.8.3 (from langchain)\n", - " Downloading aiohttp-3.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.2/1.2 MB\u001b[0m \u001b[31m81.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting async-timeout<5.0.0,>=4.0.0 (from langchain)\n", - " Downloading async_timeout-4.0.3-py3-none-any.whl (5.7 kB)\n", - "Collecting dataclasses-json<0.7,>=0.5.7 (from langchain)\n", - " Downloading dataclasses_json-0.6.4-py3-none-any.whl (28 kB)\n", - "Collecting jsonpatch<2.0,>=1.33 (from langchain)\n", - " Downloading jsonpatch-1.33-py2.py3-none-any.whl (12 kB)\n", - "Collecting langchain-community<0.1,>=0.0.29 (from langchain)\n", - " Downloading langchain_community-0.0.29-py3-none-any.whl (1.8 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.8/1.8 MB\u001b[0m \u001b[31m94.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting langchain-core<0.2.0,>=0.1.33 (from langchain)\n", - " Downloading langchain_core-0.1.35-py3-none-any.whl (273 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m273.0/273.0 kB\u001b[0m \u001b[31m47.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting langchain-text-splitters<0.1,>=0.0.1 (from langchain)\n", - " Downloading langchain_text_splitters-0.0.1-py3-none-any.whl (21 kB)\n", - "Collecting langsmith<0.2.0,>=0.1.17 (from langchain)\n", - " Downloading langsmith-0.1.36-py3-none-any.whl (86 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.9/86.9 kB\u001b[0m \u001b[31m21.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: numpy<2,>=1 in /opt/conda/lib/python3.10/site-packages (from langchain) (1.24.3)\n", - "Requirement already satisfied: pydantic<3,>=1 in /opt/conda/lib/python3.10/site-packages (from langchain) (2.6.4)\n", - "Collecting tenacity<9.0.0,>=8.1.0 (from langchain)\n", - " Downloading tenacity-8.2.3-py3-none-any.whl (24 kB)\n", - "Collecting pyarrow>=12.0.0 (from datasets)\n", - " Downloading pyarrow-15.0.2-cp310-cp310-manylinux_2_28_x86_64.whl (38.3 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m38.3/38.3 MB\u001b[0m \u001b[31m40.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", - "\u001b[?25hCollecting pyarrow-hotfix (from datasets)\n", - " Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)\n", - "Requirement already satisfied: dill<0.3.9,>=0.3.0 in /opt/conda/lib/python3.10/site-packages (from datasets) (0.3.6)\n", - "Requirement already satisfied: pandas in /opt/conda/lib/python3.10/site-packages (from datasets) (2.0.2)\n", - "Requirement already satisfied: tqdm>=4.62.1 in /opt/conda/lib/python3.10/site-packages (from datasets) (4.65.0)\n", - "Collecting xxhash (from datasets)\n", - " Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m37.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting multiprocess (from datasets)\n", - " Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m30.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: fsspec[http]<=2024.2.0,>=2023.1.0 in /opt/conda/lib/python3.10/site-packages (from datasets) (2023.5.0)\n", - "Collecting huggingface-hub>=0.19.4 (from datasets)\n", - " Downloading huggingface_hub-0.22.1-py3-none-any.whl (388 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m388.6/388.6 kB\u001b[0m \u001b[31m56.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting transformers<5.0.0,>=4.32.0 (from sentence-transformers)\n", - " Downloading transformers-4.39.1-py3-none-any.whl (8.8 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m8.8/8.8 MB\u001b[0m \u001b[31m115.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", - "\u001b[?25hCollecting torch>=1.11.0 (from sentence-transformers)\n", - " Downloading torch-2.2.2-cp310-cp310-manylinux1_x86_64.whl (755.5 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m755.5/755.5 MB\u001b[0m \u001b[31m1.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: scikit-learn in /opt/conda/lib/python3.10/site-packages (from sentence-transformers) (1.2.2)\n", - "Requirement already satisfied: scipy in /opt/conda/lib/python3.10/site-packages (from sentence-transformers) (1.10.1)\n", - "Requirement already satisfied: Pillow in /opt/conda/lib/python3.10/site-packages (from sentence-transformers) (9.5.0)\n", - "Requirement already satisfied: attrs>=17.3.0 in /opt/conda/lib/python3.10/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (23.1.0)\n", - "Requirement already satisfied: multidict<7.0,>=4.5 in /opt/conda/lib/python3.10/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (6.0.4)\n", - "Requirement already satisfied: yarl<2.0,>=1.0 in /opt/conda/lib/python3.10/site-packages (from aiohttp<4.0.0,>=3.8.3->langchain) (1.9.2)\n", - "Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain)\n", - " Downloading marshmallow-3.21.1-py3-none-any.whl (49 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m49.4/49.4 kB\u001b[0m \u001b[31m11.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain)\n", - " Downloading typing_inspect-0.9.0-py3-none-any.whl (8.8 kB)\n", - "Requirement already satisfied: typing-extensions>=3.7.4.3 in /opt/conda/lib/python3.10/site-packages (from huggingface-hub>=0.19.4->datasets) (4.6.2)\n", - "Requirement already satisfied: jsonpointer>=1.9 in /opt/conda/lib/python3.10/site-packages (from jsonpatch<2.0,>=1.33->langchain) (2.0)\n", - "Collecting packaging (from ray==2.9.3)\n", - " Downloading packaging-23.2-py3-none-any.whl (53 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m53.0/53.0 kB\u001b[0m \u001b[31m12.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting orjson<4.0.0,>=3.9.14 (from langsmith<0.2.0,>=0.1.17->langchain)\n", - " Downloading orjson-3.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (144 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m144.8/144.8 kB\u001b[0m \u001b[31m28.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: annotated-types>=0.4.0 in /opt/conda/lib/python3.10/site-packages (from pydantic<3,>=1->langchain) (0.6.0)\n", - "Requirement already satisfied: pydantic-core==2.16.3 in /opt/conda/lib/python3.10/site-packages (from pydantic<3,>=1->langchain) (2.16.3)\n", - "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from requests->ray==2.9.3) (3.1.0)\n", - "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests->ray==2.9.3) (3.4)\n", - "Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from requests->ray==2.9.3) (2.0.2)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from requests->ray==2.9.3) (2023.5.7)\n", - "Requirement already satisfied: greenlet!=0.4.17 in /opt/conda/lib/python3.10/site-packages (from SQLAlchemy<3,>=1.4->langchain) (2.0.2)\n", - "Collecting typing-extensions>=3.7.4.3 (from huggingface-hub>=0.19.4->datasets)\n", - " Using cached typing_extensions-4.10.0-py3-none-any.whl (33 kB)\n", - "Requirement already satisfied: sympy in /opt/conda/lib/python3.10/site-packages (from torch>=1.11.0->sentence-transformers) (1.12)\n", - "Requirement already satisfied: networkx in /opt/conda/lib/python3.10/site-packages (from torch>=1.11.0->sentence-transformers) (3.1)\n", - "Requirement already satisfied: jinja2 in /opt/conda/lib/python3.10/site-packages (from torch>=1.11.0->sentence-transformers) (3.1.2)\n", - "Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)\n", - " Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m23.7/23.7 MB\u001b[0m \u001b[31m75.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", - "\u001b[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)\n", - " Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m823.6/823.6 kB\u001b[0m \u001b[31m77.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)\n", - " Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m14.1/14.1 MB\u001b[0m \u001b[31m99.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", - "\u001b[?25hCollecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.11.0->sentence-transformers)\n", - " Downloading nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m731.7/731.7 MB\u001b[0m \u001b[31m1.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", - "\u001b[?25hCollecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.11.0->sentence-transformers)\n", - " Downloading nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m410.6/410.6 MB\u001b[0m \u001b[31m1.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", - "\u001b[?25hCollecting nvidia-cufft-cu12==11.0.2.54 (from torch>=1.11.0->sentence-transformers)\n", - " Downloading nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m121.6/121.6 MB\u001b[0m \u001b[31m10.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", - "\u001b[?25hCollecting nvidia-curand-cu12==10.3.2.106 (from torch>=1.11.0->sentence-transformers)\n", - " Downloading nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m56.5/56.5 MB\u001b[0m \u001b[31m24.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", - "\u001b[?25hCollecting nvidia-cusolver-cu12==11.4.5.107 (from torch>=1.11.0->sentence-transformers)\n", - " Downloading nvidia_cusolver_cu12-11.4.5.107-py3-none-manylinux1_x86_64.whl (124.2 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m124.2/124.2 MB\u001b[0m \u001b[31m10.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", - "\u001b[?25hCollecting nvidia-cusparse-cu12==12.1.0.106 (from torch>=1.11.0->sentence-transformers)\n", - " Downloading nvidia_cusparse_cu12-12.1.0.106-py3-none-manylinux1_x86_64.whl (196.0 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m196.0/196.0 MB\u001b[0m \u001b[31m6.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", - "\u001b[?25hCollecting nvidia-nccl-cu12==2.19.3 (from torch>=1.11.0->sentence-transformers)\n", - " Downloading nvidia_nccl_cu12-2.19.3-py3-none-manylinux1_x86_64.whl (166.0 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m166.0/166.0 MB\u001b[0m \u001b[31m6.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", - "\u001b[?25hCollecting nvidia-nvtx-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)\n", - " Downloading nvidia_nvtx_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (99 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m99.1/99.1 kB\u001b[0m \u001b[31m22.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hCollecting triton==2.2.0 (from torch>=1.11.0->sentence-transformers)\n", - " Downloading triton-2.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (167.9 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m167.9/167.9 MB\u001b[0m \u001b[31m7.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", - "\u001b[?25hCollecting nvidia-nvjitlink-cu12 (from nvidia-cusolver-cu12==11.4.5.107->torch>=1.11.0->sentence-transformers)\n", - " Downloading nvidia_nvjitlink_cu12-12.4.99-py3-none-manylinux2014_x86_64.whl (21.1 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m21.1/21.1 MB\u001b[0m \u001b[31m95.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n", - "\u001b[?25hCollecting regex!=2019.12.17 (from transformers<5.0.0,>=4.32.0->sentence-transformers)\n", - " Downloading regex-2023.12.25-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (773 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m774.0/774.0 kB\u001b[0m \u001b[31m1.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m\n", - "\u001b[?25hCollecting tokenizers<0.19,>=0.14 (from transformers<5.0.0,>=4.32.0->sentence-transformers)\n", - " Downloading tokenizers-0.15.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.6/3.6 MB\u001b[0m \u001b[31m6.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m0:00:01\u001b[0m\n", - "\u001b[?25hCollecting safetensors>=0.4.1 (from transformers<5.0.0,>=4.32.0->sentence-transformers)\n", - " Downloading safetensors-0.4.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m95.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: pyrsistent!=0.17.0,!=0.17.1,!=0.17.2,>=0.14.0 in /opt/conda/lib/python3.10/site-packages (from jsonschema->ray==2.9.3) (0.19.3)\n", - "Collecting dill<0.3.9,>=0.3.0 (from datasets)\n", - " Downloading dill-0.3.8-py3-none-any.whl (116 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m116.3/116.3 kB\u001b[0m \u001b[31m27.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: python-dateutil>=2.8.2 in /opt/conda/lib/python3.10/site-packages (from pandas->datasets) (2.8.2)\n", - "Requirement already satisfied: pytz>=2020.1 in /opt/conda/lib/python3.10/site-packages (from pandas->datasets) (2023.3)\n", - "Requirement already satisfied: tzdata>=2022.1 in /opt/conda/lib/python3.10/site-packages (from pandas->datasets) (2023.3)\n", - "Requirement already satisfied: joblib>=1.1.1 in /opt/conda/lib/python3.10/site-packages (from scikit-learn->sentence-transformers) (1.2.0)\n", - "Requirement already satisfied: threadpoolctl>=2.0.0 in /opt/conda/lib/python3.10/site-packages (from scikit-learn->sentence-transformers) (3.1.0)\n", - "Requirement already satisfied: six>=1.5 in /opt/conda/lib/python3.10/site-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.16.0)\n", - "Collecting mypy-extensions>=0.3.0 (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain)\n", - " Downloading mypy_extensions-1.0.0-py3-none-any.whl (4.7 kB)\n", - "Requirement already satisfied: MarkupSafe>=2.0 in /opt/conda/lib/python3.10/site-packages (from jinja2->torch>=1.11.0->sentence-transformers) (2.1.2)\n", - "Requirement already satisfied: mpmath>=0.19 in /opt/conda/lib/python3.10/site-packages (from sympy->torch>=1.11.0->sentence-transformers) (1.3.0)\n", - "Installing collected packages: xxhash, typing-extensions, triton, tenacity, safetensors, regex, pyarrow-hotfix, pyarrow, packaging, orjson, nvidia-nvtx-cu12, nvidia-nvjitlink-cu12, nvidia-nccl-cu12, nvidia-curand-cu12, nvidia-cufft-cu12, nvidia-cuda-runtime-cu12, nvidia-cuda-nvrtc-cu12, nvidia-cuda-cupti-cu12, nvidia-cublas-cu12, mypy-extensions, jsonpatch, dill, async-timeout, typing-inspect, nvidia-cusparse-cu12, nvidia-cudnn-cu12, multiprocess, marshmallow, huggingface-hub, aiohttp, tokenizers, nvidia-cusolver-cu12, dataclasses-json, transformers, torch, langsmith, datasets, sentence-transformers, langchain-core, langchain-text-splitters, langchain-community, langchain\n", - " Attempting uninstall: typing-extensions\n", - " Found existing installation: typing_extensions 4.6.2\n", - " Uninstalling typing_extensions-4.6.2:\n", - " Successfully uninstalled typing_extensions-4.6.2\n", - " Attempting uninstall: pyarrow\n", - " Found existing installation: pyarrow 10.0.1\n", - " Uninstalling pyarrow-10.0.1:\n", - " Successfully uninstalled pyarrow-10.0.1\n", - " Attempting uninstall: packaging\n", - " Found existing installation: packaging 23.1\n", - " Uninstalling packaging-23.1:\n", - " Successfully uninstalled packaging-23.1\n", - " Attempting uninstall: jsonpatch\n", - " Found existing installation: jsonpatch 1.32\n", - " Uninstalling jsonpatch-1.32:\n", - " Successfully uninstalled jsonpatch-1.32\n", - " Attempting uninstall: dill\n", - " Found existing installation: dill 0.3.6\n", - " Uninstalling dill-0.3.6:\n", - " Successfully uninstalled dill-0.3.6\n", - " Attempting uninstall: async-timeout\n", - " Found existing installation: async-timeout 3.0.1\n", - " Uninstalling async-timeout-3.0.1:\n", - " Successfully uninstalled async-timeout-3.0.1\n", - " Attempting uninstall: aiohttp\n", - " Found existing installation: aiohttp 3.7.4.post0\n", - " Uninstalling aiohttp-3.7.4.post0:\n", - " Successfully uninstalled aiohttp-3.7.4.post0\n", - "Successfully installed aiohttp-3.9.3 async-timeout-4.0.3 dataclasses-json-0.6.4 datasets-2.18.0 dill-0.3.8 huggingface-hub-0.22.1 jsonpatch-1.33 langchain-0.1.13 langchain-community-0.0.29 langchain-core-0.1.35 langchain-text-splitters-0.0.1 langsmith-0.1.36 marshmallow-3.21.1 multiprocess-0.70.16 mypy-extensions-1.0.0 nvidia-cublas-cu12-12.1.3.1 nvidia-cuda-cupti-cu12-12.1.105 nvidia-cuda-nvrtc-cu12-12.1.105 nvidia-cuda-runtime-cu12-12.1.105 nvidia-cudnn-cu12-8.9.2.26 nvidia-cufft-cu12-11.0.2.54 nvidia-curand-cu12-10.3.2.106 nvidia-cusolver-cu12-11.4.5.107 nvidia-cusparse-cu12-12.1.0.106 nvidia-nccl-cu12-2.19.3 nvidia-nvjitlink-cu12-12.4.99 nvidia-nvtx-cu12-12.1.105 orjson-3.10.0 packaging-23.2 pyarrow-15.0.2 pyarrow-hotfix-0.6 regex-2023.12.25 safetensors-0.4.2 sentence-transformers-2.6.1 tenacity-8.2.3 tokenizers-0.15.2 torch-2.2.2 transformers-4.39.1 triton-2.2.0 typing-extensions-4.10.0 typing-inspect-0.9.0 xxhash-3.4.1\n", - "Collecting cloud-sql-python-connector[pg8000]\n", - " Downloading cloud_sql_python_connector-1.8.0-py2.py3-none-any.whl (36 kB)\n", - "Collecting SQLAlchemy==2.0.7\n", - " Downloading SQLAlchemy-2.0.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.7 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.7/2.7 MB\u001b[0m \u001b[31m64.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: typing-extensions>=4.2.0 in /opt/conda/lib/python3.10/site-packages (from SQLAlchemy==2.0.7) (4.10.0)\n", - "Requirement already satisfied: greenlet!=0.4.17 in /opt/conda/lib/python3.10/site-packages (from SQLAlchemy==2.0.7) (2.0.2)\n", - "Requirement already satisfied: aiohttp in /opt/conda/lib/python3.10/site-packages (from cloud-sql-python-connector[pg8000]) (3.9.3)\n", - "Collecting cryptography>=42.0.0 (from cloud-sql-python-connector[pg8000])\n", - " Downloading cryptography-42.0.5-cp39-abi3-manylinux_2_28_x86_64.whl (4.6 MB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.6/4.6 MB\u001b[0m \u001b[31m101.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: Requests in /opt/conda/lib/python3.10/site-packages (from cloud-sql-python-connector[pg8000]) (2.31.0)\n", - "Requirement already satisfied: google-auth in /opt/conda/lib/python3.10/site-packages (from cloud-sql-python-connector[pg8000]) (2.17.3)\n", - "Collecting pg8000>=1.30.5 (from cloud-sql-python-connector[pg8000])\n", - " Downloading pg8000-1.30.5-py3-none-any.whl (46 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.6/46.6 kB\u001b[0m \u001b[31m10.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hRequirement already satisfied: cffi>=1.12 in /opt/conda/lib/python3.10/site-packages (from cryptography>=42.0.0->cloud-sql-python-connector[pg8000]) (1.15.1)\n", - "Collecting scramp>=1.4.4 (from pg8000>=1.30.5->cloud-sql-python-connector[pg8000])\n", - " Downloading scramp-1.4.4-py3-none-any.whl (13 kB)\n", - "Requirement already satisfied: python-dateutil>=2.8.2 in /opt/conda/lib/python3.10/site-packages (from pg8000>=1.30.5->cloud-sql-python-connector[pg8000]) (2.8.2)\n", - "Requirement already satisfied: aiosignal>=1.1.2 in /opt/conda/lib/python3.10/site-packages (from aiohttp->cloud-sql-python-connector[pg8000]) (1.3.1)\n", - "Requirement already satisfied: attrs>=17.3.0 in /opt/conda/lib/python3.10/site-packages (from aiohttp->cloud-sql-python-connector[pg8000]) (23.1.0)\n", - "Requirement already satisfied: frozenlist>=1.1.1 in /opt/conda/lib/python3.10/site-packages (from aiohttp->cloud-sql-python-connector[pg8000]) (1.4.1)\n", - "Requirement already satisfied: multidict<7.0,>=4.5 in /opt/conda/lib/python3.10/site-packages (from aiohttp->cloud-sql-python-connector[pg8000]) (6.0.4)\n", - "Requirement already satisfied: yarl<2.0,>=1.0 in /opt/conda/lib/python3.10/site-packages (from aiohttp->cloud-sql-python-connector[pg8000]) (1.9.2)\n", - "Requirement already satisfied: async-timeout<5.0,>=4.0 in /opt/conda/lib/python3.10/site-packages (from aiohttp->cloud-sql-python-connector[pg8000]) (4.0.3)\n", - "Requirement already satisfied: cachetools<6.0,>=2.0.0 in /opt/conda/lib/python3.10/site-packages (from google-auth->cloud-sql-python-connector[pg8000]) (5.3.0)\n", - "Requirement already satisfied: pyasn1-modules>=0.2.1 in /opt/conda/lib/python3.10/site-packages (from google-auth->cloud-sql-python-connector[pg8000]) (0.2.7)\n", - "Requirement already satisfied: six>=1.9.0 in /opt/conda/lib/python3.10/site-packages (from google-auth->cloud-sql-python-connector[pg8000]) (1.16.0)\n", - "Requirement already satisfied: rsa<5,>=3.1.4 in /opt/conda/lib/python3.10/site-packages (from google-auth->cloud-sql-python-connector[pg8000]) (4.9)\n", - "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from Requests->cloud-sql-python-connector[pg8000]) (3.1.0)\n", - "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from Requests->cloud-sql-python-connector[pg8000]) (3.4)\n", - "Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from Requests->cloud-sql-python-connector[pg8000]) (2.0.2)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from Requests->cloud-sql-python-connector[pg8000]) (2023.5.7)\n", - "Requirement already satisfied: pycparser in /opt/conda/lib/python3.10/site-packages (from cffi>=1.12->cryptography>=42.0.0->cloud-sql-python-connector[pg8000]) (2.21)\n", - "Requirement already satisfied: pyasn1<0.5.0,>=0.4.6 in /opt/conda/lib/python3.10/site-packages (from pyasn1-modules>=0.2.1->google-auth->cloud-sql-python-connector[pg8000]) (0.4.8)\n", - "Collecting asn1crypto>=1.5.1 (from scramp>=1.4.4->pg8000>=1.30.5->cloud-sql-python-connector[pg8000])\n", - " Downloading asn1crypto-1.5.1-py2.py3-none-any.whl (105 kB)\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m105.0/105.0 kB\u001b[0m \u001b[31m25.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25hInstalling collected packages: asn1crypto, SQLAlchemy, scramp, pg8000, cryptography, cloud-sql-python-connector\n", - " Attempting uninstall: SQLAlchemy\n", - " Found existing installation: SQLAlchemy 2.0.15\n", - " Uninstalling SQLAlchemy-2.0.15:\n", - " Successfully uninstalled SQLAlchemy-2.0.15\n", - " Attempting uninstall: cryptography\n", - " Found existing installation: cryptography 40.0.2\n", - " Uninstalling cryptography-40.0.2:\n", - " Successfully uninstalled cryptography-40.0.2\n", - "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", - "pyopenssl 23.1.1 requires cryptography<41,>=38.0.0, but you have cryptography 42.0.5 which is incompatible.\u001b[0m\u001b[31m\n", - "\u001b[0mSuccessfully installed SQLAlchemy-2.0.7 asn1crypto-1.5.1 cloud-sql-python-connector-1.8.0 cryptography-42.0.5 pg8000-1.30.5 scramp-1.4.4\n" - ] - } - ], + "outputs": [], "source": [ "!pip install ray[default]==2.9.3 kaggle==1.6.6\n", "!pip install langchain ray==2.9.3 datasets sentence-transformers\n", @@ -336,22 +27,10 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "1e26faef-9e2e-4793-b8af-0e18470b482d", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Downloading netflix-shows.zip to /home/jovyan/data\n", - " 0%| | 0.00/1.34M [00:00\n", - "
\n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "\n", - "\n", - "
Python version:3.10.13
Ray version:2.9.3
Dashboard:http://10.244.0.8:8265
\n", - "\n", - "
\n", - "\n" - ], - "text/plain": [ - "ClientContext(dashboard_url='10.244.0.8:8265', python_version='3.10.13', ray_version='2.9.3', ray_commit='62655e11ed76509b78654b60be67bc59f8f3460a', protocol_version='2023-06-27', _num_clients=1, _context_to_restore=)" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "import ray\n", "\n", @@ -840,29 +217,10 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "id": "a392975f-3743-4b2c-8673-087b5633637e", "metadata": {}, - "outputs": [ - { - "ename": "FileNotFoundError", - "evalue": "/data/netflix-shows/netflix_titles.csv", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[18], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# Process the dataset first, wrap the csv file contents into a Ray dataset\u001b[39;00m\n\u001b[1;32m 2\u001b[0m SHARED_DATASET_BASE_PATH\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m/data/netflix-shows/\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m----> 3\u001b[0m ray_ds \u001b[38;5;241m=\u001b[39m \u001b[43mray\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mdata\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread_csv\u001b[49m\u001b[43m(\u001b[49m\u001b[43mSHARED_DATASET_BASE_PATH\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m+\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mREVIEWS_FILE_NAME\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28mprint\u001b[39m(ray_ds\u001b[38;5;241m.\u001b[39mschema)\n\u001b[1;32m 6\u001b[0m \u001b[38;5;66;03m# Distributed flat map to extract the raw text fields.\u001b[39;00m\n", - "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/ray/data/read_api.py:1270\u001b[0m, in \u001b[0;36mread_csv\u001b[0;34m(paths, filesystem, parallelism, ray_remote_args, arrow_open_stream_args, meta_provider, partition_filter, partitioning, include_paths, ignore_missing_paths, shuffle, file_extensions, **arrow_csv_args)\u001b[0m\n\u001b[1;32m 1267\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m meta_provider \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 1268\u001b[0m meta_provider \u001b[38;5;241m=\u001b[39m get_generic_metadata_provider(CSVDatasource\u001b[38;5;241m.\u001b[39m_FILE_EXTENSIONS)\n\u001b[0;32m-> 1270\u001b[0m datasource \u001b[38;5;241m=\u001b[39m \u001b[43mCSVDatasource\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1271\u001b[0m \u001b[43m \u001b[49m\u001b[43mpaths\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1272\u001b[0m \u001b[43m \u001b[49m\u001b[43marrow_csv_args\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43marrow_csv_args\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1273\u001b[0m \u001b[43m \u001b[49m\u001b[43mfilesystem\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfilesystem\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1274\u001b[0m \u001b[43m \u001b[49m\u001b[43mopen_stream_args\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43marrow_open_stream_args\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1275\u001b[0m \u001b[43m \u001b[49m\u001b[43mmeta_provider\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmeta_provider\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1276\u001b[0m \u001b[43m \u001b[49m\u001b[43mpartition_filter\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpartition_filter\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1277\u001b[0m \u001b[43m \u001b[49m\u001b[43mpartitioning\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpartitioning\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1278\u001b[0m \u001b[43m \u001b[49m\u001b[43mignore_missing_paths\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mignore_missing_paths\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1279\u001b[0m \u001b[43m \u001b[49m\u001b[43mshuffle\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mshuffle\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1280\u001b[0m \u001b[43m \u001b[49m\u001b[43minclude_paths\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minclude_paths\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1281\u001b[0m \u001b[43m \u001b[49m\u001b[43mfile_extensions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfile_extensions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1282\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1283\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m read_datasource(\n\u001b[1;32m 1284\u001b[0m datasource,\n\u001b[1;32m 1285\u001b[0m parallelism\u001b[38;5;241m=\u001b[39mparallelism,\n\u001b[1;32m 1286\u001b[0m ray_remote_args\u001b[38;5;241m=\u001b[39mray_remote_args,\n\u001b[1;32m 1287\u001b[0m )\n", - "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/ray/data/datasource/csv_datasource.py:25\u001b[0m, in \u001b[0;36mCSVDatasource.__init__\u001b[0;34m(self, paths, arrow_csv_args, **file_based_datasource_kwargs)\u001b[0m\n\u001b[1;32m 17\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__init__\u001b[39m(\n\u001b[1;32m 18\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 19\u001b[0m paths: Union[\u001b[38;5;28mstr\u001b[39m, List[\u001b[38;5;28mstr\u001b[39m]],\n\u001b[1;32m 20\u001b[0m arrow_csv_args: Optional[Dict[\u001b[38;5;28mstr\u001b[39m, Any]] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 21\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mfile_based_datasource_kwargs,\n\u001b[1;32m 22\u001b[0m ):\n\u001b[1;32m 23\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpyarrow\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m csv\n\u001b[0;32m---> 25\u001b[0m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__init__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mpaths\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mfile_based_datasource_kwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 27\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m arrow_csv_args \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 28\u001b[0m arrow_csv_args \u001b[38;5;241m=\u001b[39m {}\n", - "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/ray/data/datasource/file_based_datasource.py:168\u001b[0m, in \u001b[0;36mFileBasedDatasource.__init__\u001b[0;34m(self, paths, filesystem, schema, open_stream_args, meta_provider, partition_filter, partitioning, ignore_missing_paths, shuffle, include_paths, file_extensions)\u001b[0m\n\u001b[1;32m 164\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_include_paths \u001b[38;5;241m=\u001b[39m include_paths\n\u001b[1;32m 165\u001b[0m paths, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_filesystem \u001b[38;5;241m=\u001b[39m _resolve_paths_and_filesystem(paths, filesystem)\n\u001b[1;32m 166\u001b[0m paths, file_sizes \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mmap\u001b[39m(\n\u001b[1;32m 167\u001b[0m \u001b[38;5;28mlist\u001b[39m,\n\u001b[0;32m--> 168\u001b[0m \u001b[38;5;28;43mzip\u001b[39;49m\u001b[43m(\u001b[49m\n\u001b[1;32m 169\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mmeta_provider\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexpand_paths\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 170\u001b[0m \u001b[43m \u001b[49m\u001b[43mpaths\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 171\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_filesystem\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 172\u001b[0m \u001b[43m \u001b[49m\u001b[43mpartitioning\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 173\u001b[0m \u001b[43m \u001b[49m\u001b[43mignore_missing_paths\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mignore_missing_paths\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 174\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 175\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m,\n\u001b[1;32m 176\u001b[0m )\n\u001b[1;32m 178\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m ignore_missing_paths \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(paths) \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[1;32m 179\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 180\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mNone of the provided paths exist. \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 181\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThe \u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mignore_missing_paths\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m field is set to True.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 182\u001b[0m )\n", - "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/ray/data/datasource/file_meta_provider.py:178\u001b[0m, in \u001b[0;36mDefaultFileMetadataProvider.expand_paths\u001b[0;34m(self, paths, filesystem, partitioning, ignore_missing_paths)\u001b[0m\n\u001b[1;32m 171\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mexpand_paths\u001b[39m(\n\u001b[1;32m 172\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 173\u001b[0m paths: List[\u001b[38;5;28mstr\u001b[39m],\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 176\u001b[0m ignore_missing_paths: \u001b[38;5;28mbool\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m,\n\u001b[1;32m 177\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Iterator[Tuple[\u001b[38;5;28mstr\u001b[39m, \u001b[38;5;28mint\u001b[39m]]:\n\u001b[0;32m--> 178\u001b[0m \u001b[38;5;28;01myield from\u001b[39;00m _expand_paths(paths, filesystem, partitioning, ignore_missing_paths)\n", - "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/ray/data/datasource/file_meta_provider.py:418\u001b[0m, in \u001b[0;36m_expand_paths\u001b[0;34m(paths, filesystem, partitioning, ignore_missing_paths)\u001b[0m\n\u001b[1;32m 405\u001b[0m \u001b[38;5;66;03m# We break down our processing paths into a few key cases:\u001b[39;00m\n\u001b[1;32m 406\u001b[0m \u001b[38;5;66;03m# 1. If len(paths) < threshold, fetch the file info for the individual files/paths\u001b[39;00m\n\u001b[1;32m 407\u001b[0m \u001b[38;5;66;03m# serially.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 411\u001b[0m \u001b[38;5;66;03m# 3. If more than threshold requests required, parallelize them via Ray tasks.\u001b[39;00m\n\u001b[1;32m 412\u001b[0m \u001b[38;5;66;03m# 1. Small # of paths case.\u001b[39;00m\n\u001b[1;32m 413\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[1;32m 414\u001b[0m \u001b[38;5;28mlen\u001b[39m(paths) \u001b[38;5;241m<\u001b[39m FILE_SIZE_FETCH_PARALLELIZATION_THRESHOLD\n\u001b[1;32m 415\u001b[0m \u001b[38;5;66;03m# Local file systems are very fast to hit.\u001b[39;00m\n\u001b[1;32m 416\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(filesystem, LocalFileSystem)\n\u001b[1;32m 417\u001b[0m ):\n\u001b[0;32m--> 418\u001b[0m \u001b[38;5;28;01myield from\u001b[39;00m _get_file_infos_serial(paths, filesystem, ignore_missing_paths)\n\u001b[1;32m 419\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 420\u001b[0m \u001b[38;5;66;03m# 2. Common path prefix case.\u001b[39;00m\n\u001b[1;32m 421\u001b[0m \u001b[38;5;66;03m# Get longest common path of all paths.\u001b[39;00m\n\u001b[1;32m 422\u001b[0m common_path \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mcommonpath(paths)\n", - "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/ray/data/datasource/file_meta_provider.py:445\u001b[0m, in \u001b[0;36m_get_file_infos_serial\u001b[0;34m(paths, filesystem, ignore_missing_paths)\u001b[0m\n\u001b[1;32m 439\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_get_file_infos_serial\u001b[39m(\n\u001b[1;32m 440\u001b[0m paths: List[\u001b[38;5;28mstr\u001b[39m],\n\u001b[1;32m 441\u001b[0m filesystem: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpyarrow.fs.FileSystem\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 442\u001b[0m ignore_missing_paths: \u001b[38;5;28mbool\u001b[39m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m,\n\u001b[1;32m 443\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Iterator[Tuple[\u001b[38;5;28mstr\u001b[39m, \u001b[38;5;28mint\u001b[39m]]:\n\u001b[1;32m 444\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m path \u001b[38;5;129;01min\u001b[39;00m paths:\n\u001b[0;32m--> 445\u001b[0m \u001b[38;5;28;01myield from\u001b[39;00m \u001b[43m_get_file_infos\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfilesystem\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mignore_missing_paths\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m/opt/conda/lib/python3.10/site-packages/ray/data/datasource/file_meta_provider.py:568\u001b[0m, in \u001b[0;36m_get_file_infos\u001b[0;34m(path, filesystem, ignore_missing_path)\u001b[0m\n\u001b[1;32m 566\u001b[0m \u001b[38;5;28;01mpass\u001b[39;00m\n\u001b[1;32m 567\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 568\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mFileNotFoundError\u001b[39;00m(path)\n\u001b[1;32m 570\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m file_infos\n", - "\u001b[0;31mFileNotFoundError\u001b[0m: /data/netflix-shows/netflix_titles.csv" - ] - } - ], + "outputs": [], "source": [ "# Process the dataset first, wrap the csv file contents into a Ray dataset\n", "SHARED_DATASET_BASE_PATH=\"/data/netflix-shows/\"\n", @@ -971,284 +329,6 @@ "# cleanup connector object\n", "connector.close()" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "050f2c66-b92e-4ca6-a3b7-b7448d066f8e", - "metadata": {}, - "outputs": [], - "source": [ - "# Create a directory to package the contents that need to be downloaded in ray worker\n", - "! mkdir -p test" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c82cdcad-c74c-4196-9aa0-2e6bb49f4b58", - "metadata": {}, - "outputs": [], - "source": [ - "%%writefile test/test.py\n", - "# Comment out the above line if you want to see notebook print out, but the line is required for the actual ray job (the test.py is downloaded by the ray workers)\n", - "\n", - "import os\n", - "import uuid\n", - "import ray\n", - "from langchain.document_loaders import ArxivLoader\n", - "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", - "from sentence_transformers import SentenceTransformer\n", - "from typing import List\n", - "import torch\n", - "from datasets import load_dataset_builder, load_dataset, Dataset\n", - "from huggingface_hub import snapshot_download\n", - "from google.cloud.sql.connector import Connector, IPTypes\n", - "import sqlalchemy\n", - "\n", - "# initialize parameters\n", - "INSTANCE_CONNECTION_NAME = os.environ[\"CLOUDSQL_INSTANCE_CONNECTION_NAME\"]\n", - "print(f\"Your instance connection name is: {INSTANCE_CONNECTION_NAME}\")\n", - "DB_NAME = \"pgvector-database\"\n", - "\n", - "db_username_file = open(\"/etc/secret-volume/username\", \"r\")\n", - "DB_USER = db_username_file.read()\n", - "db_username_file.close()\n", - "\n", - "db_password_file = open(\"/etc/secret-volume/password\", \"r\")\n", - "DB_PASS = db_password_file.read()\n", - "db_password_file.close()\n", - "\n", - "# initialize Connector object\n", - "connector = Connector()\n", - "\n", - "# function to return the database connection object\n", - "def getconn():\n", - " conn = connector.connect(\n", - " INSTANCE_CONNECTION_NAME,\n", - " \"pg8000\",\n", - " user=DB_USER,\n", - " password=DB_PASS,\n", - " db=DB_NAME,\n", - " ip_type=IPTypes.PRIVATE\n", - " )\n", - " return conn\n", - "\n", - "# create connection pool with 'creator' argument to our connection object function\n", - "pool = sqlalchemy.create_engine(\n", - " \"postgresql+pg8000://\",\n", - " creator=getconn,\n", - ")\n", - "\n", - "SHARED_DATA_BASEPATH='/data/rag/st'\n", - "SENTENCE_TRANSFORMER_MODEL = 'intfloat/multilingual-e5-small' # Transformer to use for converting text chunks to vector embeddings\n", - "SENTENCE_TRANSFORMER_MODEL_PATH_NAME='models--intfloat--multilingual-e5-small' # the downloaded model path takes this form for a given model name\n", - "SENTENCE_TRANSFORMER_MODEL_SNAPSHOT=\"ffdcc22a9a5c973ef0470385cef91e1ecb461d9f\" # specific snapshot of the model to use\n", - "SENTENCE_TRANSFORMER_MODEL_PATH = SHARED_DATA_BASEPATH + '/' + SENTENCE_TRANSFORMER_MODEL_PATH_NAME + '/snapshots/' + SENTENCE_TRANSFORMER_MODEL_SNAPSHOT # the path where the model is downloaded one time\n", - "\n", - "# the dataset has been pre-dowloaded to the GCS bucket as part of the notebook in the cell above. Ray workers will find the dataset readily mounted.\n", - "SHARED_DATASET_BASE_PATH=\"/data/netflix-shows/\"\n", - "REVIEWS_FILE_NAME=\"netflix_titles.csv\"\n", - "\n", - "BATCH_SIZE = 100\n", - "CHUNK_SIZE = 1000 # text chunk sizes which will be converted to vector embeddings\n", - "CHUNK_OVERLAP = 10\n", - "TABLE_NAME = 'netflix_reviews_db' # CloudSQL table name\n", - "DIMENSION = 384 # Embeddings size\n", - "ACTOR_POOL_SIZE = 1 # number of actors for the distributed map_batches function\n", - "\n", - "class Embed:\n", - " def __init__(self):\n", - " print(\"torch cuda version\", torch.version.cuda)\n", - " device=\"cpu\"\n", - " if torch.cuda.is_available():\n", - " print(\"device cuda found\")\n", - " device=\"cuda\"\n", - "\n", - " print (\"reading sentence transformer model from cache path:\", SENTENCE_TRANSFORMER_MODEL_PATH)\n", - " self.transformer = SentenceTransformer(SENTENCE_TRANSFORMER_MODEL_PATH, device=device)\n", - " self.splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP, length_function=len)\n", - "\n", - " def __call__(self, text_batch: List[str]):\n", - " text = text_batch[\"item\"]\n", - " # print(\"type(text)=\", type(text), \"type(text_batch)=\", type(text_batch))\n", - " chunks = []\n", - " for data in text:\n", - " splits = self.splitter.split_text(data)\n", - " # print(\"len(data)\", len(data), \"len(splits)=\", len(splits))\n", - " chunks.extend(splits)\n", - "\n", - " embeddings = self.transformer.encode(\n", - " chunks,\n", - " batch_size=BATCH_SIZE\n", - " ).tolist()\n", - " print(\"len(chunks)=\", len(chunks), \", len(emb)=\", len(embeddings))\n", - " return {'results':list(zip(chunks, embeddings))}\n", - "\n", - "\n", - "# prepare the persistent shared directory to store artifacts needed for the ray workers\n", - "os.makedirs(SHARED_DATA_BASEPATH, exist_ok=True)\n", - "\n", - "# One time download of the sentence transformer model to a shared persistent storage available to the ray workers\n", - "snapshot_download(repo_id=SENTENCE_TRANSFORMER_MODEL, revision=SENTENCE_TRANSFORMER_MODEL_SNAPSHOT, cache_dir=SHARED_DATA_BASEPATH)\n", - "\n", - "# Process the dataset first, wrap the csv file contents into a Ray dataset\n", - "ray_ds = ray.data.read_csv(SHARED_DATASET_BASE_PATH + REVIEWS_FILE_NAME)\n", - "print(ray_ds.schema)\n", - "\n", - "# Distributed flat map to extract the raw text fields.\n", - "ds_batch = ray_ds.flat_map(lambda row: [{\n", - " 'item': \"This is a \" + str(row[\"type\"]) + \" in \" + str(row[\"country\"]) + \" called \" + str(row[\"title\"]) + \n", - " \" added at \" + str(row[\"date_added\"]) + \" whose director is \" + str(row[\"director\"]) + \n", - " \" and with cast: \" + str(row[\"cast\"]) + \" released at \" + str(row[\"release_year\"]) + \n", - " \". Its rating is: \" + str(row['rating']) + \". Its duration is \" + str(row[\"duration\"]) + \n", - " \". Its description is \" + str(row['description']) + \".\"\n", - "}])\n", - "print(ds_batch.schema)\n", - "\n", - "# Distributed map batches to create chunks out of each row, and fetch the vector embeddings by running inference on the sentence transformer\n", - "ds_embed = ds_batch.map_batches(\n", - " Embed,\n", - " compute=ray.data.ActorPoolStrategy(size=ACTOR_POOL_SIZE),\n", - " batch_size=BATCH_SIZE, # Large batch size to maximize GPU utilization.\n", - " num_gpus=1, # 1 GPU for each actor.\n", - " # num_cpus=1,\n", - ")\n", - "\n", - "# Use this block for debug purpose to inspect the embeddings and raw text\n", - "# print(\"Embeddings ray dataset\", ds_embed.schema)\n", - "# for output in ds_embed.iter_rows():\n", - "# # restrict the text string to be less than 65535\n", - "# data_text = output[\"results\"][0][:65535]\n", - "# # vector data pass in needs to be a string \n", - "# data_emb = \",\".join(map(str, output[\"results\"][1]))\n", - "# data_emb = \"[\" + data_emb + \"]\"\n", - "# print (\"raw text:\", data_text, \", emdeddings:\", data_emb)\n", - "\n", - "# print(\"Embeddings ray dataset\", ds_embed.schema)\n", - "\n", - "data_text = \"\"\n", - "data_emb = \"\"\n", - "\n", - "with pool.connect() as db_conn:\n", - " db_conn.execute(\n", - " sqlalchemy.text(\n", - " \"CREATE EXTENSION IF NOT EXISTS vector;\"\n", - " )\n", - " )\n", - " db_conn.commit()\n", - "\n", - " create_table_query = \"CREATE TABLE IF NOT EXISTS \" + TABLE_NAME + \" ( id VARCHAR(255) NOT NULL, text TEXT NOT NULL, text_embedding vector(384) NOT NULL, PRIMARY KEY (id));\"\n", - " db_conn.execute(\n", - " sqlalchemy.text(create_table_query)\n", - " )\n", - " # commit transaction (SQLAlchemy v2.X.X is commit as you go)\n", - " db_conn.commit()\n", - " print(\"Created table=\", TABLE_NAME)\n", - " \n", - " query_text = \"INSERT INTO \" + TABLE_NAME + \" (id, text, text_embedding) VALUES (:id, :text, :text_embedding)\"\n", - " insert_stmt = sqlalchemy.text(query_text)\n", - " for output in ds_embed.iter_rows():\n", - " # print (\"type of embeddings\", type(output[\"results\"][1]), \"len embeddings\", len(output[\"results\"][1]))\n", - " # restrict the text string to be less than 65535\n", - " data_text = output[\"results\"][0][:65535]\n", - " # vector data pass in needs to be a string \n", - " data_emb = \",\".join(map(str, output[\"results\"][1]))\n", - " data_emb = \"[\" + data_emb + \"]\"\n", - " # print(\"text_embedding is \", data_emb)\n", - " id = uuid.uuid4()\n", - " db_conn.execute(insert_stmt, parameters={\"id\": id, \"text\": data_text, \"text_embedding\": data_emb})\n", - "\n", - " # batch commit transactions\n", - " db_conn.commit()\n", - "\n", - " # query and fetch table\n", - " query_text = \"SELECT * FROM \" + TABLE_NAME\n", - " results = db_conn.execute(sqlalchemy.text(query_text)).fetchall()\n", - " # for row in results:\n", - " # print(row)\n", - "\n", - " # verify results\n", - " transformer = SentenceTransformer(SENTENCE_TRANSFORMER_MODEL)\n", - " query_text = \"During my holiday in Marmaris we ate here to fit the food. It's really good\" \n", - " query_emb = transformer.encode(query_text).tolist()\n", - " query_request = \"SELECT id, text, text_embedding, 1 - ('[\" + \",\".join(map(str, query_emb)) + \"]' <=> text_embedding) AS cosine_similarity FROM \" + TABLE_NAME + \" ORDER BY cosine_similarity DESC LIMIT 5;\" \n", - " query_results = db_conn.execute(sqlalchemy.text(query_request)).fetchall()\n", - " db_conn.commit()\n", - " print(\"print query_results, the 1st one is the hit\")\n", - " for row in query_results:\n", - " print(row)\n", - "\n", - "# cleanup connector object\n", - "connector.close()\n", - "print (\"end job\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "aeeb7b7a-23d8-4c6a-8165-7ce5516d2a41", - "metadata": {}, - "outputs": [], - "source": [ - "import ray, time\n", - "from ray.job_submission import JobSubmissionClient\n", - "client = JobSubmissionClient(\"ray://ray-cluster-kuberay-head-svc:10001\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7ba6c3ff-a25a-4f4d-b58e-68f7fe7d33df", - "metadata": {}, - "outputs": [], - "source": [ - "# Port forward to the Ray dashboard and go to `localhost:8265` in a browser to see job status: kubectl port-forward -n service/ray-cluster-kuberay-head-svc 8265:8265\n", - "import time\n", - "\n", - "start_time = time.time()\n", - "job_id = client.submit_job(\n", - " entrypoint=\"python test.py\",\n", - " # Path to the local directory that contains the entrypoint file.\n", - " runtime_env={\n", - " \"working_dir\": \"/home/jovyan/test\", # upload the local working directory to ray workers\n", - " \"pip\": [\n", - " \"langchain==0.1.9\",\n", - " \"transformers==4.38.1\",\n", - " \"sentence-transformers==2.5.1\",\n", - " \"pyarrow\",\n", - " \"datasets==2.18.0\",\n", - " \"torch==2.0.1\",\n", - " \"cloud-sql-python-connector[pg8000]==1.7.0\",\n", - " \"SQLAlchemy==2.0.7\",\n", - " \"huggingface_hub==0.21.3\",\n", - " ],\n", - " }\n", - ")\n", - "\n", - "print(\"Job submitted with ID:\", job_id)\n", - "prev_status = \"\"\n", - "while True:\n", - " status = client.get_job_status(job_id)\n", - " if status != prev_status:\n", - " print(\"Job status:\", status)\n", - " print(\"Job info:\", client.get_job_info(job_id).message)\n", - " prev_status = status\n", - " if status.is_terminal():\n", - " break\n", - " time.sleep(1)\n", - "end_time = time.time()\n", - "job_duration = end_time - start_time\n", - "print(f\"Job completed in {job_duration} seconds.\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "98ec6c2d-3295-4f67-9fa0-af6d5708955a", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { From de514b22e9e4a5ce68b7656e96a4154e0042f27b Mon Sep 17 00:00:00 2001 From: Richard Liu Date: Thu, 28 Mar 2024 19:03:30 +0000 Subject: [PATCH 04/20] fix notebook, add markdown --- .../rag-kaggle-ray-sql-refactored.ipynb | 215 +++++++++++++----- 1 file changed, 154 insertions(+), 61 deletions(-) diff --git a/applications/rag/example_notebooks/rag-kaggle-ray-sql-refactored.ipynb b/applications/rag/example_notebooks/rag-kaggle-ray-sql-refactored.ipynb index c571ae1bb..f79a41aa2 100644 --- a/applications/rag/example_notebooks/rag-kaggle-ray-sql-refactored.ipynb +++ b/applications/rag/example_notebooks/rag-kaggle-ray-sql-refactored.ipynb @@ -1,5 +1,20 @@ { "cells": [ + { + "cell_type": "markdown", + "id": "195d442b-08a9-4469-824c-80188c622c46", + "metadata": {}, + "source": [ + "# RAG-on-GKE Application\n", + "\n", + "This is a Python notebook for the RAG on GKE application. For full information, please checkout the GitHub documentation [here](https://github.com/GoogleCloudPlatform/ai-on-gke/blob/main/applications/rag/README.md).\n", + "\n", + "\n", + "## Setup Kaggle Credentials\n", + "\n", + "First we will setup your Kaggle credentials. Replace the following with your own settings from the Kaggle web page. Navigate to https://www.kaggle.com/settings/account and generate an API token to be used to setup the env variable. See https://www.kaggle.com/docs/api#authentication how to create one." + ] + }, { "cell_type": "code", "execution_count": null, @@ -7,10 +22,22 @@ "metadata": {}, "outputs": [], "source": [ - "# Replace these with your settings\n", - "# Navigate to https://www.kaggle.com/settings/account and generate an API token to be used to setup the env variable. See https://www.kaggle.com/docs/api#authentication how to create one.\n", - "KAGGLE_USERNAME = \"ricliu\"\n", - "KAGGLE_KEY = \"8145116fb63d4f2be10cab2c1b3ca238\"\n" + "KAGGLE_USERNAME = \"\"\n", + "KAGGLE_KEY = \"\"" + ] + }, + { + "cell_type": "markdown", + "id": "44dca61a-4544-4bbc-b78c-9e0805b96192", + "metadata": {}, + "source": [ + "## Installing Prerequisites\n", + "\n", + "To run this notebook, we will install the following tools and binaries:\n", + "* Ray\n", + "* Kaggle\n", + "* Langchain\n", + "* CloudSQL" ] }, { @@ -21,10 +48,18 @@ "outputs": [], "source": [ "!pip install ray[default]==2.9.3 kaggle==1.6.6\n", - "!pip install langchain ray==2.9.3 datasets sentence-transformers\n", + "!pip install langchain==0.1.10 ray==2.9.3 datasets sentence-transformers\n", "!pip install cloud-sql-python-connector[pg8000] SQLAlchemy==2.0.7" ] }, + { + "cell_type": "markdown", + "id": "a81ab34e-a0ad-4340-8d04-45e9ce4c7416", + "metadata": {}, + "source": [ + "Now we will use the Kaggle CLI to download our data to the mounted GCS bucket:" + ] + }, { "cell_type": "code", "execution_count": null, @@ -38,14 +73,24 @@ "\n", "# Download the zip file to local storage and then extract the desired contents directly to the GKE GCS CSI mounted bucket. The bucket is mounted at the \"/persist-data\" path in the jupyter pod.\n", "!kaggle datasets download -d shivamb/netflix-shows -p ~/data --force\n", - "!mkdir /persist-data/netflix-shows -p\n", - "!unzip -o ~/data/netflix-shows.zip -d /persist-data/netflix-shows" + "!mkdir /data/netflix-shows -p\n", + "!unzip -o ~/data/netflix-shows.zip -d /data/netflix-shows" + ] + }, + { + "cell_type": "markdown", + "id": "c7ff518d-f4d2-481b-b408-2c2507565611", + "metadata": {}, + "source": [ + "## Creating the Database Connection\n", + "\n", + "Let's now set up a connection to your CloudSQL database:" ] }, { "cell_type": "code", "execution_count": null, - "id": "6a37c4eb-c080-4e0d-ad72-f9e5263f651b", + "id": "aff789e7-a32d-4dd7-afb8-d3a22c8f3cec", "metadata": {}, "outputs": [], "source": [ @@ -60,17 +105,8 @@ "from datasets import load_dataset_builder, load_dataset, Dataset\n", "from huggingface_hub import snapshot_download\n", "from google.cloud.sql.connector import Connector, IPTypes\n", - "import sqlalchemy" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "aff789e7-a32d-4dd7-afb8-d3a22c8f3cec", - "metadata": {}, - "outputs": [], - "source": [ - "# initialize parameters\n", + "import sqlalchemy# initialize parameters\n", + "\n", "INSTANCE_CONNECTION_NAME = os.environ[\"CLOUDSQL_INSTANCE_CONNECTION_NAME\"]\n", "print(f\"Your instance connection name is: {INSTANCE_CONNECTION_NAME}\")\n", "DB_NAME = \"pgvector-database\"\n", @@ -84,16 +120,8 @@ "db_password_file.close()\n", "\n", "# initialize Connector object\n", - "connector = Connector()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "18bf2a37-04ef-46cf-bb0a-d639cc45246c", - "metadata": {}, - "outputs": [], - "source": [ + "connector = Connector()\n", + "\n", "# function to return the database connection object\n", "def getconn():\n", " conn = connector.connect(\n", @@ -113,6 +141,14 @@ ")" ] }, + { + "cell_type": "markdown", + "id": "2156a6bd-1100-46c2-8ad6-22a923b3d6ac", + "metadata": {}, + "source": [ + "Next we'll setup some parameters for the dataset processing steps:" + ] + }, { "cell_type": "code", "execution_count": null, @@ -120,14 +156,14 @@ "metadata": {}, "outputs": [], "source": [ - "SHARED_DATA_BASEPATH='/persist-data/rag/st'\n", + "SHARED_DATA_BASEPATH='/data/rag/st'\n", "SENTENCE_TRANSFORMER_MODEL = 'intfloat/multilingual-e5-small' # Transformer to use for converting text chunks to vector embeddings\n", "SENTENCE_TRANSFORMER_MODEL_PATH_NAME='models--intfloat--multilingual-e5-small' # the downloaded model path takes this form for a given model name\n", "SENTENCE_TRANSFORMER_MODEL_SNAPSHOT=\"ffdcc22a9a5c973ef0470385cef91e1ecb461d9f\" # specific snapshot of the model to use\n", "SENTENCE_TRANSFORMER_MODEL_PATH = SHARED_DATA_BASEPATH + '/' + SENTENCE_TRANSFORMER_MODEL_PATH_NAME + '/snapshots/' + SENTENCE_TRANSFORMER_MODEL_SNAPSHOT # the path where the model is downloaded one time\n", "\n", "# the dataset has been pre-dowloaded to the GCS bucket as part of the notebook in the cell above. Ray workers will find the dataset readily mounted.\n", - "SHARED_DATASET_BASE_PATH=\"/persist-data/netflix-shows/\"\n", + "SHARED_DATASET_BASE_PATH=\"/data/netflix-shows/\"\n", "REVIEWS_FILE_NAME=\"netflix_titles.csv\"\n", "\n", "BATCH_SIZE = 100\n", @@ -138,6 +174,38 @@ "ACTOR_POOL_SIZE = 1 # number of actors for the distributed map_batches function" ] }, + { + "cell_type": "markdown", + "id": "3dc5bc85-dc3b-4622-99a2-f9fc269e753b", + "metadata": {}, + "source": [ + "Now we will download the sentence transformer model to our GCS bucket:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b7a676be-56c6-4c76-8041-9ad05361dd3b", + "metadata": {}, + "outputs": [], + "source": [ + "# prepare the persistent shared directory to store artifacts needed for the ray workers\n", + "os.makedirs(SHARED_DATA_BASEPATH, exist_ok=True)\n", + "\n", + "# One time download of the sentence transformer model to a shared persistent storage available to the ray workers\n", + "snapshot_download(repo_id=SENTENCE_TRANSFORMER_MODEL, revision=SENTENCE_TRANSFORMER_MODEL_SNAPSHOT, cache_dir=SHARED_DATA_BASEPATH)" + ] + }, + { + "cell_type": "markdown", + "id": "f7304035-21a4-4017-bce9-aba7e9f81c90", + "metadata": {}, + "source": [ + "## Generating Vector Embeddings\n", + "\n", + "We are ready to begin. Let's first create some code for generating the vector embeddings:" + ] + }, { "cell_type": "code", "execution_count": null, @@ -175,17 +243,11 @@ ] }, { - "cell_type": "code", - "execution_count": null, - "id": "b7a676be-56c6-4c76-8041-9ad05361dd3b", + "cell_type": "markdown", + "id": "7263b9db-9504-4177-acd6-5e1aba2ee332", "metadata": {}, - "outputs": [], "source": [ - "# prepare the persistent shared directory to store artifacts needed for the ray workers\n", - "os.makedirs(SHARED_DATA_BASEPATH, exist_ok=True)\n", - "\n", - "# One time download of the sentence transformer model to a shared persistent storage available to the ray workers\n", - "snapshot_download(repo_id=SENTENCE_TRANSFORMER_MODEL, revision=SENTENCE_TRANSFORMER_MODEL_SNAPSHOT, cache_dir=SHARED_DATA_BASEPATH)\n" + "Next we will initialize a Ray cluster to execute the remote task:" ] }, { @@ -201,7 +263,7 @@ " address=\"ray://ray-cluster-kuberay-head-svc:10001\",\n", " runtime_env={\n", " \"pip\": [ \n", - " \"langchain==0.1.9\",\n", + " \"langchain==0.1.10\",\n", " \"transformers==4.38.1\",\n", " \"sentence-transformers==2.5.1\",\n", " \"pyarrow\",\n", @@ -215,6 +277,14 @@ ")" ] }, + { + "cell_type": "markdown", + "id": "9589048c-a0aa-4740-acde-5289cd4076f7", + "metadata": {}, + "source": [ + "Generate vector embeddings using our Embed class above:" + ] + }, { "cell_type": "code", "execution_count": null, @@ -223,7 +293,6 @@ "outputs": [], "source": [ "# Process the dataset first, wrap the csv file contents into a Ray dataset\n", - "SHARED_DATASET_BASE_PATH=\"/data/netflix-shows/\"\n", "ray_ds = ray.data.read_csv(SHARED_DATASET_BASE_PATH + REVIEWS_FILE_NAME)\n", "print(ray_ds.schema)\n", "\n", @@ -247,24 +316,45 @@ ")" ] }, + { + "cell_type": "markdown", + "id": "4697ac28-9815-409c-95ec-6ecdb862bb74", + "metadata": {}, + "source": [ + "Retrieve the result data from Ray remote workers:" + ] + }, { "cell_type": "code", "execution_count": null, - "id": "caeec9e4-bb01-4355-86ef-065bb443d780", + "id": "e0edbba2-8977-4afd-aaa2-2e6e3a298169", "metadata": {}, "outputs": [], "source": [ - "# Use this block for debug purpose to inspect the embeddings and raw text\n", - "# print(\"Embeddings ray dataset\", ds_embed.schema)\n", - "# for output in ds_embed.iter_rows():\n", - "# # restrict the text string to be less than 65535\n", - "# data_text = output[\"results\"][0][:65535]\n", - "# # vector data pass in needs to be a string \n", - "# data_emb = \",\".join(map(str, output[\"results\"][1]))\n", - "# data_emb = \"[\" + data_emb + \"]\"\n", - "# print (\"raw text:\", data_text, \", emdeddings:\", data_emb)\n", + "@ray.remote\n", + "def ray_data_task(ds_embed):\n", + " results = []\n", + " for row in ds_embed.iter_rows():\n", + " data_text = row[\"results\"][0][:65535]\n", + " # vector data pass in needs to be a string \n", + " data_emb = \",\".join(map(str, row[\"results\"][1]))\n", + " data_emb = \"[\" + data_emb + \"]\"\n", + " \n", + " results.append((data_text, data_emb))\n", + " \n", + " return results\n", + " \n", + "results = ray.get(ray_data_task.remote(ds_embed))" + ] + }, + { + "cell_type": "markdown", + "id": "16f50334-0924-4dab-a356-fb86098bda82", + "metadata": {}, + "source": [ + "## Writing Results Back to MySQL\n", "\n", - "# print(\"Embeddings ray dataset\", ds_embed.schema)" + "Now that we have our vector embeddings, we can write our results back to the MySQL database:" ] }, { @@ -295,16 +385,10 @@ " \n", " query_text = \"INSERT INTO \" + TABLE_NAME + \" (id, text, text_embedding) VALUES (:id, :text, :text_embedding)\"\n", " insert_stmt = sqlalchemy.text(query_text)\n", - " for output in ds_embed.iter_rows():\n", - " # print (\"type of embeddings\", type(output[\"results\"][1]), \"len embeddings\", len(output[\"results\"][1]))\n", - " # restrict the text string to be less than 65535\n", - " data_text = output[\"results\"][0][:65535]\n", - " # vector data pass in needs to be a string \n", - " data_emb = \",\".join(map(str, output[\"results\"][1]))\n", - " data_emb = \"[\" + data_emb + \"]\"\n", + " for r in results:\n", " # print(\"text_embedding is \", data_emb)\n", " id = uuid.uuid4() \n", - " db_conn.execute(insert_stmt, parameters={\"id\": id, \"text\": data_text, \"text_embedding\": data_emb})\n", + " db_conn.execute(insert_stmt, parameters={\"id\": id, \"text\": r[0], \"text_embedding\": r[1]})\n", "\n", " # batch commit transactions\n", " db_conn.commit()\n", @@ -322,6 +406,7 @@ " query_request = \"SELECT id, text, text_embedding, 1 - ('[\" + \",\".join(map(str, query_emb)) + \"]' <=> text_embedding) AS cosine_similarity FROM \" + TABLE_NAME + \" ORDER BY cosine_similarity DESC LIMIT 5;\" \n", " query_results = db_conn.execute(sqlalchemy.text(query_request)).fetchall()\n", " db_conn.commit()\n", + " \n", " print(\"print query_results, the 1st one is the hit\")\n", " for row in query_results:\n", " print(row)\n", @@ -329,6 +414,14 @@ "# cleanup connector object\n", "connector.close()" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b256dcc0-fd79-49dc-907f-f67f93bb0a02", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { From 14da177e831464eef0f36c9d1a3902604947658b Mon Sep 17 00:00:00 2001 From: Richard Liu Date: Fri, 29 Mar 2024 00:40:35 +0000 Subject: [PATCH 05/20] use bulk insert --- .../rag-kaggle-ray-sql-refactored.ipynb | 95 ++++++++++--------- 1 file changed, 48 insertions(+), 47 deletions(-) diff --git a/applications/rag/example_notebooks/rag-kaggle-ray-sql-refactored.ipynb b/applications/rag/example_notebooks/rag-kaggle-ray-sql-refactored.ipynb index f79a41aa2..82a587c88 100644 --- a/applications/rag/example_notebooks/rag-kaggle-ray-sql-refactored.ipynb +++ b/applications/rag/example_notebooks/rag-kaggle-ray-sql-refactored.ipynb @@ -49,7 +49,7 @@ "source": [ "!pip install ray[default]==2.9.3 kaggle==1.6.6\n", "!pip install langchain==0.1.10 ray==2.9.3 datasets sentence-transformers\n", - "!pip install cloud-sql-python-connector[pg8000] SQLAlchemy==2.0.7" + "!pip install cloud-sql-python-connector[pg8000] SQLAlchemy==2.0.7 pgvector" ] }, { @@ -269,8 +269,6 @@ " \"pyarrow\",\n", " \"datasets==2.18.0\",\n", " \"torch==2.0.1\",\n", - " \"cloud-sql-python-connector[pg8000]==1.7.0\",\n", - " \"SQLAlchemy==2.0.7\",\n", " \"huggingface_hub==0.21.3\",\n", " ]\n", " }\n", @@ -336,10 +334,8 @@ " results = []\n", " for row in ds_embed.iter_rows():\n", " data_text = row[\"results\"][0][:65535]\n", - " # vector data pass in needs to be a string \n", - " data_emb = \",\".join(map(str, row[\"results\"][1]))\n", - " data_emb = \"[\" + data_emb + \"]\"\n", - " \n", + " data_emb = row[\"results\"][1]\n", + "\n", " results.append((data_text, data_emb))\n", " \n", " return results\n", @@ -349,7 +345,7 @@ }, { "cell_type": "markdown", - "id": "16f50334-0924-4dab-a356-fb86098bda82", + "id": "5652832e-025d-4745-9fef-96615eea07e4", "metadata": {}, "source": [ "## Writing Results Back to MySQL\n", @@ -360,45 +356,58 @@ { "cell_type": "code", "execution_count": null, - "id": "4cff4bbc-574d-4cc2-8c87-d0ff6d351626", + "id": "07eb5ec7-c4f7-4312-b0ce-ea07160bef92", "metadata": {}, "outputs": [], "source": [ - "data_text = \"\"\n", - "data_emb = \"\"\n", + "from sqlalchemy.ext.declarative import declarative_base\n", + "from sqlalchemy import Column, String, Text, text\n", + "from sqlalchemy.orm import scoped_session, sessionmaker, mapped_column\n", + "from pgvector.sqlalchemy import Vector\n", "\n", - "with pool.connect() as db_conn:\n", - " db_conn.execute(\n", - " sqlalchemy.text(\n", - " \"CREATE EXTENSION IF NOT EXISTS vector;\"\n", - " )\n", - " )\n", - " db_conn.commit()\n", "\n", - " create_table_query = \"CREATE TABLE IF NOT EXISTS \" + TABLE_NAME + \" ( id VARCHAR(255) NOT NULL, text TEXT NOT NULL, text_embedding vector(384) NOT NULL, PRIMARY KEY (id));\"\n", - " db_conn.execute(\n", - " sqlalchemy.text(create_table_query)\n", - " )\n", - " # commit transaction (SQLAlchemy v2.X.X is commit as you go)\n", - " db_conn.commit()\n", - " print(\"Created table=\", TABLE_NAME)\n", - " \n", - " query_text = \"INSERT INTO \" + TABLE_NAME + \" (id, text, text_embedding) VALUES (:id, :text, :text_embedding)\"\n", - " insert_stmt = sqlalchemy.text(query_text)\n", - " for r in results:\n", - " # print(\"text_embedding is \", data_emb)\n", - " id = uuid.uuid4() \n", - " db_conn.execute(insert_stmt, parameters={\"id\": id, \"text\": r[0], \"text_embedding\": r[1]})\n", + "Base = declarative_base()\n", + "DBSession = scoped_session(sessionmaker())\n", "\n", - " # batch commit transactions\n", - " db_conn.commit()\n", + "class TextEmbedding(Base):\n", + " __tablename__ = TABLE_NAME\n", + " id = Column(String(255), primary_key=True)\n", + " text = Column(Text)\n", + " text_embedding = mapped_column(Vector(384))\n", "\n", - " # query and fetch table\n", - " query_text = \"SELECT * FROM \" + TABLE_NAME\n", - " results = db_conn.execute(sqlalchemy.text(query_text)).fetchall()\n", - " # for row in results:\n", - " # print(row)\n", + "with pool.connect() as conn:\n", + " conn.execute(text(\"CREATE EXTENSION IF NOT EXISTS vector\"))\n", + " conn.commit() \n", + " \n", + "DBSession.configure(bind=pool, autoflush=False, expire_on_commit=False)\n", + "Base.metadata.drop_all(pool)\n", + "Base.metadata.create_all(pool)\n", + "\n", + "rows = []\n", + "for r in results:\n", + " id = uuid.uuid4() \n", + " rows.append(TextEmbedding(id=id, text=r[0], text_embedding=r[1]))\n", "\n", + "DBSession.bulk_save_objects(rows)\n", + "DBSession.commit()" + ] + }, + { + "cell_type": "markdown", + "id": "b4b19b1c-a83b-4a83-94a9-5edf5ae7016a", + "metadata": {}, + "source": [ + "Finally let's verify that our embeddings got stored in the database correctly:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4cff4bbc-574d-4cc2-8c87-d0ff6d351626", + "metadata": {}, + "outputs": [], + "source": [ + "with pool.connect() as db_conn:\n", " # verify results\n", " transformer = SentenceTransformer(SENTENCE_TRANSFORMER_MODEL)\n", " query_text = \"During my holiday in Marmaris we ate here to fit the food. It's really good\" \n", @@ -414,14 +423,6 @@ "# cleanup connector object\n", "connector.close()" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b256dcc0-fd79-49dc-907f-f67f93bb0a02", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { From de1bbd09e5adaa8928e0937c6411478f4c529725 Mon Sep 17 00:00:00 2001 From: Richard Liu Date: Fri, 29 Mar 2024 00:47:32 +0000 Subject: [PATCH 06/20] revert --- applications/rag/workloads.tfvars | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/applications/rag/workloads.tfvars b/applications/rag/workloads.tfvars index a060fcd8b..d4c12c620 100644 --- a/applications/rag/workloads.tfvars +++ b/applications/rag/workloads.tfvars @@ -12,15 +12,15 @@ # See the License for the specific language governing permissions and # limitations under the License. -project_id = "ricliu-gke-dev" +project_id = "" ## this is required for terraform to connect to GKE master and deploy workloads create_cluster = true # Create a GKE cluster in the specified network. -autopilot_cluster = false -cluster_name = "raggedy-rag" -cluster_location = "us-east4" +autopilot_cluster = true +cluster_name = "" +cluster_location = "us-central1" create_network = true -network_name = "raga-network" +network_name = "ml-network" subnetwork_cidr = "10.100.0.0/16" ## GKE environment variables @@ -29,10 +29,10 @@ create_gcs_bucket = true # The bucket name must be globally unique (across all of Google Cloud). # To verify, check that `gcloud storage buckets describe gs://` returns a 404. -gcs_bucket = "rag-data-ricliu-b" +gcs_bucket = "rag-data-" cloudsql_instance = "pgvector-instance" -cloudsql_instance_region = "us-east4" # defaults to cluster_location, if not specified +cloudsql_instance_region = "us-central1" # defaults to cluster_location, if not specified ## Service accounts @@ -70,7 +70,7 @@ jupyter_k8s_backend_service_port = 80 jupyter_domain = "" ## Provide domain for ingress resource and ssl certificate. If it's empty, it will use nip.io wildcard dns jupyter_client_id = "" jupyter_client_secret = "" -jupyter_members_allowlist = "user:ricliu@google.com,group:,serviceAccount:,domain:google.com" +jupyter_members_allowlist = "user:,group:,serviceAccount:,domain:google.com" ## Frontend IAP Settings frontend_add_auth = false # Set to true when using auth with IAP @@ -97,4 +97,4 @@ ray_dashboard_k8s_backend_service_port = 8265 ray_dashboard_domain = "" ## Provide domain for ingress resource and ssl certificate. If it's empty, it will use nip.io wildcard dns ray_dashboard_client_id = "" ray_dashboard_client_secret = "" -ray_dashboard_members_allowlist = "user:,group:,serviceAccount:,domain:google.com" +ray_dashboard_members_allowlist = "user:,group:,serviceAccount:,domain:google.com" \ No newline at end of file From 10ce80c892029f5d6eaed9e287daee52adf1f1e2 Mon Sep 17 00:00:00 2001 From: Richard Liu Date: Fri, 29 Mar 2024 00:59:03 +0000 Subject: [PATCH 07/20] change persist data --- modules/jupyter/jupyter_config/config-selfauth-autopilot.yaml | 2 +- modules/jupyter/jupyter_config/config-selfauth.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/jupyter/jupyter_config/config-selfauth-autopilot.yaml b/modules/jupyter/jupyter_config/config-selfauth-autopilot.yaml index 293893162..c800a7dfe 100644 --- a/modules/jupyter/jupyter_config/config-selfauth-autopilot.yaml +++ b/modules/jupyter/jupyter_config/config-selfauth-autopilot.yaml @@ -119,7 +119,7 @@ singleuser: extraVolumeMounts: - name: test-vol - mountPath: /persist-data + mountPath: /data - name: secret-volume mountPath: /etc/secret-volume readOnly: true diff --git a/modules/jupyter/jupyter_config/config-selfauth.yaml b/modules/jupyter/jupyter_config/config-selfauth.yaml index f6b03aa8c..ddd07b5d7 100644 --- a/modules/jupyter/jupyter_config/config-selfauth.yaml +++ b/modules/jupyter/jupyter_config/config-selfauth.yaml @@ -116,7 +116,7 @@ singleuser: optional: true extraVolumeMounts: - name: test-vol - mountPath: /persist-data + mountPath: /data - name: secret-volume mountPath: /etc/secret-volume readOnly: true From 2e1445d4458320682cee22c10b40c886291bd4e7 Mon Sep 17 00:00:00 2001 From: Richard Liu Date: Fri, 29 Mar 2024 01:05:36 +0000 Subject: [PATCH 08/20] terraform fmt --- modules/jupyter/main.tf | 56 ++++++++++++++++++++--------------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/modules/jupyter/main.tf b/modules/jupyter/main.tf index b06d132a9..8170dd79d 100644 --- a/modules/jupyter/main.tf +++ b/modules/jupyter/main.tf @@ -110,41 +110,41 @@ resource "helm_release" "jupyterhub" { timeout = 600 values = var.autopilot_cluster ? [templatefile("${path.module}/jupyter_config/config-selfauth-autopilot.yaml", { - password = var.add_auth ? "dummy" : random_password.generated_password[0].result - project_id = var.project_id - project_number = data.google_project.project.number - namespace = var.namespace - additional_labels = local.additional_labels - backend_config = var.k8s_backend_config_name - service_name = var.k8s_backend_service_name - authenticator_class = var.add_auth ? "'gcpiapjwtauthenticator.GCPIAPAuthenticator'" : "dummy" - service_type = var.add_auth ? "NodePort" : "ClusterIP" - gcs_bucket = var.gcs_bucket - k8s_service_account = var.workload_identity_service_account - ephemeral_storage = var.ephemeral_storage + password = var.add_auth ? "dummy" : random_password.generated_password[0].result + project_id = var.project_id + project_number = data.google_project.project.number + namespace = var.namespace + additional_labels = local.additional_labels + backend_config = var.k8s_backend_config_name + service_name = var.k8s_backend_service_name + authenticator_class = var.add_auth ? "'gcpiapjwtauthenticator.GCPIAPAuthenticator'" : "dummy" + service_type = var.add_auth ? "NodePort" : "ClusterIP" + gcs_bucket = var.gcs_bucket + k8s_service_account = var.workload_identity_service_account + ephemeral_storage = var.ephemeral_storage secret_name = var.db_secret_name cloudsql_instance_connection_name = local.cloudsql_instance_connection_name - notebook_image = "jupyter/tensorflow-notebook" - notebook_image_tag = "python-3.10" + notebook_image = "jupyter/tensorflow-notebook" + notebook_image_tag = "python-3.10" }) ] : [templatefile("${path.module}/jupyter_config/config-selfauth.yaml", { - password = var.add_auth ? "dummy" : random_password.generated_password[0].result - project_id = var.project_id - project_number = data.google_project.project.number - namespace = var.namespace - additional_labels = local.additional_labels - backend_config = var.k8s_backend_config_name - service_name = var.k8s_backend_service_name - authenticator_class = var.add_auth ? "'gcpiapjwtauthenticator.GCPIAPAuthenticator'" : "dummy" - service_type = var.add_auth ? "NodePort" : "ClusterIP" - gcs_bucket = var.gcs_bucket - k8s_service_account = var.workload_identity_service_account - ephemeral_storage = var.ephemeral_storage + password = var.add_auth ? "dummy" : random_password.generated_password[0].result + project_id = var.project_id + project_number = data.google_project.project.number + namespace = var.namespace + additional_labels = local.additional_labels + backend_config = var.k8s_backend_config_name + service_name = var.k8s_backend_service_name + authenticator_class = var.add_auth ? "'gcpiapjwtauthenticator.GCPIAPAuthenticator'" : "dummy" + service_type = var.add_auth ? "NodePort" : "ClusterIP" + gcs_bucket = var.gcs_bucket + k8s_service_account = var.workload_identity_service_account + ephemeral_storage = var.ephemeral_storage secret_name = var.db_secret_name cloudsql_instance_connection_name = local.cloudsql_instance_connection_name - notebook_image = "jupyter/tensorflow-notebook" - notebook_image_tag = "python-3.10" + notebook_image = "jupyter/tensorflow-notebook" + notebook_image_tag = "python-3.10" }) ] depends_on = [module.jupyterhub-workload-identity] From c75e246a2f535adbc837f27ab87f882804f8eb97 Mon Sep 17 00:00:00 2001 From: Richard Liu Date: Fri, 29 Mar 2024 20:40:04 +0000 Subject: [PATCH 09/20] remove sql params from notebook --- modules/jupyter/jupyter_config/config-selfauth-autopilot.yaml | 4 ++-- modules/jupyter/jupyter_config/config-selfauth.yaml | 2 +- modules/jupyter/main.tf | 2 +- modules/jupyter/variables.tf | 3 --- 4 files changed, 4 insertions(+), 7 deletions(-) diff --git a/modules/jupyter/jupyter_config/config-selfauth-autopilot.yaml b/modules/jupyter/jupyter_config/config-selfauth-autopilot.yaml index 8e76ae5bd..c9910c17d 100644 --- a/modules/jupyter/jupyter_config/config-selfauth-autopilot.yaml +++ b/modules/jupyter/jupyter_config/config-selfauth-autopilot.yaml @@ -76,8 +76,8 @@ singleuser: limit: 16G guarantee: 16G cpu: - limit: 4 - guarantee: 4 + limit: 8 + guarantee: 8 extraResource: limits: ephemeral-storage: ${ephemeral_storage} diff --git a/modules/jupyter/jupyter_config/config-selfauth.yaml b/modules/jupyter/jupyter_config/config-selfauth.yaml index 24142e74d..4fdb055dc 100644 --- a/modules/jupyter/jupyter_config/config-selfauth.yaml +++ b/modules/jupyter/jupyter_config/config-selfauth.yaml @@ -75,7 +75,7 @@ singleuser: guarantee: 8G cpu: limit: 8 - guarantee: 4 + guarantee: 8 extraResource: limits: ephemeral-storage: ${ephemeral_storage} diff --git a/modules/jupyter/main.tf b/modules/jupyter/main.tf index 8170dd79d..145cbff84 100644 --- a/modules/jupyter/main.tf +++ b/modules/jupyter/main.tf @@ -17,7 +17,7 @@ data "google_project" "project" { } locals { - cloudsql_instance_connection_name = format("%s:%s:%s", var.project_id, var.db_region, var.cloudsql_instance_name) + cloudsql_instance_connection_name = var.cloudsql_instance_name != "" ? format("%s:%s:%s", var.project_id, var.db_region, var.cloudsql_instance_name) : "" additional_labels = tomap({ for item in var.additional_labels : split("=", item)[0] => split("=", item)[1] diff --git a/modules/jupyter/variables.tf b/modules/jupyter/variables.tf index a0a200033..942cb1745 100644 --- a/modules/jupyter/variables.tf +++ b/modules/jupyter/variables.tf @@ -139,18 +139,15 @@ variable "autopilot_cluster" { variable "db_region" { type = string description = "Cloud SQL instance region" - default = "us-east4" } variable "db_secret_name" { type = string description = "CloudSQL user credentials" - default = "empty-secret" } variable "cloudsql_instance_name" { type = string description = "Cloud SQL instance name" - default = "pgvector-instance" } From 708fe719ce99b7c9cc594c427e46c6abee8f9f59 Mon Sep 17 00:00:00 2001 From: Richard Liu Date: Fri, 29 Mar 2024 20:56:08 +0000 Subject: [PATCH 10/20] default empty values --- modules/jupyter/variables.tf | 3 +++ 1 file changed, 3 insertions(+) diff --git a/modules/jupyter/variables.tf b/modules/jupyter/variables.tf index 942cb1745..99e6adb27 100644 --- a/modules/jupyter/variables.tf +++ b/modules/jupyter/variables.tf @@ -139,15 +139,18 @@ variable "autopilot_cluster" { variable "db_region" { type = string description = "Cloud SQL instance region" + default = "" } variable "db_secret_name" { type = string description = "CloudSQL user credentials" + default = "" } variable "cloudsql_instance_name" { type = string description = "Cloud SQL instance name" + default = "" } From 06403bde7515e3fbbe0e4932c9c14732f43780f2 Mon Sep 17 00:00:00 2001 From: Richard Liu Date: Fri, 29 Mar 2024 21:00:39 +0000 Subject: [PATCH 11/20] rename --- ...-sql-refactored.ipynb => rag-kaggle-ray-sql-interactive.ipynb} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename applications/rag/example_notebooks/{rag-kaggle-ray-sql-refactored.ipynb => rag-kaggle-ray-sql-interactive.ipynb} (100%) diff --git a/applications/rag/example_notebooks/rag-kaggle-ray-sql-refactored.ipynb b/applications/rag/example_notebooks/rag-kaggle-ray-sql-interactive.ipynb similarity index 100% rename from applications/rag/example_notebooks/rag-kaggle-ray-sql-refactored.ipynb rename to applications/rag/example_notebooks/rag-kaggle-ray-sql-interactive.ipynb From d1294345a99b35b2bc3ccffecabe103bd81ef301 Mon Sep 17 00:00:00 2001 From: Richard Liu Date: Fri, 29 Mar 2024 21:45:43 +0000 Subject: [PATCH 12/20] parameterize notebook image --- modules/jupyter/main.tf | 8 ++++---- modules/jupyter/variables.tf | 12 ++++++++++++ 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/modules/jupyter/main.tf b/modules/jupyter/main.tf index 145cbff84..9adfb41aa 100644 --- a/modules/jupyter/main.tf +++ b/modules/jupyter/main.tf @@ -125,8 +125,8 @@ resource "helm_release" "jupyterhub" { secret_name = var.db_secret_name cloudsql_instance_connection_name = local.cloudsql_instance_connection_name - notebook_image = "jupyter/tensorflow-notebook" - notebook_image_tag = "python-3.10" + notebook_image = var.notebook_image + notebook_image_tag = var.notebook_image_tag }) ] : [templatefile("${path.module}/jupyter_config/config-selfauth.yaml", { password = var.add_auth ? "dummy" : random_password.generated_password[0].result @@ -143,8 +143,8 @@ resource "helm_release" "jupyterhub" { ephemeral_storage = var.ephemeral_storage secret_name = var.db_secret_name cloudsql_instance_connection_name = local.cloudsql_instance_connection_name - notebook_image = "jupyter/tensorflow-notebook" - notebook_image_tag = "python-3.10" + notebook_image = var.notebook_image + notebook_image_tag = var.notebook_image_tag }) ] depends_on = [module.jupyterhub-workload-identity] diff --git a/modules/jupyter/variables.tf b/modules/jupyter/variables.tf index 99e6adb27..010a1b475 100644 --- a/modules/jupyter/variables.tf +++ b/modules/jupyter/variables.tf @@ -17,6 +17,18 @@ variable "namespace" { description = "Kubernetes namespace where resources are deployed" } +variable "notebook_image" { + type = string + description = "Jupyter notebook image name" + default = "jupyter/tensorflow-notebook" +} + +variable "notebook_image_tag" { + type = string + description = "Jupyter notebook image tag" + default = "python-3.10" +} + variable "members_allowlist" { type = list(string) default = [] From e733e94e9019702b1167cf064727a490f967777e Mon Sep 17 00:00:00 2001 From: Richard Liu Date: Fri, 29 Mar 2024 23:53:08 +0000 Subject: [PATCH 13/20] remove pip installs from notebook --- .../rag-kaggle-ray-sql-interactive.ipynb | 34 +++---------------- 1 file changed, 5 insertions(+), 29 deletions(-) diff --git a/applications/rag/example_notebooks/rag-kaggle-ray-sql-interactive.ipynb b/applications/rag/example_notebooks/rag-kaggle-ray-sql-interactive.ipynb index 82a587c88..750bd9749 100644 --- a/applications/rag/example_notebooks/rag-kaggle-ray-sql-interactive.ipynb +++ b/applications/rag/example_notebooks/rag-kaggle-ray-sql-interactive.ipynb @@ -2,12 +2,12 @@ "cells": [ { "cell_type": "markdown", - "id": "195d442b-08a9-4469-824c-80188c622c46", + "id": "304bba57-cdb4-42d5-a4b7-2494b6cfa4ff", "metadata": {}, "source": [ "# RAG-on-GKE Application\n", "\n", - "This is a Python notebook for the RAG on GKE application. For full information, please checkout the GitHub documentation [here](https://github.com/GoogleCloudPlatform/ai-on-gke/blob/main/applications/rag/README.md).\n", + "This is a Python notebook for generating the vector embeddings used by the RAG on GKE application. For full information, please checkout the GitHub documentation [here](https://github.com/GoogleCloudPlatform/ai-on-gke/blob/main/applications/rag/README.md).\n", "\n", "\n", "## Setup Kaggle Credentials\n", @@ -26,32 +26,6 @@ "KAGGLE_KEY = \"\"" ] }, - { - "cell_type": "markdown", - "id": "44dca61a-4544-4bbc-b78c-9e0805b96192", - "metadata": {}, - "source": [ - "## Installing Prerequisites\n", - "\n", - "To run this notebook, we will install the following tools and binaries:\n", - "* Ray\n", - "* Kaggle\n", - "* Langchain\n", - "* CloudSQL" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a814e91b-3afe-4c28-a3d6-fe087c7af552", - "metadata": {}, - "outputs": [], - "source": [ - "!pip install ray[default]==2.9.3 kaggle==1.6.6\n", - "!pip install langchain==0.1.10 ray==2.9.3 datasets sentence-transformers\n", - "!pip install cloud-sql-python-connector[pg8000] SQLAlchemy==2.0.7 pgvector" - ] - }, { "cell_type": "markdown", "id": "a81ab34e-a0ad-4340-8d04-45e9ce4c7416", @@ -105,7 +79,9 @@ "from datasets import load_dataset_builder, load_dataset, Dataset\n", "from huggingface_hub import snapshot_download\n", "from google.cloud.sql.connector import Connector, IPTypes\n", - "import sqlalchemy# initialize parameters\n", + "import sqlalchemy\n", + "\n", + "# initialize parameters\n", "\n", "INSTANCE_CONNECTION_NAME = os.environ[\"CLOUDSQL_INSTANCE_CONNECTION_NAME\"]\n", "print(f\"Your instance connection name is: {INSTANCE_CONNECTION_NAME}\")\n", From 7365699260aa5cc9947deca512f34385db4f8992 Mon Sep 17 00:00:00 2001 From: Richard Liu Date: Sat, 30 Mar 2024 00:48:10 +0000 Subject: [PATCH 14/20] use custom notebook image --- applications/rag/main.tf | 3 +++ 1 file changed, 3 insertions(+) diff --git a/applications/rag/main.tf b/applications/rag/main.tf index f32b6f77a..905c46be5 100644 --- a/applications/rag/main.tf +++ b/applications/rag/main.tf @@ -192,6 +192,9 @@ module "jupyterhub" { autopilot_cluster = local.enable_autopilot workload_identity_service_account = local.jupyter_service_account + notebook_image = "us-central1-docker.pkg.dev/ai-on-gke/rag-on-gke/jupyter-notebook-image" + notebook_image_tag = "latest" + db_secret_name = module.cloudsql.db_secret_name cloudsql_instance_name = local.cloudsql_instance db_region = local.cloudsql_instance_region From 8961bb3594e15409f16ad0209cfc6d3304b0672f Mon Sep 17 00:00:00 2001 From: Richard Liu Date: Sat, 30 Mar 2024 00:50:21 +0000 Subject: [PATCH 15/20] terraform fmt --- applications/rag/main.tf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/applications/rag/main.tf b/applications/rag/main.tf index 905c46be5..f56335156 100644 --- a/applications/rag/main.tf +++ b/applications/rag/main.tf @@ -192,8 +192,8 @@ module "jupyterhub" { autopilot_cluster = local.enable_autopilot workload_identity_service_account = local.jupyter_service_account - notebook_image = "us-central1-docker.pkg.dev/ai-on-gke/rag-on-gke/jupyter-notebook-image" - notebook_image_tag = "latest" + notebook_image = "us-central1-docker.pkg.dev/ai-on-gke/rag-on-gke/jupyter-notebook-image" + notebook_image_tag = "latest" db_secret_name = module.cloudsql.db_secret_name cloudsql_instance_name = local.cloudsql_instance From 6fda174fa061bdd5c34f507d66943e5badf99a4f Mon Sep 17 00:00:00 2001 From: Richard Liu <39319471+richardsliu@users.noreply.github.com> Date: Mon, 1 Apr 2024 09:57:15 -0700 Subject: [PATCH 16/20] replace jupyter notebook tag --- applications/rag/main.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/applications/rag/main.tf b/applications/rag/main.tf index f56335156..ccde3e6c7 100644 --- a/applications/rag/main.tf +++ b/applications/rag/main.tf @@ -193,7 +193,7 @@ module "jupyterhub" { workload_identity_service_account = local.jupyter_service_account notebook_image = "us-central1-docker.pkg.dev/ai-on-gke/rag-on-gke/jupyter-notebook-image" - notebook_image_tag = "latest" + notebook_image_tag = "v1.1-rag" db_secret_name = module.cloudsql.db_secret_name cloudsql_instance_name = local.cloudsql_instance From 4cd1a77a2f79d2aa68533fe109730b76a65a2d88 Mon Sep 17 00:00:00 2001 From: Richard Liu Date: Mon, 1 Apr 2024 19:07:48 +0000 Subject: [PATCH 17/20] add notebook version to jupyterhub app --- applications/jupyter/main.tf | 2 ++ 1 file changed, 2 insertions(+) diff --git a/applications/jupyter/main.tf b/applications/jupyter/main.tf index 3a9dd89a1..fc7b3d791 100644 --- a/applications/jupyter/main.tf +++ b/applications/jupyter/main.tf @@ -149,6 +149,8 @@ module "jupyterhub" { workload_identity_service_account = local.workload_identity_service_account gcs_bucket = var.gcs_bucket autopilot_cluster = local.enable_autopilot + notebook_image = "jupyter/tensorflow-notebook" + notebook_image_tag = "python-3.10" # IAP Auth parameters add_auth = var.add_auth From 7ed030b20e7829cd439c19216b350b727c6947fe Mon Sep 17 00:00:00 2001 From: Richard Liu Date: Mon, 1 Apr 2024 19:14:07 +0000 Subject: [PATCH 18/20] merge cells --- .../rag-kaggle-ray-sql-interactive.ipynb | 29 ++++--------------- 1 file changed, 5 insertions(+), 24 deletions(-) diff --git a/applications/rag/example_notebooks/rag-kaggle-ray-sql-interactive.ipynb b/applications/rag/example_notebooks/rag-kaggle-ray-sql-interactive.ipynb index 750bd9749..2b80e437e 100644 --- a/applications/rag/example_notebooks/rag-kaggle-ray-sql-interactive.ipynb +++ b/applications/rag/example_notebooks/rag-kaggle-ray-sql-interactive.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "304bba57-cdb4-42d5-a4b7-2494b6cfa4ff", + "id": "5574f366-58e9-408b-aea4-1bf5b3351e4c", "metadata": {}, "source": [ "# RAG-on-GKE Application\n", @@ -12,38 +12,19 @@ "\n", "## Setup Kaggle Credentials\n", "\n", - "First we will setup your Kaggle credentials. Replace the following with your own settings from the Kaggle web page. Navigate to https://www.kaggle.com/settings/account and generate an API token to be used to setup the env variable. See https://www.kaggle.com/docs/api#authentication how to create one." + "First we will setup your Kaggle credentials and use the Kaggle CLI to download the NetFlix shows dataset to the GCS bucket. Replace the following with your own settings from the Kaggle web page. Navigate to https://www.kaggle.com/settings/account and generate an API token to be used to setup the env variable. See https://www.kaggle.com/docs/api#authentication how to create one." ] }, { "cell_type": "code", "execution_count": null, - "id": "00b1aff4", - "metadata": {}, - "outputs": [], - "source": [ - "KAGGLE_USERNAME = \"\"\n", - "KAGGLE_KEY = \"\"" - ] - }, - { - "cell_type": "markdown", - "id": "a81ab34e-a0ad-4340-8d04-45e9ce4c7416", - "metadata": {}, - "source": [ - "Now we will use the Kaggle CLI to download our data to the mounted GCS bucket:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1e26faef-9e2e-4793-b8af-0e18470b482d", + "id": "ffee2bec-804f-4e22-9ba0-8b1db5a5d7ec", "metadata": {}, "outputs": [], "source": [ "import os\n", - "os.environ['KAGGLE_USERNAME'] = KAGGLE_USERNAME\n", - "os.environ['KAGGLE_KEY'] = KAGGLE_KEY\n", + "os.environ['KAGGLE_USERNAME'] = \"\"\n", + "os.environ['KAGGLE_KEY'] = \"\"\n", "\n", "# Download the zip file to local storage and then extract the desired contents directly to the GKE GCS CSI mounted bucket. The bucket is mounted at the \"/persist-data\" path in the jupyter pod.\n", "!kaggle datasets download -d shivamb/netflix-shows -p ~/data --force\n", From 929bbd914db85bf958e3ea53dbae02777a089dee Mon Sep 17 00:00:00 2001 From: Richard Liu Date: Mon, 1 Apr 2024 21:04:21 +0000 Subject: [PATCH 19/20] add dummy value for secret volume --- modules/jupyter/variables.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/jupyter/variables.tf b/modules/jupyter/variables.tf index 010a1b475..cd2ebaca5 100644 --- a/modules/jupyter/variables.tf +++ b/modules/jupyter/variables.tf @@ -157,7 +157,7 @@ variable "db_region" { variable "db_secret_name" { type = string description = "CloudSQL user credentials" - default = "" + default = "dummy_value" } variable "cloudsql_instance_name" { From 3bd07cda68c07d5a9ddc038e13ba0efc8e3cebfe Mon Sep 17 00:00:00 2001 From: Richard Liu Date: Mon, 1 Apr 2024 22:00:27 +0000 Subject: [PATCH 20/20] fix old notebook --- .../rag-kaggle-ray-sql-latest.ipynb | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/applications/rag/example_notebooks/rag-kaggle-ray-sql-latest.ipynb b/applications/rag/example_notebooks/rag-kaggle-ray-sql-latest.ipynb index 3134e3287..db5c5f430 100644 --- a/applications/rag/example_notebooks/rag-kaggle-ray-sql-latest.ipynb +++ b/applications/rag/example_notebooks/rag-kaggle-ray-sql-latest.ipynb @@ -1,5 +1,15 @@ { "cells": [ + { + "cell_type": "markdown", + "id": "afb6fda4-ffde-4831-88a5-ae41144492b2", + "metadata": {}, + "source": [ + "# RAG-on-GKE Application\n", + "\n", + "This is a Python notebook for generating the vector embeddings used by the RAG on GKE application. For full information, please checkout the GitHub documentation [here](https://github.com/GoogleCloudPlatform/ai-on-gke/blob/main/applications/rag/README.md).\n" + ] + }, { "cell_type": "code", "execution_count": null, @@ -10,7 +20,7 @@ "# Replace these with your settings\n", "# Navigate to https://www.kaggle.com/settings/account and generate an API token to be used to setup the env variable. See https://www.kaggle.com/docs/api#authentication how to create one.\n", "KAGGLE_USERNAME = \"\"\n", - "KAGGLE_KEY = \"\"\n" + "KAGGLE_KEY = \"\"" ] }, { @@ -36,8 +46,8 @@ "\n", "# Download the zip file to local storage and then extract the desired contents directly to the GKE GCS CSI mounted bucket. The bucket is mounted at the \"/persist-data\" path in the jupyter pod.\n", "!kaggle datasets download -d shivamb/netflix-shows -p ~/data --force\n", - "!mkdir /persist-data/netflix-shows -p\n", - "!unzip -o ~/data/netflix-shows.zip -d /persist-data/netflix-shows" + "!mkdir /data/netflix-shows -p\n", + "!unzip -o ~/data/netflix-shows.zip -d /data/netflix-shows" ] }, {