diff --git a/applications/rag/example_notebooks/rag-data-ingest-with-kubernetes-docs.ipynb b/applications/rag/example_notebooks/rag-data-ingest-with-kubernetes-docs.ipynb index 7dd40f32d..9cbd84497 100644 --- a/applications/rag/example_notebooks/rag-data-ingest-with-kubernetes-docs.ipynb +++ b/applications/rag/example_notebooks/rag-data-ingest-with-kubernetes-docs.ipynb @@ -24,23 +24,23 @@ "colab": { "base_uri": "https://localhost:8080/" }, - "id": "k8d6_U2sbaJ_", "executionInfo": { + "elapsed": 569, "status": "ok", "timestamp": 1721926267799, - "user_tz": 300, - "elapsed": 569, "user": { "displayName": "", "userId": "" - } + }, + "user_tz": 300 }, + "id": "k8d6_U2sbaJ_", "outputId": "e15c65de-1382-4923-a3ee-15b3f3f21f86" }, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "fatal: destination path '/data/kubernetes-docs' already exists and is not an empty directory.\n" ] @@ -53,43 +53,38 @@ }, { "cell_type": "markdown", - "source": [ - "- Install the required packages" - ], "metadata": { "id": "iRtu4buBamab" - } + }, + "source": [ + "- Install the required packages" + ] }, { "cell_type": "code", - "source": [ - "!pip install pgvector\n", - "!pip install langchain langchain-community sentence_transformers unstructured[pdf]\n", - "!pip install google cloud-sql-python-connector[pg8000] langchain-google-cloud-sql-pg" - ], + "execution_count": 2, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "collapsed": true, - "id": "xRh2Gn1rcBJY", "executionInfo": { + "elapsed": 35573, "status": "ok", "timestamp": 1721926317024, - "user_tz": 300, - "elapsed": 35573, "user": { "displayName": "", "userId": "" - } + }, + "user_tz": 300 }, + "id": "xRh2Gn1rcBJY", "outputId": "f0deb85d-1d5c-41d0-b6ff-e3ed86bd3042" }, - "execution_count": 2, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "Requirement already satisfied: pgvector in /usr/local/lib/python3.10/dist-packages (0.3.2)\n", "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from pgvector) (1.25.2)\n", @@ -294,32 +289,37 @@ "Requirement already satisfied: mypy-extensions>=0.3.0 in /usr/local/lib/python3.10/dist-packages (from typing-inspect<1,>=0.4.0->dataclasses-json<0.7,>=0.5.7->langchain-community<0.3.0,>=0.0.18->langchain-google-cloud-sql-pg) (1.0.0)\n" ] } + ], + "source": [ + "!pip install pgvector\n", + "!pip install langchain langchain-community sentence_transformers unstructured[pdf]\n", + "!pip install google cloud-sql-python-connector[pg8000] langchain-google-cloud-sql-pg" ] }, { "cell_type": "markdown", - "source": [ - " - Import required functions and libraries" - ], "metadata": { "id": "yZybYPPvaqcS" - } + }, + "source": [ + " - Import required functions and libraries" + ] }, { "cell_type": "code", "execution_count": 3, "metadata": { - "id": "FWqsMMdQbaKA", "executionInfo": { + "elapsed": 1322, "status": "ok", "timestamp": 1721926369825, - "user_tz": 300, - "elapsed": 1322, "user": { "displayName": "", "userId": "" - } - } + }, + "user_tz": 300 + }, + "id": "FWqsMMdQbaKA" }, "outputs": [], "source": [ @@ -345,90 +345,43 @@ "Let's now set up a connection to your CloudSQL database:" ] }, - { - "cell_type": "code", - "source": [ - "%env ENVIRONMENT=development\n", - "%env PROJECT_ID=globant-gke-ai-resources\n", - "%env CLOUDSQL_INSTANCE_REGION=us-west1\n", - "%env CLOUDSQL_INSTANCE=rag-application-test\n", - "%env EMBEDDINGS_TABLE_NAME=kubernetes_docs\n", - "%env DB_USERNAME=main-user\n", - "%env DB_PASS=gSo{I@YMyd8]&\\34" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "DegY7bswdlSB", - "executionInfo": { - "status": "ok", - "timestamp": 1721926389134, - "user_tz": 300, - "elapsed": 338, - "user": { - "displayName": "", - "userId": "" - } - }, - "outputId": "ca5aa526-bace-469d-8808-162d9e934be2" - }, - "execution_count": 4, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "env: ENVIRONMENT=development\n", - "env: PROJECT_ID=globant-gke-ai-resources\n", - "env: CLOUDSQL_INSTANCE_REGION=us-west1\n", - "env: CLOUDSQL_INSTANCE=rag-application-test\n", - "env: EMBEDDINGS_TABLE_NAME=kubernetes_docs\n", - "env: DB_USERNAME=main-user\n", - "env: DB_PASS=gSo{I@YMyd8]&\\34\n" - ] - } - ] - }, { "cell_type": "code", "execution_count": 5, "metadata": { - "id": "rvK19kzwbaKB", "executionInfo": { + "elapsed": 457, "status": "ok", "timestamp": 1721926402495, - "user_tz": 300, - "elapsed": 457, "user": { "displayName": "", "userId": "" - } - } + }, + "user_tz": 300 + }, + "id": "rvK19kzwbaKB" }, "outputs": [], "source": [ - "ENVIRONMENT = os.environ.get(\"ENVIRONMENT\")\n", - "\n", - "GCP_PROJECT_ID = os.environ.get(\"PROJECT_ID\")\n", - "GCP_CLOUD_SQL_REGION = os.environ.get(\"CLOUDSQL_INSTANCE_REGION\")\n", - "GCP_CLOUD_SQL_INSTANCE = os.environ.get(\"CLOUDSQL_INSTANCE\")\n", + "# initialize parameters\n", + "INSTANCE_CONNECTION_NAME = os.environ.get(\"CLOUDSQL_INSTANCE_CONNECTION_NAME\", \"\")\n", + "print(f\"Your instance connection name is: {INSTANCE_CONNECTION_NAME}\")\n", + "cloud_variables = INSTANCE_CONNECTION_NAME.split(\":\")\n", "\n", - "DB_NAME = os.environ.get(\"DB_NAME\", \"pgvector-database\")\n", - "VECTOR_EMBEDDINGS_TABLE_NAME = os.environ.get(\"EMBEDDINGS_TABLE_NAME\", \"\")\n", + "GCP_PROJECT_ID = os.environ.get(\"GCP_PROJECT_ID\", cloud_variables[0])\n", + "GCP_CLOUD_SQL_REGION = os.environ.get(\"CLOUDSQL_INSTANCE_REGION\", cloud_variables[1])\n", + "GCP_CLOUD_SQL_INSTANCE = os.environ.get(\"CLOUDSQL_INSTANCE\", cloud_variables[2])\n", "\n", - "try:\n", - " db_username_file = open(\"/etc/secret-volume/username\", \"r\")\n", - " DB_USER = db_username_file.read()\n", - " db_username_file.close()\n", + "DB_NAME = os.environ.get(\"INSTANCE_CONNECTION_NAME\", \"pgvector-database\")\n", + "VECTOR_EMBEDDINGS_TABLE_NAME = os.environ.get(\"EMBEDDINGS_TABLE_NAME\", \"rag_vector_embeddings\")\n", "\n", - " db_password_file = open(\"/etc/secret-volume/password\", \"r\")\n", - " DB_PASS = db_password_file.read()\n", - " db_password_file.close()\n", - "except:\n", - " DB_USER = os.environ.get(\"DB_USERNAME\", \"postgres\")\n", - " DB_PASS = os.environ.get(\"DB_PASS\", \"postgres\")\n", + "db_username_file = open(\"/etc/secret-volume/username\", \"r\")\n", + "DB_USER = db_username_file.read()\n", + "db_username_file.close()\n", "\n", + "db_password_file = open(\"/etc/secret-volume/password\", \"r\")\n", + "DB_PASS = db_password_file.read()\n", + "db_password_file.close()\n", "\n", "# Create Cloud SQL Postgres Engine\n", "pg_engine = PostgresEngine.from_instance(\n", @@ -454,17 +407,17 @@ "cell_type": "code", "execution_count": 6, "metadata": { - "id": "_MydulMdbaKC", "executionInfo": { + "elapsed": 3, "status": "ok", "timestamp": 1721926424908, - "user_tz": 300, - "elapsed": 3, "user": { "displayName": "", "userId": "" - } - } + }, + "user_tz": 300 + }, + "id": "_MydulMdbaKC" }, "outputs": [], "source": [ @@ -494,17 +447,17 @@ "cell_type": "code", "execution_count": 7, "metadata": { - "id": "0EzU4YhrbaKC", "executionInfo": { + "elapsed": 2350, "status": "ok", "timestamp": 1721926432388, - "user_tz": 300, - "elapsed": 2350, "user": { "displayName": "", "userId": "" - } - } + }, + "user_tz": 300 + }, + "id": "0EzU4YhrbaKC" }, "outputs": [], "source": [ @@ -517,37 +470,37 @@ }, { "cell_type": "markdown", - "source": [ - "# Initialize Vector Store" - ], "metadata": { "id": "aIAiofJTj4Fh" - } + }, + "source": [ + "# Initialize Vector Store" + ] }, { "cell_type": "code", "execution_count": 8, "metadata": { - "id": "oCbOnnBIbaKD", + "colab": { + "base_uri": "https://localhost:8080/" + }, "executionInfo": { + "elapsed": 14376, "status": "ok", "timestamp": 1721926463546, - "user_tz": 300, - "elapsed": 14376, "user": { "displayName": "", "userId": "" - } - }, - "colab": { - "base_uri": "https://localhost:8080/" + }, + "user_tz": 300 }, + "id": "oCbOnnBIbaKD", "outputId": "ed2702cf-ce04-4711-eaa7-5e644b5290ec" }, "outputs": [ { - "output_type": "stream", "name": "stderr", + "output_type": "stream", "text": [ "/usr/local/lib/python3.10/dist-packages/langchain_core/_api/deprecation.py:139: LangChainDeprecationWarning: The class `HuggingFaceEmbeddings` was deprecated in LangChain 0.2.2 and will be removed in 0.3.0. An updated version of the class exists in the langchain-huggingface package and should be used instead. To use it run `pip install -U langchain-huggingface` and import as `from langchain_huggingface import HuggingFaceEmbeddings`.\n", " warn_deprecated(\n", @@ -591,23 +544,23 @@ "colab": { "base_uri": "https://localhost:8080/" }, - "id": "OzXCfwNAbaKD", "executionInfo": { + "elapsed": 696702, "status": "ok", "timestamp": 1721927166341, - "user_tz": 300, - "elapsed": 696702, "user": { "displayName": "", "userId": "" - } + }, + "user_tz": 300 }, + "id": "OzXCfwNAbaKD", "outputId": "d573387c-66df-423f-a000-750334de97a0" }, "outputs": [ { - "output_type": "stream", "name": "stderr", + "output_type": "stream", "text": [ "100%|██████████| 6/6 [11:36<00:00, 116.07s/it]\n" ] @@ -620,66 +573,61 @@ }, { "cell_type": "code", - "source": [ - "splitter = RecursiveCharacterTextSplitter(\n", - " chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP, length_function=len\n", - ")\n", - "\n", - "splits = splitter.split_documents(documents)" - ], + "execution_count": 10, "metadata": { - "id": "O7eBZG7wiWBa", "executionInfo": { + "elapsed": 629, "status": "ok", "timestamp": 1721927196274, - "user_tz": 300, - "elapsed": 629, "user": { "displayName": "", "userId": "" - } - } + }, + "user_tz": 300 + }, + "id": "O7eBZG7wiWBa" }, - "execution_count": 10, - "outputs": [] + "outputs": [], + "source": [ + "splitter = RecursiveCharacterTextSplitter(\n", + " chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP, length_function=len\n", + ")\n", + "\n", + "splits = splitter.split_documents(documents)" + ] }, { "cell_type": "markdown", - "source": [ - "### Add the splits on the vector store" - ], "metadata": { "id": "UwCj9x5Jl5iq" - } + }, + "source": [ + "### Add the splits on the vector store" + ] }, { "cell_type": "code", - "source": [ - "ids = [str(uuid.uuid4()) for i in range(len(splits))]\n", - "vector_store.add_documents(splits, ids)" - ], + "execution_count": 11, "metadata": { - "collapsed": true, - "id": "Wqd3cKgntEYw", "colab": { "base_uri": "https://localhost:8080/" }, + "collapsed": true, "executionInfo": { + "elapsed": 2429339, "status": "ok", "timestamp": 1721929629134, - "user_tz": 300, - "elapsed": 2429339, "user": { "displayName": "", "userId": "" - } + }, + "user_tz": 300 }, + "id": "Wqd3cKgntEYw", "outputId": "f9329c53-49fa-488c-caf1-ae74e1db128c" }, - "execution_count": 11, "outputs": [ { - "output_type": "execute_result", "data": { "text/plain": [ "['3db34d89-aca6-4152-a2f5-09e26d932652',\n", @@ -1685,54 +1633,49 @@ " ...]" ] }, + "execution_count": 11, "metadata": {}, - "execution_count": 11 + "output_type": "execute_result" } + ], + "source": [ + "ids = [str(uuid.uuid4()) for i in range(len(splits))]\n", + "vector_store.add_documents(splits, ids)" ] }, { "cell_type": "markdown", - "source": [ - "## Trying the Vector Storage" - ], "metadata": { "id": "7Vnj-Z0pIYDM" - } + }, + "source": [ + "## Trying the Vector Storage" + ] }, { "cell_type": "code", - "source": [ - "query = \"Hello, what's kubernetes\"\n", - "query_vector = embeddings_service.embed_query(query)\n", - "docs = vector_store.similarity_search_by_vector(query_vector, k=4)\n", - "\n", - "for i, document in enumerate(docs):\n", - " print(f\"Result #{i+1}\")\n", - " print(document.page_content)\n", - " print(\"-\" * 100)" - ], + "execution_count": 15, "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, - "id": "tdITIoEAIcfr", "executionInfo": { + "elapsed": 314, "status": "ok", "timestamp": 1721932055248, - "user_tz": 300, - "elapsed": 314, "user": { "displayName": "", "userId": "" - } + }, + "user_tz": 300 }, + "id": "tdITIoEAIcfr", "outputId": "102198aa-ca71-4a4c-a5e7-58afd1a2884f" }, - "execution_count": 15, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", "text": [ "Result #1\n", "Overview\n", @@ -1786,10 +1729,24 @@ "----------------------------------------------------------------------------------------------------\n" ] } + ], + "source": [ + "query = \"Hello, what's kubernetes\"\n", + "query_vector = embeddings_service.embed_query(query)\n", + "docs = vector_store.similarity_search_by_vector(query_vector, k=4)\n", + "\n", + "for i, document in enumerate(docs):\n", + " print(f\"Result #{i+1}\")\n", + " print(document.page_content)\n", + " print(\"-\" * 100)" ] } ], "metadata": { + "colab": { + "name": "rag-data-ingest-with-kubernetes-docs.ipynb", + "provenance": [] + }, "kernelspec": { "display_name": "Python 3", "language": "python", @@ -1798,12 +1755,8 @@ "language_info": { "name": "python", "version": "3.9.13" - }, - "colab": { - "provenance": [], - "name": "rag-data-ingest-with-kubernetes-docs.ipynb" } }, "nbformat": 4, "nbformat_minor": 0 -} \ No newline at end of file +}