From 1a621565cb7f773f90f10063f7324ecbbbb87651 Mon Sep 17 00:00:00 2001
From: Sriniketh J <81156510+srini047@users.noreply.github.com>
Date: Wed, 12 Jul 2023 22:03:37 +0530
Subject: [PATCH 1/9] add: langchain vector search pdf template

---
 .../Langchain_Vector_Search_on_PDF.ipynb      | 344 ++++++++++++++++++
 1 file changed, 344 insertions(+)
 create mode 100644 LangChain/Langchain_Vector_Search_on_PDF.ipynb
diff --git a/LangChain/Langchain_Vector_Search_on_PDF.ipynb b/LangChain/Langchain_Vector_Search_on_PDF.ipynb
new file mode 100644
index 0000000000..ebe4599064
--- /dev/null
+++ b/LangChain/Langchain_Vector_Search_on_PDF.ipynb
@@ -0,0 +1,344 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "latin-packing",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2021-02-23T14:22:16.610471Z",
+     "iopub.status.busy": "2021-02-23T14:22:16.610129Z",
+     "iopub.status.idle": "2021-02-23T14:22:16.627784Z",
+     "shell.execute_reply": "2021-02-23T14:22:16.626866Z",
+     "shell.execute_reply.started": "2021-02-23T14:22:16.610384Z"
+    },
+    "papermill": {},
+    "tags": []
+   },
+   "source": [
+    "<img width=\"10%\" alt=\"Naas\" src=\"https://landen.imgix.net/jtci2pxwjczr/assets/5ice39g4.png?w=160\"/>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "compressed-wilson",
+   "metadata": {
+    "papermill": {},
+    "tags": []
+   },
+   "source": [
+    "# Tool - Action of the notebook\n",
+    "<a href=\"https://app.naas.ai/user-redirect/naas/downloader?url=https://raw.githubusercontent.com/jupyter-naas/awesome-notebooks/master/template.ipynb\" target=\"_parent\">\n",
+    "<img src=\"https://naasai-public.s3.eu-west-3.amazonaws.com/open_in_naas.svg\"/>\n",
+    "</a><br><br><a href=\"https://github.com/jupyter-naas/awesome-notebooks/issues/new?assignees=&labels=&template=template-request.md&title=Tool+-+Action+of+the+notebook+\">Template request</a> | <a href=\"https://github.com/jupyter-naas/awesome-notebooks/issues/new?assignees=&labels=&template=bug_report.md&title=[ERROR]+Tool+/+Folder+Action+of+the+notebook+\">Bug report</a>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "religious-programmer",
+   "metadata": {},
+   "source": [
+    "**Tags:** #langchain #pdf #weaviate #huggingface"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1fe9f56e-561c-4f52-aef8-b861c9462107",
+   "metadata": {},
+   "source": [
+    "**Author:** [Sriniketh Jayasendil](https://www.linkedin.com/in/sriniketh-jayasendil)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "31ea7cdb-e10d-43fc-b026-f69249a59736",
+   "metadata": {},
+   "source": [
+    "**Description:** This notebook is used to perform vector search on your PDF and it will answer basic questions that are closely related based on the prompt provided.\n",
+    "\n",
+    "It uses:\n",
+    "- PyPDF2 - Get text from PDF\n",
+    "- LangChain - Text splitter, document creation\n",
+    "- HuggingFace - Embeddings\n",
+    "- Weaviate - Vector Database"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1a14806c-9da2-446e-b8fd-b55f8d7ac3f0",
+   "metadata": {},
+   "source": [
+    "### References\n",
+    "\n",
+    "- [Langchain docs](https://python.langchain.com/docs/get_started/introduction.html)\n",
+    "- [Weaviate docs](https://weaviate.io/developers/weaviate)\n",
+    "- [Huggingface docs](https://huggingface.co/docs)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "distinguished-truth",
+   "metadata": {
+    "papermill": {},
+    "tags": []
+   },
+   "source": [
+    "## Input"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "numeric-mediterranean",
+   "metadata": {},
+   "source": [
+    "### Import libraries"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "potential-surfing",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "try:\n",
+    "    import langchain\n",
+    "    import PyPDF2\n",
+    "except ModuleNotFoundError:\n",
+    "    !pip install langchain PyPDF2\n",
+    "\n",
+    "!pip install sentence_transformers --user\n",
+    "import naas\n",
+    "import PyPDF2\n",
+    "from langchain.text_splitter import CharacterTextSplitter\n",
+    "from langchain.embeddings import HuggingFaceEmbeddings\n",
+    "from langchain.vectorstores import Weaviate"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "aggressive-trustee",
+   "metadata": {},
+   "source": [
+    "### Setup Variables\n",
+    "\n",
+    "- `pdf_file`: Path to which the PDF file exists.\n",
+    "- `weaviate_cluster_url`: You can create a new weaviate cluster [here](https://console.weaviate.cloud) and paste the url or import from naas secrets\n",
+    "- `query`: The question that you need to ask the pdf\n",
+    "- `response`: The reply for the query from search "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "continuous-melbourne",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "#inputs\n",
+    "pdf_file = \"./SWE NCG JD.pdf\"\n",
+    "weaviate_cluster_url = \"\" or naas.secret.get(\"WEAVIATE_CLUSTER_URL\")\n",
+    "query = \"How much is the base pay?\"\n",
+    "\n",
+    "#outputs\n",
+    "response = \"\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "registered-showcase",
+   "metadata": {},
+   "source": [
+    "## Model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "tested-astrology",
+   "metadata": {},
+   "source": [
+    "### Extract text from PDF"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "crude-louisville",
+   "metadata": {
+    "papermill": {},
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "def extract_text_from_pdf(pdf_path):\n",
+    "    with open(pdf_path, \"rb\") as file:\n",
+    "        pdf = PyPDF2.PdfReader(file)\n",
+    "        text = []\n",
+    "        for page in pdf.pages:\n",
+    "            text.append(page.extract_text())\n",
+    "        return \" \".join(text)\n",
+    "\n",
+    "text = extract_text_from_pdf(pdf_file)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8daa42c1-3a2b-4f96-a7dd-fb1deb395a84",
+   "metadata": {},
+   "source": [
+    "### Split the text into chunks scraped from the PDF"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f9e8e197-e965-441c-9512-9b28ed079ee6",
+   "metadata": {
+    "papermill": {},
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "text_splitter = CharacterTextSplitter(\n",
+    "    separator = \"\\n\",\n",
+    "    chunk_size = 1000,\n",
+    "    chunk_overlap  = 200,\n",
+    "    length_function = len,\n",
+    ")\n",
+    "\n",
+    "texts = text_splitter.create_documents([text])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ef1720bf-a28a-4757-b189-7df97947c158",
+   "metadata": {},
+   "source": [
+    "### Create embeddings of the text make it compatible to store it in the database"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e4a376ac-a10e-4d6a-ba01-e5445efdf091",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "embeddings = HuggingFaceEmbeddings()\n",
+    "\n",
+    "for i in range(len(texts)):\n",
+    "        query_result = embeddings.embed_query(texts[i].page_content)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4169feb2-05ac-4914-bbb2-501dae7dcd89",
+   "metadata": {},
+   "source": [
+    "### Store the embeddings into the weaviate database"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "93ae7d2d-86b0-4b88-ba96-ef0d094d9da3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "db = Weaviate.from_documents(texts, embeddings, weaviate_url=\"https://naas-langchain-test-t6yybnsw.weaviate.network\", by_text=False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "981fac74-2e1e-4b62-8b91-09d51d344bba",
+   "metadata": {},
+   "source": [
+    "### Get the closest response to the user query on the PDF"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bdf9e7a9-7de9-4c50-b677-97cb2a1d5d3b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "docs = db.similarity_search(query)\n",
+    "response = docs[0].page_content"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "lonely-pacific",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2021-07-02T23:32:10.789097Z",
+     "iopub.status.busy": "2021-07-02T23:32:10.788829Z",
+     "iopub.status.idle": "2021-07-02T23:32:10.796900Z",
+     "shell.execute_reply": "2021-07-02T23:32:10.796358Z",
+     "shell.execute_reply.started": "2021-07-02T23:32:10.789033Z"
+    }
+   },
+   "source": [
+    "## Output"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "890f7c86-b7bb-4f5d-9a1b-e492dd9580fd",
+   "metadata": {},
+   "source": [
+    "### Show the response"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9c4e3b7b-6440-4844-8054-265f1aec65eb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "response"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.6"
+  },
+  "papermill": {
+   "default_parameters": {},
+   "environment_variables": {},
+   "parameters": {},
+   "version": "2.3.3"
+  },
+  "widgets": {
+   "application/vnd.jupyter.widget-state+json": {
+    "state": {},
+    "version_major": 2,
+    "version_minor": 0
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

From 932275ea2197072dec69cfe944215b57b32441ae Mon Sep 17 00:00:00 2001
From: Sriniketh J <81156510+srini047@users.noreply.github.com>
Date: Wed, 12 Jul 2023 22:07:20 +0530
Subject: [PATCH 2/9] fix: .env exposure issue

---
 LangChain/Langchain_Vector_Search_on_PDF.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/LangChain/Langchain_Vector_Search_on_PDF.ipynb b/LangChain/Langchain_Vector_Search_on_PDF.ipynb
index ebe4599064..6f023f30e9 100644
--- a/LangChain/Langchain_Vector_Search_on_PDF.ipynb
+++ b/LangChain/Langchain_Vector_Search_on_PDF.ipynb
@@ -250,7 +250,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "db = Weaviate.from_documents(texts, embeddings, weaviate_url=\"https://naas-langchain-test-t6yybnsw.weaviate.network\", by_text=False)"
+    "db = Weaviate.from_documents(texts, embeddings, weaviate_url=\"\", by_text=False)"
    ]
   },
   {

From dea4487b08d641e6b16fd7fb8dad29941385750a Mon Sep 17 00:00:00 2001
From: Florent Ravenel <florent@naas.ai>
Date: Wed, 19 Jul 2023 10:29:51 +0200
Subject: [PATCH 3/9] feat: rename notebook, update title, references tipo

---
 ....ipynb => LangChain_Vector_Search_on_PDF.ipynb} | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)
 rename LangChain/{Langchain_Vector_Search_on_PDF.ipynb => LangChain_Vector_Search_on_PDF.ipynb} (97%)

diff --git a/LangChain/Langchain_Vector_Search_on_PDF.ipynb b/LangChain/LangChain_Vector_Search_on_PDF.ipynb
similarity index 97%
rename from LangChain/Langchain_Vector_Search_on_PDF.ipynb
rename to LangChain/LangChain_Vector_Search_on_PDF.ipynb
index 6f023f30e9..9bbb385792 100644
--- a/LangChain/Langchain_Vector_Search_on_PDF.ipynb
+++ b/LangChain/LangChain_Vector_Search_on_PDF.ipynb
@@ -26,7 +26,7 @@
     "tags": []
    },
    "source": [
-    "# Tool - Action of the notebook\n",
+    "# LangChain - Vector Search on PDF\n",
     "<a href=\"https://app.naas.ai/user-redirect/naas/downloader?url=https://raw.githubusercontent.com/jupyter-naas/awesome-notebooks/master/template.ipynb\" target=\"_parent\">\n",
     "<img src=\"https://naasai-public.s3.eu-west-3.amazonaws.com/open_in_naas.svg\"/>\n",
     "</a><br><br><a href=\"https://github.com/jupyter-naas/awesome-notebooks/issues/new?assignees=&labels=&template=template-request.md&title=Tool+-+Action+of+the+notebook+\">Template request</a> | <a href=\"https://github.com/jupyter-naas/awesome-notebooks/issues/new?assignees=&labels=&template=bug_report.md&title=[ERROR]+Tool+/+Folder+Action+of+the+notebook+\">Bug report</a>"
@@ -67,8 +67,7 @@
    "id": "1a14806c-9da2-446e-b8fd-b55f8d7ac3f0",
    "metadata": {},
    "source": [
-    "### References\n",
-    "\n",
+    "**References:**\n",
     "- [Langchain docs](https://python.langchain.com/docs/get_started/introduction.html)\n",
     "- [Weaviate docs](https://weaviate.io/developers/weaviate)\n",
     "- [Huggingface docs](https://huggingface.co/docs)"
@@ -98,6 +97,10 @@
    "execution_count": null,
    "id": "potential-surfing",
    "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-07-19T08:29:06.271389Z",
+     "iopub.status.busy": "2023-07-19T08:29:06.271116Z"
+    },
     "tags": []
    },
    "outputs": [],
@@ -107,7 +110,6 @@
     "    import PyPDF2\n",
     "except ModuleNotFoundError:\n",
     "    !pip install langchain PyPDF2\n",
-    "\n",
     "!pip install sentence_transformers --user\n",
     "import naas\n",
     "import PyPDF2\n",
@@ -138,12 +140,12 @@
    },
    "outputs": [],
    "source": [
-    "#inputs\n",
+    "# Inputs\n",
     "pdf_file = \"./SWE NCG JD.pdf\"\n",
     "weaviate_cluster_url = \"\" or naas.secret.get(\"WEAVIATE_CLUSTER_URL\")\n",
     "query = \"How much is the base pay?\"\n",
     "\n",
-    "#outputs\n",
+    "# Outputs\n",
     "response = \"\""
    ]
   },

From 5399ff33aded3d9d23e4db4ceae84280b976c51f Mon Sep 17 00:00:00 2001
From: Florent Ravenel <florent@naas.ai>
Date: Wed, 19 Jul 2023 10:49:55 +0200
Subject: [PATCH 4/9] feat: test

---
 .../LangChain_Vector_Search_on_PDF.ipynb      | 121 ++++++++++++++++--
 1 file changed, 110 insertions(+), 11 deletions(-)

diff --git a/LangChain/LangChain_Vector_Search_on_PDF.ipynb b/LangChain/LangChain_Vector_Search_on_PDF.ipynb
index 9bbb385792..93054e98da 100644
--- a/LangChain/LangChain_Vector_Search_on_PDF.ipynb
+++ b/LangChain/LangChain_Vector_Search_on_PDF.ipynb
@@ -94,16 +94,52 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "id": "potential-surfing",
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2023-07-19T08:29:06.271389Z",
-     "iopub.status.busy": "2023-07-19T08:29:06.271116Z"
+     "iopub.execute_input": "2023-07-19T08:41:33.179611Z",
+     "iopub.status.busy": "2023-07-19T08:41:33.179183Z",
+     "iopub.status.idle": "2023-07-19T08:42:16.496262Z",
+     "shell.execute_reply": "2023-07-19T08:42:16.491276Z",
+     "shell.execute_reply.started": "2023-07-19T08:41:33.179539Z"
     },
     "tags": []
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Collecting sentence_transformers\n",
+      "  Using cached sentence_transformers-2.2.2-py3-none-any.whl\n",
+      "Requirement already satisfied: transformers<5.0.0,>=4.6.0 in /opt/conda/lib/python3.9/site-packages (from sentence_transformers) (4.12.5)\n",
+      "Requirement already satisfied: tqdm in /opt/conda/lib/python3.9/site-packages (from sentence_transformers) (4.62.0)\n",
+      "Requirement already satisfied: torch>=1.6.0 in /opt/conda/lib/python3.9/site-packages (from sentence_transformers) (1.8.1)\n",
+      "Collecting torchvision (from sentence_transformers)\n",
+      "  Using cached torchvision-0.15.2-cp39-cp39-manylinux1_x86_64.whl (6.0 MB)\n",
+      "Requirement already satisfied: numpy in /home/ftp/.local/lib/python3.9/site-packages (from sentence_transformers) (1.22.4)\n",
+      "Requirement already satisfied: scikit-learn in /opt/conda/lib/python3.9/site-packages (from sentence_transformers) (0.24.2)\n",
+      "Requirement already satisfied: scipy in /opt/conda/lib/python3.9/site-packages (from sentence_transformers) (1.6.3)\n",
+      "Requirement already satisfied: nltk in /home/ftp/.local/lib/python3.9/site-packages (from sentence_transformers) (3.8.1)\n",
+      "Requirement already satisfied: sentencepiece in /home/ftp/.local/lib/python3.9/site-packages (from sentence_transformers) (0.1.99)\n",
+      "Requirement already satisfied: huggingface-hub>=0.4.0 in /opt/conda/lib/python3.9/site-packages (from sentence_transformers) (0.16.2)\n",
+      "Requirement already satisfied: filelock in /opt/conda/lib/python3.9/site-packages (from huggingface-hub>=0.4.0->sentence_transformers) (3.12.2)\n",
+      "Requirement already satisfied: fsspec in /opt/conda/lib/python3.9/site-packages (from huggingface-hub>=0.4.0->sentence_transformers) (2023.6.0)\n",
+      "Requirement already satisfied: requests in /opt/conda/lib/python3.9/site-packages (from huggingface-hub>=0.4.0->sentence_transformers) (2.26.0)\n",
+      "Requirement already satisfied: pyyaml>=5.1 in /opt/conda/lib/python3.9/site-packages (from huggingface-hub>=0.4.0->sentence_transformers) (6.0)\n",
+      "Requirement already satisfied: typing-extensions>=3.7.4.3 in /home/ftp/.local/lib/python3.9/site-packages (from huggingface-hub>=0.4.0->sentence_transformers) (4.5.0)\n",
+      "Requirement already satisfied: packaging>=20.9 in /opt/conda/lib/python3.9/site-packages (from huggingface-hub>=0.4.0->sentence_transformers) (21.0)\n",
+      "Requirement already satisfied: regex!=2019.12.17 in /opt/conda/lib/python3.9/site-packages (from transformers<5.0.0,>=4.6.0->sentence_transformers) (2023.6.3)\n",
+      "Requirement already satisfied: sacremoses in /opt/conda/lib/python3.9/site-packages (from transformers<5.0.0,>=4.6.0->sentence_transformers) (0.0.53)\n",
+      "Requirement already satisfied: tokenizers<0.11,>=0.10.1 in /opt/conda/lib/python3.9/site-packages (from transformers<5.0.0,>=4.6.0->sentence_transformers) (0.10.3)\n",
+      "Requirement already satisfied: click in /opt/conda/lib/python3.9/site-packages (from nltk->sentence_transformers) (8.1.3)\n",
+      "Requirement already satisfied: joblib in /opt/conda/lib/python3.9/site-packages (from nltk->sentence_transformers) (1.0.1)\n",
+      "Requirement already satisfied: threadpoolctl>=2.0.0 in /opt/conda/lib/python3.9/site-packages (from scikit-learn->sentence_transformers) (3.1.0)\n",
+      "Collecting torch>=1.6.0 (from sentence_transformers)\n"
+     ]
+    }
+   ],
    "source": [
     "try:\n",
     "    import langchain\n",
@@ -133,16 +169,23 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "id": "continuous-melbourne",
    "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-07-19T08:42:16.501431Z",
+     "iopub.status.busy": "2023-07-19T08:42:16.500987Z",
+     "iopub.status.idle": "2023-07-19T08:42:16.675988Z",
+     "shell.execute_reply": "2023-07-19T08:42:16.667851Z",
+     "shell.execute_reply.started": "2023-07-19T08:42:16.501399Z"
+    },
     "tags": []
    },
    "outputs": [],
    "source": [
     "# Inputs\n",
     "pdf_file = \"./SWE NCG JD.pdf\"\n",
-    "weaviate_cluster_url = \"\" or naas.secret.get(\"WEAVIATE_CLUSTER_URL\")\n",
+    "weaviate_cluster_url = naas.secret.get(\"WEAVIATE_CLUSTER_URL\")\n",
     "query = \"How much is the base pay?\"\n",
     "\n",
     "# Outputs\n",
@@ -167,13 +210,33 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "id": "crude-louisville",
    "metadata": {
+    "execution": {
+     "iopub.execute_input": "2023-07-19T08:42:16.681433Z",
+     "iopub.status.busy": "2023-07-19T08:42:16.679730Z",
+     "iopub.status.idle": "2023-07-19T08:42:20.072060Z",
+     "shell.execute_reply": "2023-07-19T08:42:20.070305Z",
+     "shell.execute_reply.started": "2023-07-19T08:42:16.681396Z"
+    },
     "papermill": {},
     "tags": []
    },
-   "outputs": [],
+   "outputs": [
+    {
+     "ename": "FileNotFoundError",
+     "evalue": "[Errno 2] No such file or directory: './SWE NCG JD.pdf'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-3-4522d8749e9f>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      7\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0;34m\" \"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      8\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 9\u001b[0;31m \u001b[0mtext\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mextract_text_from_pdf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpdf_file\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+      "\u001b[0;32m<ipython-input-3-4522d8749e9f>\u001b[0m in \u001b[0;36mextract_text_from_pdf\u001b[0;34m(pdf_path)\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mextract_text_from_pdf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpdf_path\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m     \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpdf_path\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"rb\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mfile\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      3\u001b[0m         \u001b[0mpdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mPyPDF2\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mPdfReader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfile\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m         \u001b[0mtext\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m         \u001b[0;32mfor\u001b[0m \u001b[0mpage\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mpdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpages\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: './SWE NCG JD.pdf'"
+     ]
+    }
+   ],
    "source": [
     "def extract_text_from_pdf(pdf_path):\n",
     "    with open(pdf_path, \"rb\") as file:\n",
@@ -199,6 +262,11 @@
    "execution_count": null,
    "id": "f9e8e197-e965-441c-9512-9b28ed079ee6",
    "metadata": {
+    "execution": {
+     "iopub.status.busy": "2023-07-19T08:42:20.072988Z",
+     "iopub.status.idle": "2023-07-19T08:42:20.073335Z",
+     "shell.execute_reply": "2023-07-19T08:42:20.073164Z"
+    },
     "papermill": {},
     "tags": []
    },
@@ -227,6 +295,11 @@
    "execution_count": null,
    "id": "e4a376ac-a10e-4d6a-ba01-e5445efdf091",
    "metadata": {
+    "execution": {
+     "iopub.status.busy": "2023-07-19T08:42:20.074098Z",
+     "iopub.status.idle": "2023-07-19T08:42:20.074423Z",
+     "shell.execute_reply": "2023-07-19T08:42:20.074257Z"
+    },
     "tags": []
    },
    "outputs": [],
@@ -249,7 +322,13 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "93ae7d2d-86b0-4b88-ba96-ef0d094d9da3",
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.status.busy": "2023-07-19T08:42:20.075300Z",
+     "iopub.status.idle": "2023-07-19T08:42:20.075641Z",
+     "shell.execute_reply": "2023-07-19T08:42:20.075473Z"
+    }
+   },
    "outputs": [],
    "source": [
     "db = Weaviate.from_documents(texts, embeddings, weaviate_url=\"\", by_text=False)"
@@ -267,7 +346,13 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "bdf9e7a9-7de9-4c50-b677-97cb2a1d5d3b",
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.status.busy": "2023-07-19T08:42:20.076441Z",
+     "iopub.status.idle": "2023-07-19T08:42:20.076769Z",
+     "shell.execute_reply": "2023-07-19T08:42:20.076600Z"
+    }
+   },
    "outputs": [],
    "source": [
     "docs = db.similarity_search(query)\n",
@@ -302,11 +387,25 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "9c4e3b7b-6440-4844-8054-265f1aec65eb",
-   "metadata": {},
+   "metadata": {
+    "execution": {
+     "iopub.status.busy": "2023-07-19T08:42:20.077561Z",
+     "iopub.status.idle": "2023-07-19T08:42:20.077926Z",
+     "shell.execute_reply": "2023-07-19T08:42:20.077754Z"
+    }
+   },
    "outputs": [],
    "source": [
     "response"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9d0d7bd2-7e89-49e6-807f-9ab542085a7f",
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {

From e737ecb5ab9ccf2b7a2ed3f05d9066fd7e6ecc3a Mon Sep 17 00:00:00 2001
From: Sriniketh J <81156510+srini047@users.noreply.github.com>
Date: Fri, 21 Jul 2023 21:13:38 +0530
Subject: [PATCH 5/9] fix: pdf file_path issue

---
 .../LangChain_Vector_Search_on_PDF.ipynb      | 157 ++++--------------
 1 file changed, 34 insertions(+), 123 deletions(-)

diff --git a/LangChain/LangChain_Vector_Search_on_PDF.ipynb b/LangChain/LangChain_Vector_Search_on_PDF.ipynb
index 93054e98da..17b3c7fc6f 100644
--- a/LangChain/LangChain_Vector_Search_on_PDF.ipynb
+++ b/LangChain/LangChain_Vector_Search_on_PDF.ipynb
@@ -37,7 +37,7 @@
    "id": "religious-programmer",
    "metadata": {},
    "source": [
-    "**Tags:** #langchain #pdf #weaviate #huggingface"
+    "**Tags:** #langchain #pdf #weaviate #huggingface #llm #database #embeddings"
    ]
   },
   {
@@ -94,99 +94,46 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "id": "potential-surfing",
    "metadata": {
-    "execution": {
-     "iopub.execute_input": "2023-07-19T08:41:33.179611Z",
-     "iopub.status.busy": "2023-07-19T08:41:33.179183Z",
-     "iopub.status.idle": "2023-07-19T08:42:16.496262Z",
-     "shell.execute_reply": "2023-07-19T08:42:16.491276Z",
-     "shell.execute_reply.started": "2023-07-19T08:41:33.179539Z"
-    },
     "tags": []
    },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Collecting sentence_transformers\n",
-      "  Using cached sentence_transformers-2.2.2-py3-none-any.whl\n",
-      "Requirement already satisfied: transformers<5.0.0,>=4.6.0 in /opt/conda/lib/python3.9/site-packages (from sentence_transformers) (4.12.5)\n",
-      "Requirement already satisfied: tqdm in /opt/conda/lib/python3.9/site-packages (from sentence_transformers) (4.62.0)\n",
-      "Requirement already satisfied: torch>=1.6.0 in /opt/conda/lib/python3.9/site-packages (from sentence_transformers) (1.8.1)\n",
-      "Collecting torchvision (from sentence_transformers)\n",
-      "  Using cached torchvision-0.15.2-cp39-cp39-manylinux1_x86_64.whl (6.0 MB)\n",
-      "Requirement already satisfied: numpy in /home/ftp/.local/lib/python3.9/site-packages (from sentence_transformers) (1.22.4)\n",
-      "Requirement already satisfied: scikit-learn in /opt/conda/lib/python3.9/site-packages (from sentence_transformers) (0.24.2)\n",
-      "Requirement already satisfied: scipy in /opt/conda/lib/python3.9/site-packages (from sentence_transformers) (1.6.3)\n",
-      "Requirement already satisfied: nltk in /home/ftp/.local/lib/python3.9/site-packages (from sentence_transformers) (3.8.1)\n",
-      "Requirement already satisfied: sentencepiece in /home/ftp/.local/lib/python3.9/site-packages (from sentence_transformers) (0.1.99)\n",
-      "Requirement already satisfied: huggingface-hub>=0.4.0 in /opt/conda/lib/python3.9/site-packages (from sentence_transformers) (0.16.2)\n",
-      "Requirement already satisfied: filelock in /opt/conda/lib/python3.9/site-packages (from huggingface-hub>=0.4.0->sentence_transformers) (3.12.2)\n",
-      "Requirement already satisfied: fsspec in /opt/conda/lib/python3.9/site-packages (from huggingface-hub>=0.4.0->sentence_transformers) (2023.6.0)\n",
-      "Requirement already satisfied: requests in /opt/conda/lib/python3.9/site-packages (from huggingface-hub>=0.4.0->sentence_transformers) (2.26.0)\n",
-      "Requirement already satisfied: pyyaml>=5.1 in /opt/conda/lib/python3.9/site-packages (from huggingface-hub>=0.4.0->sentence_transformers) (6.0)\n",
-      "Requirement already satisfied: typing-extensions>=3.7.4.3 in /home/ftp/.local/lib/python3.9/site-packages (from huggingface-hub>=0.4.0->sentence_transformers) (4.5.0)\n",
-      "Requirement already satisfied: packaging>=20.9 in /opt/conda/lib/python3.9/site-packages (from huggingface-hub>=0.4.0->sentence_transformers) (21.0)\n",
-      "Requirement already satisfied: regex!=2019.12.17 in /opt/conda/lib/python3.9/site-packages (from transformers<5.0.0,>=4.6.0->sentence_transformers) (2023.6.3)\n",
-      "Requirement already satisfied: sacremoses in /opt/conda/lib/python3.9/site-packages (from transformers<5.0.0,>=4.6.0->sentence_transformers) (0.0.53)\n",
-      "Requirement already satisfied: tokenizers<0.11,>=0.10.1 in /opt/conda/lib/python3.9/site-packages (from transformers<5.0.0,>=4.6.0->sentence_transformers) (0.10.3)\n",
-      "Requirement already satisfied: click in /opt/conda/lib/python3.9/site-packages (from nltk->sentence_transformers) (8.1.3)\n",
-      "Requirement already satisfied: joblib in /opt/conda/lib/python3.9/site-packages (from nltk->sentence_transformers) (1.0.1)\n",
-      "Requirement already satisfied: threadpoolctl>=2.0.0 in /opt/conda/lib/python3.9/site-packages (from scikit-learn->sentence_transformers) (3.1.0)\n",
-      "Collecting torch>=1.6.0 (from sentence_transformers)\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "try:\n",
     "    import langchain\n",
     "    import PyPDF2\n",
+    "    import weaviate\n",
     "except ModuleNotFoundError:\n",
-    "    !pip install langchain PyPDF2\n",
-    "!pip install sentence_transformers --user\n",
+    "    !pip install langchain PyPDF2 weaviate-client==3.20.0\n",
+    "    \n",
+    "# Note: This installation make take more time than usual due to more dependencies {uncomment if there is some error in the embeddings routine}\n",
+    "# !pip install sentence_transformers --user\n",
+    "\n",
     "import naas\n",
+    "import io\n",
+    "import requests\n",
     "import PyPDF2\n",
+    "import weaviate\n",
     "from langchain.text_splitter import CharacterTextSplitter\n",
     "from langchain.embeddings import HuggingFaceEmbeddings\n",
     "from langchain.vectorstores import Weaviate"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "id": "aggressive-trustee",
-   "metadata": {},
-   "source": [
-    "### Setup Variables\n",
-    "\n",
-    "- `pdf_file`: Path to which the PDF file exists.\n",
-    "- `weaviate_cluster_url`: You can create a new weaviate cluster [here](https://console.weaviate.cloud) and paste the url or import from naas secrets\n",
-    "- `query`: The question that you need to ask the pdf\n",
-    "- `response`: The reply for the query from search "
-   ]
-  },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "id": "continuous-melbourne",
    "metadata": {
-    "execution": {
-     "iopub.execute_input": "2023-07-19T08:42:16.501431Z",
-     "iopub.status.busy": "2023-07-19T08:42:16.500987Z",
-     "iopub.status.idle": "2023-07-19T08:42:16.675988Z",
-     "shell.execute_reply": "2023-07-19T08:42:16.667851Z",
-     "shell.execute_reply.started": "2023-07-19T08:42:16.501399Z"
-    },
     "tags": []
    },
    "outputs": [],
    "source": [
     "# Inputs\n",
-    "pdf_file = \"./SWE NCG JD.pdf\"\n",
-    "weaviate_cluster_url = naas.secret.get(\"WEAVIATE_CLUSTER_URL\")\n",
-    "query = \"How much is the base pay?\"\n",
+    "pdf_file = \"\"\n",
+    "weaviate_cluster_url = \"paste your cluster url\" or naas.secret.get(\"WEAVIATE_CLUSTER_URL\")\n",
+    "query = \"Enter your own\" or \"Summarize the PDF...\"\n",
     "\n",
     "# Outputs\n",
     "response = \"\""
@@ -210,41 +157,27 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": null,
    "id": "crude-louisville",
    "metadata": {
-    "execution": {
-     "iopub.execute_input": "2023-07-19T08:42:16.681433Z",
-     "iopub.status.busy": "2023-07-19T08:42:16.679730Z",
-     "iopub.status.idle": "2023-07-19T08:42:20.072060Z",
-     "shell.execute_reply": "2023-07-19T08:42:20.070305Z",
-     "shell.execute_reply.started": "2023-07-19T08:42:16.681396Z"
-    },
     "papermill": {},
     "tags": []
    },
-   "outputs": [
-    {
-     "ename": "FileNotFoundError",
-     "evalue": "[Errno 2] No such file or directory: './SWE NCG JD.pdf'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
-      "\u001b[0;32m<ipython-input-3-4522d8749e9f>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      7\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0;34m\" \"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      8\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 9\u001b[0;31m \u001b[0mtext\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mextract_text_from_pdf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpdf_file\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
-      "\u001b[0;32m<ipython-input-3-4522d8749e9f>\u001b[0m in \u001b[0;36mextract_text_from_pdf\u001b[0;34m(pdf_path)\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mextract_text_from_pdf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpdf_path\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m     \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpdf_path\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"rb\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mfile\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      3\u001b[0m         \u001b[0mpdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mPyPDF2\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mPdfReader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfile\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m         \u001b[0mtext\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m         \u001b[0;32mfor\u001b[0m \u001b[0mpage\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mpdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpages\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: './SWE NCG JD.pdf'"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "def extract_text_from_pdf(pdf_path):\n",
-    "    with open(pdf_path, \"rb\") as file:\n",
-    "        pdf = PyPDF2.PdfReader(file)\n",
-    "        text = []\n",
-    "        for page in pdf.pages:\n",
-    "            text.append(page.extract_text())\n",
-    "        return \" \".join(text)\n",
+    "    r = requests.get(pdf_path)\n",
+    "    f = io.BytesIO(r.content)\n",
+    "\n",
+    "    reader = PyPDF2.PdfReader(f)\n",
+    "    contents = []\n",
+    "    for page in reader.pages:\n",
+    "        content = page.extract_text()\n",
+    "        contents.append(content)\n",
+    "        \n",
+    "    contents = ' '.join(contents)\n",
+    "    return contents\n",
+    "    \n",
     "\n",
     "text = extract_text_from_pdf(pdf_file)"
    ]
@@ -262,11 +195,6 @@
    "execution_count": null,
    "id": "f9e8e197-e965-441c-9512-9b28ed079ee6",
    "metadata": {
-    "execution": {
-     "iopub.status.busy": "2023-07-19T08:42:20.072988Z",
-     "iopub.status.idle": "2023-07-19T08:42:20.073335Z",
-     "shell.execute_reply": "2023-07-19T08:42:20.073164Z"
-    },
     "papermill": {},
     "tags": []
    },
@@ -295,11 +223,6 @@
    "execution_count": null,
    "id": "e4a376ac-a10e-4d6a-ba01-e5445efdf091",
    "metadata": {
-    "execution": {
-     "iopub.status.busy": "2023-07-19T08:42:20.074098Z",
-     "iopub.status.idle": "2023-07-19T08:42:20.074423Z",
-     "shell.execute_reply": "2023-07-19T08:42:20.074257Z"
-    },
     "tags": []
    },
    "outputs": [],
@@ -323,15 +246,11 @@
    "execution_count": null,
    "id": "93ae7d2d-86b0-4b88-ba96-ef0d094d9da3",
    "metadata": {
-    "execution": {
-     "iopub.status.busy": "2023-07-19T08:42:20.075300Z",
-     "iopub.status.idle": "2023-07-19T08:42:20.075641Z",
-     "shell.execute_reply": "2023-07-19T08:42:20.075473Z"
-    }
+    "tags": []
    },
    "outputs": [],
    "source": [
-    "db = Weaviate.from_documents(texts, embeddings, weaviate_url=\"\", by_text=False)"
+    "db = Weaviate.from_documents(texts, embeddings, weaviate_url=weaviate_cluster_url, by_text=False)"
    ]
   },
   {
@@ -347,11 +266,7 @@
    "execution_count": null,
    "id": "bdf9e7a9-7de9-4c50-b677-97cb2a1d5d3b",
    "metadata": {
-    "execution": {
-     "iopub.status.busy": "2023-07-19T08:42:20.076441Z",
-     "iopub.status.idle": "2023-07-19T08:42:20.076769Z",
-     "shell.execute_reply": "2023-07-19T08:42:20.076600Z"
-    }
+    "tags": []
    },
    "outputs": [],
    "source": [
@@ -388,11 +303,7 @@
    "execution_count": null,
    "id": "9c4e3b7b-6440-4844-8054-265f1aec65eb",
    "metadata": {
-    "execution": {
-     "iopub.status.busy": "2023-07-19T08:42:20.077561Z",
-     "iopub.status.idle": "2023-07-19T08:42:20.077926Z",
-     "shell.execute_reply": "2023-07-19T08:42:20.077754Z"
-    }
+    "tags": []
    },
    "outputs": [],
    "source": [

From aa4b8a4a03c5d74402f8fed4f75db14a3e0adea8 Mon Sep 17 00:00:00 2001
From: Sriniketh J <81156510+srini047@users.noreply.github.com>
Date: Thu, 27 Jul 2023 18:09:49 +0530
Subject: [PATCH 6/9] update: pdf url

---
 LangChain/LangChain_Vector_Search_on_PDF.ipynb | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/LangChain/LangChain_Vector_Search_on_PDF.ipynb b/LangChain/LangChain_Vector_Search_on_PDF.ipynb
index 17b3c7fc6f..79a0ee81ee 100644
--- a/LangChain/LangChain_Vector_Search_on_PDF.ipynb
+++ b/LangChain/LangChain_Vector_Search_on_PDF.ipynb
@@ -29,7 +29,10 @@
     "# LangChain - Vector Search on PDF\n",
     "<a href=\"https://app.naas.ai/user-redirect/naas/downloader?url=https://raw.githubusercontent.com/jupyter-naas/awesome-notebooks/master/template.ipynb\" target=\"_parent\">\n",
     "<img src=\"https://naasai-public.s3.eu-west-3.amazonaws.com/open_in_naas.svg\"/>\n",
-    "</a><br><br><a href=\"https://github.com/jupyter-naas/awesome-notebooks/issues/new?assignees=&labels=&template=template-request.md&title=Tool+-+Action+of+the+notebook+\">Template request</a> | <a href=\"https://github.com/jupyter-naas/awesome-notebooks/issues/new?assignees=&labels=&template=bug_report.md&title=[ERROR]+Tool+/+Folder+Action+of+the+notebook+\">Bug report</a>"
+    "</a><a target=\"_blank\" href=\"https://colab.research.google.com/drive/1BhiqnWyHZxNfdD733QEvZIKpaz3ND663?usp=sharing\">\n",
+    "  <img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/>\n",
+    "</a>\n",
+    "<br><br><a href=\"https://github.com/jupyter-naas/awesome-notebooks/issues/new?assignees=&labels=&template=template-request.md&title=Tool+-+Action+of+the+notebook+\">Template request</a> | <a href=\"https://github.com/jupyter-naas/awesome-notebooks/issues/new?assignees=&labels=&template=bug_report.md&title=[ERROR]+Tool+/+Folder+Action+of+the+notebook+\">Bug report</a>"
    ]
   },
   {
@@ -131,9 +134,9 @@
    "outputs": [],
    "source": [
     "# Inputs\n",
-    "pdf_file = \"\"\n",
-    "weaviate_cluster_url = \"paste your cluster url\" or naas.secret.get(\"WEAVIATE_CLUSTER_URL\")\n",
-    "query = \"Enter your own\" or \"Summarize the PDF...\"\n",
+    "pdf_file = \"\" or \"https://arxiv.org/pdf/2005.14165.pdf\"\n",
+    "weaviate_cluster_url = \"\" or naas.secret.get(\"WEAVIATE_CLUSTER_URL\")\n",
+    "query = \"\" or \"Summarize the PDF...\"\n",
     "\n",
     "# Outputs\n",
     "response = \"\""

From 1baf7586a396a03ed1b2a1a3789b4a5d0acf5e82 Mon Sep 17 00:00:00 2001
From: Sriniketh J <81156510+srini047@users.noreply.github.com>
Date: Fri, 28 Jul 2023 17:49:08 +0530
Subject: [PATCH 7/9] refactor code

---
 .../LangChain_Vector_Search_on_PDF.ipynb      | 144 +++++++++++++-----
 1 file changed, 105 insertions(+), 39 deletions(-)

diff --git a/LangChain/LangChain_Vector_Search_on_PDF.ipynb b/LangChain/LangChain_Vector_Search_on_PDF.ipynb
index 79a0ee81ee..9cf376788e 100644
--- a/LangChain/LangChain_Vector_Search_on_PDF.ipynb
+++ b/LangChain/LangChain_Vector_Search_on_PDF.ipynb
@@ -11,6 +11,7 @@
      "shell.execute_reply": "2021-02-23T14:22:16.626866Z",
      "shell.execute_reply.started": "2021-02-23T14:22:16.610384Z"
     },
+    "id": "latin-packing",
     "papermill": {},
     "tags": []
    },
@@ -22,6 +23,7 @@
    "cell_type": "markdown",
    "id": "compressed-wilson",
    "metadata": {
+    "id": "compressed-wilson",
     "papermill": {},
     "tags": []
    },
@@ -38,7 +40,9 @@
   {
    "cell_type": "markdown",
    "id": "religious-programmer",
-   "metadata": {},
+   "metadata": {
+    "id": "religious-programmer"
+   },
    "source": [
     "**Tags:** #langchain #pdf #weaviate #huggingface #llm #database #embeddings"
    ]
@@ -46,7 +50,9 @@
   {
    "cell_type": "markdown",
    "id": "1fe9f56e-561c-4f52-aef8-b861c9462107",
-   "metadata": {},
+   "metadata": {
+    "id": "1fe9f56e-561c-4f52-aef8-b861c9462107"
+   },
    "source": [
     "**Author:** [Sriniketh Jayasendil](https://www.linkedin.com/in/sriniketh-jayasendil)"
    ]
@@ -54,7 +60,9 @@
   {
    "cell_type": "markdown",
    "id": "31ea7cdb-e10d-43fc-b026-f69249a59736",
-   "metadata": {},
+   "metadata": {
+    "id": "31ea7cdb-e10d-43fc-b026-f69249a59736"
+   },
    "source": [
     "**Description:** This notebook is used to perform vector search on your PDF and it will answer basic questions that are closely related based on the prompt provided.\n",
     "\n",
@@ -68,7 +76,9 @@
   {
    "cell_type": "markdown",
    "id": "1a14806c-9da2-446e-b8fd-b55f8d7ac3f0",
-   "metadata": {},
+   "metadata": {
+    "id": "1a14806c-9da2-446e-b8fd-b55f8d7ac3f0"
+   },
    "source": [
     "**References:**\n",
     "- [Langchain docs](https://python.langchain.com/docs/get_started/introduction.html)\n",
@@ -80,6 +90,7 @@
    "cell_type": "markdown",
    "id": "distinguished-truth",
    "metadata": {
+    "id": "distinguished-truth",
     "papermill": {},
     "tags": []
    },
@@ -90,7 +101,9 @@
   {
    "cell_type": "markdown",
    "id": "numeric-mediterranean",
-   "metadata": {},
+   "metadata": {
+    "id": "numeric-mediterranean"
+   },
    "source": [
     "### Import libraries"
    ]
@@ -100,6 +113,7 @@
    "execution_count": null,
    "id": "potential-surfing",
    "metadata": {
+    "id": "potential-surfing",
     "tags": []
    },
    "outputs": [],
@@ -108,20 +122,35 @@
     "    import langchain\n",
     "    import PyPDF2\n",
     "    import weaviate\n",
+    "    import openai\n",
     "except ModuleNotFoundError:\n",
-    "    !pip install langchain PyPDF2 weaviate-client==3.20.0\n",
-    "    \n",
-    "# Note: This installation make take more time than usual due to more dependencies {uncomment if there is some error in the embeddings routine}\n",
-    "# !pip install sentence_transformers --user\n",
+    "    !pip install langchain PyPDF2 openai weaviate-client==3.20.0\n",
     "\n",
     "import naas\n",
     "import io\n",
     "import requests\n",
     "import PyPDF2\n",
-    "import weaviate\n",
+    "import openai\n",
     "from langchain.text_splitter import CharacterTextSplitter\n",
     "from langchain.embeddings import HuggingFaceEmbeddings\n",
-    "from langchain.vectorstores import Weaviate"
+    "from langchain.embeddings.openai import OpenAIEmbeddings\n",
+    "from langchain.vectorstores import Weaviate\n",
+    "from langchain.llms import OpenAI\n",
+    "from langchain.chains import RetrievalQA\n",
+    "from langchain.document_loaders import TextLoader"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "kDfGd3KRPDP8",
+   "metadata": {
+    "id": "kDfGd3KRPDP8"
+   },
+   "outputs": [],
+   "source": [
+    "# Note: This installation make take more time than usual due to more dependencies {uncomment if there is some error in the embeddings routine}\n",
+    "# !pip install -U sentence-transformers --user"
    ]
   },
   {
@@ -129,14 +158,16 @@
    "execution_count": null,
    "id": "continuous-melbourne",
    "metadata": {
+    "id": "continuous-melbourne",
     "tags": []
    },
    "outputs": [],
    "source": [
     "# Inputs\n",
-    "pdf_file = \"\" or \"https://arxiv.org/pdf/2005.14165.pdf\"\n",
+    "pdf_file = \"\" or \"https://bcf.princeton.edu/wp-content/uploads/2023/05/A_User_s_Guide_to_GPT_and_LLMs_for_Economic_Research.pdf\"\n",
     "weaviate_cluster_url = \"\" or naas.secret.get(\"WEAVIATE_CLUSTER_URL\")\n",
-    "query = \"\" or \"Summarize the PDF...\"\n",
+    "openai_api_key = \"\" or naas.secret.get(\"OPENAI_API_KEY\")\n",
+    "query = \"\" or \"Summarize the PDF\"\n",
     "\n",
     "# Outputs\n",
     "response = \"\""
@@ -145,7 +176,9 @@
   {
    "cell_type": "markdown",
    "id": "registered-showcase",
-   "metadata": {},
+   "metadata": {
+    "id": "registered-showcase"
+   },
    "source": [
     "## Model"
    ]
@@ -153,7 +186,9 @@
   {
    "cell_type": "markdown",
    "id": "tested-astrology",
-   "metadata": {},
+   "metadata": {
+    "id": "tested-astrology"
+   },
    "source": [
     "### Extract text from PDF"
    ]
@@ -163,6 +198,7 @@
    "execution_count": null,
    "id": "crude-louisville",
    "metadata": {
+    "id": "crude-louisville",
     "papermill": {},
     "tags": []
    },
@@ -177,10 +213,10 @@
     "    for page in reader.pages:\n",
     "        content = page.extract_text()\n",
     "        contents.append(content)\n",
-    "        \n",
+    "\n",
     "    contents = ' '.join(contents)\n",
     "    return contents\n",
-    "    \n",
+    "\n",
     "\n",
     "text = extract_text_from_pdf(pdf_file)"
    ]
@@ -188,7 +224,9 @@
   {
    "cell_type": "markdown",
    "id": "8daa42c1-3a2b-4f96-a7dd-fb1deb395a84",
-   "metadata": {},
+   "metadata": {
+    "id": "8daa42c1-3a2b-4f96-a7dd-fb1deb395a84"
+   },
    "source": [
     "### Split the text into chunks scraped from the PDF"
    ]
@@ -198,17 +236,13 @@
    "execution_count": null,
    "id": "f9e8e197-e965-441c-9512-9b28ed079ee6",
    "metadata": {
+    "id": "f9e8e197-e965-441c-9512-9b28ed079ee6",
     "papermill": {},
     "tags": []
    },
    "outputs": [],
    "source": [
-    "text_splitter = CharacterTextSplitter(\n",
-    "    separator = \"\\n\",\n",
-    "    chunk_size = 1000,\n",
-    "    chunk_overlap  = 200,\n",
-    "    length_function = len,\n",
-    ")\n",
+    "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
     "\n",
     "texts = text_splitter.create_documents([text])"
    ]
@@ -216,7 +250,9 @@
   {
    "cell_type": "markdown",
    "id": "ef1720bf-a28a-4757-b189-7df97947c158",
-   "metadata": {},
+   "metadata": {
+    "id": "ef1720bf-a28a-4757-b189-7df97947c158"
+   },
    "source": [
     "### Create embeddings of the text make it compatible to store it in the database"
    ]
@@ -226,6 +262,7 @@
    "execution_count": null,
    "id": "e4a376ac-a10e-4d6a-ba01-e5445efdf091",
    "metadata": {
+    "id": "e4a376ac-a10e-4d6a-ba01-e5445efdf091",
     "tags": []
    },
    "outputs": [],
@@ -239,7 +276,9 @@
   {
    "cell_type": "markdown",
    "id": "4169feb2-05ac-4914-bbb2-501dae7dcd89",
-   "metadata": {},
+   "metadata": {
+    "id": "4169feb2-05ac-4914-bbb2-501dae7dcd89"
+   },
    "source": [
     "### Store the embeddings into the weaviate database"
    ]
@@ -249,17 +288,34 @@
    "execution_count": null,
    "id": "93ae7d2d-86b0-4b88-ba96-ef0d094d9da3",
    "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "93ae7d2d-86b0-4b88-ba96-ef0d094d9da3",
+    "outputId": "66af9945-1e8c-470c-dfe0-f161d3dc1c68",
     "tags": []
    },
    "outputs": [],
    "source": [
+    "# Delete existing schema if any present\n",
+    "client = weaviate.Client(url=weaviate_cluster_url )\n",
+    "\n",
+    "try:\n",
+    "    client.schema.delete_all()\n",
+    "    print(\"Schema deleted successfully...\")\n",
+    "except:\n",
+    "    print(\"Schema not deleted...\")\n",
+    "\n",
+    "# Store in the weaviate vector database\n",
     "db = Weaviate.from_documents(texts, embeddings, weaviate_url=weaviate_cluster_url, by_text=False)"
    ]
   },
   {
    "cell_type": "markdown",
    "id": "981fac74-2e1e-4b62-8b91-09d51d344bba",
-   "metadata": {},
+   "metadata": {
+    "id": "981fac74-2e1e-4b62-8b91-09d51d344bba"
+   },
    "source": [
     "### Get the closest response to the user query on the PDF"
    ]
@@ -269,12 +325,13 @@
    "execution_count": null,
    "id": "bdf9e7a9-7de9-4c50-b677-97cb2a1d5d3b",
    "metadata": {
+    "id": "bdf9e7a9-7de9-4c50-b677-97cb2a1d5d3b",
     "tags": []
    },
    "outputs": [],
    "source": [
-    "docs = db.similarity_search(query)\n",
-    "response = docs[0].page_content"
+    "qa = RetrievalQA.from_chain_type(llm=OpenAI(openai_api_key=openai_api_key, temperature=0), chain_type=\"stuff\", retriever=db.as_retriever())\n",
+    "response = qa.run(query)"
    ]
   },
   {
@@ -287,7 +344,8 @@
      "iopub.status.idle": "2021-07-02T23:32:10.796900Z",
      "shell.execute_reply": "2021-07-02T23:32:10.796358Z",
      "shell.execute_reply.started": "2021-07-02T23:32:10.789033Z"
-    }
+    },
+    "id": "lonely-pacific"
    },
    "source": [
     "## Output"
@@ -296,7 +354,9 @@
   {
    "cell_type": "markdown",
    "id": "890f7c86-b7bb-4f5d-9a1b-e492dd9580fd",
-   "metadata": {},
+   "metadata": {
+    "id": "890f7c86-b7bb-4f5d-9a1b-e492dd9580fd"
+   },
    "source": [
     "### Show the response"
    ]
@@ -306,6 +366,12 @@
    "execution_count": null,
    "id": "9c4e3b7b-6440-4844-8054-265f1aec65eb",
    "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 69
+    },
+    "id": "9c4e3b7b-6440-4844-8054-265f1aec65eb",
+    "outputId": "70a8eb18-ce96-4902-9ebe-c3179494fb30",
     "tags": []
    },
    "outputs": [],
@@ -317,12 +383,19 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "9d0d7bd2-7e89-49e6-807f-9ab542085a7f",
-   "metadata": {},
+   "metadata": {
+    "id": "9d0d7bd2-7e89-49e6-807f-9ab542085a7f"
+   },
    "outputs": [],
    "source": []
   }
  ],
  "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": []
+  },
   "kernelspec": {
    "display_name": "Python 3",
    "language": "python",
@@ -345,13 +418,6 @@
    "environment_variables": {},
    "parameters": {},
    "version": "2.3.3"
-  },
-  "widgets": {
-   "application/vnd.jupyter.widget-state+json": {
-    "state": {},
-    "version_major": 2,
-    "version_minor": 0
-   }
   }
  },
  "nbformat": 4,

From dd2a8ba14d1b66802a24f52dbba6fac7ff9e1130 Mon Sep 17 00:00:00 2001
From: Florent Ravenel <florent@naas.ai>
Date: Tue, 1 Aug 2023 09:06:01 +0200
Subject: [PATCH 8/9] feat: update PDF and test

---
 .../LangChain_Vector_Search_on_PDF.ipynb      | 206 ++++++++----------
 1 file changed, 89 insertions(+), 117 deletions(-)

diff --git a/LangChain/LangChain_Vector_Search_on_PDF.ipynb b/LangChain/LangChain_Vector_Search_on_PDF.ipynb
index 9cf376788e..a192b7a495 100644
--- a/LangChain/LangChain_Vector_Search_on_PDF.ipynb
+++ b/LangChain/LangChain_Vector_Search_on_PDF.ipynb
@@ -11,7 +11,6 @@
      "shell.execute_reply": "2021-02-23T14:22:16.626866Z",
      "shell.execute_reply.started": "2021-02-23T14:22:16.610384Z"
     },
-    "id": "latin-packing",
     "papermill": {},
     "tags": []
    },
@@ -23,7 +22,6 @@
    "cell_type": "markdown",
    "id": "compressed-wilson",
    "metadata": {
-    "id": "compressed-wilson",
     "papermill": {},
     "tags": []
    },
@@ -31,8 +29,6 @@
     "# LangChain - Vector Search on PDF\n",
     "<a href=\"https://app.naas.ai/user-redirect/naas/downloader?url=https://raw.githubusercontent.com/jupyter-naas/awesome-notebooks/master/template.ipynb\" target=\"_parent\">\n",
     "<img src=\"https://naasai-public.s3.eu-west-3.amazonaws.com/open_in_naas.svg\"/>\n",
-    "</a><a target=\"_blank\" href=\"https://colab.research.google.com/drive/1BhiqnWyHZxNfdD733QEvZIKpaz3ND663?usp=sharing\">\n",
-    "  <img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/>\n",
     "</a>\n",
     "<br><br><a href=\"https://github.com/jupyter-naas/awesome-notebooks/issues/new?assignees=&labels=&template=template-request.md&title=Tool+-+Action+of+the+notebook+\">Template request</a> | <a href=\"https://github.com/jupyter-naas/awesome-notebooks/issues/new?assignees=&labels=&template=bug_report.md&title=[ERROR]+Tool+/+Folder+Action+of+the+notebook+\">Bug report</a>"
    ]
@@ -40,9 +36,7 @@
   {
    "cell_type": "markdown",
    "id": "religious-programmer",
-   "metadata": {
-    "id": "religious-programmer"
-   },
+   "metadata": {},
    "source": [
     "**Tags:** #langchain #pdf #weaviate #huggingface #llm #database #embeddings"
    ]
@@ -50,19 +44,26 @@
   {
    "cell_type": "markdown",
    "id": "1fe9f56e-561c-4f52-aef8-b861c9462107",
-   "metadata": {
-    "id": "1fe9f56e-561c-4f52-aef8-b861c9462107"
-   },
+   "metadata": {},
    "source": [
     "**Author:** [Sriniketh Jayasendil](https://www.linkedin.com/in/sriniketh-jayasendil)"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "31ea7cdb-e10d-43fc-b026-f69249a59736",
+   "id": "68c33d85-f522-44bb-9b2c-dec47a414f54",
    "metadata": {
-    "id": "31ea7cdb-e10d-43fc-b026-f69249a59736"
+    "papermill": {},
+    "tags": []
    },
+   "source": [
+    "**Last update:** 2023-07-31 (Created: 2023-07-10)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "31ea7cdb-e10d-43fc-b026-f69249a59736",
+   "metadata": {},
    "source": [
     "**Description:** This notebook is used to perform vector search on your PDF and it will answer basic questions that are closely related based on the prompt provided.\n",
     "\n",
@@ -70,15 +71,17 @@
     "- PyPDF2 - Get text from PDF\n",
     "- LangChain - Text splitter, document creation\n",
     "- HuggingFace - Embeddings\n",
-    "- Weaviate - Vector Database"
+    "- Weaviate - Vector Database\n",
+    "\n",
+    "<a target=\"_blank\" href=\"https://colab.research.google.com/drive/1BhiqnWyHZxNfdD733QEvZIKpaz3ND663?usp=sharing\">\n",
+    "  <img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/>\n",
+    "</a>"
    ]
   },
   {
    "cell_type": "markdown",
    "id": "1a14806c-9da2-446e-b8fd-b55f8d7ac3f0",
-   "metadata": {
-    "id": "1a14806c-9da2-446e-b8fd-b55f8d7ac3f0"
-   },
+   "metadata": {},
    "source": [
     "**References:**\n",
     "- [Langchain docs](https://python.langchain.com/docs/get_started/introduction.html)\n",
@@ -90,7 +93,6 @@
    "cell_type": "markdown",
    "id": "distinguished-truth",
    "metadata": {
-    "id": "distinguished-truth",
     "papermill": {},
     "tags": []
    },
@@ -101,9 +103,7 @@
   {
    "cell_type": "markdown",
    "id": "numeric-mediterranean",
-   "metadata": {
-    "id": "numeric-mediterranean"
-   },
+   "metadata": {},
    "source": [
     "### Import libraries"
    ]
@@ -113,7 +113,6 @@
    "execution_count": null,
    "id": "potential-surfing",
    "metadata": {
-    "id": "potential-surfing",
     "tags": []
    },
    "outputs": [],
@@ -122,35 +121,33 @@
     "    import langchain\n",
     "    import PyPDF2\n",
     "    import weaviate\n",
-    "    import openai\n",
     "except ModuleNotFoundError:\n",
-    "    !pip install langchain PyPDF2 openai weaviate-client==3.20.0\n",
+    "    !pip install langchain PyPDF2 weaviate-client==3.20.0\n",
+    "    \n",
+    "# Note: This installation make take more time than usual due to more dependencies {uncomment if there is some error in the embeddings routine}\n",
+    "# !pip install sentence_transformers --user\n",
     "\n",
+    "import os\n",
     "import naas\n",
     "import io\n",
     "import requests\n",
     "import PyPDF2\n",
-    "import openai\n",
+    "import weaviate\n",
     "from langchain.text_splitter import CharacterTextSplitter\n",
     "from langchain.embeddings import HuggingFaceEmbeddings\n",
-    "from langchain.embeddings.openai import OpenAIEmbeddings\n",
-    "from langchain.vectorstores import Weaviate\n",
-    "from langchain.llms import OpenAI\n",
-    "from langchain.chains import RetrievalQA\n",
-    "from langchain.document_loaders import TextLoader"
+    "from langchain.vectorstores import Weaviate"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "kDfGd3KRPDP8",
-   "metadata": {
-    "id": "kDfGd3KRPDP8"
-   },
-   "outputs": [],
+   "cell_type": "markdown",
+   "id": "64db5ac5-046f-4203-8503-990002927075",
+   "metadata": {},
    "source": [
-    "# Note: This installation make take more time than usual due to more dependencies {uncomment if there is some error in the embeddings routine}\n",
-    "# !pip install -U sentence-transformers --user"
+    "### Setup variables\n",
+    "- `pdf_file`: Path to which the PDF file exists.\",\n",
+    "- `weaviate_cluster_url`: You can create a new weaviate cluster [here](https://console.weaviate.cloud) and paste the url or import from naas secrets\n",
+    "- `weaviate_api_key`: Get your API key from your weaviate dashboard [here](https://console.weaviate.cloud/dashboard#)\n",
+    "- `query`: The question that you need to ask the pdf"
    ]
   },
   {
@@ -158,37 +155,48 @@
    "execution_count": null,
    "id": "continuous-melbourne",
    "metadata": {
-    "id": "continuous-melbourne",
     "tags": []
    },
    "outputs": [],
    "source": [
-    "# Inputs\n",
-    "pdf_file = \"\" or \"https://bcf.princeton.edu/wp-content/uploads/2023/05/A_User_s_Guide_to_GPT_and_LLMs_for_Economic_Research.pdf\"\n",
-    "weaviate_cluster_url = \"\" or naas.secret.get(\"WEAVIATE_CLUSTER_URL\")\n",
-    "openai_api_key = \"\" or naas.secret.get(\"OPENAI_API_KEY\")\n",
-    "query = \"\" or \"Summarize the PDF\"\n",
-    "\n",
-    "# Outputs\n",
-    "response = \"\""
+    "pdf_file = \"https://tesla-cdn.thron.com/static/SVCPTV_2022_Q4_Quarterly_Update_6UDS97.pdf?xseo=&response-content-disposition=inline%3Bfilename%3D%22b7871185-dd6a-4d79-9c3b-19b497227f2a.pdf%22\"\n",
+    "weaviate_api_key = naas.secret.get(\"WEAVIATE_API_KEY\")\n",
+    "weaviate_cluster_url = naas.secret.get(\"WEAVIATE_CLUSTER_URL\")\n",
+    "query = \"What's the total revenue on Q4 2022?\""
    ]
   },
   {
    "cell_type": "markdown",
    "id": "registered-showcase",
-   "metadata": {
-    "id": "registered-showcase"
-   },
+   "metadata": {},
    "source": [
     "## Model"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "tested-astrology",
+   "id": "8ae9725c-161a-47f6-a115-7d74cee3bd2f",
+   "metadata": {},
+   "source": [
+    "### Setup environ"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bd067008-9cf1-45b1-a6d1-c37627dc4976",
    "metadata": {
-    "id": "tested-astrology"
+    "tags": []
    },
+   "outputs": [],
+   "source": [
+    "os.environ[\"WEAVIATE_API_KEY\"] = weaviate_api_key"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "tested-astrology",
+   "metadata": {},
    "source": [
     "### Extract text from PDF"
    ]
@@ -198,7 +206,6 @@
    "execution_count": null,
    "id": "crude-louisville",
    "metadata": {
-    "id": "crude-louisville",
     "papermill": {},
     "tags": []
    },
@@ -213,20 +220,17 @@
     "    for page in reader.pages:\n",
     "        content = page.extract_text()\n",
     "        contents.append(content)\n",
-    "\n",
+    "        \n",
     "    contents = ' '.join(contents)\n",
     "    return contents\n",
-    "\n",
-    "\n",
+    "    \n",
     "text = extract_text_from_pdf(pdf_file)"
    ]
   },
   {
    "cell_type": "markdown",
    "id": "8daa42c1-3a2b-4f96-a7dd-fb1deb395a84",
-   "metadata": {
-    "id": "8daa42c1-3a2b-4f96-a7dd-fb1deb395a84"
-   },
+   "metadata": {},
    "source": [
     "### Split the text into chunks scraped from the PDF"
    ]
@@ -236,23 +240,27 @@
    "execution_count": null,
    "id": "f9e8e197-e965-441c-9512-9b28ed079ee6",
    "metadata": {
-    "id": "f9e8e197-e965-441c-9512-9b28ed079ee6",
     "papermill": {},
     "tags": []
    },
    "outputs": [],
    "source": [
-    "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
+    "text_splitter = CharacterTextSplitter(\n",
+    "    separator = \"\\n\",\n",
+    "    chunk_size = 1000,\n",
+    "    chunk_overlap  = 200,\n",
+    "    length_function = len,\n",
+    ")\n",
     "\n",
-    "texts = text_splitter.create_documents([text])"
+    "texts = text_splitter.create_documents([text])\n",
+    "print(len(texts))\n",
+    "texts[0]"
    ]
   },
   {
    "cell_type": "markdown",
    "id": "ef1720bf-a28a-4757-b189-7df97947c158",
-   "metadata": {
-    "id": "ef1720bf-a28a-4757-b189-7df97947c158"
-   },
+   "metadata": {},
    "source": [
     "### Create embeddings of the text make it compatible to store it in the database"
    ]
@@ -262,7 +270,6 @@
    "execution_count": null,
    "id": "e4a376ac-a10e-4d6a-ba01-e5445efdf091",
    "metadata": {
-    "id": "e4a376ac-a10e-4d6a-ba01-e5445efdf091",
     "tags": []
    },
    "outputs": [],
@@ -270,15 +277,13 @@
     "embeddings = HuggingFaceEmbeddings()\n",
     "\n",
     "for i in range(len(texts)):\n",
-    "        query_result = embeddings.embed_query(texts[i].page_content)"
+    "    query_result = embeddings.embed_query(texts[i].page_content)"
    ]
   },
   {
    "cell_type": "markdown",
    "id": "4169feb2-05ac-4914-bbb2-501dae7dcd89",
-   "metadata": {
-    "id": "4169feb2-05ac-4914-bbb2-501dae7dcd89"
-   },
+   "metadata": {},
    "source": [
     "### Store the embeddings into the weaviate database"
    ]
@@ -286,26 +291,12 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "93ae7d2d-86b0-4b88-ba96-ef0d094d9da3",
+   "id": "6922b1d4-e394-493a-8549-07ba3c947e7d",
    "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
-    },
-    "id": "93ae7d2d-86b0-4b88-ba96-ef0d094d9da3",
-    "outputId": "66af9945-1e8c-470c-dfe0-f161d3dc1c68",
     "tags": []
    },
    "outputs": [],
    "source": [
-    "# Delete existing schema if any present\n",
-    "client = weaviate.Client(url=weaviate_cluster_url )\n",
-    "\n",
-    "try:\n",
-    "    client.schema.delete_all()\n",
-    "    print(\"Schema deleted successfully...\")\n",
-    "except:\n",
-    "    print(\"Schema not deleted...\")\n",
-    "\n",
     "# Store in the weaviate vector database\n",
     "db = Weaviate.from_documents(texts, embeddings, weaviate_url=weaviate_cluster_url, by_text=False)"
    ]
@@ -313,9 +304,7 @@
   {
    "cell_type": "markdown",
    "id": "981fac74-2e1e-4b62-8b91-09d51d344bba",
-   "metadata": {
-    "id": "981fac74-2e1e-4b62-8b91-09d51d344bba"
-   },
+   "metadata": {},
    "source": [
     "### Get the closest response to the user query on the PDF"
    ]
@@ -325,13 +314,12 @@
    "execution_count": null,
    "id": "bdf9e7a9-7de9-4c50-b677-97cb2a1d5d3b",
    "metadata": {
-    "id": "bdf9e7a9-7de9-4c50-b677-97cb2a1d5d3b",
     "tags": []
    },
    "outputs": [],
    "source": [
-    "qa = RetrievalQA.from_chain_type(llm=OpenAI(openai_api_key=openai_api_key, temperature=0), chain_type=\"stuff\", retriever=db.as_retriever())\n",
-    "response = qa.run(query)"
+    "docs = db.similarity_search(query)\n",
+    "docs"
    ]
   },
   {
@@ -344,8 +332,7 @@
      "iopub.status.idle": "2021-07-02T23:32:10.796900Z",
      "shell.execute_reply": "2021-07-02T23:32:10.796358Z",
      "shell.execute_reply.started": "2021-07-02T23:32:10.789033Z"
-    },
-    "id": "lonely-pacific"
+    }
    },
    "source": [
     "## Output"
@@ -354,9 +341,7 @@
   {
    "cell_type": "markdown",
    "id": "890f7c86-b7bb-4f5d-9a1b-e492dd9580fd",
-   "metadata": {
-    "id": "890f7c86-b7bb-4f5d-9a1b-e492dd9580fd"
-   },
+   "metadata": {},
    "source": [
     "### Show the response"
    ]
@@ -366,36 +351,16 @@
    "execution_count": null,
    "id": "9c4e3b7b-6440-4844-8054-265f1aec65eb",
    "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/",
-     "height": 69
-    },
-    "id": "9c4e3b7b-6440-4844-8054-265f1aec65eb",
-    "outputId": "70a8eb18-ce96-4902-9ebe-c3179494fb30",
     "tags": []
    },
    "outputs": [],
    "source": [
+    "response = docs[0].page_content\n",
     "response"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "9d0d7bd2-7e89-49e6-807f-9ab542085a7f",
-   "metadata": {
-    "id": "9d0d7bd2-7e89-49e6-807f-9ab542085a7f"
-   },
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
-  "accelerator": "GPU",
-  "colab": {
-   "gpuType": "T4",
-   "provenance": []
-  },
   "kernelspec": {
    "display_name": "Python 3",
    "language": "python",
@@ -418,6 +383,13 @@
    "environment_variables": {},
    "parameters": {},
    "version": "2.3.3"
+  },
+  "widgets": {
+   "application/vnd.jupyter.widget-state+json": {
+    "state": {},
+    "version_major": 2,
+    "version_minor": 0
+   }
   }
  },
  "nbformat": 4,

From 4840802b5c50e6e1b1dd48c7a63c365fc32b02b2 Mon Sep 17 00:00:00 2001
From: Florent Ravenel <florent@naas.ai>
Date: Wed, 27 Sep 2023 14:31:22 +0200
Subject: [PATCH 9/9] fix: install lib

---
 .../LangChain_Vector_Search_on_PDF.ipynb      | 23 +++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/LangChain/LangChain_Vector_Search_on_PDF.ipynb b/LangChain/LangChain_Vector_Search_on_PDF.ipynb
index a192b7a495..f6e25ed743 100644
--- a/LangChain/LangChain_Vector_Search_on_PDF.ipynb
+++ b/LangChain/LangChain_Vector_Search_on_PDF.ipynb
@@ -57,7 +57,7 @@
     "tags": []
    },
    "source": [
-    "**Last update:** 2023-07-31 (Created: 2023-07-10)"
+    "**Last update:** 2023-09-27 (Created: 2023-09-27)"
    ]
   },
   {
@@ -119,10 +119,19 @@
    "source": [
     "try:\n",
     "    import langchain\n",
+    "except ModuleNotFoundError:\n",
+    "    !pip install langchain --user\n",
+    "    import langchain\n",
+    "try:\n",
     "    import PyPDF2\n",
+    "except ModuleNotFoundError:\n",
+    "    !pip install PyPDF2 --user\n",
+    "    import PyPDF2\n",
+    "try:\n",
     "    import weaviate\n",
     "except ModuleNotFoundError:\n",
-    "    !pip install langchain PyPDF2 weaviate-client==3.20.0\n",
+    "    !pip install weaviate-client==3.20.0 --user\n",
+    "    import weaviate\n",
     "    \n",
     "# Note: This installation make take more time than usual due to more dependencies {uncomment if there is some error in the embeddings routine}\n",
     "# !pip install sentence_transformers --user\n",
@@ -131,8 +140,6 @@
     "import naas\n",
     "import io\n",
     "import requests\n",
-    "import PyPDF2\n",
-    "import weaviate\n",
     "from langchain.text_splitter import CharacterTextSplitter\n",
     "from langchain.embeddings import HuggingFaceEmbeddings\n",
     "from langchain.vectorstores import Weaviate"
@@ -358,6 +365,14 @@
     "response = docs[0].page_content\n",
     "response"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5e2bc7f1-acf9-402b-b0aa-93de14764f8b",
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {