From 1a621565cb7f773f90f10063f7324ecbbbb87651 Mon Sep 17 00:00:00 2001 From: Sriniketh J <81156510+srini047@users.noreply.github.com> Date: Wed, 12 Jul 2023 22:03:37 +0530 Subject: [PATCH 1/9] add: langchain vector search pdf template --- .../Langchain_Vector_Search_on_PDF.ipynb | 344 ++++++++++++++++++ 1 file changed, 344 insertions(+) create mode 100644 LangChain/Langchain_Vector_Search_on_PDF.ipynb diff --git a/LangChain/Langchain_Vector_Search_on_PDF.ipynb b/LangChain/Langchain_Vector_Search_on_PDF.ipynb new file mode 100644 index 0000000000..ebe4599064 --- /dev/null +++ b/LangChain/Langchain_Vector_Search_on_PDF.ipynb @@ -0,0 +1,344 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "latin-packing", + "metadata": { + "execution": { + "iopub.execute_input": "2021-02-23T14:22:16.610471Z", + "iopub.status.busy": "2021-02-23T14:22:16.610129Z", + "iopub.status.idle": "2021-02-23T14:22:16.627784Z", + "shell.execute_reply": "2021-02-23T14:22:16.626866Z", + "shell.execute_reply.started": "2021-02-23T14:22:16.610384Z" + }, + "papermill": {}, + "tags": [] + }, + "source": [ + "\"Naas\"" + ] + }, + { + "cell_type": "markdown", + "id": "compressed-wilson", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": [ + "# Tool - Action of the notebook\n", + "\n", + "\n", + "

Template request | Bug report" + ] + }, + { + "cell_type": "markdown", + "id": "religious-programmer", + "metadata": {}, + "source": [ + "**Tags:** #langchain #pdf #weaviate #huggingface" + ] + }, + { + "cell_type": "markdown", + "id": "1fe9f56e-561c-4f52-aef8-b861c9462107", + "metadata": {}, + "source": [ + "**Author:** [Sriniketh Jayasendil](https://www.linkedin.com/in/sriniketh-jayasendil)" + ] + }, + { + "cell_type": "markdown", + "id": "31ea7cdb-e10d-43fc-b026-f69249a59736", + "metadata": {}, + "source": [ + "**Description:** This notebook is used to perform vector search on your PDF and it will answer basic questions that are closely related based on the prompt provided.\n", + "\n", + "It uses:\n", + "- PyPDF2 - Get text from PDF\n", + "- LangChain - Text splitter, document creation\n", + "- HuggingFace - Embeddings\n", + "- Weaviate - Vector Database" + ] + }, + { + "cell_type": "markdown", + "id": "1a14806c-9da2-446e-b8fd-b55f8d7ac3f0", + "metadata": {}, + "source": [ + "### References\n", + "\n", + "- [Langchain docs](https://python.langchain.com/docs/get_started/introduction.html)\n", + "- [Weaviate docs](https://weaviate.io/developers/weaviate)\n", + "- [Huggingface docs](https://huggingface.co/docs)" + ] + }, + { + "cell_type": "markdown", + "id": "distinguished-truth", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": [ + "## Input" + ] + }, + { + "cell_type": "markdown", + "id": "numeric-mediterranean", + "metadata": {}, + "source": [ + "### Import libraries" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "potential-surfing", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "try:\n", + " import langchain\n", + " import PyPDF2\n", + "except ModuleNotFoundError:\n", + " !pip install langchain PyPDF2\n", + "\n", + "!pip install sentence_transformers --user\n", + "import naas\n", + "import PyPDF2\n", + "from langchain.text_splitter import CharacterTextSplitter\n", + "from langchain.embeddings import HuggingFaceEmbeddings\n", + "from langchain.vectorstores import Weaviate" + ] + }, + { + "cell_type": "markdown", + "id": "aggressive-trustee", + "metadata": {}, + "source": [ + "### Setup Variables\n", + "\n", + "- `pdf_file`: Path to which the PDF file exists.\n", + "- `weaviate_cluster_url`: You can create a new weaviate cluster [here](https://console.weaviate.cloud) and paste the url or import from naas secrets\n", + "- `query`: The question that you need to ask the pdf\n", + "- `response`: The reply for the query from search " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "continuous-melbourne", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "#inputs\n", + "pdf_file = \"./SWE NCG JD.pdf\"\n", + "weaviate_cluster_url = \"\" or naas.secret.get(\"WEAVIATE_CLUSTER_URL\")\n", + "query = \"How much is the base pay?\"\n", + "\n", + "#outputs\n", + "response = \"\"" + ] + }, + { + "cell_type": "markdown", + "id": "registered-showcase", + "metadata": {}, + "source": [ + "## Model" + ] + }, + { + "cell_type": "markdown", + "id": "tested-astrology", + "metadata": {}, + "source": [ + "### Extract text from PDF" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "crude-louisville", + "metadata": { + "papermill": {}, + "tags": [] + }, + "outputs": [], + "source": [ + "def extract_text_from_pdf(pdf_path):\n", + " with open(pdf_path, \"rb\") as file:\n", + " pdf = PyPDF2.PdfReader(file)\n", + " text = []\n", + " for page in pdf.pages:\n", + " text.append(page.extract_text())\n", + " return \" \".join(text)\n", + "\n", + "text = extract_text_from_pdf(pdf_file)" + ] + }, + { + "cell_type": "markdown", + "id": "8daa42c1-3a2b-4f96-a7dd-fb1deb395a84", + "metadata": {}, + "source": [ + "### Split the text into chunks scraped from the PDF" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f9e8e197-e965-441c-9512-9b28ed079ee6", + "metadata": { + "papermill": {}, + "tags": [] + }, + "outputs": [], + "source": [ + "text_splitter = CharacterTextSplitter(\n", + " separator = \"\\n\",\n", + " chunk_size = 1000,\n", + " chunk_overlap = 200,\n", + " length_function = len,\n", + ")\n", + "\n", + "texts = text_splitter.create_documents([text])" + ] + }, + { + "cell_type": "markdown", + "id": "ef1720bf-a28a-4757-b189-7df97947c158", + "metadata": {}, + "source": [ + "### Create embeddings of the text make it compatible to store it in the database" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e4a376ac-a10e-4d6a-ba01-e5445efdf091", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "embeddings = HuggingFaceEmbeddings()\n", + "\n", + "for i in range(len(texts)):\n", + " query_result = embeddings.embed_query(texts[i].page_content)" + ] + }, + { + "cell_type": "markdown", + "id": "4169feb2-05ac-4914-bbb2-501dae7dcd89", + "metadata": {}, + "source": [ + "### Store the embeddings into the weaviate database" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "93ae7d2d-86b0-4b88-ba96-ef0d094d9da3", + "metadata": {}, + "outputs": [], + "source": [ + "db = Weaviate.from_documents(texts, embeddings, weaviate_url=\"https://naas-langchain-test-t6yybnsw.weaviate.network\", by_text=False)" + ] + }, + { + "cell_type": "markdown", + "id": "981fac74-2e1e-4b62-8b91-09d51d344bba", + "metadata": {}, + "source": [ + "### Get the closest response to the user query on the PDF" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bdf9e7a9-7de9-4c50-b677-97cb2a1d5d3b", + "metadata": {}, + "outputs": [], + "source": [ + "docs = db.similarity_search(query)\n", + "response = docs[0].page_content" + ] + }, + { + "cell_type": "markdown", + "id": "lonely-pacific", + "metadata": { + "execution": { + "iopub.execute_input": "2021-07-02T23:32:10.789097Z", + "iopub.status.busy": "2021-07-02T23:32:10.788829Z", + "iopub.status.idle": "2021-07-02T23:32:10.796900Z", + "shell.execute_reply": "2021-07-02T23:32:10.796358Z", + "shell.execute_reply.started": "2021-07-02T23:32:10.789033Z" + } + }, + "source": [ + "## Output" + ] + }, + { + "cell_type": "markdown", + "id": "890f7c86-b7bb-4f5d-9a1b-e492dd9580fd", + "metadata": {}, + "source": [ + "### Show the response" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9c4e3b7b-6440-4844-8054-265f1aec65eb", + "metadata": {}, + "outputs": [], + "source": [ + "response" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + }, + "papermill": { + "default_parameters": {}, + "environment_variables": {}, + "parameters": {}, + "version": "2.3.3" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": {}, + "version_major": 2, + "version_minor": 0 + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 932275ea2197072dec69cfe944215b57b32441ae Mon Sep 17 00:00:00 2001 From: Sriniketh J <81156510+srini047@users.noreply.github.com> Date: Wed, 12 Jul 2023 22:07:20 +0530 Subject: [PATCH 2/9] fix: .env exposure issue --- LangChain/Langchain_Vector_Search_on_PDF.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LangChain/Langchain_Vector_Search_on_PDF.ipynb b/LangChain/Langchain_Vector_Search_on_PDF.ipynb index ebe4599064..6f023f30e9 100644 --- a/LangChain/Langchain_Vector_Search_on_PDF.ipynb +++ b/LangChain/Langchain_Vector_Search_on_PDF.ipynb @@ -250,7 +250,7 @@ "metadata": {}, "outputs": [], "source": [ - "db = Weaviate.from_documents(texts, embeddings, weaviate_url=\"https://naas-langchain-test-t6yybnsw.weaviate.network\", by_text=False)" + "db = Weaviate.from_documents(texts, embeddings, weaviate_url=\"\", by_text=False)" ] }, { From dea4487b08d641e6b16fd7fb8dad29941385750a Mon Sep 17 00:00:00 2001 From: Florent Ravenel Date: Wed, 19 Jul 2023 10:29:51 +0200 Subject: [PATCH 3/9] feat: rename notebook, update title, references tipo --- ....ipynb => LangChain_Vector_Search_on_PDF.ipynb} | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) rename LangChain/{Langchain_Vector_Search_on_PDF.ipynb => LangChain_Vector_Search_on_PDF.ipynb} (97%) diff --git a/LangChain/Langchain_Vector_Search_on_PDF.ipynb b/LangChain/LangChain_Vector_Search_on_PDF.ipynb similarity index 97% rename from LangChain/Langchain_Vector_Search_on_PDF.ipynb rename to LangChain/LangChain_Vector_Search_on_PDF.ipynb index 6f023f30e9..9bbb385792 100644 --- a/LangChain/Langchain_Vector_Search_on_PDF.ipynb +++ b/LangChain/LangChain_Vector_Search_on_PDF.ipynb @@ -26,7 +26,7 @@ "tags": [] }, "source": [ - "# Tool - Action of the notebook\n", + "# LangChain - Vector Search on PDF\n", "\n", "\n", "

Template request | Bug report" @@ -67,8 +67,7 @@ "id": "1a14806c-9da2-446e-b8fd-b55f8d7ac3f0", "metadata": {}, "source": [ - "### References\n", - "\n", + "**References:**\n", "- [Langchain docs](https://python.langchain.com/docs/get_started/introduction.html)\n", "- [Weaviate docs](https://weaviate.io/developers/weaviate)\n", "- [Huggingface docs](https://huggingface.co/docs)" @@ -98,6 +97,10 @@ "execution_count": null, "id": "potential-surfing", "metadata": { + "execution": { + "iopub.execute_input": "2023-07-19T08:29:06.271389Z", + "iopub.status.busy": "2023-07-19T08:29:06.271116Z" + }, "tags": [] }, "outputs": [], @@ -107,7 +110,6 @@ " import PyPDF2\n", "except ModuleNotFoundError:\n", " !pip install langchain PyPDF2\n", - "\n", "!pip install sentence_transformers --user\n", "import naas\n", "import PyPDF2\n", @@ -138,12 +140,12 @@ }, "outputs": [], "source": [ - "#inputs\n", + "# Inputs\n", "pdf_file = \"./SWE NCG JD.pdf\"\n", "weaviate_cluster_url = \"\" or naas.secret.get(\"WEAVIATE_CLUSTER_URL\")\n", "query = \"How much is the base pay?\"\n", "\n", - "#outputs\n", + "# Outputs\n", "response = \"\"" ] }, From 5399ff33aded3d9d23e4db4ceae84280b976c51f Mon Sep 17 00:00:00 2001 From: Florent Ravenel Date: Wed, 19 Jul 2023 10:49:55 +0200 Subject: [PATCH 4/9] feat: test --- .../LangChain_Vector_Search_on_PDF.ipynb | 121 ++++++++++++++++-- 1 file changed, 110 insertions(+), 11 deletions(-) diff --git a/LangChain/LangChain_Vector_Search_on_PDF.ipynb b/LangChain/LangChain_Vector_Search_on_PDF.ipynb index 9bbb385792..93054e98da 100644 --- a/LangChain/LangChain_Vector_Search_on_PDF.ipynb +++ b/LangChain/LangChain_Vector_Search_on_PDF.ipynb @@ -94,16 +94,52 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "potential-surfing", "metadata": { "execution": { - "iopub.execute_input": "2023-07-19T08:29:06.271389Z", - "iopub.status.busy": "2023-07-19T08:29:06.271116Z" + "iopub.execute_input": "2023-07-19T08:41:33.179611Z", + "iopub.status.busy": "2023-07-19T08:41:33.179183Z", + "iopub.status.idle": "2023-07-19T08:42:16.496262Z", + "shell.execute_reply": "2023-07-19T08:42:16.491276Z", + "shell.execute_reply.started": "2023-07-19T08:41:33.179539Z" }, "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Collecting sentence_transformers\n", + " Using cached sentence_transformers-2.2.2-py3-none-any.whl\n", + "Requirement already satisfied: transformers<5.0.0,>=4.6.0 in /opt/conda/lib/python3.9/site-packages (from sentence_transformers) (4.12.5)\n", + "Requirement already satisfied: tqdm in /opt/conda/lib/python3.9/site-packages (from sentence_transformers) (4.62.0)\n", + "Requirement already satisfied: torch>=1.6.0 in /opt/conda/lib/python3.9/site-packages (from sentence_transformers) (1.8.1)\n", + "Collecting torchvision (from sentence_transformers)\n", + " Using cached torchvision-0.15.2-cp39-cp39-manylinux1_x86_64.whl (6.0 MB)\n", + "Requirement already satisfied: numpy in /home/ftp/.local/lib/python3.9/site-packages (from sentence_transformers) (1.22.4)\n", + "Requirement already satisfied: scikit-learn in /opt/conda/lib/python3.9/site-packages (from sentence_transformers) (0.24.2)\n", + "Requirement already satisfied: scipy in /opt/conda/lib/python3.9/site-packages (from sentence_transformers) (1.6.3)\n", + "Requirement already satisfied: nltk in /home/ftp/.local/lib/python3.9/site-packages (from sentence_transformers) (3.8.1)\n", + "Requirement already satisfied: sentencepiece in /home/ftp/.local/lib/python3.9/site-packages (from sentence_transformers) (0.1.99)\n", + "Requirement already satisfied: huggingface-hub>=0.4.0 in /opt/conda/lib/python3.9/site-packages (from sentence_transformers) (0.16.2)\n", + "Requirement already satisfied: filelock in /opt/conda/lib/python3.9/site-packages (from huggingface-hub>=0.4.0->sentence_transformers) (3.12.2)\n", + "Requirement already satisfied: fsspec in /opt/conda/lib/python3.9/site-packages (from huggingface-hub>=0.4.0->sentence_transformers) (2023.6.0)\n", + "Requirement already satisfied: requests in /opt/conda/lib/python3.9/site-packages (from huggingface-hub>=0.4.0->sentence_transformers) (2.26.0)\n", + "Requirement already satisfied: pyyaml>=5.1 in /opt/conda/lib/python3.9/site-packages (from huggingface-hub>=0.4.0->sentence_transformers) (6.0)\n", + "Requirement already satisfied: typing-extensions>=3.7.4.3 in /home/ftp/.local/lib/python3.9/site-packages (from huggingface-hub>=0.4.0->sentence_transformers) (4.5.0)\n", + "Requirement already satisfied: packaging>=20.9 in /opt/conda/lib/python3.9/site-packages (from huggingface-hub>=0.4.0->sentence_transformers) (21.0)\n", + "Requirement already satisfied: regex!=2019.12.17 in /opt/conda/lib/python3.9/site-packages (from transformers<5.0.0,>=4.6.0->sentence_transformers) (2023.6.3)\n", + "Requirement already satisfied: sacremoses in /opt/conda/lib/python3.9/site-packages (from transformers<5.0.0,>=4.6.0->sentence_transformers) (0.0.53)\n", + "Requirement already satisfied: tokenizers<0.11,>=0.10.1 in /opt/conda/lib/python3.9/site-packages (from transformers<5.0.0,>=4.6.0->sentence_transformers) (0.10.3)\n", + "Requirement already satisfied: click in /opt/conda/lib/python3.9/site-packages (from nltk->sentence_transformers) (8.1.3)\n", + "Requirement already satisfied: joblib in /opt/conda/lib/python3.9/site-packages (from nltk->sentence_transformers) (1.0.1)\n", + "Requirement already satisfied: threadpoolctl>=2.0.0 in /opt/conda/lib/python3.9/site-packages (from scikit-learn->sentence_transformers) (3.1.0)\n", + "Collecting torch>=1.6.0 (from sentence_transformers)\n" + ] + } + ], "source": [ "try:\n", " import langchain\n", @@ -133,16 +169,23 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "continuous-melbourne", "metadata": { + "execution": { + "iopub.execute_input": "2023-07-19T08:42:16.501431Z", + "iopub.status.busy": "2023-07-19T08:42:16.500987Z", + "iopub.status.idle": "2023-07-19T08:42:16.675988Z", + "shell.execute_reply": "2023-07-19T08:42:16.667851Z", + "shell.execute_reply.started": "2023-07-19T08:42:16.501399Z" + }, "tags": [] }, "outputs": [], "source": [ "# Inputs\n", "pdf_file = \"./SWE NCG JD.pdf\"\n", - "weaviate_cluster_url = \"\" or naas.secret.get(\"WEAVIATE_CLUSTER_URL\")\n", + "weaviate_cluster_url = naas.secret.get(\"WEAVIATE_CLUSTER_URL\")\n", "query = \"How much is the base pay?\"\n", "\n", "# Outputs\n", @@ -167,13 +210,33 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "crude-louisville", "metadata": { + "execution": { + "iopub.execute_input": "2023-07-19T08:42:16.681433Z", + "iopub.status.busy": "2023-07-19T08:42:16.679730Z", + "iopub.status.idle": "2023-07-19T08:42:20.072060Z", + "shell.execute_reply": "2023-07-19T08:42:20.070305Z", + "shell.execute_reply.started": "2023-07-19T08:42:16.681396Z" + }, "papermill": {}, "tags": [] }, - "outputs": [], + "outputs": [ + { + "ename": "FileNotFoundError", + "evalue": "[Errno 2] No such file or directory: './SWE NCG JD.pdf'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0;34m\" \"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 9\u001b[0;31m \u001b[0mtext\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mextract_text_from_pdf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpdf_file\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m\u001b[0m in \u001b[0;36mextract_text_from_pdf\u001b[0;34m(pdf_path)\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mextract_text_from_pdf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpdf_path\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpdf_path\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"rb\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mfile\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mpdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mPyPDF2\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mPdfReader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfile\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mtext\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mpage\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mpdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpages\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: './SWE NCG JD.pdf'" + ] + } + ], "source": [ "def extract_text_from_pdf(pdf_path):\n", " with open(pdf_path, \"rb\") as file:\n", @@ -199,6 +262,11 @@ "execution_count": null, "id": "f9e8e197-e965-441c-9512-9b28ed079ee6", "metadata": { + "execution": { + "iopub.status.busy": "2023-07-19T08:42:20.072988Z", + "iopub.status.idle": "2023-07-19T08:42:20.073335Z", + "shell.execute_reply": "2023-07-19T08:42:20.073164Z" + }, "papermill": {}, "tags": [] }, @@ -227,6 +295,11 @@ "execution_count": null, "id": "e4a376ac-a10e-4d6a-ba01-e5445efdf091", "metadata": { + "execution": { + "iopub.status.busy": "2023-07-19T08:42:20.074098Z", + "iopub.status.idle": "2023-07-19T08:42:20.074423Z", + "shell.execute_reply": "2023-07-19T08:42:20.074257Z" + }, "tags": [] }, "outputs": [], @@ -249,7 +322,13 @@ "cell_type": "code", "execution_count": null, "id": "93ae7d2d-86b0-4b88-ba96-ef0d094d9da3", - "metadata": {}, + "metadata": { + "execution": { + "iopub.status.busy": "2023-07-19T08:42:20.075300Z", + "iopub.status.idle": "2023-07-19T08:42:20.075641Z", + "shell.execute_reply": "2023-07-19T08:42:20.075473Z" + } + }, "outputs": [], "source": [ "db = Weaviate.from_documents(texts, embeddings, weaviate_url=\"\", by_text=False)" @@ -267,7 +346,13 @@ "cell_type": "code", "execution_count": null, "id": "bdf9e7a9-7de9-4c50-b677-97cb2a1d5d3b", - "metadata": {}, + "metadata": { + "execution": { + "iopub.status.busy": "2023-07-19T08:42:20.076441Z", + "iopub.status.idle": "2023-07-19T08:42:20.076769Z", + "shell.execute_reply": "2023-07-19T08:42:20.076600Z" + } + }, "outputs": [], "source": [ "docs = db.similarity_search(query)\n", @@ -302,11 +387,25 @@ "cell_type": "code", "execution_count": null, "id": "9c4e3b7b-6440-4844-8054-265f1aec65eb", - "metadata": {}, + "metadata": { + "execution": { + "iopub.status.busy": "2023-07-19T08:42:20.077561Z", + "iopub.status.idle": "2023-07-19T08:42:20.077926Z", + "shell.execute_reply": "2023-07-19T08:42:20.077754Z" + } + }, "outputs": [], "source": [ "response" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9d0d7bd2-7e89-49e6-807f-9ab542085a7f", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { From e737ecb5ab9ccf2b7a2ed3f05d9066fd7e6ecc3a Mon Sep 17 00:00:00 2001 From: Sriniketh J <81156510+srini047@users.noreply.github.com> Date: Fri, 21 Jul 2023 21:13:38 +0530 Subject: [PATCH 5/9] fix: pdf file_path issue --- .../LangChain_Vector_Search_on_PDF.ipynb | 157 ++++-------------- 1 file changed, 34 insertions(+), 123 deletions(-) diff --git a/LangChain/LangChain_Vector_Search_on_PDF.ipynb b/LangChain/LangChain_Vector_Search_on_PDF.ipynb index 93054e98da..17b3c7fc6f 100644 --- a/LangChain/LangChain_Vector_Search_on_PDF.ipynb +++ b/LangChain/LangChain_Vector_Search_on_PDF.ipynb @@ -37,7 +37,7 @@ "id": "religious-programmer", "metadata": {}, "source": [ - "**Tags:** #langchain #pdf #weaviate #huggingface" + "**Tags:** #langchain #pdf #weaviate #huggingface #llm #database #embeddings" ] }, { @@ -94,99 +94,46 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "potential-surfing", "metadata": { - "execution": { - "iopub.execute_input": "2023-07-19T08:41:33.179611Z", - "iopub.status.busy": "2023-07-19T08:41:33.179183Z", - "iopub.status.idle": "2023-07-19T08:42:16.496262Z", - "shell.execute_reply": "2023-07-19T08:42:16.491276Z", - "shell.execute_reply.started": "2023-07-19T08:41:33.179539Z" - }, "tags": [] }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Collecting sentence_transformers\n", - " Using cached sentence_transformers-2.2.2-py3-none-any.whl\n", - "Requirement already satisfied: transformers<5.0.0,>=4.6.0 in /opt/conda/lib/python3.9/site-packages (from sentence_transformers) (4.12.5)\n", - "Requirement already satisfied: tqdm in /opt/conda/lib/python3.9/site-packages (from sentence_transformers) (4.62.0)\n", - "Requirement already satisfied: torch>=1.6.0 in /opt/conda/lib/python3.9/site-packages (from sentence_transformers) (1.8.1)\n", - "Collecting torchvision (from sentence_transformers)\n", - " Using cached torchvision-0.15.2-cp39-cp39-manylinux1_x86_64.whl (6.0 MB)\n", - "Requirement already satisfied: numpy in /home/ftp/.local/lib/python3.9/site-packages (from sentence_transformers) (1.22.4)\n", - "Requirement already satisfied: scikit-learn in /opt/conda/lib/python3.9/site-packages (from sentence_transformers) (0.24.2)\n", - "Requirement already satisfied: scipy in /opt/conda/lib/python3.9/site-packages (from sentence_transformers) (1.6.3)\n", - "Requirement already satisfied: nltk in /home/ftp/.local/lib/python3.9/site-packages (from sentence_transformers) (3.8.1)\n", - "Requirement already satisfied: sentencepiece in /home/ftp/.local/lib/python3.9/site-packages (from sentence_transformers) (0.1.99)\n", - "Requirement already satisfied: huggingface-hub>=0.4.0 in /opt/conda/lib/python3.9/site-packages (from sentence_transformers) (0.16.2)\n", - "Requirement already satisfied: filelock in /opt/conda/lib/python3.9/site-packages (from huggingface-hub>=0.4.0->sentence_transformers) (3.12.2)\n", - "Requirement already satisfied: fsspec in /opt/conda/lib/python3.9/site-packages (from huggingface-hub>=0.4.0->sentence_transformers) (2023.6.0)\n", - "Requirement already satisfied: requests in /opt/conda/lib/python3.9/site-packages (from huggingface-hub>=0.4.0->sentence_transformers) (2.26.0)\n", - "Requirement already satisfied: pyyaml>=5.1 in /opt/conda/lib/python3.9/site-packages (from huggingface-hub>=0.4.0->sentence_transformers) (6.0)\n", - "Requirement already satisfied: typing-extensions>=3.7.4.3 in /home/ftp/.local/lib/python3.9/site-packages (from huggingface-hub>=0.4.0->sentence_transformers) (4.5.0)\n", - "Requirement already satisfied: packaging>=20.9 in /opt/conda/lib/python3.9/site-packages (from huggingface-hub>=0.4.0->sentence_transformers) (21.0)\n", - "Requirement already satisfied: regex!=2019.12.17 in /opt/conda/lib/python3.9/site-packages (from transformers<5.0.0,>=4.6.0->sentence_transformers) (2023.6.3)\n", - "Requirement already satisfied: sacremoses in /opt/conda/lib/python3.9/site-packages (from transformers<5.0.0,>=4.6.0->sentence_transformers) (0.0.53)\n", - "Requirement already satisfied: tokenizers<0.11,>=0.10.1 in /opt/conda/lib/python3.9/site-packages (from transformers<5.0.0,>=4.6.0->sentence_transformers) (0.10.3)\n", - "Requirement already satisfied: click in /opt/conda/lib/python3.9/site-packages (from nltk->sentence_transformers) (8.1.3)\n", - "Requirement already satisfied: joblib in /opt/conda/lib/python3.9/site-packages (from nltk->sentence_transformers) (1.0.1)\n", - "Requirement already satisfied: threadpoolctl>=2.0.0 in /opt/conda/lib/python3.9/site-packages (from scikit-learn->sentence_transformers) (3.1.0)\n", - "Collecting torch>=1.6.0 (from sentence_transformers)\n" - ] - } - ], + "outputs": [], "source": [ "try:\n", " import langchain\n", " import PyPDF2\n", + " import weaviate\n", "except ModuleNotFoundError:\n", - " !pip install langchain PyPDF2\n", - "!pip install sentence_transformers --user\n", + " !pip install langchain PyPDF2 weaviate-client==3.20.0\n", + " \n", + "# Note: This installation make take more time than usual due to more dependencies {uncomment if there is some error in the embeddings routine}\n", + "# !pip install sentence_transformers --user\n", + "\n", "import naas\n", + "import io\n", + "import requests\n", "import PyPDF2\n", + "import weaviate\n", "from langchain.text_splitter import CharacterTextSplitter\n", "from langchain.embeddings import HuggingFaceEmbeddings\n", "from langchain.vectorstores import Weaviate" ] }, - { - "cell_type": "markdown", - "id": "aggressive-trustee", - "metadata": {}, - "source": [ - "### Setup Variables\n", - "\n", - "- `pdf_file`: Path to which the PDF file exists.\n", - "- `weaviate_cluster_url`: You can create a new weaviate cluster [here](https://console.weaviate.cloud) and paste the url or import from naas secrets\n", - "- `query`: The question that you need to ask the pdf\n", - "- `response`: The reply for the query from search " - ] - }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "continuous-melbourne", "metadata": { - "execution": { - "iopub.execute_input": "2023-07-19T08:42:16.501431Z", - "iopub.status.busy": "2023-07-19T08:42:16.500987Z", - "iopub.status.idle": "2023-07-19T08:42:16.675988Z", - "shell.execute_reply": "2023-07-19T08:42:16.667851Z", - "shell.execute_reply.started": "2023-07-19T08:42:16.501399Z" - }, "tags": [] }, "outputs": [], "source": [ "# Inputs\n", - "pdf_file = \"./SWE NCG JD.pdf\"\n", - "weaviate_cluster_url = naas.secret.get(\"WEAVIATE_CLUSTER_URL\")\n", - "query = \"How much is the base pay?\"\n", + "pdf_file = \"\"\n", + "weaviate_cluster_url = \"paste your cluster url\" or naas.secret.get(\"WEAVIATE_CLUSTER_URL\")\n", + "query = \"Enter your own\" or \"Summarize the PDF...\"\n", "\n", "# Outputs\n", "response = \"\"" @@ -210,41 +157,27 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "crude-louisville", "metadata": { - "execution": { - "iopub.execute_input": "2023-07-19T08:42:16.681433Z", - "iopub.status.busy": "2023-07-19T08:42:16.679730Z", - "iopub.status.idle": "2023-07-19T08:42:20.072060Z", - "shell.execute_reply": "2023-07-19T08:42:20.070305Z", - "shell.execute_reply.started": "2023-07-19T08:42:16.681396Z" - }, "papermill": {}, "tags": [] }, - "outputs": [ - { - "ename": "FileNotFoundError", - "evalue": "[Errno 2] No such file or directory: './SWE NCG JD.pdf'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0;34m\" \"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 9\u001b[0;31m \u001b[0mtext\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mextract_text_from_pdf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpdf_file\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;32m\u001b[0m in \u001b[0;36mextract_text_from_pdf\u001b[0;34m(pdf_path)\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mextract_text_from_pdf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpdf_path\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpdf_path\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"rb\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mfile\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mpdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mPyPDF2\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mPdfReader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfile\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mtext\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mpage\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mpdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpages\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: './SWE NCG JD.pdf'" - ] - } - ], + "outputs": [], "source": [ "def extract_text_from_pdf(pdf_path):\n", - " with open(pdf_path, \"rb\") as file:\n", - " pdf = PyPDF2.PdfReader(file)\n", - " text = []\n", - " for page in pdf.pages:\n", - " text.append(page.extract_text())\n", - " return \" \".join(text)\n", + " r = requests.get(pdf_path)\n", + " f = io.BytesIO(r.content)\n", + "\n", + " reader = PyPDF2.PdfReader(f)\n", + " contents = []\n", + " for page in reader.pages:\n", + " content = page.extract_text()\n", + " contents.append(content)\n", + " \n", + " contents = ' '.join(contents)\n", + " return contents\n", + " \n", "\n", "text = extract_text_from_pdf(pdf_file)" ] @@ -262,11 +195,6 @@ "execution_count": null, "id": "f9e8e197-e965-441c-9512-9b28ed079ee6", "metadata": { - "execution": { - "iopub.status.busy": "2023-07-19T08:42:20.072988Z", - "iopub.status.idle": "2023-07-19T08:42:20.073335Z", - "shell.execute_reply": "2023-07-19T08:42:20.073164Z" - }, "papermill": {}, "tags": [] }, @@ -295,11 +223,6 @@ "execution_count": null, "id": "e4a376ac-a10e-4d6a-ba01-e5445efdf091", "metadata": { - "execution": { - "iopub.status.busy": "2023-07-19T08:42:20.074098Z", - "iopub.status.idle": "2023-07-19T08:42:20.074423Z", - "shell.execute_reply": "2023-07-19T08:42:20.074257Z" - }, "tags": [] }, "outputs": [], @@ -323,15 +246,11 @@ "execution_count": null, "id": "93ae7d2d-86b0-4b88-ba96-ef0d094d9da3", "metadata": { - "execution": { - "iopub.status.busy": "2023-07-19T08:42:20.075300Z", - "iopub.status.idle": "2023-07-19T08:42:20.075641Z", - "shell.execute_reply": "2023-07-19T08:42:20.075473Z" - } + "tags": [] }, "outputs": [], "source": [ - "db = Weaviate.from_documents(texts, embeddings, weaviate_url=\"\", by_text=False)" + "db = Weaviate.from_documents(texts, embeddings, weaviate_url=weaviate_cluster_url, by_text=False)" ] }, { @@ -347,11 +266,7 @@ "execution_count": null, "id": "bdf9e7a9-7de9-4c50-b677-97cb2a1d5d3b", "metadata": { - "execution": { - "iopub.status.busy": "2023-07-19T08:42:20.076441Z", - "iopub.status.idle": "2023-07-19T08:42:20.076769Z", - "shell.execute_reply": "2023-07-19T08:42:20.076600Z" - } + "tags": [] }, "outputs": [], "source": [ @@ -388,11 +303,7 @@ "execution_count": null, "id": "9c4e3b7b-6440-4844-8054-265f1aec65eb", "metadata": { - "execution": { - "iopub.status.busy": "2023-07-19T08:42:20.077561Z", - "iopub.status.idle": "2023-07-19T08:42:20.077926Z", - "shell.execute_reply": "2023-07-19T08:42:20.077754Z" - } + "tags": [] }, "outputs": [], "source": [ From aa4b8a4a03c5d74402f8fed4f75db14a3e0adea8 Mon Sep 17 00:00:00 2001 From: Sriniketh J <81156510+srini047@users.noreply.github.com> Date: Thu, 27 Jul 2023 18:09:49 +0530 Subject: [PATCH 6/9] update: pdf url --- LangChain/LangChain_Vector_Search_on_PDF.ipynb | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/LangChain/LangChain_Vector_Search_on_PDF.ipynb b/LangChain/LangChain_Vector_Search_on_PDF.ipynb index 17b3c7fc6f..79a0ee81ee 100644 --- a/LangChain/LangChain_Vector_Search_on_PDF.ipynb +++ b/LangChain/LangChain_Vector_Search_on_PDF.ipynb @@ -29,7 +29,10 @@ "# LangChain - Vector Search on PDF\n", "\n", "\n", - "

Template request | Bug report" + "\n", + " \"Open\n", + "\n", + "

Template request | Bug report" ] }, { @@ -131,9 +134,9 @@ "outputs": [], "source": [ "# Inputs\n", - "pdf_file = \"\"\n", - "weaviate_cluster_url = \"paste your cluster url\" or naas.secret.get(\"WEAVIATE_CLUSTER_URL\")\n", - "query = \"Enter your own\" or \"Summarize the PDF...\"\n", + "pdf_file = \"\" or \"https://arxiv.org/pdf/2005.14165.pdf\"\n", + "weaviate_cluster_url = \"\" or naas.secret.get(\"WEAVIATE_CLUSTER_URL\")\n", + "query = \"\" or \"Summarize the PDF...\"\n", "\n", "# Outputs\n", "response = \"\"" From 1baf7586a396a03ed1b2a1a3789b4a5d0acf5e82 Mon Sep 17 00:00:00 2001 From: Sriniketh J <81156510+srini047@users.noreply.github.com> Date: Fri, 28 Jul 2023 17:49:08 +0530 Subject: [PATCH 7/9] refactor code --- .../LangChain_Vector_Search_on_PDF.ipynb | 144 +++++++++++++----- 1 file changed, 105 insertions(+), 39 deletions(-) diff --git a/LangChain/LangChain_Vector_Search_on_PDF.ipynb b/LangChain/LangChain_Vector_Search_on_PDF.ipynb index 79a0ee81ee..9cf376788e 100644 --- a/LangChain/LangChain_Vector_Search_on_PDF.ipynb +++ b/LangChain/LangChain_Vector_Search_on_PDF.ipynb @@ -11,6 +11,7 @@ "shell.execute_reply": "2021-02-23T14:22:16.626866Z", "shell.execute_reply.started": "2021-02-23T14:22:16.610384Z" }, + "id": "latin-packing", "papermill": {}, "tags": [] }, @@ -22,6 +23,7 @@ "cell_type": "markdown", "id": "compressed-wilson", "metadata": { + "id": "compressed-wilson", "papermill": {}, "tags": [] }, @@ -38,7 +40,9 @@ { "cell_type": "markdown", "id": "religious-programmer", - "metadata": {}, + "metadata": { + "id": "religious-programmer" + }, "source": [ "**Tags:** #langchain #pdf #weaviate #huggingface #llm #database #embeddings" ] @@ -46,7 +50,9 @@ { "cell_type": "markdown", "id": "1fe9f56e-561c-4f52-aef8-b861c9462107", - "metadata": {}, + "metadata": { + "id": "1fe9f56e-561c-4f52-aef8-b861c9462107" + }, "source": [ "**Author:** [Sriniketh Jayasendil](https://www.linkedin.com/in/sriniketh-jayasendil)" ] @@ -54,7 +60,9 @@ { "cell_type": "markdown", "id": "31ea7cdb-e10d-43fc-b026-f69249a59736", - "metadata": {}, + "metadata": { + "id": "31ea7cdb-e10d-43fc-b026-f69249a59736" + }, "source": [ "**Description:** This notebook is used to perform vector search on your PDF and it will answer basic questions that are closely related based on the prompt provided.\n", "\n", @@ -68,7 +76,9 @@ { "cell_type": "markdown", "id": "1a14806c-9da2-446e-b8fd-b55f8d7ac3f0", - "metadata": {}, + "metadata": { + "id": "1a14806c-9da2-446e-b8fd-b55f8d7ac3f0" + }, "source": [ "**References:**\n", "- [Langchain docs](https://python.langchain.com/docs/get_started/introduction.html)\n", @@ -80,6 +90,7 @@ "cell_type": "markdown", "id": "distinguished-truth", "metadata": { + "id": "distinguished-truth", "papermill": {}, "tags": [] }, @@ -90,7 +101,9 @@ { "cell_type": "markdown", "id": "numeric-mediterranean", - "metadata": {}, + "metadata": { + "id": "numeric-mediterranean" + }, "source": [ "### Import libraries" ] @@ -100,6 +113,7 @@ "execution_count": null, "id": "potential-surfing", "metadata": { + "id": "potential-surfing", "tags": [] }, "outputs": [], @@ -108,20 +122,35 @@ " import langchain\n", " import PyPDF2\n", " import weaviate\n", + " import openai\n", "except ModuleNotFoundError:\n", - " !pip install langchain PyPDF2 weaviate-client==3.20.0\n", - " \n", - "# Note: This installation make take more time than usual due to more dependencies {uncomment if there is some error in the embeddings routine}\n", - "# !pip install sentence_transformers --user\n", + " !pip install langchain PyPDF2 openai weaviate-client==3.20.0\n", "\n", "import naas\n", "import io\n", "import requests\n", "import PyPDF2\n", - "import weaviate\n", + "import openai\n", "from langchain.text_splitter import CharacterTextSplitter\n", "from langchain.embeddings import HuggingFaceEmbeddings\n", - "from langchain.vectorstores import Weaviate" + "from langchain.embeddings.openai import OpenAIEmbeddings\n", + "from langchain.vectorstores import Weaviate\n", + "from langchain.llms import OpenAI\n", + "from langchain.chains import RetrievalQA\n", + "from langchain.document_loaders import TextLoader" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "kDfGd3KRPDP8", + "metadata": { + "id": "kDfGd3KRPDP8" + }, + "outputs": [], + "source": [ + "# Note: This installation make take more time than usual due to more dependencies {uncomment if there is some error in the embeddings routine}\n", + "# !pip install -U sentence-transformers --user" ] }, { @@ -129,14 +158,16 @@ "execution_count": null, "id": "continuous-melbourne", "metadata": { + "id": "continuous-melbourne", "tags": [] }, "outputs": [], "source": [ "# Inputs\n", - "pdf_file = \"\" or \"https://arxiv.org/pdf/2005.14165.pdf\"\n", + "pdf_file = \"\" or \"https://bcf.princeton.edu/wp-content/uploads/2023/05/A_User_s_Guide_to_GPT_and_LLMs_for_Economic_Research.pdf\"\n", "weaviate_cluster_url = \"\" or naas.secret.get(\"WEAVIATE_CLUSTER_URL\")\n", - "query = \"\" or \"Summarize the PDF...\"\n", + "openai_api_key = \"\" or naas.secret.get(\"OPENAI_API_KEY\")\n", + "query = \"\" or \"Summarize the PDF\"\n", "\n", "# Outputs\n", "response = \"\"" @@ -145,7 +176,9 @@ { "cell_type": "markdown", "id": "registered-showcase", - "metadata": {}, + "metadata": { + "id": "registered-showcase" + }, "source": [ "## Model" ] @@ -153,7 +186,9 @@ { "cell_type": "markdown", "id": "tested-astrology", - "metadata": {}, + "metadata": { + "id": "tested-astrology" + }, "source": [ "### Extract text from PDF" ] @@ -163,6 +198,7 @@ "execution_count": null, "id": "crude-louisville", "metadata": { + "id": "crude-louisville", "papermill": {}, "tags": [] }, @@ -177,10 +213,10 @@ " for page in reader.pages:\n", " content = page.extract_text()\n", " contents.append(content)\n", - " \n", + "\n", " contents = ' '.join(contents)\n", " return contents\n", - " \n", + "\n", "\n", "text = extract_text_from_pdf(pdf_file)" ] @@ -188,7 +224,9 @@ { "cell_type": "markdown", "id": "8daa42c1-3a2b-4f96-a7dd-fb1deb395a84", - "metadata": {}, + "metadata": { + "id": "8daa42c1-3a2b-4f96-a7dd-fb1deb395a84" + }, "source": [ "### Split the text into chunks scraped from the PDF" ] @@ -198,17 +236,13 @@ "execution_count": null, "id": "f9e8e197-e965-441c-9512-9b28ed079ee6", "metadata": { + "id": "f9e8e197-e965-441c-9512-9b28ed079ee6", "papermill": {}, "tags": [] }, "outputs": [], "source": [ - "text_splitter = CharacterTextSplitter(\n", - " separator = \"\\n\",\n", - " chunk_size = 1000,\n", - " chunk_overlap = 200,\n", - " length_function = len,\n", - ")\n", + "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n", "\n", "texts = text_splitter.create_documents([text])" ] @@ -216,7 +250,9 @@ { "cell_type": "markdown", "id": "ef1720bf-a28a-4757-b189-7df97947c158", - "metadata": {}, + "metadata": { + "id": "ef1720bf-a28a-4757-b189-7df97947c158" + }, "source": [ "### Create embeddings of the text make it compatible to store it in the database" ] @@ -226,6 +262,7 @@ "execution_count": null, "id": "e4a376ac-a10e-4d6a-ba01-e5445efdf091", "metadata": { + "id": "e4a376ac-a10e-4d6a-ba01-e5445efdf091", "tags": [] }, "outputs": [], @@ -239,7 +276,9 @@ { "cell_type": "markdown", "id": "4169feb2-05ac-4914-bbb2-501dae7dcd89", - "metadata": {}, + "metadata": { + "id": "4169feb2-05ac-4914-bbb2-501dae7dcd89" + }, "source": [ "### Store the embeddings into the weaviate database" ] @@ -249,17 +288,34 @@ "execution_count": null, "id": "93ae7d2d-86b0-4b88-ba96-ef0d094d9da3", "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "93ae7d2d-86b0-4b88-ba96-ef0d094d9da3", + "outputId": "66af9945-1e8c-470c-dfe0-f161d3dc1c68", "tags": [] }, "outputs": [], "source": [ + "# Delete existing schema if any present\n", + "client = weaviate.Client(url=weaviate_cluster_url )\n", + "\n", + "try:\n", + " client.schema.delete_all()\n", + " print(\"Schema deleted successfully...\")\n", + "except:\n", + " print(\"Schema not deleted...\")\n", + "\n", + "# Store in the weaviate vector database\n", "db = Weaviate.from_documents(texts, embeddings, weaviate_url=weaviate_cluster_url, by_text=False)" ] }, { "cell_type": "markdown", "id": "981fac74-2e1e-4b62-8b91-09d51d344bba", - "metadata": {}, + "metadata": { + "id": "981fac74-2e1e-4b62-8b91-09d51d344bba" + }, "source": [ "### Get the closest response to the user query on the PDF" ] @@ -269,12 +325,13 @@ "execution_count": null, "id": "bdf9e7a9-7de9-4c50-b677-97cb2a1d5d3b", "metadata": { + "id": "bdf9e7a9-7de9-4c50-b677-97cb2a1d5d3b", "tags": [] }, "outputs": [], "source": [ - "docs = db.similarity_search(query)\n", - "response = docs[0].page_content" + "qa = RetrievalQA.from_chain_type(llm=OpenAI(openai_api_key=openai_api_key, temperature=0), chain_type=\"stuff\", retriever=db.as_retriever())\n", + "response = qa.run(query)" ] }, { @@ -287,7 +344,8 @@ "iopub.status.idle": "2021-07-02T23:32:10.796900Z", "shell.execute_reply": "2021-07-02T23:32:10.796358Z", "shell.execute_reply.started": "2021-07-02T23:32:10.789033Z" - } + }, + "id": "lonely-pacific" }, "source": [ "## Output" @@ -296,7 +354,9 @@ { "cell_type": "markdown", "id": "890f7c86-b7bb-4f5d-9a1b-e492dd9580fd", - "metadata": {}, + "metadata": { + "id": "890f7c86-b7bb-4f5d-9a1b-e492dd9580fd" + }, "source": [ "### Show the response" ] @@ -306,6 +366,12 @@ "execution_count": null, "id": "9c4e3b7b-6440-4844-8054-265f1aec65eb", "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 69 + }, + "id": "9c4e3b7b-6440-4844-8054-265f1aec65eb", + "outputId": "70a8eb18-ce96-4902-9ebe-c3179494fb30", "tags": [] }, "outputs": [], @@ -317,12 +383,19 @@ "cell_type": "code", "execution_count": null, "id": "9d0d7bd2-7e89-49e6-807f-9ab542085a7f", - "metadata": {}, + "metadata": { + "id": "9d0d7bd2-7e89-49e6-807f-9ab542085a7f" + }, "outputs": [], "source": [] } ], "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "T4", + "provenance": [] + }, "kernelspec": { "display_name": "Python 3", "language": "python", @@ -345,13 +418,6 @@ "environment_variables": {}, "parameters": {}, "version": "2.3.3" - }, - "widgets": { - "application/vnd.jupyter.widget-state+json": { - "state": {}, - "version_major": 2, - "version_minor": 0 - } } }, "nbformat": 4, From dd2a8ba14d1b66802a24f52dbba6fac7ff9e1130 Mon Sep 17 00:00:00 2001 From: Florent Ravenel Date: Tue, 1 Aug 2023 09:06:01 +0200 Subject: [PATCH 8/9] feat: update PDF and test --- .../LangChain_Vector_Search_on_PDF.ipynb | 206 ++++++++---------- 1 file changed, 89 insertions(+), 117 deletions(-) diff --git a/LangChain/LangChain_Vector_Search_on_PDF.ipynb b/LangChain/LangChain_Vector_Search_on_PDF.ipynb index 9cf376788e..a192b7a495 100644 --- a/LangChain/LangChain_Vector_Search_on_PDF.ipynb +++ b/LangChain/LangChain_Vector_Search_on_PDF.ipynb @@ -11,7 +11,6 @@ "shell.execute_reply": "2021-02-23T14:22:16.626866Z", "shell.execute_reply.started": "2021-02-23T14:22:16.610384Z" }, - "id": "latin-packing", "papermill": {}, "tags": [] }, @@ -23,7 +22,6 @@ "cell_type": "markdown", "id": "compressed-wilson", "metadata": { - "id": "compressed-wilson", "papermill": {}, "tags": [] }, @@ -31,8 +29,6 @@ "# LangChain - Vector Search on PDF\n", "\n", "\n", - "\n", - " \"Open\n", "\n", "

Template request | Bug report" ] @@ -40,9 +36,7 @@ { "cell_type": "markdown", "id": "religious-programmer", - "metadata": { - "id": "religious-programmer" - }, + "metadata": {}, "source": [ "**Tags:** #langchain #pdf #weaviate #huggingface #llm #database #embeddings" ] @@ -50,19 +44,26 @@ { "cell_type": "markdown", "id": "1fe9f56e-561c-4f52-aef8-b861c9462107", - "metadata": { - "id": "1fe9f56e-561c-4f52-aef8-b861c9462107" - }, + "metadata": {}, "source": [ "**Author:** [Sriniketh Jayasendil](https://www.linkedin.com/in/sriniketh-jayasendil)" ] }, { "cell_type": "markdown", - "id": "31ea7cdb-e10d-43fc-b026-f69249a59736", + "id": "68c33d85-f522-44bb-9b2c-dec47a414f54", "metadata": { - "id": "31ea7cdb-e10d-43fc-b026-f69249a59736" + "papermill": {}, + "tags": [] }, + "source": [ + "**Last update:** 2023-07-31 (Created: 2023-07-10)" + ] + }, + { + "cell_type": "markdown", + "id": "31ea7cdb-e10d-43fc-b026-f69249a59736", + "metadata": {}, "source": [ "**Description:** This notebook is used to perform vector search on your PDF and it will answer basic questions that are closely related based on the prompt provided.\n", "\n", @@ -70,15 +71,17 @@ "- PyPDF2 - Get text from PDF\n", "- LangChain - Text splitter, document creation\n", "- HuggingFace - Embeddings\n", - "- Weaviate - Vector Database" + "- Weaviate - Vector Database\n", + "\n", + "\n", + " \"Open\n", + "" ] }, { "cell_type": "markdown", "id": "1a14806c-9da2-446e-b8fd-b55f8d7ac3f0", - "metadata": { - "id": "1a14806c-9da2-446e-b8fd-b55f8d7ac3f0" - }, + "metadata": {}, "source": [ "**References:**\n", "- [Langchain docs](https://python.langchain.com/docs/get_started/introduction.html)\n", @@ -90,7 +93,6 @@ "cell_type": "markdown", "id": "distinguished-truth", "metadata": { - "id": "distinguished-truth", "papermill": {}, "tags": [] }, @@ -101,9 +103,7 @@ { "cell_type": "markdown", "id": "numeric-mediterranean", - "metadata": { - "id": "numeric-mediterranean" - }, + "metadata": {}, "source": [ "### Import libraries" ] @@ -113,7 +113,6 @@ "execution_count": null, "id": "potential-surfing", "metadata": { - "id": "potential-surfing", "tags": [] }, "outputs": [], @@ -122,35 +121,33 @@ " import langchain\n", " import PyPDF2\n", " import weaviate\n", - " import openai\n", "except ModuleNotFoundError:\n", - " !pip install langchain PyPDF2 openai weaviate-client==3.20.0\n", + " !pip install langchain PyPDF2 weaviate-client==3.20.0\n", + " \n", + "# Note: This installation make take more time than usual due to more dependencies {uncomment if there is some error in the embeddings routine}\n", + "# !pip install sentence_transformers --user\n", "\n", + "import os\n", "import naas\n", "import io\n", "import requests\n", "import PyPDF2\n", - "import openai\n", + "import weaviate\n", "from langchain.text_splitter import CharacterTextSplitter\n", "from langchain.embeddings import HuggingFaceEmbeddings\n", - "from langchain.embeddings.openai import OpenAIEmbeddings\n", - "from langchain.vectorstores import Weaviate\n", - "from langchain.llms import OpenAI\n", - "from langchain.chains import RetrievalQA\n", - "from langchain.document_loaders import TextLoader" + "from langchain.vectorstores import Weaviate" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "kDfGd3KRPDP8", - "metadata": { - "id": "kDfGd3KRPDP8" - }, - "outputs": [], + "cell_type": "markdown", + "id": "64db5ac5-046f-4203-8503-990002927075", + "metadata": {}, "source": [ - "# Note: This installation make take more time than usual due to more dependencies {uncomment if there is some error in the embeddings routine}\n", - "# !pip install -U sentence-transformers --user" + "### Setup variables\n", + "- `pdf_file`: Path to which the PDF file exists.\",\n", + "- `weaviate_cluster_url`: You can create a new weaviate cluster [here](https://console.weaviate.cloud) and paste the url or import from naas secrets\n", + "- `weaviate_api_key`: Get your API key from your weaviate dashboard [here](https://console.weaviate.cloud/dashboard#)\n", + "- `query`: The question that you need to ask the pdf" ] }, { @@ -158,37 +155,48 @@ "execution_count": null, "id": "continuous-melbourne", "metadata": { - "id": "continuous-melbourne", "tags": [] }, "outputs": [], "source": [ - "# Inputs\n", - "pdf_file = \"\" or \"https://bcf.princeton.edu/wp-content/uploads/2023/05/A_User_s_Guide_to_GPT_and_LLMs_for_Economic_Research.pdf\"\n", - "weaviate_cluster_url = \"\" or naas.secret.get(\"WEAVIATE_CLUSTER_URL\")\n", - "openai_api_key = \"\" or naas.secret.get(\"OPENAI_API_KEY\")\n", - "query = \"\" or \"Summarize the PDF\"\n", - "\n", - "# Outputs\n", - "response = \"\"" + "pdf_file = \"https://tesla-cdn.thron.com/static/SVCPTV_2022_Q4_Quarterly_Update_6UDS97.pdf?xseo=&response-content-disposition=inline%3Bfilename%3D%22b7871185-dd6a-4d79-9c3b-19b497227f2a.pdf%22\"\n", + "weaviate_api_key = naas.secret.get(\"WEAVIATE_API_KEY\")\n", + "weaviate_cluster_url = naas.secret.get(\"WEAVIATE_CLUSTER_URL\")\n", + "query = \"What's the total revenue on Q4 2022?\"" ] }, { "cell_type": "markdown", "id": "registered-showcase", - "metadata": { - "id": "registered-showcase" - }, + "metadata": {}, "source": [ "## Model" ] }, { "cell_type": "markdown", - "id": "tested-astrology", + "id": "8ae9725c-161a-47f6-a115-7d74cee3bd2f", + "metadata": {}, + "source": [ + "### Setup environ" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bd067008-9cf1-45b1-a6d1-c37627dc4976", "metadata": { - "id": "tested-astrology" + "tags": [] }, + "outputs": [], + "source": [ + "os.environ[\"WEAVIATE_API_KEY\"] = weaviate_api_key" + ] + }, + { + "cell_type": "markdown", + "id": "tested-astrology", + "metadata": {}, "source": [ "### Extract text from PDF" ] @@ -198,7 +206,6 @@ "execution_count": null, "id": "crude-louisville", "metadata": { - "id": "crude-louisville", "papermill": {}, "tags": [] }, @@ -213,20 +220,17 @@ " for page in reader.pages:\n", " content = page.extract_text()\n", " contents.append(content)\n", - "\n", + " \n", " contents = ' '.join(contents)\n", " return contents\n", - "\n", - "\n", + " \n", "text = extract_text_from_pdf(pdf_file)" ] }, { "cell_type": "markdown", "id": "8daa42c1-3a2b-4f96-a7dd-fb1deb395a84", - "metadata": { - "id": "8daa42c1-3a2b-4f96-a7dd-fb1deb395a84" - }, + "metadata": {}, "source": [ "### Split the text into chunks scraped from the PDF" ] @@ -236,23 +240,27 @@ "execution_count": null, "id": "f9e8e197-e965-441c-9512-9b28ed079ee6", "metadata": { - "id": "f9e8e197-e965-441c-9512-9b28ed079ee6", "papermill": {}, "tags": [] }, "outputs": [], "source": [ - "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n", + "text_splitter = CharacterTextSplitter(\n", + " separator = \"\\n\",\n", + " chunk_size = 1000,\n", + " chunk_overlap = 200,\n", + " length_function = len,\n", + ")\n", "\n", - "texts = text_splitter.create_documents([text])" + "texts = text_splitter.create_documents([text])\n", + "print(len(texts))\n", + "texts[0]" ] }, { "cell_type": "markdown", "id": "ef1720bf-a28a-4757-b189-7df97947c158", - "metadata": { - "id": "ef1720bf-a28a-4757-b189-7df97947c158" - }, + "metadata": {}, "source": [ "### Create embeddings of the text make it compatible to store it in the database" ] @@ -262,7 +270,6 @@ "execution_count": null, "id": "e4a376ac-a10e-4d6a-ba01-e5445efdf091", "metadata": { - "id": "e4a376ac-a10e-4d6a-ba01-e5445efdf091", "tags": [] }, "outputs": [], @@ -270,15 +277,13 @@ "embeddings = HuggingFaceEmbeddings()\n", "\n", "for i in range(len(texts)):\n", - " query_result = embeddings.embed_query(texts[i].page_content)" + " query_result = embeddings.embed_query(texts[i].page_content)" ] }, { "cell_type": "markdown", "id": "4169feb2-05ac-4914-bbb2-501dae7dcd89", - "metadata": { - "id": "4169feb2-05ac-4914-bbb2-501dae7dcd89" - }, + "metadata": {}, "source": [ "### Store the embeddings into the weaviate database" ] @@ -286,26 +291,12 @@ { "cell_type": "code", "execution_count": null, - "id": "93ae7d2d-86b0-4b88-ba96-ef0d094d9da3", + "id": "6922b1d4-e394-493a-8549-07ba3c947e7d", "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "93ae7d2d-86b0-4b88-ba96-ef0d094d9da3", - "outputId": "66af9945-1e8c-470c-dfe0-f161d3dc1c68", "tags": [] }, "outputs": [], "source": [ - "# Delete existing schema if any present\n", - "client = weaviate.Client(url=weaviate_cluster_url )\n", - "\n", - "try:\n", - " client.schema.delete_all()\n", - " print(\"Schema deleted successfully...\")\n", - "except:\n", - " print(\"Schema not deleted...\")\n", - "\n", "# Store in the weaviate vector database\n", "db = Weaviate.from_documents(texts, embeddings, weaviate_url=weaviate_cluster_url, by_text=False)" ] @@ -313,9 +304,7 @@ { "cell_type": "markdown", "id": "981fac74-2e1e-4b62-8b91-09d51d344bba", - "metadata": { - "id": "981fac74-2e1e-4b62-8b91-09d51d344bba" - }, + "metadata": {}, "source": [ "### Get the closest response to the user query on the PDF" ] @@ -325,13 +314,12 @@ "execution_count": null, "id": "bdf9e7a9-7de9-4c50-b677-97cb2a1d5d3b", "metadata": { - "id": "bdf9e7a9-7de9-4c50-b677-97cb2a1d5d3b", "tags": [] }, "outputs": [], "source": [ - "qa = RetrievalQA.from_chain_type(llm=OpenAI(openai_api_key=openai_api_key, temperature=0), chain_type=\"stuff\", retriever=db.as_retriever())\n", - "response = qa.run(query)" + "docs = db.similarity_search(query)\n", + "docs" ] }, { @@ -344,8 +332,7 @@ "iopub.status.idle": "2021-07-02T23:32:10.796900Z", "shell.execute_reply": "2021-07-02T23:32:10.796358Z", "shell.execute_reply.started": "2021-07-02T23:32:10.789033Z" - }, - "id": "lonely-pacific" + } }, "source": [ "## Output" @@ -354,9 +341,7 @@ { "cell_type": "markdown", "id": "890f7c86-b7bb-4f5d-9a1b-e492dd9580fd", - "metadata": { - "id": "890f7c86-b7bb-4f5d-9a1b-e492dd9580fd" - }, + "metadata": {}, "source": [ "### Show the response" ] @@ -366,36 +351,16 @@ "execution_count": null, "id": "9c4e3b7b-6440-4844-8054-265f1aec65eb", "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 69 - }, - "id": "9c4e3b7b-6440-4844-8054-265f1aec65eb", - "outputId": "70a8eb18-ce96-4902-9ebe-c3179494fb30", "tags": [] }, "outputs": [], "source": [ + "response = docs[0].page_content\n", "response" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9d0d7bd2-7e89-49e6-807f-9ab542085a7f", - "metadata": { - "id": "9d0d7bd2-7e89-49e6-807f-9ab542085a7f" - }, - "outputs": [], - "source": [] } ], "metadata": { - "accelerator": "GPU", - "colab": { - "gpuType": "T4", - "provenance": [] - }, "kernelspec": { "display_name": "Python 3", "language": "python", @@ -418,6 +383,13 @@ "environment_variables": {}, "parameters": {}, "version": "2.3.3" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": {}, + "version_major": 2, + "version_minor": 0 + } } }, "nbformat": 4, From 4840802b5c50e6e1b1dd48c7a63c365fc32b02b2 Mon Sep 17 00:00:00 2001 From: Florent Ravenel Date: Wed, 27 Sep 2023 14:31:22 +0200 Subject: [PATCH 9/9] fix: install lib --- .../LangChain_Vector_Search_on_PDF.ipynb | 23 +++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/LangChain/LangChain_Vector_Search_on_PDF.ipynb b/LangChain/LangChain_Vector_Search_on_PDF.ipynb index a192b7a495..f6e25ed743 100644 --- a/LangChain/LangChain_Vector_Search_on_PDF.ipynb +++ b/LangChain/LangChain_Vector_Search_on_PDF.ipynb @@ -57,7 +57,7 @@ "tags": [] }, "source": [ - "**Last update:** 2023-07-31 (Created: 2023-07-10)" + "**Last update:** 2023-09-27 (Created: 2023-09-27)" ] }, { @@ -119,10 +119,19 @@ "source": [ "try:\n", " import langchain\n", + "except ModuleNotFoundError:\n", + " !pip install langchain --user\n", + " import langchain\n", + "try:\n", " import PyPDF2\n", + "except ModuleNotFoundError:\n", + " !pip install PyPDF2 --user\n", + " import PyPDF2\n", + "try:\n", " import weaviate\n", "except ModuleNotFoundError:\n", - " !pip install langchain PyPDF2 weaviate-client==3.20.0\n", + " !pip install weaviate-client==3.20.0 --user\n", + " import weaviate\n", " \n", "# Note: This installation make take more time than usual due to more dependencies {uncomment if there is some error in the embeddings routine}\n", "# !pip install sentence_transformers --user\n", @@ -131,8 +140,6 @@ "import naas\n", "import io\n", "import requests\n", - "import PyPDF2\n", - "import weaviate\n", "from langchain.text_splitter import CharacterTextSplitter\n", "from langchain.embeddings import HuggingFaceEmbeddings\n", "from langchain.vectorstores import Weaviate" @@ -358,6 +365,14 @@ "response = docs[0].page_content\n", "response" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5e2bc7f1-acf9-402b-b0aa-93de14764f8b", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": {