From 1a621565cb7f773f90f10063f7324ecbbbb87651 Mon Sep 17 00:00:00 2001
From: Sriniketh J <81156510+srini047@users.noreply.github.com>
Date: Wed, 12 Jul 2023 22:03:37 +0530
Subject: [PATCH 1/9] add: langchain vector search pdf template
---
.../Langchain_Vector_Search_on_PDF.ipynb | 344 ++++++++++++++++++
1 file changed, 344 insertions(+)
create mode 100644 LangChain/Langchain_Vector_Search_on_PDF.ipynb
diff --git a/LangChain/Langchain_Vector_Search_on_PDF.ipynb b/LangChain/Langchain_Vector_Search_on_PDF.ipynb
new file mode 100644
index 0000000000..ebe4599064
--- /dev/null
+++ b/LangChain/Langchain_Vector_Search_on_PDF.ipynb
@@ -0,0 +1,344 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "latin-packing",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2021-02-23T14:22:16.610471Z",
+ "iopub.status.busy": "2021-02-23T14:22:16.610129Z",
+ "iopub.status.idle": "2021-02-23T14:22:16.627784Z",
+ "shell.execute_reply": "2021-02-23T14:22:16.626866Z",
+ "shell.execute_reply.started": "2021-02-23T14:22:16.610384Z"
+ },
+ "papermill": {},
+ "tags": []
+ },
+ "source": [
+ ""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "compressed-wilson",
+ "metadata": {
+ "papermill": {},
+ "tags": []
+ },
+ "source": [
+ "# Tool - Action of the notebook\n",
+ "\n",
+ "\n",
+ "
Template request | Bug report"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "religious-programmer",
+ "metadata": {},
+ "source": [
+ "**Tags:** #langchain #pdf #weaviate #huggingface"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1fe9f56e-561c-4f52-aef8-b861c9462107",
+ "metadata": {},
+ "source": [
+ "**Author:** [Sriniketh Jayasendil](https://www.linkedin.com/in/sriniketh-jayasendil)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "31ea7cdb-e10d-43fc-b026-f69249a59736",
+ "metadata": {},
+ "source": [
+ "**Description:** This notebook is used to perform vector search on your PDF and it will answer basic questions that are closely related based on the prompt provided.\n",
+ "\n",
+ "It uses:\n",
+ "- PyPDF2 - Get text from PDF\n",
+ "- LangChain - Text splitter, document creation\n",
+ "- HuggingFace - Embeddings\n",
+ "- Weaviate - Vector Database"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1a14806c-9da2-446e-b8fd-b55f8d7ac3f0",
+ "metadata": {},
+ "source": [
+ "### References\n",
+ "\n",
+ "- [Langchain docs](https://python.langchain.com/docs/get_started/introduction.html)\n",
+ "- [Weaviate docs](https://weaviate.io/developers/weaviate)\n",
+ "- [Huggingface docs](https://huggingface.co/docs)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "distinguished-truth",
+ "metadata": {
+ "papermill": {},
+ "tags": []
+ },
+ "source": [
+ "## Input"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "numeric-mediterranean",
+ "metadata": {},
+ "source": [
+ "### Import libraries"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "potential-surfing",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "try:\n",
+ " import langchain\n",
+ " import PyPDF2\n",
+ "except ModuleNotFoundError:\n",
+ " !pip install langchain PyPDF2\n",
+ "\n",
+ "!pip install sentence_transformers --user\n",
+ "import naas\n",
+ "import PyPDF2\n",
+ "from langchain.text_splitter import CharacterTextSplitter\n",
+ "from langchain.embeddings import HuggingFaceEmbeddings\n",
+ "from langchain.vectorstores import Weaviate"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "aggressive-trustee",
+ "metadata": {},
+ "source": [
+ "### Setup Variables\n",
+ "\n",
+ "- `pdf_file`: Path to which the PDF file exists.\n",
+ "- `weaviate_cluster_url`: You can create a new weaviate cluster [here](https://console.weaviate.cloud) and paste the url or import from naas secrets\n",
+ "- `query`: The question that you need to ask the pdf\n",
+ "- `response`: The reply for the query from search "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "continuous-melbourne",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "#inputs\n",
+ "pdf_file = \"./SWE NCG JD.pdf\"\n",
+ "weaviate_cluster_url = \"\" or naas.secret.get(\"WEAVIATE_CLUSTER_URL\")\n",
+ "query = \"How much is the base pay?\"\n",
+ "\n",
+ "#outputs\n",
+ "response = \"\""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "registered-showcase",
+ "metadata": {},
+ "source": [
+ "## Model"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "tested-astrology",
+ "metadata": {},
+ "source": [
+ "### Extract text from PDF"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "crude-louisville",
+ "metadata": {
+ "papermill": {},
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "def extract_text_from_pdf(pdf_path):\n",
+ " with open(pdf_path, \"rb\") as file:\n",
+ " pdf = PyPDF2.PdfReader(file)\n",
+ " text = []\n",
+ " for page in pdf.pages:\n",
+ " text.append(page.extract_text())\n",
+ " return \" \".join(text)\n",
+ "\n",
+ "text = extract_text_from_pdf(pdf_file)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8daa42c1-3a2b-4f96-a7dd-fb1deb395a84",
+ "metadata": {},
+ "source": [
+ "### Split the text into chunks scraped from the PDF"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f9e8e197-e965-441c-9512-9b28ed079ee6",
+ "metadata": {
+ "papermill": {},
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "text_splitter = CharacterTextSplitter(\n",
+ " separator = \"\\n\",\n",
+ " chunk_size = 1000,\n",
+ " chunk_overlap = 200,\n",
+ " length_function = len,\n",
+ ")\n",
+ "\n",
+ "texts = text_splitter.create_documents([text])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ef1720bf-a28a-4757-b189-7df97947c158",
+ "metadata": {},
+ "source": [
+ "### Create embeddings of the text make it compatible to store it in the database"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e4a376ac-a10e-4d6a-ba01-e5445efdf091",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "embeddings = HuggingFaceEmbeddings()\n",
+ "\n",
+ "for i in range(len(texts)):\n",
+ " query_result = embeddings.embed_query(texts[i].page_content)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4169feb2-05ac-4914-bbb2-501dae7dcd89",
+ "metadata": {},
+ "source": [
+ "### Store the embeddings into the weaviate database"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "93ae7d2d-86b0-4b88-ba96-ef0d094d9da3",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "db = Weaviate.from_documents(texts, embeddings, weaviate_url=\"https://naas-langchain-test-t6yybnsw.weaviate.network\", by_text=False)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "981fac74-2e1e-4b62-8b91-09d51d344bba",
+ "metadata": {},
+ "source": [
+ "### Get the closest response to the user query on the PDF"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "bdf9e7a9-7de9-4c50-b677-97cb2a1d5d3b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "docs = db.similarity_search(query)\n",
+ "response = docs[0].page_content"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "lonely-pacific",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2021-07-02T23:32:10.789097Z",
+ "iopub.status.busy": "2021-07-02T23:32:10.788829Z",
+ "iopub.status.idle": "2021-07-02T23:32:10.796900Z",
+ "shell.execute_reply": "2021-07-02T23:32:10.796358Z",
+ "shell.execute_reply.started": "2021-07-02T23:32:10.789033Z"
+ }
+ },
+ "source": [
+ "## Output"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "890f7c86-b7bb-4f5d-9a1b-e492dd9580fd",
+ "metadata": {},
+ "source": [
+ "### Show the response"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9c4e3b7b-6440-4844-8054-265f1aec65eb",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "response"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.6"
+ },
+ "papermill": {
+ "default_parameters": {},
+ "environment_variables": {},
+ "parameters": {},
+ "version": "2.3.3"
+ },
+ "widgets": {
+ "application/vnd.jupyter.widget-state+json": {
+ "state": {},
+ "version_major": 2,
+ "version_minor": 0
+ }
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
From 932275ea2197072dec69cfe944215b57b32441ae Mon Sep 17 00:00:00 2001
From: Sriniketh J <81156510+srini047@users.noreply.github.com>
Date: Wed, 12 Jul 2023 22:07:20 +0530
Subject: [PATCH 2/9] fix: .env exposure issue
---
LangChain/Langchain_Vector_Search_on_PDF.ipynb | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/LangChain/Langchain_Vector_Search_on_PDF.ipynb b/LangChain/Langchain_Vector_Search_on_PDF.ipynb
index ebe4599064..6f023f30e9 100644
--- a/LangChain/Langchain_Vector_Search_on_PDF.ipynb
+++ b/LangChain/Langchain_Vector_Search_on_PDF.ipynb
@@ -250,7 +250,7 @@
"metadata": {},
"outputs": [],
"source": [
- "db = Weaviate.from_documents(texts, embeddings, weaviate_url=\"https://naas-langchain-test-t6yybnsw.weaviate.network\", by_text=False)"
+ "db = Weaviate.from_documents(texts, embeddings, weaviate_url=\"\", by_text=False)"
]
},
{
From dea4487b08d641e6b16fd7fb8dad29941385750a Mon Sep 17 00:00:00 2001
From: Florent Ravenel
Date: Wed, 19 Jul 2023 10:29:51 +0200
Subject: [PATCH 3/9] feat: rename notebook, update title, references tipo
---
....ipynb => LangChain_Vector_Search_on_PDF.ipynb} | 14 ++++++++------
1 file changed, 8 insertions(+), 6 deletions(-)
rename LangChain/{Langchain_Vector_Search_on_PDF.ipynb => LangChain_Vector_Search_on_PDF.ipynb} (97%)
diff --git a/LangChain/Langchain_Vector_Search_on_PDF.ipynb b/LangChain/LangChain_Vector_Search_on_PDF.ipynb
similarity index 97%
rename from LangChain/Langchain_Vector_Search_on_PDF.ipynb
rename to LangChain/LangChain_Vector_Search_on_PDF.ipynb
index 6f023f30e9..9bbb385792 100644
--- a/LangChain/Langchain_Vector_Search_on_PDF.ipynb
+++ b/LangChain/LangChain_Vector_Search_on_PDF.ipynb
@@ -26,7 +26,7 @@
"tags": []
},
"source": [
- "# Tool - Action of the notebook\n",
+ "# LangChain - Vector Search on PDF\n",
"\n",
"\n",
"
Template request | Bug report"
@@ -67,8 +67,7 @@
"id": "1a14806c-9da2-446e-b8fd-b55f8d7ac3f0",
"metadata": {},
"source": [
- "### References\n",
- "\n",
+ "**References:**\n",
"- [Langchain docs](https://python.langchain.com/docs/get_started/introduction.html)\n",
"- [Weaviate docs](https://weaviate.io/developers/weaviate)\n",
"- [Huggingface docs](https://huggingface.co/docs)"
@@ -98,6 +97,10 @@
"execution_count": null,
"id": "potential-surfing",
"metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-19T08:29:06.271389Z",
+ "iopub.status.busy": "2023-07-19T08:29:06.271116Z"
+ },
"tags": []
},
"outputs": [],
@@ -107,7 +110,6 @@
" import PyPDF2\n",
"except ModuleNotFoundError:\n",
" !pip install langchain PyPDF2\n",
- "\n",
"!pip install sentence_transformers --user\n",
"import naas\n",
"import PyPDF2\n",
@@ -138,12 +140,12 @@
},
"outputs": [],
"source": [
- "#inputs\n",
+ "# Inputs\n",
"pdf_file = \"./SWE NCG JD.pdf\"\n",
"weaviate_cluster_url = \"\" or naas.secret.get(\"WEAVIATE_CLUSTER_URL\")\n",
"query = \"How much is the base pay?\"\n",
"\n",
- "#outputs\n",
+ "# Outputs\n",
"response = \"\""
]
},
From 5399ff33aded3d9d23e4db4ceae84280b976c51f Mon Sep 17 00:00:00 2001
From: Florent Ravenel
Date: Wed, 19 Jul 2023 10:49:55 +0200
Subject: [PATCH 4/9] feat: test
---
.../LangChain_Vector_Search_on_PDF.ipynb | 121 ++++++++++++++++--
1 file changed, 110 insertions(+), 11 deletions(-)
diff --git a/LangChain/LangChain_Vector_Search_on_PDF.ipynb b/LangChain/LangChain_Vector_Search_on_PDF.ipynb
index 9bbb385792..93054e98da 100644
--- a/LangChain/LangChain_Vector_Search_on_PDF.ipynb
+++ b/LangChain/LangChain_Vector_Search_on_PDF.ipynb
@@ -94,16 +94,52 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 1,
"id": "potential-surfing",
"metadata": {
"execution": {
- "iopub.execute_input": "2023-07-19T08:29:06.271389Z",
- "iopub.status.busy": "2023-07-19T08:29:06.271116Z"
+ "iopub.execute_input": "2023-07-19T08:41:33.179611Z",
+ "iopub.status.busy": "2023-07-19T08:41:33.179183Z",
+ "iopub.status.idle": "2023-07-19T08:42:16.496262Z",
+ "shell.execute_reply": "2023-07-19T08:42:16.491276Z",
+ "shell.execute_reply.started": "2023-07-19T08:41:33.179539Z"
},
"tags": []
},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Collecting sentence_transformers\n",
+ " Using cached sentence_transformers-2.2.2-py3-none-any.whl\n",
+ "Requirement already satisfied: transformers<5.0.0,>=4.6.0 in /opt/conda/lib/python3.9/site-packages (from sentence_transformers) (4.12.5)\n",
+ "Requirement already satisfied: tqdm in /opt/conda/lib/python3.9/site-packages (from sentence_transformers) (4.62.0)\n",
+ "Requirement already satisfied: torch>=1.6.0 in /opt/conda/lib/python3.9/site-packages (from sentence_transformers) (1.8.1)\n",
+ "Collecting torchvision (from sentence_transformers)\n",
+ " Using cached torchvision-0.15.2-cp39-cp39-manylinux1_x86_64.whl (6.0 MB)\n",
+ "Requirement already satisfied: numpy in /home/ftp/.local/lib/python3.9/site-packages (from sentence_transformers) (1.22.4)\n",
+ "Requirement already satisfied: scikit-learn in /opt/conda/lib/python3.9/site-packages (from sentence_transformers) (0.24.2)\n",
+ "Requirement already satisfied: scipy in /opt/conda/lib/python3.9/site-packages (from sentence_transformers) (1.6.3)\n",
+ "Requirement already satisfied: nltk in /home/ftp/.local/lib/python3.9/site-packages (from sentence_transformers) (3.8.1)\n",
+ "Requirement already satisfied: sentencepiece in /home/ftp/.local/lib/python3.9/site-packages (from sentence_transformers) (0.1.99)\n",
+ "Requirement already satisfied: huggingface-hub>=0.4.0 in /opt/conda/lib/python3.9/site-packages (from sentence_transformers) (0.16.2)\n",
+ "Requirement already satisfied: filelock in /opt/conda/lib/python3.9/site-packages (from huggingface-hub>=0.4.0->sentence_transformers) (3.12.2)\n",
+ "Requirement already satisfied: fsspec in /opt/conda/lib/python3.9/site-packages (from huggingface-hub>=0.4.0->sentence_transformers) (2023.6.0)\n",
+ "Requirement already satisfied: requests in /opt/conda/lib/python3.9/site-packages (from huggingface-hub>=0.4.0->sentence_transformers) (2.26.0)\n",
+ "Requirement already satisfied: pyyaml>=5.1 in /opt/conda/lib/python3.9/site-packages (from huggingface-hub>=0.4.0->sentence_transformers) (6.0)\n",
+ "Requirement already satisfied: typing-extensions>=3.7.4.3 in /home/ftp/.local/lib/python3.9/site-packages (from huggingface-hub>=0.4.0->sentence_transformers) (4.5.0)\n",
+ "Requirement already satisfied: packaging>=20.9 in /opt/conda/lib/python3.9/site-packages (from huggingface-hub>=0.4.0->sentence_transformers) (21.0)\n",
+ "Requirement already satisfied: regex!=2019.12.17 in /opt/conda/lib/python3.9/site-packages (from transformers<5.0.0,>=4.6.0->sentence_transformers) (2023.6.3)\n",
+ "Requirement already satisfied: sacremoses in /opt/conda/lib/python3.9/site-packages (from transformers<5.0.0,>=4.6.0->sentence_transformers) (0.0.53)\n",
+ "Requirement already satisfied: tokenizers<0.11,>=0.10.1 in /opt/conda/lib/python3.9/site-packages (from transformers<5.0.0,>=4.6.0->sentence_transformers) (0.10.3)\n",
+ "Requirement already satisfied: click in /opt/conda/lib/python3.9/site-packages (from nltk->sentence_transformers) (8.1.3)\n",
+ "Requirement already satisfied: joblib in /opt/conda/lib/python3.9/site-packages (from nltk->sentence_transformers) (1.0.1)\n",
+ "Requirement already satisfied: threadpoolctl>=2.0.0 in /opt/conda/lib/python3.9/site-packages (from scikit-learn->sentence_transformers) (3.1.0)\n",
+ "Collecting torch>=1.6.0 (from sentence_transformers)\n"
+ ]
+ }
+ ],
"source": [
"try:\n",
" import langchain\n",
@@ -133,16 +169,23 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 2,
"id": "continuous-melbourne",
"metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-19T08:42:16.501431Z",
+ "iopub.status.busy": "2023-07-19T08:42:16.500987Z",
+ "iopub.status.idle": "2023-07-19T08:42:16.675988Z",
+ "shell.execute_reply": "2023-07-19T08:42:16.667851Z",
+ "shell.execute_reply.started": "2023-07-19T08:42:16.501399Z"
+ },
"tags": []
},
"outputs": [],
"source": [
"# Inputs\n",
"pdf_file = \"./SWE NCG JD.pdf\"\n",
- "weaviate_cluster_url = \"\" or naas.secret.get(\"WEAVIATE_CLUSTER_URL\")\n",
+ "weaviate_cluster_url = naas.secret.get(\"WEAVIATE_CLUSTER_URL\")\n",
"query = \"How much is the base pay?\"\n",
"\n",
"# Outputs\n",
@@ -167,13 +210,33 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 3,
"id": "crude-louisville",
"metadata": {
+ "execution": {
+ "iopub.execute_input": "2023-07-19T08:42:16.681433Z",
+ "iopub.status.busy": "2023-07-19T08:42:16.679730Z",
+ "iopub.status.idle": "2023-07-19T08:42:20.072060Z",
+ "shell.execute_reply": "2023-07-19T08:42:20.070305Z",
+ "shell.execute_reply.started": "2023-07-19T08:42:16.681396Z"
+ },
"papermill": {},
"tags": []
},
- "outputs": [],
+ "outputs": [
+ {
+ "ename": "FileNotFoundError",
+ "evalue": "[Errno 2] No such file or directory: './SWE NCG JD.pdf'",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
+ "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0;34m\" \"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 9\u001b[0;31m \u001b[0mtext\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mextract_text_from_pdf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpdf_file\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+ "\u001b[0;32m\u001b[0m in \u001b[0;36mextract_text_from_pdf\u001b[0;34m(pdf_path)\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mextract_text_from_pdf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpdf_path\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpdf_path\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"rb\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mfile\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mpdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mPyPDF2\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mPdfReader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfile\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mtext\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mpage\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mpdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpages\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: './SWE NCG JD.pdf'"
+ ]
+ }
+ ],
"source": [
"def extract_text_from_pdf(pdf_path):\n",
" with open(pdf_path, \"rb\") as file:\n",
@@ -199,6 +262,11 @@
"execution_count": null,
"id": "f9e8e197-e965-441c-9512-9b28ed079ee6",
"metadata": {
+ "execution": {
+ "iopub.status.busy": "2023-07-19T08:42:20.072988Z",
+ "iopub.status.idle": "2023-07-19T08:42:20.073335Z",
+ "shell.execute_reply": "2023-07-19T08:42:20.073164Z"
+ },
"papermill": {},
"tags": []
},
@@ -227,6 +295,11 @@
"execution_count": null,
"id": "e4a376ac-a10e-4d6a-ba01-e5445efdf091",
"metadata": {
+ "execution": {
+ "iopub.status.busy": "2023-07-19T08:42:20.074098Z",
+ "iopub.status.idle": "2023-07-19T08:42:20.074423Z",
+ "shell.execute_reply": "2023-07-19T08:42:20.074257Z"
+ },
"tags": []
},
"outputs": [],
@@ -249,7 +322,13 @@
"cell_type": "code",
"execution_count": null,
"id": "93ae7d2d-86b0-4b88-ba96-ef0d094d9da3",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.status.busy": "2023-07-19T08:42:20.075300Z",
+ "iopub.status.idle": "2023-07-19T08:42:20.075641Z",
+ "shell.execute_reply": "2023-07-19T08:42:20.075473Z"
+ }
+ },
"outputs": [],
"source": [
"db = Weaviate.from_documents(texts, embeddings, weaviate_url=\"\", by_text=False)"
@@ -267,7 +346,13 @@
"cell_type": "code",
"execution_count": null,
"id": "bdf9e7a9-7de9-4c50-b677-97cb2a1d5d3b",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.status.busy": "2023-07-19T08:42:20.076441Z",
+ "iopub.status.idle": "2023-07-19T08:42:20.076769Z",
+ "shell.execute_reply": "2023-07-19T08:42:20.076600Z"
+ }
+ },
"outputs": [],
"source": [
"docs = db.similarity_search(query)\n",
@@ -302,11 +387,25 @@
"cell_type": "code",
"execution_count": null,
"id": "9c4e3b7b-6440-4844-8054-265f1aec65eb",
- "metadata": {},
+ "metadata": {
+ "execution": {
+ "iopub.status.busy": "2023-07-19T08:42:20.077561Z",
+ "iopub.status.idle": "2023-07-19T08:42:20.077926Z",
+ "shell.execute_reply": "2023-07-19T08:42:20.077754Z"
+ }
+ },
"outputs": [],
"source": [
"response"
]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9d0d7bd2-7e89-49e6-807f-9ab542085a7f",
+ "metadata": {},
+ "outputs": [],
+ "source": []
}
],
"metadata": {
From e737ecb5ab9ccf2b7a2ed3f05d9066fd7e6ecc3a Mon Sep 17 00:00:00 2001
From: Sriniketh J <81156510+srini047@users.noreply.github.com>
Date: Fri, 21 Jul 2023 21:13:38 +0530
Subject: [PATCH 5/9] fix: pdf file_path issue
---
.../LangChain_Vector_Search_on_PDF.ipynb | 157 ++++--------------
1 file changed, 34 insertions(+), 123 deletions(-)
diff --git a/LangChain/LangChain_Vector_Search_on_PDF.ipynb b/LangChain/LangChain_Vector_Search_on_PDF.ipynb
index 93054e98da..17b3c7fc6f 100644
--- a/LangChain/LangChain_Vector_Search_on_PDF.ipynb
+++ b/LangChain/LangChain_Vector_Search_on_PDF.ipynb
@@ -37,7 +37,7 @@
"id": "religious-programmer",
"metadata": {},
"source": [
- "**Tags:** #langchain #pdf #weaviate #huggingface"
+ "**Tags:** #langchain #pdf #weaviate #huggingface #llm #database #embeddings"
]
},
{
@@ -94,99 +94,46 @@
},
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": null,
"id": "potential-surfing",
"metadata": {
- "execution": {
- "iopub.execute_input": "2023-07-19T08:41:33.179611Z",
- "iopub.status.busy": "2023-07-19T08:41:33.179183Z",
- "iopub.status.idle": "2023-07-19T08:42:16.496262Z",
- "shell.execute_reply": "2023-07-19T08:42:16.491276Z",
- "shell.execute_reply.started": "2023-07-19T08:41:33.179539Z"
- },
"tags": []
},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Collecting sentence_transformers\n",
- " Using cached sentence_transformers-2.2.2-py3-none-any.whl\n",
- "Requirement already satisfied: transformers<5.0.0,>=4.6.0 in /opt/conda/lib/python3.9/site-packages (from sentence_transformers) (4.12.5)\n",
- "Requirement already satisfied: tqdm in /opt/conda/lib/python3.9/site-packages (from sentence_transformers) (4.62.0)\n",
- "Requirement already satisfied: torch>=1.6.0 in /opt/conda/lib/python3.9/site-packages (from sentence_transformers) (1.8.1)\n",
- "Collecting torchvision (from sentence_transformers)\n",
- " Using cached torchvision-0.15.2-cp39-cp39-manylinux1_x86_64.whl (6.0 MB)\n",
- "Requirement already satisfied: numpy in /home/ftp/.local/lib/python3.9/site-packages (from sentence_transformers) (1.22.4)\n",
- "Requirement already satisfied: scikit-learn in /opt/conda/lib/python3.9/site-packages (from sentence_transformers) (0.24.2)\n",
- "Requirement already satisfied: scipy in /opt/conda/lib/python3.9/site-packages (from sentence_transformers) (1.6.3)\n",
- "Requirement already satisfied: nltk in /home/ftp/.local/lib/python3.9/site-packages (from sentence_transformers) (3.8.1)\n",
- "Requirement already satisfied: sentencepiece in /home/ftp/.local/lib/python3.9/site-packages (from sentence_transformers) (0.1.99)\n",
- "Requirement already satisfied: huggingface-hub>=0.4.0 in /opt/conda/lib/python3.9/site-packages (from sentence_transformers) (0.16.2)\n",
- "Requirement already satisfied: filelock in /opt/conda/lib/python3.9/site-packages (from huggingface-hub>=0.4.0->sentence_transformers) (3.12.2)\n",
- "Requirement already satisfied: fsspec in /opt/conda/lib/python3.9/site-packages (from huggingface-hub>=0.4.0->sentence_transformers) (2023.6.0)\n",
- "Requirement already satisfied: requests in /opt/conda/lib/python3.9/site-packages (from huggingface-hub>=0.4.0->sentence_transformers) (2.26.0)\n",
- "Requirement already satisfied: pyyaml>=5.1 in /opt/conda/lib/python3.9/site-packages (from huggingface-hub>=0.4.0->sentence_transformers) (6.0)\n",
- "Requirement already satisfied: typing-extensions>=3.7.4.3 in /home/ftp/.local/lib/python3.9/site-packages (from huggingface-hub>=0.4.0->sentence_transformers) (4.5.0)\n",
- "Requirement already satisfied: packaging>=20.9 in /opt/conda/lib/python3.9/site-packages (from huggingface-hub>=0.4.0->sentence_transformers) (21.0)\n",
- "Requirement already satisfied: regex!=2019.12.17 in /opt/conda/lib/python3.9/site-packages (from transformers<5.0.0,>=4.6.0->sentence_transformers) (2023.6.3)\n",
- "Requirement already satisfied: sacremoses in /opt/conda/lib/python3.9/site-packages (from transformers<5.0.0,>=4.6.0->sentence_transformers) (0.0.53)\n",
- "Requirement already satisfied: tokenizers<0.11,>=0.10.1 in /opt/conda/lib/python3.9/site-packages (from transformers<5.0.0,>=4.6.0->sentence_transformers) (0.10.3)\n",
- "Requirement already satisfied: click in /opt/conda/lib/python3.9/site-packages (from nltk->sentence_transformers) (8.1.3)\n",
- "Requirement already satisfied: joblib in /opt/conda/lib/python3.9/site-packages (from nltk->sentence_transformers) (1.0.1)\n",
- "Requirement already satisfied: threadpoolctl>=2.0.0 in /opt/conda/lib/python3.9/site-packages (from scikit-learn->sentence_transformers) (3.1.0)\n",
- "Collecting torch>=1.6.0 (from sentence_transformers)\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"try:\n",
" import langchain\n",
" import PyPDF2\n",
+ " import weaviate\n",
"except ModuleNotFoundError:\n",
- " !pip install langchain PyPDF2\n",
- "!pip install sentence_transformers --user\n",
+ " !pip install langchain PyPDF2 weaviate-client==3.20.0\n",
+ " \n",
+ "# Note: This installation make take more time than usual due to more dependencies {uncomment if there is some error in the embeddings routine}\n",
+ "# !pip install sentence_transformers --user\n",
+ "\n",
"import naas\n",
+ "import io\n",
+ "import requests\n",
"import PyPDF2\n",
+ "import weaviate\n",
"from langchain.text_splitter import CharacterTextSplitter\n",
"from langchain.embeddings import HuggingFaceEmbeddings\n",
"from langchain.vectorstores import Weaviate"
]
},
- {
- "cell_type": "markdown",
- "id": "aggressive-trustee",
- "metadata": {},
- "source": [
- "### Setup Variables\n",
- "\n",
- "- `pdf_file`: Path to which the PDF file exists.\n",
- "- `weaviate_cluster_url`: You can create a new weaviate cluster [here](https://console.weaviate.cloud) and paste the url or import from naas secrets\n",
- "- `query`: The question that you need to ask the pdf\n",
- "- `response`: The reply for the query from search "
- ]
- },
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": null,
"id": "continuous-melbourne",
"metadata": {
- "execution": {
- "iopub.execute_input": "2023-07-19T08:42:16.501431Z",
- "iopub.status.busy": "2023-07-19T08:42:16.500987Z",
- "iopub.status.idle": "2023-07-19T08:42:16.675988Z",
- "shell.execute_reply": "2023-07-19T08:42:16.667851Z",
- "shell.execute_reply.started": "2023-07-19T08:42:16.501399Z"
- },
"tags": []
},
"outputs": [],
"source": [
"# Inputs\n",
- "pdf_file = \"./SWE NCG JD.pdf\"\n",
- "weaviate_cluster_url = naas.secret.get(\"WEAVIATE_CLUSTER_URL\")\n",
- "query = \"How much is the base pay?\"\n",
+ "pdf_file = \"\"\n",
+ "weaviate_cluster_url = \"paste your cluster url\" or naas.secret.get(\"WEAVIATE_CLUSTER_URL\")\n",
+ "query = \"Enter your own\" or \"Summarize the PDF...\"\n",
"\n",
"# Outputs\n",
"response = \"\""
@@ -210,41 +157,27 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": null,
"id": "crude-louisville",
"metadata": {
- "execution": {
- "iopub.execute_input": "2023-07-19T08:42:16.681433Z",
- "iopub.status.busy": "2023-07-19T08:42:16.679730Z",
- "iopub.status.idle": "2023-07-19T08:42:20.072060Z",
- "shell.execute_reply": "2023-07-19T08:42:20.070305Z",
- "shell.execute_reply.started": "2023-07-19T08:42:16.681396Z"
- },
"papermill": {},
"tags": []
},
- "outputs": [
- {
- "ename": "FileNotFoundError",
- "evalue": "[Errno 2] No such file or directory: './SWE NCG JD.pdf'",
- "output_type": "error",
- "traceback": [
- "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
- "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
- "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0;34m\" \"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mjoin\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 9\u001b[0;31m \u001b[0mtext\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mextract_text_from_pdf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpdf_file\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
- "\u001b[0;32m\u001b[0m in \u001b[0;36mextract_text_from_pdf\u001b[0;34m(pdf_path)\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mextract_text_from_pdf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpdf_path\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mpdf_path\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"rb\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mfile\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mpdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mPyPDF2\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mPdfReader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfile\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mtext\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mpage\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mpdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpages\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
- "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: './SWE NCG JD.pdf'"
- ]
- }
- ],
+ "outputs": [],
"source": [
"def extract_text_from_pdf(pdf_path):\n",
- " with open(pdf_path, \"rb\") as file:\n",
- " pdf = PyPDF2.PdfReader(file)\n",
- " text = []\n",
- " for page in pdf.pages:\n",
- " text.append(page.extract_text())\n",
- " return \" \".join(text)\n",
+ " r = requests.get(pdf_path)\n",
+ " f = io.BytesIO(r.content)\n",
+ "\n",
+ " reader = PyPDF2.PdfReader(f)\n",
+ " contents = []\n",
+ " for page in reader.pages:\n",
+ " content = page.extract_text()\n",
+ " contents.append(content)\n",
+ " \n",
+ " contents = ' '.join(contents)\n",
+ " return contents\n",
+ " \n",
"\n",
"text = extract_text_from_pdf(pdf_file)"
]
@@ -262,11 +195,6 @@
"execution_count": null,
"id": "f9e8e197-e965-441c-9512-9b28ed079ee6",
"metadata": {
- "execution": {
- "iopub.status.busy": "2023-07-19T08:42:20.072988Z",
- "iopub.status.idle": "2023-07-19T08:42:20.073335Z",
- "shell.execute_reply": "2023-07-19T08:42:20.073164Z"
- },
"papermill": {},
"tags": []
},
@@ -295,11 +223,6 @@
"execution_count": null,
"id": "e4a376ac-a10e-4d6a-ba01-e5445efdf091",
"metadata": {
- "execution": {
- "iopub.status.busy": "2023-07-19T08:42:20.074098Z",
- "iopub.status.idle": "2023-07-19T08:42:20.074423Z",
- "shell.execute_reply": "2023-07-19T08:42:20.074257Z"
- },
"tags": []
},
"outputs": [],
@@ -323,15 +246,11 @@
"execution_count": null,
"id": "93ae7d2d-86b0-4b88-ba96-ef0d094d9da3",
"metadata": {
- "execution": {
- "iopub.status.busy": "2023-07-19T08:42:20.075300Z",
- "iopub.status.idle": "2023-07-19T08:42:20.075641Z",
- "shell.execute_reply": "2023-07-19T08:42:20.075473Z"
- }
+ "tags": []
},
"outputs": [],
"source": [
- "db = Weaviate.from_documents(texts, embeddings, weaviate_url=\"\", by_text=False)"
+ "db = Weaviate.from_documents(texts, embeddings, weaviate_url=weaviate_cluster_url, by_text=False)"
]
},
{
@@ -347,11 +266,7 @@
"execution_count": null,
"id": "bdf9e7a9-7de9-4c50-b677-97cb2a1d5d3b",
"metadata": {
- "execution": {
- "iopub.status.busy": "2023-07-19T08:42:20.076441Z",
- "iopub.status.idle": "2023-07-19T08:42:20.076769Z",
- "shell.execute_reply": "2023-07-19T08:42:20.076600Z"
- }
+ "tags": []
},
"outputs": [],
"source": [
@@ -388,11 +303,7 @@
"execution_count": null,
"id": "9c4e3b7b-6440-4844-8054-265f1aec65eb",
"metadata": {
- "execution": {
- "iopub.status.busy": "2023-07-19T08:42:20.077561Z",
- "iopub.status.idle": "2023-07-19T08:42:20.077926Z",
- "shell.execute_reply": "2023-07-19T08:42:20.077754Z"
- }
+ "tags": []
},
"outputs": [],
"source": [
From aa4b8a4a03c5d74402f8fed4f75db14a3e0adea8 Mon Sep 17 00:00:00 2001
From: Sriniketh J <81156510+srini047@users.noreply.github.com>
Date: Thu, 27 Jul 2023 18:09:49 +0530
Subject: [PATCH 6/9] update: pdf url
---
LangChain/LangChain_Vector_Search_on_PDF.ipynb | 11 +++++++----
1 file changed, 7 insertions(+), 4 deletions(-)
diff --git a/LangChain/LangChain_Vector_Search_on_PDF.ipynb b/LangChain/LangChain_Vector_Search_on_PDF.ipynb
index 17b3c7fc6f..79a0ee81ee 100644
--- a/LangChain/LangChain_Vector_Search_on_PDF.ipynb
+++ b/LangChain/LangChain_Vector_Search_on_PDF.ipynb
@@ -29,7 +29,10 @@
"# LangChain - Vector Search on PDF\n",
"\n",
"\n",
- "
Template request | Bug report"
+ "\n",
+ " \n",
+ "\n",
+ "
Template request | Bug report"
]
},
{
@@ -131,9 +134,9 @@
"outputs": [],
"source": [
"# Inputs\n",
- "pdf_file = \"\"\n",
- "weaviate_cluster_url = \"paste your cluster url\" or naas.secret.get(\"WEAVIATE_CLUSTER_URL\")\n",
- "query = \"Enter your own\" or \"Summarize the PDF...\"\n",
+ "pdf_file = \"\" or \"https://arxiv.org/pdf/2005.14165.pdf\"\n",
+ "weaviate_cluster_url = \"\" or naas.secret.get(\"WEAVIATE_CLUSTER_URL\")\n",
+ "query = \"\" or \"Summarize the PDF...\"\n",
"\n",
"# Outputs\n",
"response = \"\""
From 1baf7586a396a03ed1b2a1a3789b4a5d0acf5e82 Mon Sep 17 00:00:00 2001
From: Sriniketh J <81156510+srini047@users.noreply.github.com>
Date: Fri, 28 Jul 2023 17:49:08 +0530
Subject: [PATCH 7/9] refactor code
---
.../LangChain_Vector_Search_on_PDF.ipynb | 144 +++++++++++++-----
1 file changed, 105 insertions(+), 39 deletions(-)
diff --git a/LangChain/LangChain_Vector_Search_on_PDF.ipynb b/LangChain/LangChain_Vector_Search_on_PDF.ipynb
index 79a0ee81ee..9cf376788e 100644
--- a/LangChain/LangChain_Vector_Search_on_PDF.ipynb
+++ b/LangChain/LangChain_Vector_Search_on_PDF.ipynb
@@ -11,6 +11,7 @@
"shell.execute_reply": "2021-02-23T14:22:16.626866Z",
"shell.execute_reply.started": "2021-02-23T14:22:16.610384Z"
},
+ "id": "latin-packing",
"papermill": {},
"tags": []
},
@@ -22,6 +23,7 @@
"cell_type": "markdown",
"id": "compressed-wilson",
"metadata": {
+ "id": "compressed-wilson",
"papermill": {},
"tags": []
},
@@ -38,7 +40,9 @@
{
"cell_type": "markdown",
"id": "religious-programmer",
- "metadata": {},
+ "metadata": {
+ "id": "religious-programmer"
+ },
"source": [
"**Tags:** #langchain #pdf #weaviate #huggingface #llm #database #embeddings"
]
@@ -46,7 +50,9 @@
{
"cell_type": "markdown",
"id": "1fe9f56e-561c-4f52-aef8-b861c9462107",
- "metadata": {},
+ "metadata": {
+ "id": "1fe9f56e-561c-4f52-aef8-b861c9462107"
+ },
"source": [
"**Author:** [Sriniketh Jayasendil](https://www.linkedin.com/in/sriniketh-jayasendil)"
]
@@ -54,7 +60,9 @@
{
"cell_type": "markdown",
"id": "31ea7cdb-e10d-43fc-b026-f69249a59736",
- "metadata": {},
+ "metadata": {
+ "id": "31ea7cdb-e10d-43fc-b026-f69249a59736"
+ },
"source": [
"**Description:** This notebook is used to perform vector search on your PDF and it will answer basic questions that are closely related based on the prompt provided.\n",
"\n",
@@ -68,7 +76,9 @@
{
"cell_type": "markdown",
"id": "1a14806c-9da2-446e-b8fd-b55f8d7ac3f0",
- "metadata": {},
+ "metadata": {
+ "id": "1a14806c-9da2-446e-b8fd-b55f8d7ac3f0"
+ },
"source": [
"**References:**\n",
"- [Langchain docs](https://python.langchain.com/docs/get_started/introduction.html)\n",
@@ -80,6 +90,7 @@
"cell_type": "markdown",
"id": "distinguished-truth",
"metadata": {
+ "id": "distinguished-truth",
"papermill": {},
"tags": []
},
@@ -90,7 +101,9 @@
{
"cell_type": "markdown",
"id": "numeric-mediterranean",
- "metadata": {},
+ "metadata": {
+ "id": "numeric-mediterranean"
+ },
"source": [
"### Import libraries"
]
@@ -100,6 +113,7 @@
"execution_count": null,
"id": "potential-surfing",
"metadata": {
+ "id": "potential-surfing",
"tags": []
},
"outputs": [],
@@ -108,20 +122,35 @@
" import langchain\n",
" import PyPDF2\n",
" import weaviate\n",
+ " import openai\n",
"except ModuleNotFoundError:\n",
- " !pip install langchain PyPDF2 weaviate-client==3.20.0\n",
- " \n",
- "# Note: This installation make take more time than usual due to more dependencies {uncomment if there is some error in the embeddings routine}\n",
- "# !pip install sentence_transformers --user\n",
+ " !pip install langchain PyPDF2 openai weaviate-client==3.20.0\n",
"\n",
"import naas\n",
"import io\n",
"import requests\n",
"import PyPDF2\n",
- "import weaviate\n",
+ "import openai\n",
"from langchain.text_splitter import CharacterTextSplitter\n",
"from langchain.embeddings import HuggingFaceEmbeddings\n",
- "from langchain.vectorstores import Weaviate"
+ "from langchain.embeddings.openai import OpenAIEmbeddings\n",
+ "from langchain.vectorstores import Weaviate\n",
+ "from langchain.llms import OpenAI\n",
+ "from langchain.chains import RetrievalQA\n",
+ "from langchain.document_loaders import TextLoader"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "kDfGd3KRPDP8",
+ "metadata": {
+ "id": "kDfGd3KRPDP8"
+ },
+ "outputs": [],
+ "source": [
+ "# Note: This installation make take more time than usual due to more dependencies {uncomment if there is some error in the embeddings routine}\n",
+ "# !pip install -U sentence-transformers --user"
]
},
{
@@ -129,14 +158,16 @@
"execution_count": null,
"id": "continuous-melbourne",
"metadata": {
+ "id": "continuous-melbourne",
"tags": []
},
"outputs": [],
"source": [
"# Inputs\n",
- "pdf_file = \"\" or \"https://arxiv.org/pdf/2005.14165.pdf\"\n",
+ "pdf_file = \"\" or \"https://bcf.princeton.edu/wp-content/uploads/2023/05/A_User_s_Guide_to_GPT_and_LLMs_for_Economic_Research.pdf\"\n",
"weaviate_cluster_url = \"\" or naas.secret.get(\"WEAVIATE_CLUSTER_URL\")\n",
- "query = \"\" or \"Summarize the PDF...\"\n",
+ "openai_api_key = \"\" or naas.secret.get(\"OPENAI_API_KEY\")\n",
+ "query = \"\" or \"Summarize the PDF\"\n",
"\n",
"# Outputs\n",
"response = \"\""
@@ -145,7 +176,9 @@
{
"cell_type": "markdown",
"id": "registered-showcase",
- "metadata": {},
+ "metadata": {
+ "id": "registered-showcase"
+ },
"source": [
"## Model"
]
@@ -153,7 +186,9 @@
{
"cell_type": "markdown",
"id": "tested-astrology",
- "metadata": {},
+ "metadata": {
+ "id": "tested-astrology"
+ },
"source": [
"### Extract text from PDF"
]
@@ -163,6 +198,7 @@
"execution_count": null,
"id": "crude-louisville",
"metadata": {
+ "id": "crude-louisville",
"papermill": {},
"tags": []
},
@@ -177,10 +213,10 @@
" for page in reader.pages:\n",
" content = page.extract_text()\n",
" contents.append(content)\n",
- " \n",
+ "\n",
" contents = ' '.join(contents)\n",
" return contents\n",
- " \n",
+ "\n",
"\n",
"text = extract_text_from_pdf(pdf_file)"
]
@@ -188,7 +224,9 @@
{
"cell_type": "markdown",
"id": "8daa42c1-3a2b-4f96-a7dd-fb1deb395a84",
- "metadata": {},
+ "metadata": {
+ "id": "8daa42c1-3a2b-4f96-a7dd-fb1deb395a84"
+ },
"source": [
"### Split the text into chunks scraped from the PDF"
]
@@ -198,17 +236,13 @@
"execution_count": null,
"id": "f9e8e197-e965-441c-9512-9b28ed079ee6",
"metadata": {
+ "id": "f9e8e197-e965-441c-9512-9b28ed079ee6",
"papermill": {},
"tags": []
},
"outputs": [],
"source": [
- "text_splitter = CharacterTextSplitter(\n",
- " separator = \"\\n\",\n",
- " chunk_size = 1000,\n",
- " chunk_overlap = 200,\n",
- " length_function = len,\n",
- ")\n",
+ "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
"\n",
"texts = text_splitter.create_documents([text])"
]
@@ -216,7 +250,9 @@
{
"cell_type": "markdown",
"id": "ef1720bf-a28a-4757-b189-7df97947c158",
- "metadata": {},
+ "metadata": {
+ "id": "ef1720bf-a28a-4757-b189-7df97947c158"
+ },
"source": [
"### Create embeddings of the text make it compatible to store it in the database"
]
@@ -226,6 +262,7 @@
"execution_count": null,
"id": "e4a376ac-a10e-4d6a-ba01-e5445efdf091",
"metadata": {
+ "id": "e4a376ac-a10e-4d6a-ba01-e5445efdf091",
"tags": []
},
"outputs": [],
@@ -239,7 +276,9 @@
{
"cell_type": "markdown",
"id": "4169feb2-05ac-4914-bbb2-501dae7dcd89",
- "metadata": {},
+ "metadata": {
+ "id": "4169feb2-05ac-4914-bbb2-501dae7dcd89"
+ },
"source": [
"### Store the embeddings into the weaviate database"
]
@@ -249,17 +288,34 @@
"execution_count": null,
"id": "93ae7d2d-86b0-4b88-ba96-ef0d094d9da3",
"metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "93ae7d2d-86b0-4b88-ba96-ef0d094d9da3",
+ "outputId": "66af9945-1e8c-470c-dfe0-f161d3dc1c68",
"tags": []
},
"outputs": [],
"source": [
+ "# Delete existing schema if any present\n",
+ "client = weaviate.Client(url=weaviate_cluster_url )\n",
+ "\n",
+ "try:\n",
+ " client.schema.delete_all()\n",
+ " print(\"Schema deleted successfully...\")\n",
+ "except:\n",
+ " print(\"Schema not deleted...\")\n",
+ "\n",
+ "# Store in the weaviate vector database\n",
"db = Weaviate.from_documents(texts, embeddings, weaviate_url=weaviate_cluster_url, by_text=False)"
]
},
{
"cell_type": "markdown",
"id": "981fac74-2e1e-4b62-8b91-09d51d344bba",
- "metadata": {},
+ "metadata": {
+ "id": "981fac74-2e1e-4b62-8b91-09d51d344bba"
+ },
"source": [
"### Get the closest response to the user query on the PDF"
]
@@ -269,12 +325,13 @@
"execution_count": null,
"id": "bdf9e7a9-7de9-4c50-b677-97cb2a1d5d3b",
"metadata": {
+ "id": "bdf9e7a9-7de9-4c50-b677-97cb2a1d5d3b",
"tags": []
},
"outputs": [],
"source": [
- "docs = db.similarity_search(query)\n",
- "response = docs[0].page_content"
+ "qa = RetrievalQA.from_chain_type(llm=OpenAI(openai_api_key=openai_api_key, temperature=0), chain_type=\"stuff\", retriever=db.as_retriever())\n",
+ "response = qa.run(query)"
]
},
{
@@ -287,7 +344,8 @@
"iopub.status.idle": "2021-07-02T23:32:10.796900Z",
"shell.execute_reply": "2021-07-02T23:32:10.796358Z",
"shell.execute_reply.started": "2021-07-02T23:32:10.789033Z"
- }
+ },
+ "id": "lonely-pacific"
},
"source": [
"## Output"
@@ -296,7 +354,9 @@
{
"cell_type": "markdown",
"id": "890f7c86-b7bb-4f5d-9a1b-e492dd9580fd",
- "metadata": {},
+ "metadata": {
+ "id": "890f7c86-b7bb-4f5d-9a1b-e492dd9580fd"
+ },
"source": [
"### Show the response"
]
@@ -306,6 +366,12 @@
"execution_count": null,
"id": "9c4e3b7b-6440-4844-8054-265f1aec65eb",
"metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 69
+ },
+ "id": "9c4e3b7b-6440-4844-8054-265f1aec65eb",
+ "outputId": "70a8eb18-ce96-4902-9ebe-c3179494fb30",
"tags": []
},
"outputs": [],
@@ -317,12 +383,19 @@
"cell_type": "code",
"execution_count": null,
"id": "9d0d7bd2-7e89-49e6-807f-9ab542085a7f",
- "metadata": {},
+ "metadata": {
+ "id": "9d0d7bd2-7e89-49e6-807f-9ab542085a7f"
+ },
"outputs": [],
"source": []
}
],
"metadata": {
+ "accelerator": "GPU",
+ "colab": {
+ "gpuType": "T4",
+ "provenance": []
+ },
"kernelspec": {
"display_name": "Python 3",
"language": "python",
@@ -345,13 +418,6 @@
"environment_variables": {},
"parameters": {},
"version": "2.3.3"
- },
- "widgets": {
- "application/vnd.jupyter.widget-state+json": {
- "state": {},
- "version_major": 2,
- "version_minor": 0
- }
}
},
"nbformat": 4,
From dd2a8ba14d1b66802a24f52dbba6fac7ff9e1130 Mon Sep 17 00:00:00 2001
From: Florent Ravenel
Date: Tue, 1 Aug 2023 09:06:01 +0200
Subject: [PATCH 8/9] feat: update PDF and test
---
.../LangChain_Vector_Search_on_PDF.ipynb | 206 ++++++++----------
1 file changed, 89 insertions(+), 117 deletions(-)
diff --git a/LangChain/LangChain_Vector_Search_on_PDF.ipynb b/LangChain/LangChain_Vector_Search_on_PDF.ipynb
index 9cf376788e..a192b7a495 100644
--- a/LangChain/LangChain_Vector_Search_on_PDF.ipynb
+++ b/LangChain/LangChain_Vector_Search_on_PDF.ipynb
@@ -11,7 +11,6 @@
"shell.execute_reply": "2021-02-23T14:22:16.626866Z",
"shell.execute_reply.started": "2021-02-23T14:22:16.610384Z"
},
- "id": "latin-packing",
"papermill": {},
"tags": []
},
@@ -23,7 +22,6 @@
"cell_type": "markdown",
"id": "compressed-wilson",
"metadata": {
- "id": "compressed-wilson",
"papermill": {},
"tags": []
},
@@ -31,8 +29,6 @@
"# LangChain - Vector Search on PDF\n",
"\n",
"\n",
- "\n",
- " \n",
"\n",
"
Template request | Bug report"
]
@@ -40,9 +36,7 @@
{
"cell_type": "markdown",
"id": "religious-programmer",
- "metadata": {
- "id": "religious-programmer"
- },
+ "metadata": {},
"source": [
"**Tags:** #langchain #pdf #weaviate #huggingface #llm #database #embeddings"
]
@@ -50,19 +44,26 @@
{
"cell_type": "markdown",
"id": "1fe9f56e-561c-4f52-aef8-b861c9462107",
- "metadata": {
- "id": "1fe9f56e-561c-4f52-aef8-b861c9462107"
- },
+ "metadata": {},
"source": [
"**Author:** [Sriniketh Jayasendil](https://www.linkedin.com/in/sriniketh-jayasendil)"
]
},
{
"cell_type": "markdown",
- "id": "31ea7cdb-e10d-43fc-b026-f69249a59736",
+ "id": "68c33d85-f522-44bb-9b2c-dec47a414f54",
"metadata": {
- "id": "31ea7cdb-e10d-43fc-b026-f69249a59736"
+ "papermill": {},
+ "tags": []
},
+ "source": [
+ "**Last update:** 2023-07-31 (Created: 2023-07-10)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "31ea7cdb-e10d-43fc-b026-f69249a59736",
+ "metadata": {},
"source": [
"**Description:** This notebook is used to perform vector search on your PDF and it will answer basic questions that are closely related based on the prompt provided.\n",
"\n",
@@ -70,15 +71,17 @@
"- PyPDF2 - Get text from PDF\n",
"- LangChain - Text splitter, document creation\n",
"- HuggingFace - Embeddings\n",
- "- Weaviate - Vector Database"
+ "- Weaviate - Vector Database\n",
+ "\n",
+ "\n",
+ " \n",
+ ""
]
},
{
"cell_type": "markdown",
"id": "1a14806c-9da2-446e-b8fd-b55f8d7ac3f0",
- "metadata": {
- "id": "1a14806c-9da2-446e-b8fd-b55f8d7ac3f0"
- },
+ "metadata": {},
"source": [
"**References:**\n",
"- [Langchain docs](https://python.langchain.com/docs/get_started/introduction.html)\n",
@@ -90,7 +93,6 @@
"cell_type": "markdown",
"id": "distinguished-truth",
"metadata": {
- "id": "distinguished-truth",
"papermill": {},
"tags": []
},
@@ -101,9 +103,7 @@
{
"cell_type": "markdown",
"id": "numeric-mediterranean",
- "metadata": {
- "id": "numeric-mediterranean"
- },
+ "metadata": {},
"source": [
"### Import libraries"
]
@@ -113,7 +113,6 @@
"execution_count": null,
"id": "potential-surfing",
"metadata": {
- "id": "potential-surfing",
"tags": []
},
"outputs": [],
@@ -122,35 +121,33 @@
" import langchain\n",
" import PyPDF2\n",
" import weaviate\n",
- " import openai\n",
"except ModuleNotFoundError:\n",
- " !pip install langchain PyPDF2 openai weaviate-client==3.20.0\n",
+ " !pip install langchain PyPDF2 weaviate-client==3.20.0\n",
+ " \n",
+ "# Note: This installation make take more time than usual due to more dependencies {uncomment if there is some error in the embeddings routine}\n",
+ "# !pip install sentence_transformers --user\n",
"\n",
+ "import os\n",
"import naas\n",
"import io\n",
"import requests\n",
"import PyPDF2\n",
- "import openai\n",
+ "import weaviate\n",
"from langchain.text_splitter import CharacterTextSplitter\n",
"from langchain.embeddings import HuggingFaceEmbeddings\n",
- "from langchain.embeddings.openai import OpenAIEmbeddings\n",
- "from langchain.vectorstores import Weaviate\n",
- "from langchain.llms import OpenAI\n",
- "from langchain.chains import RetrievalQA\n",
- "from langchain.document_loaders import TextLoader"
+ "from langchain.vectorstores import Weaviate"
]
},
{
- "cell_type": "code",
- "execution_count": null,
- "id": "kDfGd3KRPDP8",
- "metadata": {
- "id": "kDfGd3KRPDP8"
- },
- "outputs": [],
+ "cell_type": "markdown",
+ "id": "64db5ac5-046f-4203-8503-990002927075",
+ "metadata": {},
"source": [
- "# Note: This installation make take more time than usual due to more dependencies {uncomment if there is some error in the embeddings routine}\n",
- "# !pip install -U sentence-transformers --user"
+ "### Setup variables\n",
+ "- `pdf_file`: Path to which the PDF file exists.\",\n",
+ "- `weaviate_cluster_url`: You can create a new weaviate cluster [here](https://console.weaviate.cloud) and paste the url or import from naas secrets\n",
+ "- `weaviate_api_key`: Get your API key from your weaviate dashboard [here](https://console.weaviate.cloud/dashboard#)\n",
+ "- `query`: The question that you need to ask the pdf"
]
},
{
@@ -158,37 +155,48 @@
"execution_count": null,
"id": "continuous-melbourne",
"metadata": {
- "id": "continuous-melbourne",
"tags": []
},
"outputs": [],
"source": [
- "# Inputs\n",
- "pdf_file = \"\" or \"https://bcf.princeton.edu/wp-content/uploads/2023/05/A_User_s_Guide_to_GPT_and_LLMs_for_Economic_Research.pdf\"\n",
- "weaviate_cluster_url = \"\" or naas.secret.get(\"WEAVIATE_CLUSTER_URL\")\n",
- "openai_api_key = \"\" or naas.secret.get(\"OPENAI_API_KEY\")\n",
- "query = \"\" or \"Summarize the PDF\"\n",
- "\n",
- "# Outputs\n",
- "response = \"\""
+ "pdf_file = \"https://tesla-cdn.thron.com/static/SVCPTV_2022_Q4_Quarterly_Update_6UDS97.pdf?xseo=&response-content-disposition=inline%3Bfilename%3D%22b7871185-dd6a-4d79-9c3b-19b497227f2a.pdf%22\"\n",
+ "weaviate_api_key = naas.secret.get(\"WEAVIATE_API_KEY\")\n",
+ "weaviate_cluster_url = naas.secret.get(\"WEAVIATE_CLUSTER_URL\")\n",
+ "query = \"What's the total revenue on Q4 2022?\""
]
},
{
"cell_type": "markdown",
"id": "registered-showcase",
- "metadata": {
- "id": "registered-showcase"
- },
+ "metadata": {},
"source": [
"## Model"
]
},
{
"cell_type": "markdown",
- "id": "tested-astrology",
+ "id": "8ae9725c-161a-47f6-a115-7d74cee3bd2f",
+ "metadata": {},
+ "source": [
+ "### Setup environ"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "bd067008-9cf1-45b1-a6d1-c37627dc4976",
"metadata": {
- "id": "tested-astrology"
+ "tags": []
},
+ "outputs": [],
+ "source": [
+ "os.environ[\"WEAVIATE_API_KEY\"] = weaviate_api_key"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "tested-astrology",
+ "metadata": {},
"source": [
"### Extract text from PDF"
]
@@ -198,7 +206,6 @@
"execution_count": null,
"id": "crude-louisville",
"metadata": {
- "id": "crude-louisville",
"papermill": {},
"tags": []
},
@@ -213,20 +220,17 @@
" for page in reader.pages:\n",
" content = page.extract_text()\n",
" contents.append(content)\n",
- "\n",
+ " \n",
" contents = ' '.join(contents)\n",
" return contents\n",
- "\n",
- "\n",
+ " \n",
"text = extract_text_from_pdf(pdf_file)"
]
},
{
"cell_type": "markdown",
"id": "8daa42c1-3a2b-4f96-a7dd-fb1deb395a84",
- "metadata": {
- "id": "8daa42c1-3a2b-4f96-a7dd-fb1deb395a84"
- },
+ "metadata": {},
"source": [
"### Split the text into chunks scraped from the PDF"
]
@@ -236,23 +240,27 @@
"execution_count": null,
"id": "f9e8e197-e965-441c-9512-9b28ed079ee6",
"metadata": {
- "id": "f9e8e197-e965-441c-9512-9b28ed079ee6",
"papermill": {},
"tags": []
},
"outputs": [],
"source": [
- "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
+ "text_splitter = CharacterTextSplitter(\n",
+ " separator = \"\\n\",\n",
+ " chunk_size = 1000,\n",
+ " chunk_overlap = 200,\n",
+ " length_function = len,\n",
+ ")\n",
"\n",
- "texts = text_splitter.create_documents([text])"
+ "texts = text_splitter.create_documents([text])\n",
+ "print(len(texts))\n",
+ "texts[0]"
]
},
{
"cell_type": "markdown",
"id": "ef1720bf-a28a-4757-b189-7df97947c158",
- "metadata": {
- "id": "ef1720bf-a28a-4757-b189-7df97947c158"
- },
+ "metadata": {},
"source": [
"### Create embeddings of the text make it compatible to store it in the database"
]
@@ -262,7 +270,6 @@
"execution_count": null,
"id": "e4a376ac-a10e-4d6a-ba01-e5445efdf091",
"metadata": {
- "id": "e4a376ac-a10e-4d6a-ba01-e5445efdf091",
"tags": []
},
"outputs": [],
@@ -270,15 +277,13 @@
"embeddings = HuggingFaceEmbeddings()\n",
"\n",
"for i in range(len(texts)):\n",
- " query_result = embeddings.embed_query(texts[i].page_content)"
+ " query_result = embeddings.embed_query(texts[i].page_content)"
]
},
{
"cell_type": "markdown",
"id": "4169feb2-05ac-4914-bbb2-501dae7dcd89",
- "metadata": {
- "id": "4169feb2-05ac-4914-bbb2-501dae7dcd89"
- },
+ "metadata": {},
"source": [
"### Store the embeddings into the weaviate database"
]
@@ -286,26 +291,12 @@
{
"cell_type": "code",
"execution_count": null,
- "id": "93ae7d2d-86b0-4b88-ba96-ef0d094d9da3",
+ "id": "6922b1d4-e394-493a-8549-07ba3c947e7d",
"metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "93ae7d2d-86b0-4b88-ba96-ef0d094d9da3",
- "outputId": "66af9945-1e8c-470c-dfe0-f161d3dc1c68",
"tags": []
},
"outputs": [],
"source": [
- "# Delete existing schema if any present\n",
- "client = weaviate.Client(url=weaviate_cluster_url )\n",
- "\n",
- "try:\n",
- " client.schema.delete_all()\n",
- " print(\"Schema deleted successfully...\")\n",
- "except:\n",
- " print(\"Schema not deleted...\")\n",
- "\n",
"# Store in the weaviate vector database\n",
"db = Weaviate.from_documents(texts, embeddings, weaviate_url=weaviate_cluster_url, by_text=False)"
]
@@ -313,9 +304,7 @@
{
"cell_type": "markdown",
"id": "981fac74-2e1e-4b62-8b91-09d51d344bba",
- "metadata": {
- "id": "981fac74-2e1e-4b62-8b91-09d51d344bba"
- },
+ "metadata": {},
"source": [
"### Get the closest response to the user query on the PDF"
]
@@ -325,13 +314,12 @@
"execution_count": null,
"id": "bdf9e7a9-7de9-4c50-b677-97cb2a1d5d3b",
"metadata": {
- "id": "bdf9e7a9-7de9-4c50-b677-97cb2a1d5d3b",
"tags": []
},
"outputs": [],
"source": [
- "qa = RetrievalQA.from_chain_type(llm=OpenAI(openai_api_key=openai_api_key, temperature=0), chain_type=\"stuff\", retriever=db.as_retriever())\n",
- "response = qa.run(query)"
+ "docs = db.similarity_search(query)\n",
+ "docs"
]
},
{
@@ -344,8 +332,7 @@
"iopub.status.idle": "2021-07-02T23:32:10.796900Z",
"shell.execute_reply": "2021-07-02T23:32:10.796358Z",
"shell.execute_reply.started": "2021-07-02T23:32:10.789033Z"
- },
- "id": "lonely-pacific"
+ }
},
"source": [
"## Output"
@@ -354,9 +341,7 @@
{
"cell_type": "markdown",
"id": "890f7c86-b7bb-4f5d-9a1b-e492dd9580fd",
- "metadata": {
- "id": "890f7c86-b7bb-4f5d-9a1b-e492dd9580fd"
- },
+ "metadata": {},
"source": [
"### Show the response"
]
@@ -366,36 +351,16 @@
"execution_count": null,
"id": "9c4e3b7b-6440-4844-8054-265f1aec65eb",
"metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 69
- },
- "id": "9c4e3b7b-6440-4844-8054-265f1aec65eb",
- "outputId": "70a8eb18-ce96-4902-9ebe-c3179494fb30",
"tags": []
},
"outputs": [],
"source": [
+ "response = docs[0].page_content\n",
"response"
]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "9d0d7bd2-7e89-49e6-807f-9ab542085a7f",
- "metadata": {
- "id": "9d0d7bd2-7e89-49e6-807f-9ab542085a7f"
- },
- "outputs": [],
- "source": []
}
],
"metadata": {
- "accelerator": "GPU",
- "colab": {
- "gpuType": "T4",
- "provenance": []
- },
"kernelspec": {
"display_name": "Python 3",
"language": "python",
@@ -418,6 +383,13 @@
"environment_variables": {},
"parameters": {},
"version": "2.3.3"
+ },
+ "widgets": {
+ "application/vnd.jupyter.widget-state+json": {
+ "state": {},
+ "version_major": 2,
+ "version_minor": 0
+ }
}
},
"nbformat": 4,
From 4840802b5c50e6e1b1dd48c7a63c365fc32b02b2 Mon Sep 17 00:00:00 2001
From: Florent Ravenel
Date: Wed, 27 Sep 2023 14:31:22 +0200
Subject: [PATCH 9/9] fix: install lib
---
.../LangChain_Vector_Search_on_PDF.ipynb | 23 +++++++++++++++----
1 file changed, 19 insertions(+), 4 deletions(-)
diff --git a/LangChain/LangChain_Vector_Search_on_PDF.ipynb b/LangChain/LangChain_Vector_Search_on_PDF.ipynb
index a192b7a495..f6e25ed743 100644
--- a/LangChain/LangChain_Vector_Search_on_PDF.ipynb
+++ b/LangChain/LangChain_Vector_Search_on_PDF.ipynb
@@ -57,7 +57,7 @@
"tags": []
},
"source": [
- "**Last update:** 2023-07-31 (Created: 2023-07-10)"
+ "**Last update:** 2023-09-27 (Created: 2023-09-27)"
]
},
{
@@ -119,10 +119,19 @@
"source": [
"try:\n",
" import langchain\n",
+ "except ModuleNotFoundError:\n",
+ " !pip install langchain --user\n",
+ " import langchain\n",
+ "try:\n",
" import PyPDF2\n",
+ "except ModuleNotFoundError:\n",
+ " !pip install PyPDF2 --user\n",
+ " import PyPDF2\n",
+ "try:\n",
" import weaviate\n",
"except ModuleNotFoundError:\n",
- " !pip install langchain PyPDF2 weaviate-client==3.20.0\n",
+ " !pip install weaviate-client==3.20.0 --user\n",
+ " import weaviate\n",
" \n",
"# Note: This installation make take more time than usual due to more dependencies {uncomment if there is some error in the embeddings routine}\n",
"# !pip install sentence_transformers --user\n",
@@ -131,8 +140,6 @@
"import naas\n",
"import io\n",
"import requests\n",
- "import PyPDF2\n",
- "import weaviate\n",
"from langchain.text_splitter import CharacterTextSplitter\n",
"from langchain.embeddings import HuggingFaceEmbeddings\n",
"from langchain.vectorstores import Weaviate"
@@ -358,6 +365,14 @@
"response = docs[0].page_content\n",
"response"
]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5e2bc7f1-acf9-402b-b0aa-93de14764f8b",
+ "metadata": {},
+ "outputs": [],
+ "source": []
}
],
"metadata": {