diff --git a/LangChain/LangChain_Vector_Search_on_PDF.ipynb b/LangChain/LangChain_Vector_Search_on_PDF.ipynb new file mode 100644 index 0000000000..f6e25ed743 --- /dev/null +++ b/LangChain/LangChain_Vector_Search_on_PDF.ipynb @@ -0,0 +1,412 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "latin-packing", + "metadata": { + "execution": { + "iopub.execute_input": "2021-02-23T14:22:16.610471Z", + "iopub.status.busy": "2021-02-23T14:22:16.610129Z", + "iopub.status.idle": "2021-02-23T14:22:16.627784Z", + "shell.execute_reply": "2021-02-23T14:22:16.626866Z", + "shell.execute_reply.started": "2021-02-23T14:22:16.610384Z" + }, + "papermill": {}, + "tags": [] + }, + "source": [ + "\"Naas\"" + ] + }, + { + "cell_type": "markdown", + "id": "compressed-wilson", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": [ + "# LangChain - Vector Search on PDF\n", + "\n", + "\n", + "\n", + "

Template request | Bug report" + ] + }, + { + "cell_type": "markdown", + "id": "religious-programmer", + "metadata": {}, + "source": [ + "**Tags:** #langchain #pdf #weaviate #huggingface #llm #database #embeddings" + ] + }, + { + "cell_type": "markdown", + "id": "1fe9f56e-561c-4f52-aef8-b861c9462107", + "metadata": {}, + "source": [ + "**Author:** [Sriniketh Jayasendil](https://www.linkedin.com/in/sriniketh-jayasendil)" + ] + }, + { + "cell_type": "markdown", + "id": "68c33d85-f522-44bb-9b2c-dec47a414f54", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": [ + "**Last update:** 2023-09-27 (Created: 2023-09-27)" + ] + }, + { + "cell_type": "markdown", + "id": "31ea7cdb-e10d-43fc-b026-f69249a59736", + "metadata": {}, + "source": [ + "**Description:** This notebook is used to perform vector search on your PDF and it will answer basic questions that are closely related based on the prompt provided.\n", + "\n", + "It uses:\n", + "- PyPDF2 - Get text from PDF\n", + "- LangChain - Text splitter, document creation\n", + "- HuggingFace - Embeddings\n", + "- Weaviate - Vector Database\n", + "\n", + "\n", + " \"Open\n", + "" + ] + }, + { + "cell_type": "markdown", + "id": "1a14806c-9da2-446e-b8fd-b55f8d7ac3f0", + "metadata": {}, + "source": [ + "**References:**\n", + "- [Langchain docs](https://python.langchain.com/docs/get_started/introduction.html)\n", + "- [Weaviate docs](https://weaviate.io/developers/weaviate)\n", + "- [Huggingface docs](https://huggingface.co/docs)" + ] + }, + { + "cell_type": "markdown", + "id": "distinguished-truth", + "metadata": { + "papermill": {}, + "tags": [] + }, + "source": [ + "## Input" + ] + }, + { + "cell_type": "markdown", + "id": "numeric-mediterranean", + "metadata": {}, + "source": [ + "### Import libraries" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "potential-surfing", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "try:\n", + " import langchain\n", + "except ModuleNotFoundError:\n", + " !pip install langchain --user\n", + " import langchain\n", + "try:\n", + " import PyPDF2\n", + "except ModuleNotFoundError:\n", + " !pip install PyPDF2 --user\n", + " import PyPDF2\n", + "try:\n", + " import weaviate\n", + "except ModuleNotFoundError:\n", + " !pip install weaviate-client==3.20.0 --user\n", + " import weaviate\n", + " \n", + "# Note: This installation make take more time than usual due to more dependencies {uncomment if there is some error in the embeddings routine}\n", + "# !pip install sentence_transformers --user\n", + "\n", + "import os\n", + "import naas\n", + "import io\n", + "import requests\n", + "from langchain.text_splitter import CharacterTextSplitter\n", + "from langchain.embeddings import HuggingFaceEmbeddings\n", + "from langchain.vectorstores import Weaviate" + ] + }, + { + "cell_type": "markdown", + "id": "64db5ac5-046f-4203-8503-990002927075", + "metadata": {}, + "source": [ + "### Setup variables\n", + "- `pdf_file`: Path to which the PDF file exists.\",\n", + "- `weaviate_cluster_url`: You can create a new weaviate cluster [here](https://console.weaviate.cloud) and paste the url or import from naas secrets\n", + "- `weaviate_api_key`: Get your API key from your weaviate dashboard [here](https://console.weaviate.cloud/dashboard#)\n", + "- `query`: The question that you need to ask the pdf" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "continuous-melbourne", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "pdf_file = \"https://tesla-cdn.thron.com/static/SVCPTV_2022_Q4_Quarterly_Update_6UDS97.pdf?xseo=&response-content-disposition=inline%3Bfilename%3D%22b7871185-dd6a-4d79-9c3b-19b497227f2a.pdf%22\"\n", + "weaviate_api_key = naas.secret.get(\"WEAVIATE_API_KEY\")\n", + "weaviate_cluster_url = naas.secret.get(\"WEAVIATE_CLUSTER_URL\")\n", + "query = \"What's the total revenue on Q4 2022?\"" + ] + }, + { + "cell_type": "markdown", + "id": "registered-showcase", + "metadata": {}, + "source": [ + "## Model" + ] + }, + { + "cell_type": "markdown", + "id": "8ae9725c-161a-47f6-a115-7d74cee3bd2f", + "metadata": {}, + "source": [ + "### Setup environ" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bd067008-9cf1-45b1-a6d1-c37627dc4976", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "os.environ[\"WEAVIATE_API_KEY\"] = weaviate_api_key" + ] + }, + { + "cell_type": "markdown", + "id": "tested-astrology", + "metadata": {}, + "source": [ + "### Extract text from PDF" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "crude-louisville", + "metadata": { + "papermill": {}, + "tags": [] + }, + "outputs": [], + "source": [ + "def extract_text_from_pdf(pdf_path):\n", + " r = requests.get(pdf_path)\n", + " f = io.BytesIO(r.content)\n", + "\n", + " reader = PyPDF2.PdfReader(f)\n", + " contents = []\n", + " for page in reader.pages:\n", + " content = page.extract_text()\n", + " contents.append(content)\n", + " \n", + " contents = ' '.join(contents)\n", + " return contents\n", + " \n", + "text = extract_text_from_pdf(pdf_file)" + ] + }, + { + "cell_type": "markdown", + "id": "8daa42c1-3a2b-4f96-a7dd-fb1deb395a84", + "metadata": {}, + "source": [ + "### Split the text into chunks scraped from the PDF" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f9e8e197-e965-441c-9512-9b28ed079ee6", + "metadata": { + "papermill": {}, + "tags": [] + }, + "outputs": [], + "source": [ + "text_splitter = CharacterTextSplitter(\n", + " separator = \"\\n\",\n", + " chunk_size = 1000,\n", + " chunk_overlap = 200,\n", + " length_function = len,\n", + ")\n", + "\n", + "texts = text_splitter.create_documents([text])\n", + "print(len(texts))\n", + "texts[0]" + ] + }, + { + "cell_type": "markdown", + "id": "ef1720bf-a28a-4757-b189-7df97947c158", + "metadata": {}, + "source": [ + "### Create embeddings of the text make it compatible to store it in the database" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e4a376ac-a10e-4d6a-ba01-e5445efdf091", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "embeddings = HuggingFaceEmbeddings()\n", + "\n", + "for i in range(len(texts)):\n", + " query_result = embeddings.embed_query(texts[i].page_content)" + ] + }, + { + "cell_type": "markdown", + "id": "4169feb2-05ac-4914-bbb2-501dae7dcd89", + "metadata": {}, + "source": [ + "### Store the embeddings into the weaviate database" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6922b1d4-e394-493a-8549-07ba3c947e7d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# Store in the weaviate vector database\n", + "db = Weaviate.from_documents(texts, embeddings, weaviate_url=weaviate_cluster_url, by_text=False)" + ] + }, + { + "cell_type": "markdown", + "id": "981fac74-2e1e-4b62-8b91-09d51d344bba", + "metadata": {}, + "source": [ + "### Get the closest response to the user query on the PDF" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bdf9e7a9-7de9-4c50-b677-97cb2a1d5d3b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "docs = db.similarity_search(query)\n", + "docs" + ] + }, + { + "cell_type": "markdown", + "id": "lonely-pacific", + "metadata": { + "execution": { + "iopub.execute_input": "2021-07-02T23:32:10.789097Z", + "iopub.status.busy": "2021-07-02T23:32:10.788829Z", + "iopub.status.idle": "2021-07-02T23:32:10.796900Z", + "shell.execute_reply": "2021-07-02T23:32:10.796358Z", + "shell.execute_reply.started": "2021-07-02T23:32:10.789033Z" + } + }, + "source": [ + "## Output" + ] + }, + { + "cell_type": "markdown", + "id": "890f7c86-b7bb-4f5d-9a1b-e492dd9580fd", + "metadata": {}, + "source": [ + "### Show the response" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9c4e3b7b-6440-4844-8054-265f1aec65eb", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "response = docs[0].page_content\n", + "response" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5e2bc7f1-acf9-402b-b0aa-93de14764f8b", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.6" + }, + "papermill": { + "default_parameters": {}, + "environment_variables": {}, + "parameters": {}, + "version": "2.3.3" + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": {}, + "version_major": 2, + "version_minor": 0 + } + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}