diff --git a/LangChain/LangChain_Vector_Search_on_PDF.ipynb b/LangChain/LangChain_Vector_Search_on_PDF.ipynb
new file mode 100644
index 0000000000..f6e25ed743
--- /dev/null
+++ b/LangChain/LangChain_Vector_Search_on_PDF.ipynb
@@ -0,0 +1,412 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "latin-packing",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2021-02-23T14:22:16.610471Z",
+ "iopub.status.busy": "2021-02-23T14:22:16.610129Z",
+ "iopub.status.idle": "2021-02-23T14:22:16.627784Z",
+ "shell.execute_reply": "2021-02-23T14:22:16.626866Z",
+ "shell.execute_reply.started": "2021-02-23T14:22:16.610384Z"
+ },
+ "papermill": {},
+ "tags": []
+ },
+ "source": [
+ ""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "compressed-wilson",
+ "metadata": {
+ "papermill": {},
+ "tags": []
+ },
+ "source": [
+ "# LangChain - Vector Search on PDF\n",
+ "\n",
+ "\n",
+ "\n",
+ "
Template request | Bug report"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "religious-programmer",
+ "metadata": {},
+ "source": [
+ "**Tags:** #langchain #pdf #weaviate #huggingface #llm #database #embeddings"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1fe9f56e-561c-4f52-aef8-b861c9462107",
+ "metadata": {},
+ "source": [
+ "**Author:** [Sriniketh Jayasendil](https://www.linkedin.com/in/sriniketh-jayasendil)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "68c33d85-f522-44bb-9b2c-dec47a414f54",
+ "metadata": {
+ "papermill": {},
+ "tags": []
+ },
+ "source": [
+ "**Last update:** 2023-09-27 (Created: 2023-09-27)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "31ea7cdb-e10d-43fc-b026-f69249a59736",
+ "metadata": {},
+ "source": [
+ "**Description:** This notebook is used to perform vector search on your PDF and it will answer basic questions that are closely related based on the prompt provided.\n",
+ "\n",
+ "It uses:\n",
+ "- PyPDF2 - Get text from PDF\n",
+ "- LangChain - Text splitter, document creation\n",
+ "- HuggingFace - Embeddings\n",
+ "- Weaviate - Vector Database\n",
+ "\n",
+ "\n",
+ " \n",
+ ""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1a14806c-9da2-446e-b8fd-b55f8d7ac3f0",
+ "metadata": {},
+ "source": [
+ "**References:**\n",
+ "- [Langchain docs](https://python.langchain.com/docs/get_started/introduction.html)\n",
+ "- [Weaviate docs](https://weaviate.io/developers/weaviate)\n",
+ "- [Huggingface docs](https://huggingface.co/docs)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "distinguished-truth",
+ "metadata": {
+ "papermill": {},
+ "tags": []
+ },
+ "source": [
+ "## Input"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "numeric-mediterranean",
+ "metadata": {},
+ "source": [
+ "### Import libraries"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "potential-surfing",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "try:\n",
+ " import langchain\n",
+ "except ModuleNotFoundError:\n",
+ " !pip install langchain --user\n",
+ " import langchain\n",
+ "try:\n",
+ " import PyPDF2\n",
+ "except ModuleNotFoundError:\n",
+ " !pip install PyPDF2 --user\n",
+ " import PyPDF2\n",
+ "try:\n",
+ " import weaviate\n",
+ "except ModuleNotFoundError:\n",
+ " !pip install weaviate-client==3.20.0 --user\n",
+ " import weaviate\n",
+ " \n",
+ "# Note: This installation make take more time than usual due to more dependencies {uncomment if there is some error in the embeddings routine}\n",
+ "# !pip install sentence_transformers --user\n",
+ "\n",
+ "import os\n",
+ "import naas\n",
+ "import io\n",
+ "import requests\n",
+ "from langchain.text_splitter import CharacterTextSplitter\n",
+ "from langchain.embeddings import HuggingFaceEmbeddings\n",
+ "from langchain.vectorstores import Weaviate"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "64db5ac5-046f-4203-8503-990002927075",
+ "metadata": {},
+ "source": [
+ "### Setup variables\n",
+ "- `pdf_file`: Path to which the PDF file exists.\",\n",
+ "- `weaviate_cluster_url`: You can create a new weaviate cluster [here](https://console.weaviate.cloud) and paste the url or import from naas secrets\n",
+ "- `weaviate_api_key`: Get your API key from your weaviate dashboard [here](https://console.weaviate.cloud/dashboard#)\n",
+ "- `query`: The question that you need to ask the pdf"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "continuous-melbourne",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "pdf_file = \"https://tesla-cdn.thron.com/static/SVCPTV_2022_Q4_Quarterly_Update_6UDS97.pdf?xseo=&response-content-disposition=inline%3Bfilename%3D%22b7871185-dd6a-4d79-9c3b-19b497227f2a.pdf%22\"\n",
+ "weaviate_api_key = naas.secret.get(\"WEAVIATE_API_KEY\")\n",
+ "weaviate_cluster_url = naas.secret.get(\"WEAVIATE_CLUSTER_URL\")\n",
+ "query = \"What's the total revenue on Q4 2022?\""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "registered-showcase",
+ "metadata": {},
+ "source": [
+ "## Model"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8ae9725c-161a-47f6-a115-7d74cee3bd2f",
+ "metadata": {},
+ "source": [
+ "### Setup environ"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "bd067008-9cf1-45b1-a6d1-c37627dc4976",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "os.environ[\"WEAVIATE_API_KEY\"] = weaviate_api_key"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "tested-astrology",
+ "metadata": {},
+ "source": [
+ "### Extract text from PDF"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "crude-louisville",
+ "metadata": {
+ "papermill": {},
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "def extract_text_from_pdf(pdf_path):\n",
+ " r = requests.get(pdf_path)\n",
+ " f = io.BytesIO(r.content)\n",
+ "\n",
+ " reader = PyPDF2.PdfReader(f)\n",
+ " contents = []\n",
+ " for page in reader.pages:\n",
+ " content = page.extract_text()\n",
+ " contents.append(content)\n",
+ " \n",
+ " contents = ' '.join(contents)\n",
+ " return contents\n",
+ " \n",
+ "text = extract_text_from_pdf(pdf_file)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8daa42c1-3a2b-4f96-a7dd-fb1deb395a84",
+ "metadata": {},
+ "source": [
+ "### Split the text into chunks scraped from the PDF"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f9e8e197-e965-441c-9512-9b28ed079ee6",
+ "metadata": {
+ "papermill": {},
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "text_splitter = CharacterTextSplitter(\n",
+ " separator = \"\\n\",\n",
+ " chunk_size = 1000,\n",
+ " chunk_overlap = 200,\n",
+ " length_function = len,\n",
+ ")\n",
+ "\n",
+ "texts = text_splitter.create_documents([text])\n",
+ "print(len(texts))\n",
+ "texts[0]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ef1720bf-a28a-4757-b189-7df97947c158",
+ "metadata": {},
+ "source": [
+ "### Create embeddings of the text make it compatible to store it in the database"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e4a376ac-a10e-4d6a-ba01-e5445efdf091",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "embeddings = HuggingFaceEmbeddings()\n",
+ "\n",
+ "for i in range(len(texts)):\n",
+ " query_result = embeddings.embed_query(texts[i].page_content)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4169feb2-05ac-4914-bbb2-501dae7dcd89",
+ "metadata": {},
+ "source": [
+ "### Store the embeddings into the weaviate database"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6922b1d4-e394-493a-8549-07ba3c947e7d",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "# Store in the weaviate vector database\n",
+ "db = Weaviate.from_documents(texts, embeddings, weaviate_url=weaviate_cluster_url, by_text=False)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "981fac74-2e1e-4b62-8b91-09d51d344bba",
+ "metadata": {},
+ "source": [
+ "### Get the closest response to the user query on the PDF"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "bdf9e7a9-7de9-4c50-b677-97cb2a1d5d3b",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "docs = db.similarity_search(query)\n",
+ "docs"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "lonely-pacific",
+ "metadata": {
+ "execution": {
+ "iopub.execute_input": "2021-07-02T23:32:10.789097Z",
+ "iopub.status.busy": "2021-07-02T23:32:10.788829Z",
+ "iopub.status.idle": "2021-07-02T23:32:10.796900Z",
+ "shell.execute_reply": "2021-07-02T23:32:10.796358Z",
+ "shell.execute_reply.started": "2021-07-02T23:32:10.789033Z"
+ }
+ },
+ "source": [
+ "## Output"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "890f7c86-b7bb-4f5d-9a1b-e492dd9580fd",
+ "metadata": {},
+ "source": [
+ "### Show the response"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9c4e3b7b-6440-4844-8054-265f1aec65eb",
+ "metadata": {
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "response = docs[0].page_content\n",
+ "response"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5e2bc7f1-acf9-402b-b0aa-93de14764f8b",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.9.6"
+ },
+ "papermill": {
+ "default_parameters": {},
+ "environment_variables": {},
+ "parameters": {},
+ "version": "2.3.3"
+ },
+ "widgets": {
+ "application/vnd.jupyter.widget-state+json": {
+ "state": {},
+ "version_major": 2,
+ "version_minor": 0
+ }
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}