run-llama · logan-markewich · Mar 14, 2024 · Feb 15, 2024 · Feb 15, 2024 · Feb 16, 2024
diff --git a/docs/examples/vector_stores/DatabricksVectorSearchDemo.ipynb b/docs/examples/vector_stores/DatabricksVectorSearchDemo.ipynb
@@ -0,0 +1,289 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {},
+     "inputWidgets": {},
+     "nuid": "2f685925-940a-418f-9b00-5500f8878fc3",
+     "showTitle": false,
+     "title": ""
+    }
+   },
+   "source": [
+    "# Databricks Vector Search\n",
+    "\n",
+    "Databricks Vector Search is a vector database that is built into the Databricks Intelligence Platform and integrated with its governance and productivity tools. Full docs here: https://docs.databricks.com/en/generative-ai/vector-search.html"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Install llama-index and databricks-vectorsearch. You must be inside a Databricks runtime to use the Vector Search python client."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "8289764f-1001-4eb7-b162-92490746ebe8",
+     "showTitle": true,
+     "title": "Install llama-index and databricks-vectorsearch client"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "%pip install llama-index\n",
+    "%pip install databricks-vectorsearch"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Import databricks dependencies"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "75dd1adb-1937-49d2-aef1-393886271d46",
+     "showTitle": true,
+     "title": "Import Databricks dependencies"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from databricks.vector_search.client import VectorSearchIndex, VectorSearchClient"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Import LlamaIndex dependencies"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "b4ca851b-b0ee-4ea6-a31c-755c07e16d51",
+     "showTitle": true,
+     "title": "Import LlamaIndex dependencies"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from llama_index.core import (\n",
+    "    VectorStoreIndex,\n",
+    "    SimpleDirectoryReader,\n",
+    "    ServiceContext,\n",
+    "    StorageContext,\n",
+    ")\n",
+    "from llama_index.vector_stores.databricks_vector_search import DatabricksVectorSearch"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Load example data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {},
+     "inputWidgets": {},
+     "nuid": "dd06759d-0070-48a8-aa74-3d46b12457f8",
+     "showTitle": true,
+     "title": "Load example data"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "!mkdir -p 'data/paul_graham/'\n",
+    "!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt'"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Read the data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {},
+     "inputWidgets": {},
+     "nuid": "7a82b624-bffb-453b-b5c6-f8414566dc2f",
+     "showTitle": true,
+     "title": "Read the data"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# load documents\n",
+    "documents = SimpleDirectoryReader(\"./data/paul_graham/\").load_data()\n",
+    "print(f\"Total documents: {len(documents)}\")\n",
+    "print(f\"First document, id: {documents[0].doc_id}\")\n",
+    "print(f\"First document, hash: {documents[0].hash}\")\n",
+    "print(\n",
+    "    \"First document, text\"\n",
+    "    f\" ({len(documents[0].text)} characters):\\n{'='*20}\\n{documents[0].text[:360]} ...\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Create a Databricks Vector Search endpoint which will serve the index"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "99c56854-c182-4dfe-bc08-cee8263461ee",
+     "showTitle": true,
+     "title": "Create the Databricks Vector Search endpoint"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Create a vector search endpoint\n",
+    "client = VectorSearchClient()\n",
+    "client.create_endpoint(\n",
+    "    name=\"llamaindex_dbx_vector_store_test_endpoint\", endpoint_type=\"STANDARD\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Create the Databricks Vector Search index, and build it from the documents"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {
+      "byteLimit": 2048000,
+      "rowLimit": 10000
+     },
+     "inputWidgets": {},
+     "nuid": "6abe427b-79ca-4c0c-8e58-ba5f670294ae",
+     "showTitle": true,
+     "title": "Build the index from the documents"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# Create a vector search index\n",
+    "# it must be placed inside a Unity Catalog-enabled schema\n",
+    "\n",
+    "# We'll use self-managed embeddings (i.e. managed by LlamaIndex) rather than a Databricks-managed index\n",
+    "databricks_index = client.create_direct_access_index(\n",
+    "    endpoint_name=\"llamaindex_dbx_vector_store_test_endpoint\",\n",
+    "    index_name=\"my_catalog.my_schema.my_test_table\",\n",
+    "    primary_key=\"my_primary_key_name\",\n",
+    "    embedding_dimension=1536,  # match the embeddings model dimension you're going to use\n",
+    "    embedding_vector_column=\"my_embedding_vector_column_name\",  # you name this anything you want - it'll be picked up by the LlamaIndex class\n",
+    "    schema={\n",
+    "        \"my_primary_key_name\": \"string\",\n",
+    "        \"my_embedding_vector_column_name\": \"array<double>\",\n",
+    "        \"text\": \"string\",  # one column must match the text_column in the DatabricksVectorSearch instance created below; this will hold the raw node text.\n",
+    "        # add any other metadata you may have in your nodes (Databricks Vector Search supports metadata filtering)\n",
+    "    },\n",
+    ")\n",
+    "\n",
+    "databricks_vector_store = DatabricksVectorSearch(\n",
+    "    index=databricks_index, text_column=\"text\"\n",
+    ")  # text_column is required for self-managed embeddings\n",
+    "storage_context = StorageContext.from_defaults(vector_store=databricks_vector_store)\n",
+    "index = VectorStoreIndex.from_documents(documents, storage_context=storage_context)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Query the index"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "application/vnd.databricks.v1+cell": {
+     "cellMetadata": {},
+     "inputWidgets": {},
+     "nuid": "3e8c18f7-db8c-45c1-bb82-b75ad2307824",
+     "showTitle": true,
+     "title": "Query using the index"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "query_engine = index.as_query_engine()\n",
+    "response = query_engine.query(\"Why did the author choose to work on AI?\")\n",
+    "\n",
+    "print(response.response)"
+   ]
+  }
+ ],
+ "metadata": {
+  "application/vnd.databricks.v1+notebook": {
+   "dashboards": [],
+   "language": "python",
+   "notebookMetadata": {
+    "pythonIndentUnit": 4
+   },
+   "notebookName": "Databricks Vector Search Demo (LlamaIndex Integration)",
+   "widgets": {}
+  },
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/docs/module_guides/storing/vector_stores.md b/docs/module_guides/storing/vector_stores.md
@@ -23,6 +23,7 @@ We are actively adding more integrations and improving feature coverage for each
 | ChatGPT Retrieval Plugin | aggregator              |                    |               | ✓      | ✓               |       |
 | Chroma                   | self-hosted             | ✓                  |               | ✓      | ✓               |       |
 | DashVector               | cloud                   | ✓                  | ✓             | ✓      | ✓               |       |
+| Databricks               | cloud               | ✓                  |               | ✓      | ✓               |       |
 | Deeplake                 | self-hosted / cloud     | ✓                  |               | ✓      | ✓               |       |
 | DocArray                 | aggregator              | ✓                  |               | ✓      | ✓               |       |
 | DuckDB                   | in-memory / self-hosted | ✓                  |               | ✓      | ✓               |       |
@@ -70,6 +71,7 @@ maxdepth: 1
 /examples/vector_stores/ChromaIndexDemo.ipynb
 /examples/vector_stores/DashvectorIndexDemo.ipynb
 /examples/vector_stores/DashvectorIndexDemo-Hybrid.ipynb
+/examples/vector_stores/DatabricksVectorSearchDemo.ipynb
 /examples/vector_stores/DeepLakeIndexDemo.ipynb
 /examples/vector_stores/DocArrayHnswIndexDemo.ipynb
 /examples/vector_stores/DocArrayInMemoryIndexDemo.ipynb