Feature/databricks vector search (#10754)

run-llama · Mar 14, 2024 · 6311604 · 6311604
1 parent 3e02793
commit 6311604
Show file tree

Hide file tree

Showing 13 changed files with 884 additions and 0 deletions.
diff --git a/docs/examples/vector_stores/DatabricksVectorSearchDemo.ipynb b/docs/examples/vector_stores/DatabricksVectorSearchDemo.ipynb
@@ -0,0 +1,213 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Databricks Vector Search\n",
+    "\n",
+    "Databricks Vector Search is a vector database that is built into the Databricks Intelligence Platform and integrated with its governance and productivity tools. Full docs here: https://docs.databricks.com/en/generative-ai/vector-search.html"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Install llama-index and databricks-vectorsearch. You must be inside a Databricks runtime to use the Vector Search python client."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%pip install llama-index llama-index-vector-stores-databricks\n",
+    "%pip install databricks-vectorsearch"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Import databricks dependencies"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from databricks.vector_search.client import (\n",
+    "    VectorSearchIndex,\n",
+    "    VectorSearchClient,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Import LlamaIndex dependencies"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llama_index.core import (\n",
+    "    VectorStoreIndex,\n",
+    "    SimpleDirectoryReader,\n",
+    "    ServiceContext,\n",
+    "    StorageContext,\n",
+    ")\n",
+    "from llama_index.vector_stores.databricks import DatabricksVectorSearch"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Load example data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!mkdir -p 'data/paul_graham/'\n",
+    "!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt'"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Read the data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# load documents\n",
+    "documents = SimpleDirectoryReader(\"./data/paul_graham/\").load_data()\n",
+    "print(f\"Total documents: {len(documents)}\")\n",
+    "print(f\"First document, id: {documents[0].doc_id}\")\n",
+    "print(f\"First document, hash: {documents[0].hash}\")\n",
+    "print(\n",
+    "    \"First document, text\"\n",
+    "    f\" ({len(documents[0].text)} characters):\\n{'='*20}\\n{documents[0].text[:360]} ...\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Create a Databricks Vector Search endpoint which will serve the index"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create a vector search endpoint\n",
+    "client = VectorSearchClient()\n",
+    "client.create_endpoint(\n",
+    "    name=\"llamaindex_dbx_vector_store_test_endpoint\", endpoint_type=\"STANDARD\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Create the Databricks Vector Search index, and build it from the documents"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create a vector search index\n",
+    "# it must be placed inside a Unity Catalog-enabled schema\n",
+    "\n",
+    "# We'll use self-managed embeddings (i.e. managed by LlamaIndex) rather than a Databricks-managed index\n",
+    "databricks_index = client.create_direct_access_index(\n",
+    "    endpoint_name=\"llamaindex_dbx_vector_store_test_endpoint\",\n",
+    "    index_name=\"my_catalog.my_schema.my_test_table\",\n",
+    "    primary_key=\"my_primary_key_name\",\n",
+    "    embedding_dimension=1536,  # match the embeddings model dimension you're going to use\n",
+    "    embedding_vector_column=\"my_embedding_vector_column_name\",  # you name this anything you want - it'll be picked up by the LlamaIndex class\n",
+    "    schema={\n",
+    "        \"my_primary_key_name\": \"string\",\n",
+    "        \"my_embedding_vector_column_name\": \"array<double>\",\n",
+    "        \"text\": \"string\",  # one column must match the text_column in the DatabricksVectorSearch instance created below; this will hold the raw node text,\n",
+    "        \"doc_id\": \"string\",  # one column must contain the reference document ID (this will be populated by LlamaIndex automatically)\n",
+    "        # add any other metadata you may have in your nodes (Databricks Vector Search supports metadata filtering)\n",
+    "        # NOTE THAT THESE FIELDS MUST BE ADDED EXPLICITLY TO BE USED FOR METADATA FILTERING\n",
+    "    },\n",
+    ")\n",
+    "\n",
+    "databricks_vector_store = DatabricksVectorSearch(\n",
+    "    index=databricks_index,\n",
+    "    text_column=\"text\",\n",
+    "    columns=None,  # YOU MUST ALSO RECORD YOUR METADATA FIELD NAMES HERE\n",
+    ")  # text_column is required for self-managed embeddings\n",
+    "storage_context = StorageContext.from_defaults(\n",
+    "    vector_store=databricks_vector_store\n",
+    ")\n",
+    "index = VectorStoreIndex.from_documents(\n",
+    "    documents, storage_context=storage_context\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Query the index"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "query_engine = index.as_query_engine()\n",
+    "response = query_engine.query(\"Why did the author choose to work on AI?\")\n",
+    "\n",
+    "print(response.response)"
+   ]
+  }
+ ],
+ "metadata": {
+  "application/vnd.databricks.v1+notebook": {
+   "dashboards": [],
+   "language": "python",
+   "notebookMetadata": {
+    "pythonIndentUnit": 4
+   },
+   "notebookName": "Databricks Vector Search Demo (LlamaIndex Integration)",
+   "widgets": {}
+  },
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/docs/module_guides/storing/vector_stores.md b/docs/module_guides/storing/vector_stores.md
@@ -23,6 +23,7 @@ We are actively adding more integrations and improving feature coverage for each
 | ChatGPT Retrieval Plugin | aggregator              |                    |               | ✓      | ✓               |       |
 | Chroma                   | self-hosted             | ✓                  |               | ✓      | ✓               |       |
 | DashVector               | cloud                   | ✓                  | ✓             | ✓      | ✓               |       |
+| Databricks               | cloud                   | ✓                  |               | ✓      | ✓               |       |
 | Deeplake                 | self-hosted / cloud     | ✓                  |               | ✓      | ✓               |       |
 | DocArray                 | aggregator              | ✓                  |               | ✓      | ✓               |       |
 | DuckDB                   | in-memory / self-hosted | ✓                  |               | ✓      | ✓               |       |
@@ -70,6 +71,7 @@ maxdepth: 1
 /examples/vector_stores/ChromaIndexDemo.ipynb
 /examples/vector_stores/DashvectorIndexDemo.ipynb
 /examples/vector_stores/DashvectorIndexDemo-Hybrid.ipynb
+/examples/vector_stores/DatabricksVectorSearchDemo.ipynb
 /examples/vector_stores/DeepLakeIndexDemo.ipynb
 /examples/vector_stores/DocArrayHnswIndexDemo.ipynb
 /examples/vector_stores/DocArrayInMemoryIndexDemo.ipynb

diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/.gitignore b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/.gitignore
@@ -0,0 +1,153 @@
+llama_index/_static
+.DS_Store
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+bin/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+etc/
+include/
+lib/
+lib64/
+parts/
+sdist/
+share/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+.ruff_cache
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+notebooks/
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+pyvenv.cfg
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# Jetbrains
+.idea
+modules/
+*.swp
+
+# VsCode
+.vscode
+
+# pipenv
+Pipfile
+Pipfile.lock
+
+# pyright
+pyrightconfig.json
diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/BUILD b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/BUILD
@@ -0,0 +1,4 @@
+poetry_requirements(
+    name="poetry",
+    module_mapping={"databricks-vectorsearch": ["databricks"]}
+)