-
Notifications
You must be signed in to change notification settings - Fork 5.3k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Feature/databricks vector search (#10754)
- Loading branch information
Showing
13 changed files
with
884 additions
and
0 deletions.
There are no files selected for viewing
213 changes: 213 additions & 0 deletions
213
docs/examples/vector_stores/DatabricksVectorSearchDemo.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,213 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"# Databricks Vector Search\n", | ||
"\n", | ||
"Databricks Vector Search is a vector database that is built into the Databricks Intelligence Platform and integrated with its governance and productivity tools. Full docs here: https://docs.databricks.com/en/generative-ai/vector-search.html" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"Install llama-index and databricks-vectorsearch. You must be inside a Databricks runtime to use the Vector Search python client." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"%pip install llama-index llama-index-vector-stores-databricks\n", | ||
"%pip install databricks-vectorsearch" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"Import databricks dependencies" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from databricks.vector_search.client import (\n", | ||
" VectorSearchIndex,\n", | ||
" VectorSearchClient,\n", | ||
")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"Import LlamaIndex dependencies" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from llama_index.core import (\n", | ||
" VectorStoreIndex,\n", | ||
" SimpleDirectoryReader,\n", | ||
" ServiceContext,\n", | ||
" StorageContext,\n", | ||
")\n", | ||
"from llama_index.vector_stores.databricks import DatabricksVectorSearch" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"Load example data" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"!mkdir -p 'data/paul_graham/'\n", | ||
"!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt'" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"Read the data" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# load documents\n", | ||
"documents = SimpleDirectoryReader(\"./data/paul_graham/\").load_data()\n", | ||
"print(f\"Total documents: {len(documents)}\")\n", | ||
"print(f\"First document, id: {documents[0].doc_id}\")\n", | ||
"print(f\"First document, hash: {documents[0].hash}\")\n", | ||
"print(\n", | ||
" \"First document, text\"\n", | ||
" f\" ({len(documents[0].text)} characters):\\n{'='*20}\\n{documents[0].text[:360]} ...\"\n", | ||
")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"Create a Databricks Vector Search endpoint which will serve the index" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Create a vector search endpoint\n", | ||
"client = VectorSearchClient()\n", | ||
"client.create_endpoint(\n", | ||
" name=\"llamaindex_dbx_vector_store_test_endpoint\", endpoint_type=\"STANDARD\"\n", | ||
")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"Create the Databricks Vector Search index, and build it from the documents" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"# Create a vector search index\n", | ||
"# it must be placed inside a Unity Catalog-enabled schema\n", | ||
"\n", | ||
"# We'll use self-managed embeddings (i.e. managed by LlamaIndex) rather than a Databricks-managed index\n", | ||
"databricks_index = client.create_direct_access_index(\n", | ||
" endpoint_name=\"llamaindex_dbx_vector_store_test_endpoint\",\n", | ||
" index_name=\"my_catalog.my_schema.my_test_table\",\n", | ||
" primary_key=\"my_primary_key_name\",\n", | ||
" embedding_dimension=1536, # match the embeddings model dimension you're going to use\n", | ||
" embedding_vector_column=\"my_embedding_vector_column_name\", # you name this anything you want - it'll be picked up by the LlamaIndex class\n", | ||
" schema={\n", | ||
" \"my_primary_key_name\": \"string\",\n", | ||
" \"my_embedding_vector_column_name\": \"array<double>\",\n", | ||
" \"text\": \"string\", # one column must match the text_column in the DatabricksVectorSearch instance created below; this will hold the raw node text,\n", | ||
" \"doc_id\": \"string\", # one column must contain the reference document ID (this will be populated by LlamaIndex automatically)\n", | ||
" # add any other metadata you may have in your nodes (Databricks Vector Search supports metadata filtering)\n", | ||
" # NOTE THAT THESE FIELDS MUST BE ADDED EXPLICITLY TO BE USED FOR METADATA FILTERING\n", | ||
" },\n", | ||
")\n", | ||
"\n", | ||
"databricks_vector_store = DatabricksVectorSearch(\n", | ||
" index=databricks_index,\n", | ||
" text_column=\"text\",\n", | ||
" columns=None, # YOU MUST ALSO RECORD YOUR METADATA FIELD NAMES HERE\n", | ||
") # text_column is required for self-managed embeddings\n", | ||
"storage_context = StorageContext.from_defaults(\n", | ||
" vector_store=databricks_vector_store\n", | ||
")\n", | ||
"index = VectorStoreIndex.from_documents(\n", | ||
" documents, storage_context=storage_context\n", | ||
")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": {}, | ||
"source": [ | ||
"Query the index" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"query_engine = index.as_query_engine()\n", | ||
"response = query_engine.query(\"Why did the author choose to work on AI?\")\n", | ||
"\n", | ||
"print(response.response)" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"application/vnd.databricks.v1+notebook": { | ||
"dashboards": [], | ||
"language": "python", | ||
"notebookMetadata": { | ||
"pythonIndentUnit": 4 | ||
}, | ||
"notebookName": "Databricks Vector Search Demo (LlamaIndex Integration)", | ||
"widgets": {} | ||
}, | ||
"language_info": { | ||
"name": "python" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 0 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
153 changes: 153 additions & 0 deletions
153
llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/.gitignore
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,153 @@ | ||
llama_index/_static | ||
.DS_Store | ||
# Byte-compiled / optimized / DLL files | ||
__pycache__/ | ||
*.py[cod] | ||
*$py.class | ||
|
||
# C extensions | ||
*.so | ||
|
||
# Distribution / packaging | ||
.Python | ||
bin/ | ||
build/ | ||
develop-eggs/ | ||
dist/ | ||
downloads/ | ||
eggs/ | ||
.eggs/ | ||
etc/ | ||
include/ | ||
lib/ | ||
lib64/ | ||
parts/ | ||
sdist/ | ||
share/ | ||
var/ | ||
wheels/ | ||
pip-wheel-metadata/ | ||
share/python-wheels/ | ||
*.egg-info/ | ||
.installed.cfg | ||
*.egg | ||
MANIFEST | ||
|
||
# PyInstaller | ||
# Usually these files are written by a python script from a template | ||
# before PyInstaller builds the exe, so as to inject date/other infos into it. | ||
*.manifest | ||
*.spec | ||
|
||
# Installer logs | ||
pip-log.txt | ||
pip-delete-this-directory.txt | ||
|
||
# Unit test / coverage reports | ||
htmlcov/ | ||
.tox/ | ||
.nox/ | ||
.coverage | ||
.coverage.* | ||
.cache | ||
nosetests.xml | ||
coverage.xml | ||
*.cover | ||
*.py,cover | ||
.hypothesis/ | ||
.pytest_cache/ | ||
.ruff_cache | ||
|
||
# Translations | ||
*.mo | ||
*.pot | ||
|
||
# Django stuff: | ||
*.log | ||
local_settings.py | ||
db.sqlite3 | ||
db.sqlite3-journal | ||
|
||
# Flask stuff: | ||
instance/ | ||
.webassets-cache | ||
|
||
# Scrapy stuff: | ||
.scrapy | ||
|
||
# Sphinx documentation | ||
docs/_build/ | ||
|
||
# PyBuilder | ||
target/ | ||
|
||
# Jupyter Notebook | ||
.ipynb_checkpoints | ||
notebooks/ | ||
|
||
# IPython | ||
profile_default/ | ||
ipython_config.py | ||
|
||
# pyenv | ||
.python-version | ||
|
||
# pipenv | ||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. | ||
# However, in case of collaboration, if having platform-specific dependencies or dependencies | ||
# having no cross-platform support, pipenv may install dependencies that don't work, or not | ||
# install all needed dependencies. | ||
#Pipfile.lock | ||
|
||
# PEP 582; used by e.g. github.com/David-OConnor/pyflow | ||
__pypackages__/ | ||
|
||
# Celery stuff | ||
celerybeat-schedule | ||
celerybeat.pid | ||
|
||
# SageMath parsed files | ||
*.sage.py | ||
|
||
# Environments | ||
.env | ||
.venv | ||
env/ | ||
venv/ | ||
ENV/ | ||
env.bak/ | ||
venv.bak/ | ||
pyvenv.cfg | ||
|
||
# Spyder project settings | ||
.spyderproject | ||
.spyproject | ||
|
||
# Rope project settings | ||
.ropeproject | ||
|
||
# mkdocs documentation | ||
/site | ||
|
||
# mypy | ||
.mypy_cache/ | ||
.dmypy.json | ||
dmypy.json | ||
|
||
# Pyre type checker | ||
.pyre/ | ||
|
||
# Jetbrains | ||
.idea | ||
modules/ | ||
*.swp | ||
|
||
# VsCode | ||
.vscode | ||
|
||
# pipenv | ||
Pipfile | ||
Pipfile.lock | ||
|
||
# pyright | ||
pyrightconfig.json |
4 changes: 4 additions & 0 deletions
4
llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/BUILD
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
poetry_requirements( | ||
name="poetry", | ||
module_mapping={"databricks-vectorsearch": ["databricks"]} | ||
) |
Oops, something went wrong.