diff --git a/docs/examples/vector_stores/DatabricksVectorSearchDemo.ipynb b/docs/examples/vector_stores/DatabricksVectorSearchDemo.ipynb new file mode 100644 index 00000000000000..e48cfd9267a492 --- /dev/null +++ b/docs/examples/vector_stores/DatabricksVectorSearchDemo.ipynb @@ -0,0 +1,213 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Databricks Vector Search\n", + "\n", + "Databricks Vector Search is a vector database that is built into the Databricks Intelligence Platform and integrated with its governance and productivity tools. Full docs here: https://docs.databricks.com/en/generative-ai/vector-search.html" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Install llama-index and databricks-vectorsearch. You must be inside a Databricks runtime to use the Vector Search python client." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install llama-index llama-index-vector-stores-databricks\n", + "%pip install databricks-vectorsearch" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Import databricks dependencies" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from databricks.vector_search.client import (\n", + " VectorSearchIndex,\n", + " VectorSearchClient,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Import LlamaIndex dependencies" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index.core import (\n", + " VectorStoreIndex,\n", + " SimpleDirectoryReader,\n", + " ServiceContext,\n", + " StorageContext,\n", + ")\n", + "from llama_index.vector_stores.databricks import DatabricksVectorSearch" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Load example data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!mkdir -p 'data/paul_graham/'\n", + "!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Read the data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# load documents\n", + "documents = SimpleDirectoryReader(\"./data/paul_graham/\").load_data()\n", + "print(f\"Total documents: {len(documents)}\")\n", + "print(f\"First document, id: {documents[0].doc_id}\")\n", + "print(f\"First document, hash: {documents[0].hash}\")\n", + "print(\n", + " \"First document, text\"\n", + " f\" ({len(documents[0].text)} characters):\\n{'='*20}\\n{documents[0].text[:360]} ...\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create a Databricks Vector Search endpoint which will serve the index" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create a vector search endpoint\n", + "client = VectorSearchClient()\n", + "client.create_endpoint(\n", + " name=\"llamaindex_dbx_vector_store_test_endpoint\", endpoint_type=\"STANDARD\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create the Databricks Vector Search index, and build it from the documents" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Create a vector search index\n", + "# it must be placed inside a Unity Catalog-enabled schema\n", + "\n", + "# We'll use self-managed embeddings (i.e. managed by LlamaIndex) rather than a Databricks-managed index\n", + "databricks_index = client.create_direct_access_index(\n", + " endpoint_name=\"llamaindex_dbx_vector_store_test_endpoint\",\n", + " index_name=\"my_catalog.my_schema.my_test_table\",\n", + " primary_key=\"my_primary_key_name\",\n", + " embedding_dimension=1536, # match the embeddings model dimension you're going to use\n", + " embedding_vector_column=\"my_embedding_vector_column_name\", # you name this anything you want - it'll be picked up by the LlamaIndex class\n", + " schema={\n", + " \"my_primary_key_name\": \"string\",\n", + " \"my_embedding_vector_column_name\": \"array\",\n", + " \"text\": \"string\", # one column must match the text_column in the DatabricksVectorSearch instance created below; this will hold the raw node text,\n", + " \"doc_id\": \"string\", # one column must contain the reference document ID (this will be populated by LlamaIndex automatically)\n", + " # add any other metadata you may have in your nodes (Databricks Vector Search supports metadata filtering)\n", + " # NOTE THAT THESE FIELDS MUST BE ADDED EXPLICITLY TO BE USED FOR METADATA FILTERING\n", + " },\n", + ")\n", + "\n", + "databricks_vector_store = DatabricksVectorSearch(\n", + " index=databricks_index,\n", + " text_column=\"text\",\n", + " columns=None, # YOU MUST ALSO RECORD YOUR METADATA FIELD NAMES HERE\n", + ") # text_column is required for self-managed embeddings\n", + "storage_context = StorageContext.from_defaults(\n", + " vector_store=databricks_vector_store\n", + ")\n", + "index = VectorStoreIndex.from_documents(\n", + " documents, storage_context=storage_context\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Query the index" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "query_engine = index.as_query_engine()\n", + "response = query_engine.query(\"Why did the author choose to work on AI?\")\n", + "\n", + "print(response.response)" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 4 + }, + "notebookName": "Databricks Vector Search Demo (LlamaIndex Integration)", + "widgets": {} + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/docs/module_guides/storing/vector_stores.md b/docs/module_guides/storing/vector_stores.md index 2d721c8361ad79..8a6a39e1a1384f 100644 --- a/docs/module_guides/storing/vector_stores.md +++ b/docs/module_guides/storing/vector_stores.md @@ -23,6 +23,7 @@ We are actively adding more integrations and improving feature coverage for each | ChatGPT Retrieval Plugin | aggregator | | | ✓ | ✓ | | | Chroma | self-hosted | ✓ | | ✓ | ✓ | | | DashVector | cloud | ✓ | ✓ | ✓ | ✓ | | +| Databricks | cloud | ✓ | | ✓ | ✓ | | | Deeplake | self-hosted / cloud | ✓ | | ✓ | ✓ | | | DocArray | aggregator | ✓ | | ✓ | ✓ | | | DuckDB | in-memory / self-hosted | ✓ | | ✓ | ✓ | | @@ -70,6 +71,7 @@ maxdepth: 1 /examples/vector_stores/ChromaIndexDemo.ipynb /examples/vector_stores/DashvectorIndexDemo.ipynb /examples/vector_stores/DashvectorIndexDemo-Hybrid.ipynb +/examples/vector_stores/DatabricksVectorSearchDemo.ipynb /examples/vector_stores/DeepLakeIndexDemo.ipynb /examples/vector_stores/DocArrayHnswIndexDemo.ipynb /examples/vector_stores/DocArrayInMemoryIndexDemo.ipynb diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/.gitignore b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/.gitignore new file mode 100644 index 00000000000000..990c18de229088 --- /dev/null +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/.gitignore @@ -0,0 +1,153 @@ +llama_index/_static +.DS_Store +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +bin/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +etc/ +include/ +lib/ +lib64/ +parts/ +sdist/ +share/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +.ruff_cache + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints +notebooks/ + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ +pyvenv.cfg + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# Jetbrains +.idea +modules/ +*.swp + +# VsCode +.vscode + +# pipenv +Pipfile +Pipfile.lock + +# pyright +pyrightconfig.json diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/BUILD b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/BUILD new file mode 100644 index 00000000000000..05444d69d26e8c --- /dev/null +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/BUILD @@ -0,0 +1,4 @@ +poetry_requirements( + name="poetry", + module_mapping={"databricks-vectorsearch": ["databricks"]} +) diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/Makefile b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/Makefile new file mode 100644 index 00000000000000..b9eab05aa37062 --- /dev/null +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/Makefile @@ -0,0 +1,17 @@ +GIT_ROOT ?= $(shell git rev-parse --show-toplevel) + +help: ## Show all Makefile targets. + @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[33m%-30s\033[0m %s\n", $$1, $$2}' + +format: ## Run code autoformatters (black). + pre-commit install + git ls-files | xargs pre-commit run black --files + +lint: ## Run linters: pre-commit (black, ruff, codespell) and mypy + pre-commit install && git ls-files | xargs pre-commit run --show-diff-on-failure --files + +test: ## Run tests via pytest. + pytest tests + +watch-docs: ## Build and watch documentation. + sphinx-autobuild docs/ docs/_build/html --open-browser --watch $(GIT_ROOT)/llama_index/ diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/README.md b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/README.md new file mode 100644 index 00000000000000..837b6aaec7fbed --- /dev/null +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/README.md @@ -0,0 +1 @@ +# LlamaIndex Vector_Stores Integration: Databricks Vector Search diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/llama_index/vector_stores/databricks/BUILD b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/llama_index/vector_stores/databricks/BUILD new file mode 100644 index 00000000000000..db46e8d6c978c6 --- /dev/null +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/llama_index/vector_stores/databricks/BUILD @@ -0,0 +1 @@ +python_sources() diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/llama_index/vector_stores/databricks/__init__.py b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/llama_index/vector_stores/databricks/__init__.py new file mode 100644 index 00000000000000..3d63d6acf2fa4d --- /dev/null +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/llama_index/vector_stores/databricks/__init__.py @@ -0,0 +1,5 @@ +from llama_index.vector_stores.databricks.base import ( + DatabricksVectorSearch, +) + +__all__ = ["DatabricksVectorSearch"] diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/llama_index/vector_stores/databricks/base.py b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/llama_index/vector_stores/databricks/base.py new file mode 100644 index 00000000000000..bd7f5ae3282187 --- /dev/null +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/llama_index/vector_stores/databricks/base.py @@ -0,0 +1,418 @@ +""" +Databricks Vector Search index. + +Supports Delta Sync indexes and Direct Access indexes in Databricks Vector Search. +""" + +import json +import logging +from typing import ( + Any, + List, + Dict, + Optional, + cast, +) +from enum import Enum + +from databricks.vector_search.client import VectorSearchIndex + +from llama_index.core.bridge.pydantic import BaseModel, Field, PrivateAttr +from llama_index.core.vector_stores.types import ( + BasePydanticVectorStore, + MetadataFilters, + FilterCondition, + FilterOperator, + VectorStoreQuery, + VectorStoreQueryResult, + VectorStoreQueryMode, +) +from llama_index.core.vector_stores.utils import node_to_metadata_dict +from llama_index.core.schema import TextNode, BaseNode +from llama_index.core.bridge.pydantic import PrivateAttr + + +class _DatabricksIndexType(str, Enum): + DIRECT_ACCESS = "DIRECT_ACCESS" + DELTA_SYNC = "DELTA_SYNC" + + +class _DatabricksIndexDescription(BaseModel): + primary_key: str + index_type: _DatabricksIndexType + delta_sync_index_spec: Dict = Field(default_factory=dict) + direct_access_index_spec: Dict = Field(default_factory=dict) + + +_logger = logging.getLogger(__name__) + + +_filter_translation = { + FilterOperator.EQ: "", + FilterOperator.GT: ">", + FilterOperator.LT: "<", + FilterOperator.NE: "NOT", + FilterOperator.GTE: ">=", + FilterOperator.LTE: "<=", + FilterOperator.IN: "", + FilterOperator.NIN: "NOT", +} + + +def _transform_databricks_filter_operator(operator: FilterOperator) -> str: + try: + return _filter_translation[operator] + + except KeyError as e: + raise ValueError(f"filter operator {operator} is not supported") + + +def _to_databricks_filter(standard_filters: MetadataFilters) -> dict: + """Convert from standard dataclass to databricks filter dict.""" + filters = {} + + condition = standard_filters.condition or FilterOperator.AND + + for filter in standard_filters.filters: + value = filter.value if isinstance(filter.value, list) else [filter.value] + + transformed_operator = _transform_databricks_filter_operator(filter.operator) + + if transformed_operator == "": + key = filter.key + + else: + key = f"{filter.key} {transformed_operator}" + + if key in filters: + raise ValueError(f"filter condition already exists for {key}") + + filters[key] = value + + if condition == FilterCondition.AND: + return filters + + elif condition == FilterCondition.OR: + keys, values = zip(*filters.items()) + return {" OR ".join(keys): values} + + raise ValueError(f"condition {condition} is not supported") + + +class DatabricksVectorSearch(BasePydanticVectorStore): + """ + Vector store for Databricks Vector Search. + + Install ``databricks-vectorsearch`` package using the following in a Databricks notebook: + %pip install databricks-vectorsearch + dbutils.library.restartPython() + + """ + + stores_text: bool = True + text_column: Optional[str] + columns: Optional[List[str]] + + _index: VectorSearchIndex = PrivateAttr() + _primary_key: str = PrivateAttr() + _index_type: str = PrivateAttr() + _delta_sync_index_spec: dict = PrivateAttr() + _direct_access_index_spec: dict = PrivateAttr() + _doc_id_to_pk: dict = PrivateAttr() + + def __init__( + self, + index: VectorSearchIndex, + text_column: Optional[str] = None, + columns: Optional[List[str]] = None, + ) -> None: + try: + from databricks.vector_search.client import VectorSearchIndex + except ImportError: + raise ImportError( + "`databricks-vectorsearch` package not found: " + "please run `pip install databricks-vectorsearch`" + ) + if not isinstance(index, VectorSearchIndex): + raise TypeError( + f"index must be of type `VectorSearchIndex`, not {type(index)}" + ) + + self._index = index + + # unpack the index spec + index_description = _DatabricksIndexDescription.parse_obj( + self._index.describe() + ) + + self._primary_key = index_description.primary_key + self._index_type = index_description.index_type + self._delta_sync_index_spec = index_description.delta_sync_index_spec + self._direct_access_index_spec = index_description.direct_access_index_spec + self._doc_id_to_pk = {} + + if columns is None: + columns = [] + if "doc_id" not in columns: + columns = columns[:19] + ["doc_id"] + super().__init__( + text_column=text_column, + columns=columns, + ) + + # initialize the column name for the text column in the delta table + if self._is_databricks_managed_embeddings(): + index_source_column = self._embedding_source_column_name() + + # check if input text column matches the source column of the index + if text_column is not None and text_column != index_source_column: + raise ValueError( + f"text_column '{text_column}' does not match with the " + f"source column of the index: '{index_source_column}'." + ) + + self.text_column = index_source_column + else: + if text_column is None: + raise ValueError("text_column is required for self-managed embeddings.") + self.text_column = text_column + + # Fold primary key and text column into columns if they're not empty. + columns_to_add = set(columns or []) + columns_to_add.add(self._primary_key) + columns_to_add.add(self.text_column) + columns_to_add -= {"", None} + + self.columns = list(columns_to_add) + + # If the index schema is known, all our columns should be in that index. + # Validate specified columns are in the index + index_schema = self._index_schema() + + if self._is_direct_access_index() and index_schema: + missing_columns = columns_to_add - set(index_schema.keys()) + + if missing_columns: + raise ValueError( + f"columns missing from schema: {', '.join(missing_columns)}" + ) + + def add( + self, + nodes: List[BaseNode], + **add_kwargs: Any, + ) -> List[str]: + """Add nodes to index. + + Args: + nodes: List[BaseNode]: list of nodes with embeddings + + """ + if self._is_databricks_managed_embeddings(): + raise ValueError( + "Adding nodes is not supported for Databricks-managed embeddings." + ) + + # construct the entries to upsert + entries = [] + ids = [] + for node in nodes: + node_id = node.node_id + metadata = node_to_metadata_dict(node, remove_text=True, flat_metadata=True) + + metadata_columns = self.columns or [] + + # explicitly record doc_id as metadata (for delete) + metadata_columns.append("doc_id") + + entry = { + self._primary_key: node_id, + self.text_column: node.get_content(), + self._embedding_vector_column_name(): node.get_embedding(), + **{ + col: metadata.get(col) + for col in filter( + lambda column: column + not in (self._primary_key, self.text_column), + metadata_columns, + ) + }, + } + doc_id = metadata.get("doc_id") + self._doc_id_to_pk[doc_id] = list( + set(self._doc_id_to_pk.get(doc_id, []) + [node_id]) # noqa: RUF005 + ) # associate this node_id with this doc_id + + entries.append(entry) + ids.append(node_id) + + # attempt the upsert + upsert_resp = self._index.upsert( + entries, + ) + + # return the successful IDs + response_status = upsert_resp.get("status") + + failed_ids = ( + set(upsert_resp["result"]["failed_primary_keys"] or []) + if "result" in upsert_resp + and "failed_primary_keys" in upsert_resp["result"] + else set() + ) + + if response_status not in ("PARTIAL_SUCCESS", "FAILURE") or not failed_ids: + return ids + + elif response_status == "PARTIAL_SUCCESS": + _logger.warning( + "failed to add %d out of %d texts to the index", + len(failed_ids), + len(ids), + ) + + elif response_status == "FAILURE": + _logger.error("failed to add all %d texts to the index", len(ids)) + + return list(filter(lambda id_: id_ not in failed_ids, ids)) + + def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None: + """ + Delete nodes with ref_doc_id. + + Args: + ref_doc_id (str): The doc_id of the document to delete. + + """ + primary_keys = self._doc_id_to_pk.get( + ref_doc_id, None + ) # get the node_ids associated with the doc_id + if primary_keys is not None: + self._index.delete( + primary_keys=primary_keys, + ) + self._doc_id_to_pk.pop( + ref_doc_id + ) # remove this doc_id from the doc_id-to-node_id map + + def query(self, query: VectorStoreQuery, **kwargs: Any) -> VectorStoreQueryResult: + """Query index for top k most similar nodes.""" + if self._is_databricks_managed_embeddings(): + query_text = query.query_str + query_vector = None + else: + query_text = None + query_vector = cast(List[float], query.query_embedding) + + if query.mode not in ( + VectorStoreQueryMode.DEFAULT, + VectorStoreQueryMode.HYBRID, + ): + raise ValueError( + "Only DEFAULT and HYBRID modes are supported for Databricks Vector Search." + ) + + if query.filters is not None: + filters = _to_databricks_filter(query.filters) + else: + filters = None + + search_resp = self._index.similarity_search( + columns=self.columns, + query_text=query_text, + query_vector=query_vector, + filters=filters, + num_results=query.similarity_top_k, + ) + + columns = [ + col["name"] for col in search_resp.get("manifest", {}).get("columns", []) + ] + top_k_nodes = [] + top_k_ids = [] + top_k_scores = [] + for result in search_resp.get("result", {}).get("data_array", []): + doc_id = result[columns.index(self._primary_key)] + text_content = result[columns.index(self.text_column)] + metadata = { + col: value + for col, value in zip(columns[:-1], result[:-1]) + if col not in [self._primary_key, self.text_column] + } + metadata[self._primary_key] = doc_id + score = result[-1] + node = TextNode( + text=text_content, id_=doc_id, metadata=metadata + ) # TODO star_char, end_char, relationships? https://github.com/run-llama/llama_index/blob/main/llama-index-integrations/vector_stores/llama-index-vector-stores-pinecone/llama_index/vector_stores/pinecone/base.py + + top_k_ids.append(doc_id) + top_k_nodes.append(node) + top_k_scores.append(score) + + return VectorStoreQueryResult( + nodes=top_k_nodes, similarities=top_k_scores, ids=top_k_ids + ) + + @property + def client(self) -> Any: + """Return VectorStoreIndex.""" + return self._index + + # The remaining utilities (and snippets of the above) are taken from + # https://github.com/langchain-ai/langchain/blob/master/libs/community/langchain_community/vectorstores/databricks_vector_search.py + def _index_schema(self) -> Optional[dict]: + """Return the index schema as a dictionary. + Return None if no schema found. + """ + if self._is_direct_access_index(): + schema_json = self._direct_access_index_spec.get("schema_json") + if schema_json is not None: + return json.loads(schema_json) + return None + + def _embedding_vector_column_name(self) -> Optional[str]: + """Return the name of the embedding vector column. + None if the index is not a self-managed embedding index. + """ + return self._embedding_vector_column().get("name") + + def _embedding_vector_column(self) -> dict: + """Return the embedding vector column configs as a dictionary. + Empty if the index is not a self-managed embedding index. + """ + index_spec = ( + self._delta_sync_index_spec + if self._is_delta_sync_index() + else self._direct_access_index_spec + ) + return next(iter(index_spec.get("embedding_vector_columns") or []), {}) + + def _embedding_source_column_name(self) -> Optional[str]: + """Return the name of the embedding source column. + None if the index is not a Databricks-managed embedding index. + """ + return self._embedding_source_column().get("name") + + def _embedding_source_column(self) -> dict: + """Return the embedding source column configs as a dictionary. + Empty if the index is not a Databricks-managed embedding index. + """ + return next( + iter(self._delta_sync_index_spec.get("embedding_source_columns") or []), + {}, + ) + + def _is_delta_sync_index(self) -> bool: + """Return True if the index is a delta-sync index.""" + return self._index_type == _DatabricksIndexType.DELTA_SYNC + + def _is_direct_access_index(self) -> bool: + """Return True if the index is a direct-access index.""" + return self._index_type == _DatabricksIndexType.DIRECT_ACCESS + + def _is_databricks_managed_embeddings(self) -> bool: + """Return True if the embeddings are managed by Databricks Vector Search.""" + return ( + self._is_delta_sync_index() + and self._embedding_source_column_name() is not None + ) diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/pyproject.toml b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/pyproject.toml new file mode 100644 index 00000000000000..0b45006cde64b7 --- /dev/null +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/pyproject.toml @@ -0,0 +1,62 @@ +[build-system] +build-backend = "poetry.core.masonry.api" +requires = ["poetry-core"] + +[tool.codespell] +check-filenames = true +check-hidden = true +skip = "*.csv,*.html,*.json,*.jsonl,*.pdf,*.txt,*.ipynb" + +[tool.llamahub] +contains_example = false +import_path = "llama_index.vector_stores.databricks" + +[tool.llamahub.class_authors] +DatabricksVectorSearch = "NickhilN" + +[tool.mypy] +disallow_untyped_defs = true +exclude = ["_static", "build", "examples", "notebooks", "venv"] +ignore_missing_imports = true +python_version = "3.8" + +[tool.poetry] +authors = ["Alberto Da Costa ", "Nickhil Nabar