From 3177eb98260186e68bc7a04849067c1a75ff0538 Mon Sep 17 00:00:00 2001 From: nnabar Date: Thu, 15 Feb 2024 16:36:19 +0000 Subject: [PATCH 1/9] Databricks Vector Search integration from BAM Elevate --- .../.gitignore | 153 +++++++ .../BUILD | 3 + .../Makefile | 17 + .../README.md | 1 + .../databricks-vector-search/BUILD | 1 + .../databricks-vector-search/__init__.py | 5 + .../databricks-vector-search/base.py | 388 ++++++++++++++++++ .../databricks-vector-search/utils.py | 15 + .../pyproject.toml | 60 +++ .../tests/BUILD | 1 + .../tests/__init__.py | 0 ..._vector_stores_databricks_vector_search.py | 7 + 12 files changed, 651 insertions(+) create mode 100644 llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/.gitignore create mode 100644 llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/BUILD create mode 100644 llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/Makefile create mode 100644 llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/README.md create mode 100644 llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/llama_index/vector_stores/databricks-vector-search/BUILD create mode 100644 llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/llama_index/vector_stores/databricks-vector-search/__init__.py create mode 100644 llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/llama_index/vector_stores/databricks-vector-search/base.py create mode 100644 llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/llama_index/vector_stores/databricks-vector-search/utils.py create mode 100644 llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/pyproject.toml create mode 100644 llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/tests/BUILD create mode 100644 llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/tests/__init__.py create mode 100644 llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/tests/test_vector_stores_databricks_vector_search.py diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/.gitignore b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/.gitignore new file mode 100644 index 0000000000000..990c18de22908 --- /dev/null +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/.gitignore @@ -0,0 +1,153 @@ +llama_index/_static +.DS_Store +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +bin/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +etc/ +include/ +lib/ +lib64/ +parts/ +sdist/ +share/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +.ruff_cache + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints +notebooks/ + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ +pyvenv.cfg + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# Jetbrains +.idea +modules/ +*.swp + +# VsCode +.vscode + +# pipenv +Pipfile +Pipfile.lock + +# pyright +pyrightconfig.json diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/BUILD b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/BUILD new file mode 100644 index 0000000000000..0896ca890d8bf --- /dev/null +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/BUILD @@ -0,0 +1,3 @@ +poetry_requirements( + name="poetry", +) diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/Makefile b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/Makefile new file mode 100644 index 0000000000000..b9eab05aa3706 --- /dev/null +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/Makefile @@ -0,0 +1,17 @@ +GIT_ROOT ?= $(shell git rev-parse --show-toplevel) + +help: ## Show all Makefile targets. + @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[33m%-30s\033[0m %s\n", $$1, $$2}' + +format: ## Run code autoformatters (black). + pre-commit install + git ls-files | xargs pre-commit run black --files + +lint: ## Run linters: pre-commit (black, ruff, codespell) and mypy + pre-commit install && git ls-files | xargs pre-commit run --show-diff-on-failure --files + +test: ## Run tests via pytest. + pytest tests + +watch-docs: ## Build and watch documentation. + sphinx-autobuild docs/ docs/_build/html --open-browser --watch $(GIT_ROOT)/llama_index/ diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/README.md b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/README.md new file mode 100644 index 0000000000000..837b6aaec7fbe --- /dev/null +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/README.md @@ -0,0 +1 @@ +# LlamaIndex Vector_Stores Integration: Databricks Vector Search diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/llama_index/vector_stores/databricks-vector-search/BUILD b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/llama_index/vector_stores/databricks-vector-search/BUILD new file mode 100644 index 0000000000000..db46e8d6c978c --- /dev/null +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/llama_index/vector_stores/databricks-vector-search/BUILD @@ -0,0 +1 @@ +python_sources() diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/llama_index/vector_stores/databricks-vector-search/__init__.py b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/llama_index/vector_stores/databricks-vector-search/__init__.py new file mode 100644 index 0000000000000..041494b47fb05 --- /dev/null +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/llama_index/vector_stores/databricks-vector-search/__init__.py @@ -0,0 +1,5 @@ +from llama_index.vector_stores.databricks_vector_search.base import ( + DatabricksVectorSearch, +) + +__all__ = ["DatabricksVectorSearch"] diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/llama_index/vector_stores/databricks-vector-search/base.py b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/llama_index/vector_stores/databricks-vector-search/base.py new file mode 100644 index 0000000000000..9ef9d0298d3d8 --- /dev/null +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/llama_index/vector_stores/databricks-vector-search/base.py @@ -0,0 +1,388 @@ +""" +Databricks Vector Search index. + +Supports Delta Sync indexes and Direct Access indexes in Databricks Vector Search. +""" + +import json +import logging +from typing import ( + Any, + List, + Dict, + Optional, + TYPE_CHECKING, + cast, +) +from enum import Enum + +from pydantic import BaseModel, Field + +from llama_index.core.vector_stores.types import ( + BasePydanticVectorStore, + MetadataFilters, + FilterCondition, + FilterOperator, + VectorStoreQuery, + VectorStoreQueryResult, + VectorStoreQueryMode, +) +from llama_index.core.vector_stores.utils import node_to_metadata_dict +from llama_index.core.schema import TextNode, BaseNode +from llama_index.core.bridge.pydantic import PrivateAttr + +from llama_index.vector_stores.databricks_vector_search.utils import _import_databricks + + +if TYPE_CHECKING: + from databricks.vector_search.client import VectorSearchIndex + + +class _DatabricksIndexType(str, Enum): + DIRECT_ACCESS = "DIRECT_ACCESS" + DELTA_SYNC = "DELTA_SYNC" + + +class _DatabricksIndexDescription(BaseModel): + primary_key: str + index_type: _DatabricksIndexType + delta_sync_index_spec: Dict = Field(default_factory=dict) + direct_access_index_spec: Dict = Field(default_factory=dict) + + +_logger = logging.getLogger(__name__) + + +_filter_translation = { + FilterOperator.EQ: "", + FilterOperator.GT: ">", + FilterOperator.LT: "<", + FilterOperator.NE: "NOT", + FilterOperator.GTE: ">=", + FilterOperator.LTE: "<=", + FilterOperator.IN: "", + FilterOperator.NIN: "NOT", +} + + +def _transform_databricks_filter_operator(operator: FilterOperator) -> str: + try: + return _filter_translation[operator] + + except KeyError as e: + raise ValueError(f"filter operator {operator} is not supported") + + +def _to_databricks_filter(standard_filters: MetadataFilters) -> dict: + """Convert from standard dataclass to databricks filter dict.""" + filters = {} + + condition = standard_filters.condition or FilterOperator.AND + + for filter in standard_filters.filters: + value = filter.value if isinstance(filter.value, list) else [filter.value] + + transformed_operator = _transform_databricks_filter_operator(filter.operator) + + if transformed_operator == "": + key = filter.key + + else: + key = f"{filter.key} {transformed_operator}" + + if key in filters: + raise ValueError(f"filter condition already exists for {key}") + + filters[key] = value + + if condition == FilterCondition.AND: + return filters + + elif condition == FilterCondition.OR: + keys, values = zip(*filters.items()) + return {" OR ".join(keys): values} + + raise ValueError(f"condition {condition} is not supported") + + +class DatabricksVectorSearch(BasePydanticVectorStore): + """ + Vector store for Databricks Vector Search. + + Install ``databricks-vectorsearch`` package using the following in a Databricks notebook: + %pip install databricks-vectorsearch + dbutils.library.restartPython() + """ + + stores_text: bool = True + text_column: Optional[str] + columns: Optional[List[str]] + + _index: VectorSearchIndex = PrivateAttr() + _primary_key: str = PrivateAttr() + _index_type: str = PrivateAttr() + _delta_sync_index_spec: dict = PrivateAttr() + _direct_access_index_spec: dict = PrivateAttr() + + def __init__( + self, + index: VectorSearchIndex, + text_column: Optional[str] = None, + columns: Optional[List[str]] = None, + ) -> None: + _import_databricks() + + if not isinstance(index, VectorSearchIndex): + raise TypeError( + f"index must be of type `VectorSearchIndex`, not {type(index)}" + ) + + self._index = index + + # unpack the index spec + index_description = _DatabricksIndexDescription.parse_obj( + self._index.describe() + ) + + self._primary_key = index_description.primary_key + self._index_type = index_description.index_type + self._delta_sync_index_spec = index_description.delta_sync_index_spec + self._direct_access_index_spec = index_description.direct_access_index_spec + + super().__init__( + text_column=text_column, columns=columns, + ) + + # initialize the column name for the text column in the delta table + if self._is_databricks_managed_embeddings(): + index_source_column = self._embedding_source_column_name() + + # check if input text column matches the source column of the index + if text_column is not None and text_column != index_source_column: + raise ValueError( + f"text_column '{text_column}' does not match with the " + f"source column of the index: '{index_source_column}'." + ) + + self.text_column = index_source_column + else: + if text_column is None: + raise ValueError("text_column is required for self-managed embeddings.") + self.text_column = text_column + + # Fold primary key and text column into columns if they're not empty. + columns_to_add = set(columns or []) + columns_to_add.add(self._primary_key) + columns_to_add.add(self.text_column) + columns_to_add -= {"", None} + + self.columns = list(columns_to_add) + + # If the index schema is known, all our columns should be in that index. + # Validate specified columns are in the index + index_schema = self._index_schema() + + if self._is_direct_access_index() and index_schema: + missing_columns = columns_to_add - set(index_schema.keys()) + + if missing_columns: + raise ValueError( + f"columns missing from schema: {', '.join(missing_columns)}" + ) + + def add(self, nodes: List[BaseNode], **add_kwargs: Any,) -> List[str]: + """Add nodes to index. + + Args: + nodes: List[BaseNode]: list of nodes with embeddings + + """ + if self._is_databricks_managed_embeddings(): + raise ValueError( + "Adding nodes is not supported for Databricks-managed embeddings." + ) + + # construct the entries to upsert + entries = [] + ids = [] + for node in nodes: + node_id = node.node_id + metadata = node_to_metadata_dict(node, remove_text=True, flat_metadata=True) + entry = { + self._primary_key: node_id, + self.text_column: node.get_content(), + self._embedding_vector_column_name(): node.get_embedding(), + **{ + col: metadata.get(col) + for col in filter( + lambda column: column + not in (self._primary_key, self.text_column), + self.columns or [], + ) + }, + } + + entries.append(entry) + ids.append(node_id) + + # attempt the upsert + upsert_resp = self._index.upsert(entries,) + + # return the successful IDs + response_status = upsert_resp.get("status") + + failed_ids = ( + set(upsert_resp["result"]["failed_primary_keys"] or []) + if "result" in upsert_resp + and "failed_primary_keys" in upsert_resp["result"] + else set() + ) + + if response_status not in ("PARTIAL_SUCCESS", "FAILURE") or not failed_ids: + return ids + + elif response_status == "PARTIAL_SUCCESS": + _logger.warning( + "failed to add %d out of %d texts to the index", + len(failed_ids), + len(ids), + ) + + elif response_status == "FAILURE": + _logger.error("failed to add all %d texts to the index", len(ids)) + + return list(filter(lambda id_: id_ not in failed_ids, ids)) + + def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None: + """ + Delete nodes with ref_doc_id. + + Args: + ref_doc_id (str): The doc_id of the document to delete. + + """ + self._index.delete(primary_keys=[ref_doc_id],) + + def query(self, query: VectorStoreQuery, **kwargs: Any) -> VectorStoreQueryResult: + """Query index for top k most similar nodes. + """ + if self._is_databricks_managed_embeddings(): + query_text = query.query_str + query_vector = None + else: + query_text = None + query_vector = cast(List[float], query.query_embedding) + + if query.mode not in ( + VectorStoreQueryMode.DEFAULT, + VectorStoreQueryMode.HYBRID, + ): + raise ValueError( + "Only DEFAULT and HYBRID modes are supported for Databricks Vector Search." + ) + + if query.filters is not None: + filters = _to_databricks_filter(query.filters) + else: + filters = None + + search_resp = self._index.similarity_search( + columns=self.columns, + query_text=query_text, + query_vector=query_vector, + filters=filters, + num_results=query.similarity_top_k, + ) + + columns = [ + col["name"] + for col in search_resp.get("manifest", dict()).get("columns", []) + ] + top_k_nodes = [] + top_k_ids = [] + top_k_scores = [] + for result in search_resp.get("result", dict()).get("data_array", []): + doc_id = result[columns.index(self._primary_key)] + text_content = result[columns.index(self.text_column)] + metadata = { + col: value + for col, value in zip(columns[:-1], result[:-1]) + if col not in [self._primary_key, self.text_column] + } + metadata[self._primary_key] = doc_id + score = result[-1] + node = TextNode( + text=text_content, id_=doc_id, metadata=metadata + ) # TODO star_char, end_char, relationships? https://github.com/run-llama/llama_index/blob/main/llama-index-integrations/vector_stores/llama-index-vector-stores-pinecone/llama_index/vector_stores/pinecone/base.py + + top_k_ids.append(doc_id) + top_k_nodes.append(node) + top_k_scores.append(score) + + return VectorStoreQueryResult( + nodes=top_k_nodes, similarities=top_k_scores, ids=top_k_ids + ) + + @property + def client(self) -> Any: + """Return VectorStoreIndex""" + return self._index + + # The remaining utilities (and snippets of the above) are taken from + # https://github.com/langchain-ai/langchain/blob/master/libs/community/langchain_community/vectorstores/databricks_vector_search.py + def _index_schema(self) -> Optional[dict]: + """Return the index schema as a dictionary. + Return None if no schema found. + """ + if self._is_direct_access_index(): + schema_json = self._direct_access_index_spec.get("schema_json") + if schema_json is not None: + return json.loads(schema_json) + return None + + def _embedding_vector_column_name(self) -> Optional[str]: + """Return the name of the embedding vector column. + None if the index is not a self-managed embedding index. + """ + return self._embedding_vector_column().get("name") + + def _embedding_vector_column(self) -> dict: + """Return the embedding vector column configs as a dictionary. + Empty if the index is not a self-managed embedding index. + """ + index_spec = ( + self._delta_sync_index_spec + if self._is_delta_sync_index() + else self._direct_access_index_spec + ) + return next(iter(index_spec.get("embedding_vector_columns") or list()), dict()) + + def _embedding_source_column_name(self) -> Optional[str]: + """Return the name of the embedding source column. + None if the index is not a Databricks-managed embedding index. + """ + return self._embedding_source_column().get("name") + + def _embedding_source_column(self) -> dict: + """Return the embedding source column configs as a dictionary. + Empty if the index is not a Databricks-managed embedding index. + """ + return next( + iter(self._delta_sync_index_spec.get("embedding_source_columns") or list()), + dict(), + ) + + def _is_delta_sync_index(self) -> bool: + """Return True if the index is a delta-sync index.""" + return self._index_type == _DatabricksIndexType.DELTA_SYNC + + def _is_direct_access_index(self) -> bool: + """Return True if the index is a direct-access index.""" + return self._index_type == _DatabricksIndexType.DIRECT_ACCESS + + def _is_databricks_managed_embeddings(self) -> bool: + """Return True if the embeddings are managed by Databricks Vector Search.""" + return ( + self._is_delta_sync_index() + and self._embedding_source_column_name() is not None + ) diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/llama_index/vector_stores/databricks-vector-search/utils.py b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/llama_index/vector_stores/databricks-vector-search/utils.py new file mode 100644 index 0000000000000..e2fef63a28558 --- /dev/null +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/llama_index/vector_stores/databricks-vector-search/utils.py @@ -0,0 +1,15 @@ +from typing import Any + + +def _import_databricks() -> Any: + """ + Try to import databricks.vector_search.client.VectorSearchIndex. If databricks module it's not already installed, instruct user how to install. + """ + + try: + from databricks.vector_search.client import VectorSearchIndex + except ImportError: + raise ImportError( + "`databricks-vectorsearch` package not found: " + "please run `pip install databricks-vectorsearch`" + ) diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/pyproject.toml b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/pyproject.toml new file mode 100644 index 0000000000000..d2edc205d0252 --- /dev/null +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/pyproject.toml @@ -0,0 +1,60 @@ +[build-system] +build-backend = "poetry.core.masonry.api" +requires = ["poetry-core"] + +[tool.codespell] +check-filenames = true +check-hidden = true +skip = "*.csv,*.html,*.json,*.jsonl,*.pdf,*.txt,*.ipynb" + +[tool.llamahub] +classes = ["DatabricksVectorSearch"] +contains_example = false +import_path = "llama_index.vector_stores.databricks_vector_search" + +[tool.mypy] +disallow_untyped_defs = true +exclude = ["_static", "build", "examples", "notebooks", "venv"] +ignore_missing_imports = true +python_version = "3.8" + +[tool.poetry] +authors = ["Nickhil Nabar "] +description = "llama-index vector_stores databricks vector search integration" +license = "MIT" +name = "llama-index-vector-stores-databricks-vector-search" +readme = "README.md" +version = "0.1.1" + +[tool.poetry.dependencies] +python = ">=3.8.1,<3.12" +llama-index-core = "^0.10.1" +databricks-vectorsearch + +[tool.poetry.group.dev.dependencies] +ipython = "8.10.0" +jupyter = "^1.0.0" +mypy = "0.991" +pre-commit = "3.2.0" +pylint = "2.15.10" +pytest = "7.2.1" +pytest-mock = "3.11.1" +ruff = "0.0.292" +tree-sitter-languages = "^1.8.0" +types-Deprecated = ">=0.1.0" +types-PyYAML = "^6.0.12.12" +types-protobuf = "^4.24.0.4" +types-redis = "4.5.5.0" +types-requests = "2.28.11.8" +types-setuptools = "67.1.0.0" + +[tool.poetry.group.dev.dependencies.black] +extras = ["jupyter"] +version = "<=23.9.1,>=23.7.0" + +[tool.poetry.group.dev.dependencies.codespell] +extras = ["toml"] +version = ">=v2.2.6" + +[[tool.poetry.packages]] +include = "llama_index/" diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/tests/BUILD b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/tests/BUILD new file mode 100644 index 0000000000000..dabf212d7e716 --- /dev/null +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/tests/BUILD @@ -0,0 +1 @@ +python_tests() diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/tests/__init__.py b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/tests/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/tests/test_vector_stores_databricks_vector_search.py b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/tests/test_vector_stores_databricks_vector_search.py new file mode 100644 index 0000000000000..d8b07700ce3e6 --- /dev/null +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/tests/test_vector_stores_databricks_vector_search.py @@ -0,0 +1,7 @@ +from llama_index.core.vector_stores.types import BasePydanticVectorStore +from llama_index.vector_stores.databricks_vector_search import DatabricksVectorSearch + + +def test_class(): + names_of_base_classes = [b.__name__ for b in DatabricksVectorSearch.__mro__] + assert BasePydanticVectorStore.__name__ in names_of_base_classes From 18d700c1ae32b98646caac2361b3dbf0f6a2dddf Mon Sep 17 00:00:00 2001 From: Haotian Zhang Date: Fri, 16 Feb 2024 00:29:22 -0500 Subject: [PATCH 2/9] cr --- .../pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/pyproject.toml b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/pyproject.toml index d2edc205d0252..5296cebaae2a0 100644 --- a/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/pyproject.toml +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/pyproject.toml @@ -19,7 +19,7 @@ ignore_missing_imports = true python_version = "3.8" [tool.poetry] -authors = ["Nickhil Nabar "] +authors = ["Alberto Da Costa ", "Nickhil Nabar Date: Fri, 16 Feb 2024 10:11:10 -0500 Subject: [PATCH 3/9] cr --- .../databricks-vector-search/base.py | 34 ++++++++++++------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/llama_index/vector_stores/databricks-vector-search/base.py b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/llama_index/vector_stores/databricks-vector-search/base.py index 9ef9d0298d3d8..1089f770b5dbb 100644 --- a/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/llama_index/vector_stores/databricks-vector-search/base.py +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/llama_index/vector_stores/databricks-vector-search/base.py @@ -112,6 +112,7 @@ class DatabricksVectorSearch(BasePydanticVectorStore): Install ``databricks-vectorsearch`` package using the following in a Databricks notebook: %pip install databricks-vectorsearch dbutils.library.restartPython() + """ stores_text: bool = True @@ -150,7 +151,8 @@ def __init__( self._direct_access_index_spec = index_description.direct_access_index_spec super().__init__( - text_column=text_column, columns=columns, + text_column=text_column, + columns=columns, ) # initialize the column name for the text column in the delta table @@ -190,7 +192,11 @@ def __init__( f"columns missing from schema: {', '.join(missing_columns)}" ) - def add(self, nodes: List[BaseNode], **add_kwargs: Any,) -> List[str]: + def add( + self, + nodes: List[BaseNode], + **add_kwargs: Any, + ) -> List[str]: """Add nodes to index. Args: @@ -226,7 +232,9 @@ def add(self, nodes: List[BaseNode], **add_kwargs: Any,) -> List[str]: ids.append(node_id) # attempt the upsert - upsert_resp = self._index.upsert(entries,) + upsert_resp = self._index.upsert( + entries, + ) # return the successful IDs response_status = upsert_resp.get("status") @@ -261,11 +269,12 @@ def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None: ref_doc_id (str): The doc_id of the document to delete. """ - self._index.delete(primary_keys=[ref_doc_id],) + self._index.delete( + primary_keys=[ref_doc_id], + ) def query(self, query: VectorStoreQuery, **kwargs: Any) -> VectorStoreQueryResult: - """Query index for top k most similar nodes. - """ + """Query index for top k most similar nodes.""" if self._is_databricks_managed_embeddings(): query_text = query.query_str query_vector = None @@ -295,13 +304,12 @@ def query(self, query: VectorStoreQuery, **kwargs: Any) -> VectorStoreQueryResul ) columns = [ - col["name"] - for col in search_resp.get("manifest", dict()).get("columns", []) + col["name"] for col in search_resp.get("manifest", {}).get("columns", []) ] top_k_nodes = [] top_k_ids = [] top_k_scores = [] - for result in search_resp.get("result", dict()).get("data_array", []): + for result in search_resp.get("result", {}).get("data_array", []): doc_id = result[columns.index(self._primary_key)] text_content = result[columns.index(self.text_column)] metadata = { @@ -325,7 +333,7 @@ def query(self, query: VectorStoreQuery, **kwargs: Any) -> VectorStoreQueryResul @property def client(self) -> Any: - """Return VectorStoreIndex""" + """Return VectorStoreIndex.""" return self._index # The remaining utilities (and snippets of the above) are taken from @@ -355,7 +363,7 @@ def _embedding_vector_column(self) -> dict: if self._is_delta_sync_index() else self._direct_access_index_spec ) - return next(iter(index_spec.get("embedding_vector_columns") or list()), dict()) + return next(iter(index_spec.get("embedding_vector_columns") or []), {}) def _embedding_source_column_name(self) -> Optional[str]: """Return the name of the embedding source column. @@ -368,8 +376,8 @@ def _embedding_source_column(self) -> dict: Empty if the index is not a Databricks-managed embedding index. """ return next( - iter(self._delta_sync_index_spec.get("embedding_source_columns") or list()), - dict(), + iter(self._delta_sync_index_spec.get("embedding_source_columns") or []), + {}, ) def _is_delta_sync_index(self) -> bool: From d45cc3dcb9b1216fd2933056c0d9c861c95e38f2 Mon Sep 17 00:00:00 2001 From: Haotian Zhang Date: Fri, 16 Feb 2024 23:19:02 -0500 Subject: [PATCH 4/9] cr --- .../databricks-vector-search/base.py | 11 +++++++---- .../databricks-vector-search/utils.py | 15 --------------- 2 files changed, 7 insertions(+), 19 deletions(-) delete mode 100644 llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/llama_index/vector_stores/databricks-vector-search/utils.py diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/llama_index/vector_stores/databricks-vector-search/base.py b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/llama_index/vector_stores/databricks-vector-search/base.py index 1089f770b5dbb..3f3ec169ee20e 100644 --- a/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/llama_index/vector_stores/databricks-vector-search/base.py +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/llama_index/vector_stores/databricks-vector-search/base.py @@ -31,8 +31,6 @@ from llama_index.core.schema import TextNode, BaseNode from llama_index.core.bridge.pydantic import PrivateAttr -from llama_index.vector_stores.databricks_vector_search.utils import _import_databricks - if TYPE_CHECKING: from databricks.vector_search.client import VectorSearchIndex @@ -131,8 +129,13 @@ def __init__( text_column: Optional[str] = None, columns: Optional[List[str]] = None, ) -> None: - _import_databricks() - + try: + from databricks.vector_search.client import VectorSearchIndex + except ImportError: + raise ImportError( + "`databricks-vectorsearch` package not found: " + "please run `pip install databricks-vectorsearch`" + ) if not isinstance(index, VectorSearchIndex): raise TypeError( f"index must be of type `VectorSearchIndex`, not {type(index)}" diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/llama_index/vector_stores/databricks-vector-search/utils.py b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/llama_index/vector_stores/databricks-vector-search/utils.py deleted file mode 100644 index e2fef63a28558..0000000000000 --- a/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/llama_index/vector_stores/databricks-vector-search/utils.py +++ /dev/null @@ -1,15 +0,0 @@ -from typing import Any - - -def _import_databricks() -> Any: - """ - Try to import databricks.vector_search.client.VectorSearchIndex. If databricks module it's not already installed, instruct user how to install. - """ - - try: - from databricks.vector_search.client import VectorSearchIndex - except ImportError: - raise ImportError( - "`databricks-vectorsearch` package not found: " - "please run `pip install databricks-vectorsearch`" - ) From 17e9639dadf768bc1b20634abeeb44a0cec6a3dd Mon Sep 17 00:00:00 2001 From: Logan Markewich Date: Tue, 20 Feb 2024 20:44:13 -0600 Subject: [PATCH 5/9] pyproject.toml --- .../pyproject.toml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/pyproject.toml b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/pyproject.toml index 5296cebaae2a0..8151bb9da3521 100644 --- a/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/pyproject.toml +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/pyproject.toml @@ -8,10 +8,12 @@ check-hidden = true skip = "*.csv,*.html,*.json,*.jsonl,*.pdf,*.txt,*.ipynb" [tool.llamahub] -classes = ["DatabricksVectorSearch"] contains_example = false import_path = "llama_index.vector_stores.databricks_vector_search" +[tool.llamahub.class_authors] +DatabricksVectorSearch = "NickhilN" + [tool.mypy] disallow_untyped_defs = true exclude = ["_static", "build", "examples", "notebooks", "venv"] From 258d65a6dc003fae31086ac1954a69496f8b6b56 Mon Sep 17 00:00:00 2001 From: Nickhil Nabar Date: Tue, 12 Mar 2024 17:45:59 +0000 Subject: [PATCH 6/9] added linting and documentation/examples to the DatabricksVectorSearch module --- .../DatabricksVectorSearchDemo.ipynb | 289 ++++++++++++++++++ docs/module_guides/storing/vector_stores.md | 2 + 2 files changed, 291 insertions(+) create mode 100644 docs/examples/vector_stores/DatabricksVectorSearchDemo.ipynb diff --git a/docs/examples/vector_stores/DatabricksVectorSearchDemo.ipynb b/docs/examples/vector_stores/DatabricksVectorSearchDemo.ipynb new file mode 100644 index 0000000000000..f5b043fc80d2d --- /dev/null +++ b/docs/examples/vector_stores/DatabricksVectorSearchDemo.ipynb @@ -0,0 +1,289 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "2f685925-940a-418f-9b00-5500f8878fc3", + "showTitle": false, + "title": "" + } + }, + "source": [ + "# Databricks Vector Search\n", + "\n", + "Databricks Vector Search is a vector database that is built into the Databricks Intelligence Platform and integrated with its governance and productivity tools. Full docs here: https://docs.databricks.com/en/generative-ai/vector-search.html" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Install llama-index and databricks-vectorsearch. You must be inside a Databricks runtime to use the Vector Search python client." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "8289764f-1001-4eb7-b162-92490746ebe8", + "showTitle": true, + "title": "Install llama-index and databricks-vectorsearch client" + } + }, + "outputs": [], + "source": [ + "%pip install llama-index\n", + "%pip install databricks-vectorsearch" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Import databricks dependencies" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "75dd1adb-1937-49d2-aef1-393886271d46", + "showTitle": true, + "title": "Import Databricks dependencies" + } + }, + "outputs": [], + "source": [ + "from databricks.vector_search.client import VectorSearchIndex, VectorSearchClient" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Import LlamaIndex dependencies" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "b4ca851b-b0ee-4ea6-a31c-755c07e16d51", + "showTitle": true, + "title": "Import LlamaIndex dependencies" + } + }, + "outputs": [], + "source": [ + "from llama_index.core import (\n", + " VectorStoreIndex,\n", + " SimpleDirectoryReader,\n", + " ServiceContext,\n", + " StorageContext,\n", + ")\n", + "from llama_index.vector_stores.databricks_vector_search import DatabricksVectorSearch" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Load example data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "dd06759d-0070-48a8-aa74-3d46b12457f8", + "showTitle": true, + "title": "Load example data" + } + }, + "outputs": [], + "source": [ + "!mkdir -p 'data/paul_graham/'\n", + "!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Read the data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "7a82b624-bffb-453b-b5c6-f8414566dc2f", + "showTitle": true, + "title": "Read the data" + } + }, + "outputs": [], + "source": [ + "# load documents\n", + "documents = SimpleDirectoryReader(\"./data/paul_graham/\").load_data()\n", + "print(f\"Total documents: {len(documents)}\")\n", + "print(f\"First document, id: {documents[0].doc_id}\")\n", + "print(f\"First document, hash: {documents[0].hash}\")\n", + "print(\n", + " \"First document, text\"\n", + " f\" ({len(documents[0].text)} characters):\\n{'='*20}\\n{documents[0].text[:360]} ...\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create a Databricks Vector Search endpoint which will serve the index" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "99c56854-c182-4dfe-bc08-cee8263461ee", + "showTitle": true, + "title": "Create the Databricks Vector Search endpoint" + } + }, + "outputs": [], + "source": [ + "# Create a vector search endpoint\n", + "client = VectorSearchClient()\n", + "client.create_endpoint(\n", + " name=\"llamaindex_dbx_vector_store_test_endpoint\", endpoint_type=\"STANDARD\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create the Databricks Vector Search index, and build it from the documents" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "6abe427b-79ca-4c0c-8e58-ba5f670294ae", + "showTitle": true, + "title": "Build the index from the documents" + } + }, + "outputs": [], + "source": [ + "# Create a vector search index\n", + "# it must be placed inside a Unity Catalog-enabled schema\n", + "\n", + "# We'll use self-managed embeddings (i.e. managed by LlamaIndex) rather than a Databricks-managed index\n", + "databricks_index = client.create_direct_access_index(\n", + " endpoint_name=\"llamaindex_dbx_vector_store_test_endpoint\",\n", + " index_name=\"my_catalog.my_schema.my_test_table\",\n", + " primary_key=\"my_primary_key_name\",\n", + " embedding_dimension=1536, # match the embeddings model dimension you're going to use\n", + " embedding_vector_column=\"my_embedding_vector_column_name\", # you name this anything you want - it'll be picked up by the LlamaIndex class\n", + " schema={\n", + " \"my_primary_key_name\": \"string\",\n", + " \"my_embedding_vector_column_name\": \"array\",\n", + " \"text\": \"string\", # one column must match the text_column in the DatabricksVectorSearch instance created below; this will hold the raw node text.\n", + " # add any other metadata you may have in your nodes (Databricks Vector Search supports metadata filtering)\n", + " },\n", + ")\n", + "\n", + "databricks_vector_store = DatabricksVectorSearch(\n", + " index=databricks_index, text_column=\"text\"\n", + ") # text_column is required for self-managed embeddings\n", + "storage_context = StorageContext.from_defaults(vector_store=databricks_vector_store)\n", + "index = VectorStoreIndex.from_documents(documents, storage_context=storage_context)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Query the index" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "3e8c18f7-db8c-45c1-bb82-b75ad2307824", + "showTitle": true, + "title": "Query using the index" + } + }, + "outputs": [], + "source": [ + "query_engine = index.as_query_engine()\n", + "response = query_engine.query(\"Why did the author choose to work on AI?\")\n", + "\n", + "print(response.response)" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "dashboards": [], + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 4 + }, + "notebookName": "Databricks Vector Search Demo (LlamaIndex Integration)", + "widgets": {} + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/docs/module_guides/storing/vector_stores.md b/docs/module_guides/storing/vector_stores.md index e09e63318e9db..97a3d98a04a59 100644 --- a/docs/module_guides/storing/vector_stores.md +++ b/docs/module_guides/storing/vector_stores.md @@ -22,6 +22,7 @@ We are actively adding more integrations and improving feature coverage for each | ChatGPT Retrieval Plugin | aggregator | | | ✓ | ✓ | | | Chroma | self-hosted | ✓ | | ✓ | ✓ | | | DashVector | cloud | ✓ | ✓ | ✓ | ✓ | | +| Databricks | cloud | ✓ | | ✓ | ✓ | | Deeplake | self-hosted / cloud | ✓ | | ✓ | ✓ | | | DocArray | aggregator | ✓ | | ✓ | ✓ | | | DynamoDB | cloud | | | ✓ | | | @@ -67,6 +68,7 @@ maxdepth: 1 /examples/vector_stores/ChromaIndexDemo.ipynb /examples/vector_stores/DashvectorIndexDemo.ipynb /examples/vector_stores/DashvectorIndexDemo-Hybrid.ipynb +/examples/vector_stores/DatabricksVectorSearchDemo.ipynb /examples/vector_stores/DeepLakeIndexDemo.ipynb /examples/vector_stores/DocArrayHnswIndexDemo.ipynb /examples/vector_stores/DocArrayInMemoryIndexDemo.ipynb From 0c54272fb2f019467804c822efb035a4d1cda970 Mon Sep 17 00:00:00 2001 From: Nickhil Nabar Date: Wed, 13 Mar 2024 00:01:12 +0000 Subject: [PATCH 7/9] updated delete functionality to track documeent ID to node ID mappings and delete appropriately. clarified documentation around declaring doc_id and other metadata fields in the schema and columns keyword argumnet --- .../DatabricksVectorSearchDemo.ipynb | 7 +++++-- .../databricks-vector-search/base.py | 19 +++++++++++++++---- 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/docs/examples/vector_stores/DatabricksVectorSearchDemo.ipynb b/docs/examples/vector_stores/DatabricksVectorSearchDemo.ipynb index f5b043fc80d2d..58044d3d54c5c 100644 --- a/docs/examples/vector_stores/DatabricksVectorSearchDemo.ipynb +++ b/docs/examples/vector_stores/DatabricksVectorSearchDemo.ipynb @@ -230,13 +230,16 @@ " schema={\n", " \"my_primary_key_name\": \"string\",\n", " \"my_embedding_vector_column_name\": \"array\",\n", - " \"text\": \"string\", # one column must match the text_column in the DatabricksVectorSearch instance created below; this will hold the raw node text.\n", + " \"text\": \"string\", # one column must match the text_column in the DatabricksVectorSearch instance created below; this will hold the raw node text,\n", + " \"doc_id\": \"string\", # one column must contain the reference document ID (this will be populated by LlamaIndex automatically)\n", " # add any other metadata you may have in your nodes (Databricks Vector Search supports metadata filtering)\n", + " # NOTE THAT THESE FIELDS MUST BE ADDED EXPLICITLY TO BE USED FOR METADATA FILTERING\n", " },\n", ")\n", "\n", "databricks_vector_store = DatabricksVectorSearch(\n", - " index=databricks_index, text_column=\"text\"\n", + " index=databricks_index, text_column=\"text\",\n", + " columns=None, # YOU MUST ALSO RECORD YOUR METADATA FIELD NAMES HERE\n", ") # text_column is required for self-managed embeddings\n", "storage_context = StorageContext.from_defaults(vector_store=databricks_vector_store)\n", "index = VectorStoreIndex.from_documents(documents, storage_context=storage_context)" diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/llama_index/vector_stores/databricks-vector-search/base.py b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/llama_index/vector_stores/databricks-vector-search/base.py index 3f3ec169ee20e..7178c9a18a77e 100644 --- a/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/llama_index/vector_stores/databricks-vector-search/base.py +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/llama_index/vector_stores/databricks-vector-search/base.py @@ -122,6 +122,7 @@ class DatabricksVectorSearch(BasePydanticVectorStore): _index_type: str = PrivateAttr() _delta_sync_index_spec: dict = PrivateAttr() _direct_access_index_spec: dict = PrivateAttr() + _doc_id_to_pk: dict = PrivateAttr() def __init__( self, @@ -152,7 +153,12 @@ def __init__( self._index_type = index_description.index_type self._delta_sync_index_spec = index_description.delta_sync_index_spec self._direct_access_index_spec = index_description.direct_access_index_spec + self._doc_id_to_pk = {} + if columns is None: + columns = [] + if "doc_id" not in columns: + columns = columns[:19] + ["doc_id"] super().__init__( text_column=text_column, columns=columns, @@ -226,10 +232,12 @@ def add( for col in filter( lambda column: column not in (self._primary_key, self.text_column), - self.columns or [], + self.columns + ["doc_id"] or ["doc_id"], # explicitly record doc_id as metadata (for delete) ) }, } + doc_id = metadata.get("doc_id") + self._doc_id_to_pk[doc_id] = list(set(self._doc_id_to_pk.get(doc_id, []) + [node_id])) # associate this node_id with this doc_id entries.append(entry) ids.append(node_id) @@ -272,9 +280,12 @@ def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None: ref_doc_id (str): The doc_id of the document to delete. """ - self._index.delete( - primary_keys=[ref_doc_id], - ) + primary_keys = self._doc_id_to_pk.get(ref_doc_id, None) # get the node_ids associated with the doc_id + if primary_keys is not None: + self._index.delete( + primary_keys=primary_keys, + ) + self._doc_id_to_pk.pop(ref_doc_id) # remove this doc_id from the doc_id-to-node_id map def query(self, query: VectorStoreQuery, **kwargs: Any) -> VectorStoreQueryResult: """Query index for top k most similar nodes.""" From 50b4d1183f6c8126b378808be73cc897b74ec6fb Mon Sep 17 00:00:00 2001 From: Logan Markewich Date: Thu, 14 Mar 2024 16:31:53 -0600 Subject: [PATCH 8/9] nits --- .../DatabricksVectorSearchDemo.ipynb | 129 ++++-------------- .../BUILD | 3 - .../databricks-vector-search/__init__.py | 5 - .../.gitignore | 0 .../BUILD | 4 + .../Makefile | 0 .../README.md | 0 .../vector_stores/databricks}/BUILD | 0 .../vector_stores/databricks/__init__.py | 5 + .../vector_stores/databricks}/base.py | 28 ++-- .../pyproject.toml | 4 +- .../tests/BUILD | 0 .../tests/__init__.py | 0 ..._vector_stores_databricks_vector_search.py | 2 +- 14 files changed, 55 insertions(+), 125 deletions(-) delete mode 100644 llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/BUILD delete mode 100644 llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/llama_index/vector_stores/databricks-vector-search/__init__.py rename llama-index-integrations/vector_stores/{llama-index-vector-stores-databricks-vector-search => llama-index-vector-stores-databricks}/.gitignore (100%) create mode 100644 llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/BUILD rename llama-index-integrations/vector_stores/{llama-index-vector-stores-databricks-vector-search => llama-index-vector-stores-databricks}/Makefile (100%) rename llama-index-integrations/vector_stores/{llama-index-vector-stores-databricks-vector-search => llama-index-vector-stores-databricks}/README.md (100%) rename llama-index-integrations/vector_stores/{llama-index-vector-stores-databricks-vector-search/llama_index/vector_stores/databricks-vector-search => llama-index-vector-stores-databricks/llama_index/vector_stores/databricks}/BUILD (100%) create mode 100644 llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/llama_index/vector_stores/databricks/__init__.py rename llama-index-integrations/vector_stores/{llama-index-vector-stores-databricks-vector-search/llama_index/vector_stores/databricks-vector-search => llama-index-vector-stores-databricks/llama_index/vector_stores/databricks}/base.py (94%) rename llama-index-integrations/vector_stores/{llama-index-vector-stores-databricks-vector-search => llama-index-vector-stores-databricks}/pyproject.toml (91%) rename llama-index-integrations/vector_stores/{llama-index-vector-stores-databricks-vector-search => llama-index-vector-stores-databricks}/tests/BUILD (100%) rename llama-index-integrations/vector_stores/{llama-index-vector-stores-databricks-vector-search => llama-index-vector-stores-databricks}/tests/__init__.py (100%) rename llama-index-integrations/vector_stores/{llama-index-vector-stores-databricks-vector-search => llama-index-vector-stores-databricks}/tests/test_vector_stores_databricks_vector_search.py (73%) diff --git a/docs/examples/vector_stores/DatabricksVectorSearchDemo.ipynb b/docs/examples/vector_stores/DatabricksVectorSearchDemo.ipynb index 58044d3d54c5c..e48cfd9267a49 100644 --- a/docs/examples/vector_stores/DatabricksVectorSearchDemo.ipynb +++ b/docs/examples/vector_stores/DatabricksVectorSearchDemo.ipynb @@ -2,15 +2,7 @@ "cells": [ { "cell_type": "markdown", - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "2f685925-940a-418f-9b00-5500f8878fc3", - "showTitle": false, - "title": "" - } - }, + "metadata": {}, "source": [ "# Databricks Vector Search\n", "\n", @@ -27,21 +19,10 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "8289764f-1001-4eb7-b162-92490746ebe8", - "showTitle": true, - "title": "Install llama-index and databricks-vectorsearch client" - } - }, + "metadata": {}, "outputs": [], "source": [ - "%pip install llama-index\n", + "%pip install llama-index llama-index-vector-stores-databricks\n", "%pip install databricks-vectorsearch" ] }, @@ -55,21 +36,13 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "75dd1adb-1937-49d2-aef1-393886271d46", - "showTitle": true, - "title": "Import Databricks dependencies" - } - }, + "metadata": {}, "outputs": [], "source": [ - "from databricks.vector_search.client import VectorSearchIndex, VectorSearchClient" + "from databricks.vector_search.client import (\n", + " VectorSearchIndex,\n", + " VectorSearchClient,\n", + ")" ] }, { @@ -82,18 +55,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "b4ca851b-b0ee-4ea6-a31c-755c07e16d51", - "showTitle": true, - "title": "Import LlamaIndex dependencies" - } - }, + "metadata": {}, "outputs": [], "source": [ "from llama_index.core import (\n", @@ -102,7 +64,7 @@ " ServiceContext,\n", " StorageContext,\n", ")\n", - "from llama_index.vector_stores.databricks_vector_search import DatabricksVectorSearch" + "from llama_index.vector_stores.databricks import DatabricksVectorSearch" ] }, { @@ -115,15 +77,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "dd06759d-0070-48a8-aa74-3d46b12457f8", - "showTitle": true, - "title": "Load example data" - } - }, + "metadata": {}, "outputs": [], "source": [ "!mkdir -p 'data/paul_graham/'\n", @@ -140,15 +94,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "7a82b624-bffb-453b-b5c6-f8414566dc2f", - "showTitle": true, - "title": "Read the data" - } - }, + "metadata": {}, "outputs": [], "source": [ "# load documents\n", @@ -172,18 +118,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "99c56854-c182-4dfe-bc08-cee8263461ee", - "showTitle": true, - "title": "Create the Databricks Vector Search endpoint" - } - }, + "metadata": {}, "outputs": [], "source": [ "# Create a vector search endpoint\n", @@ -203,18 +138,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": { - "byteLimit": 2048000, - "rowLimit": 10000 - }, - "inputWidgets": {}, - "nuid": "6abe427b-79ca-4c0c-8e58-ba5f670294ae", - "showTitle": true, - "title": "Build the index from the documents" - } - }, + "metadata": {}, "outputs": [], "source": [ "# Create a vector search index\n", @@ -231,18 +155,23 @@ " \"my_primary_key_name\": \"string\",\n", " \"my_embedding_vector_column_name\": \"array\",\n", " \"text\": \"string\", # one column must match the text_column in the DatabricksVectorSearch instance created below; this will hold the raw node text,\n", - " \"doc_id\": \"string\", # one column must contain the reference document ID (this will be populated by LlamaIndex automatically)\n", + " \"doc_id\": \"string\", # one column must contain the reference document ID (this will be populated by LlamaIndex automatically)\n", " # add any other metadata you may have in your nodes (Databricks Vector Search supports metadata filtering)\n", " # NOTE THAT THESE FIELDS MUST BE ADDED EXPLICITLY TO BE USED FOR METADATA FILTERING\n", " },\n", ")\n", "\n", "databricks_vector_store = DatabricksVectorSearch(\n", - " index=databricks_index, text_column=\"text\",\n", - " columns=None, # YOU MUST ALSO RECORD YOUR METADATA FIELD NAMES HERE\n", + " index=databricks_index,\n", + " text_column=\"text\",\n", + " columns=None, # YOU MUST ALSO RECORD YOUR METADATA FIELD NAMES HERE\n", ") # text_column is required for self-managed embeddings\n", - "storage_context = StorageContext.from_defaults(vector_store=databricks_vector_store)\n", - "index = VectorStoreIndex.from_documents(documents, storage_context=storage_context)" + "storage_context = StorageContext.from_defaults(\n", + " vector_store=databricks_vector_store\n", + ")\n", + "index = VectorStoreIndex.from_documents(\n", + " documents, storage_context=storage_context\n", + ")" ] }, { @@ -255,15 +184,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "application/vnd.databricks.v1+cell": { - "cellMetadata": {}, - "inputWidgets": {}, - "nuid": "3e8c18f7-db8c-45c1-bb82-b75ad2307824", - "showTitle": true, - "title": "Query using the index" - } - }, + "metadata": {}, "outputs": [], "source": [ "query_engine = index.as_query_engine()\n", diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/BUILD b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/BUILD deleted file mode 100644 index 0896ca890d8bf..0000000000000 --- a/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/BUILD +++ /dev/null @@ -1,3 +0,0 @@ -poetry_requirements( - name="poetry", -) diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/llama_index/vector_stores/databricks-vector-search/__init__.py b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/llama_index/vector_stores/databricks-vector-search/__init__.py deleted file mode 100644 index 041494b47fb05..0000000000000 --- a/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/llama_index/vector_stores/databricks-vector-search/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -from llama_index.vector_stores.databricks_vector_search.base import ( - DatabricksVectorSearch, -) - -__all__ = ["DatabricksVectorSearch"] diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/.gitignore b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/.gitignore similarity index 100% rename from llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/.gitignore rename to llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/.gitignore diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/BUILD b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/BUILD new file mode 100644 index 0000000000000..05444d69d26e8 --- /dev/null +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/BUILD @@ -0,0 +1,4 @@ +poetry_requirements( + name="poetry", + module_mapping={"databricks-vectorsearch": ["databricks"]} +) diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/Makefile b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/Makefile similarity index 100% rename from llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/Makefile rename to llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/Makefile diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/README.md b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/README.md similarity index 100% rename from llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/README.md rename to llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/README.md diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/llama_index/vector_stores/databricks-vector-search/BUILD b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/llama_index/vector_stores/databricks/BUILD similarity index 100% rename from llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/llama_index/vector_stores/databricks-vector-search/BUILD rename to llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/llama_index/vector_stores/databricks/BUILD diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/llama_index/vector_stores/databricks/__init__.py b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/llama_index/vector_stores/databricks/__init__.py new file mode 100644 index 0000000000000..3d63d6acf2fa4 --- /dev/null +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/llama_index/vector_stores/databricks/__init__.py @@ -0,0 +1,5 @@ +from llama_index.vector_stores.databricks.base import ( + DatabricksVectorSearch, +) + +__all__ = ["DatabricksVectorSearch"] diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/llama_index/vector_stores/databricks-vector-search/base.py b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/llama_index/vector_stores/databricks/base.py similarity index 94% rename from llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/llama_index/vector_stores/databricks-vector-search/base.py rename to llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/llama_index/vector_stores/databricks/base.py index 7178c9a18a77e..bd7f5ae328218 100644 --- a/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/llama_index/vector_stores/databricks-vector-search/base.py +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/llama_index/vector_stores/databricks/base.py @@ -11,13 +11,13 @@ List, Dict, Optional, - TYPE_CHECKING, cast, ) from enum import Enum -from pydantic import BaseModel, Field +from databricks.vector_search.client import VectorSearchIndex +from llama_index.core.bridge.pydantic import BaseModel, Field, PrivateAttr from llama_index.core.vector_stores.types import ( BasePydanticVectorStore, MetadataFilters, @@ -32,10 +32,6 @@ from llama_index.core.bridge.pydantic import PrivateAttr -if TYPE_CHECKING: - from databricks.vector_search.client import VectorSearchIndex - - class _DatabricksIndexType(str, Enum): DIRECT_ACCESS = "DIRECT_ACCESS" DELTA_SYNC = "DELTA_SYNC" @@ -223,6 +219,12 @@ def add( for node in nodes: node_id = node.node_id metadata = node_to_metadata_dict(node, remove_text=True, flat_metadata=True) + + metadata_columns = self.columns or [] + + # explicitly record doc_id as metadata (for delete) + metadata_columns.append("doc_id") + entry = { self._primary_key: node_id, self.text_column: node.get_content(), @@ -232,12 +234,14 @@ def add( for col in filter( lambda column: column not in (self._primary_key, self.text_column), - self.columns + ["doc_id"] or ["doc_id"], # explicitly record doc_id as metadata (for delete) + metadata_columns, ) }, } doc_id = metadata.get("doc_id") - self._doc_id_to_pk[doc_id] = list(set(self._doc_id_to_pk.get(doc_id, []) + [node_id])) # associate this node_id with this doc_id + self._doc_id_to_pk[doc_id] = list( + set(self._doc_id_to_pk.get(doc_id, []) + [node_id]) # noqa: RUF005 + ) # associate this node_id with this doc_id entries.append(entry) ids.append(node_id) @@ -280,12 +284,16 @@ def delete(self, ref_doc_id: str, **delete_kwargs: Any) -> None: ref_doc_id (str): The doc_id of the document to delete. """ - primary_keys = self._doc_id_to_pk.get(ref_doc_id, None) # get the node_ids associated with the doc_id + primary_keys = self._doc_id_to_pk.get( + ref_doc_id, None + ) # get the node_ids associated with the doc_id if primary_keys is not None: self._index.delete( primary_keys=primary_keys, ) - self._doc_id_to_pk.pop(ref_doc_id) # remove this doc_id from the doc_id-to-node_id map + self._doc_id_to_pk.pop( + ref_doc_id + ) # remove this doc_id from the doc_id-to-node_id map def query(self, query: VectorStoreQuery, **kwargs: Any) -> VectorStoreQueryResult: """Query index for top k most similar nodes.""" diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/pyproject.toml b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/pyproject.toml similarity index 91% rename from llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/pyproject.toml rename to llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/pyproject.toml index 8151bb9da3521..0b45006cde64b 100644 --- a/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks-vector-search/pyproject.toml +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-databricks/pyproject.toml @@ -9,7 +9,7 @@ skip = "*.csv,*.html,*.json,*.jsonl,*.pdf,*.txt,*.ipynb" [tool.llamahub] contains_example = false -import_path = "llama_index.vector_stores.databricks_vector_search" +import_path = "llama_index.vector_stores.databricks" [tool.llamahub.class_authors] DatabricksVectorSearch = "NickhilN" @@ -24,7 +24,7 @@ python_version = "3.8" authors = ["Alberto Da Costa ", "Nickhil Nabar Date: Thu, 14 Mar 2024 16:40:55 -0600 Subject: [PATCH 9/9] linting --- docs/module_guides/storing/vector_stores.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/module_guides/storing/vector_stores.md b/docs/module_guides/storing/vector_stores.md index ba27da0932367..8a6a39e1a1384 100644 --- a/docs/module_guides/storing/vector_stores.md +++ b/docs/module_guides/storing/vector_stores.md @@ -23,7 +23,7 @@ We are actively adding more integrations and improving feature coverage for each | ChatGPT Retrieval Plugin | aggregator | | | ✓ | ✓ | | | Chroma | self-hosted | ✓ | | ✓ | ✓ | | | DashVector | cloud | ✓ | ✓ | ✓ | ✓ | | -| Databricks | cloud | ✓ | | ✓ | ✓ | | +| Databricks | cloud | ✓ | | ✓ | ✓ | | | Deeplake | self-hosted / cloud | ✓ | | ✓ | ✓ | | | DocArray | aggregator | ✓ | | ✓ | ✓ | | | DuckDB | in-memory / self-hosted | ✓ | | ✓ | ✓ | |