From bcb537b35a4219b7e5e77ccd472f64b081bace01 Mon Sep 17 00:00:00 2001 From: mshaw4015101 Date: Tue, 25 Jun 2024 18:25:14 +0100 Subject: [PATCH 1/3] llama_changes --- .../llama_index/vector_stores/kdbai/base.py | 106 ++++++++++++------ .../llama_index/vector_stores/kdbai/utils.py | 60 +++++++++- 2 files changed, 131 insertions(+), 35 deletions(-) diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-kdbai/llama_index/vector_stores/kdbai/base.py b/llama-index-integrations/vector_stores/llama-index-vector-stores-kdbai/llama_index/vector_stores/kdbai/base.py index 0274c004b4074..108a52fa0214b 100644 --- a/llama-index-integrations/vector_stores/llama-index-vector-stores-kdbai/llama_index/vector_stores/kdbai/base.py +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-kdbai/llama_index/vector_stores/kdbai/base.py @@ -16,7 +16,12 @@ VectorStoreQuery, VectorStoreQueryResult, ) -from llama_index.vector_stores.kdbai.utils import default_sparse_encoder +from llama_index.vector_stores.kdbai.utils import ( + default_sparse_encoder_v1, + convert_metadata_col_v1, + default_sparse_encoder_v2, + convert_metadata_col_v2, +) DEFAULT_COLUMN_NAMES = ["document_id", "text", "embedding"] @@ -27,24 +32,6 @@ logger = logging.getLogger(__name__) -# MATCH THE METADATA COLUMN DATA TYPE TO ITS PYTYPE -def convert_metadata_col(column, value): - try: - if column["pytype"] == "str": - return str(value) - elif column["pytype"] == "bytes": - return value.encode("utf-8") - elif column["pytype"] == "datetime64[ns]": - return pd.to_datetime(value) - elif column["pytype"] == "timedelta64[ns]": - return pd.to_timedelta(value) - return value.astype(column["pytype"]) - except Exception as e: - logger.error( - f"Failed to convert column {column['name']} to type {column['pytype']}: {e}" - ) - - class KDBAIVectorStore(BasePydanticVectorStore): """The KDBAI Vector Store. @@ -97,7 +84,10 @@ def __init__( if hybrid_search: if sparse_encoder is None: - self._sparse_encoder = default_sparse_encoder + if kdbai.version("kdbai_client") >= '1.2.0': + self._sparse_encoder = default_sparse_encoder_v2 + else: + self._sparse_encoder = default_sparse_encoder_v1 else: self._sparse_encoder = sparse_encoder @@ -125,12 +115,32 @@ def add( Returns: List[str]: List of document IDs that were added. """ + try: + import kdbai_client as kdbai + + logger.info("KDBAI client version: " + kdbai.__version__) + + except ImportError: + raise ValueError( + "Could not import kdbai_client package." + "Please add it to the dependencies." + ) + df = pd.DataFrame() docs = [] - schema = self._table.schema()["columns"] - if self.hybrid_search: - schema = [item for item in schema if item["name"] != "sparseVectors"] + if kdbai.version("kdbai_client") >= '1.2.0': + schema = self._table.schema['schema']['c'] + types = self._table.schema['schema']['t'].decode('utf-8') + else: + schema = self._table.schema()["columns"] + + if self.hybrid_search: + if kdbai.version("kdbai_client") >= '1.2.0': + schema = [item for item in schema if item != "sparseVectors"] + else: + schema = [item for item in schema if item["name"] != "sparseVectors"] + try: for node in nodes: doc = { @@ -144,15 +154,26 @@ def add( # handle extra columns if len(schema) > len(DEFAULT_COLUMN_NAMES): - for column in schema[len(DEFAULT_COLUMN_NAMES) :]: - try: - doc[column["name"]] = convert_metadata_col( - column, node.metadata[column["name"]] - ) - except Exception as e: - logger.error( - f"Error writing column {column['name']} as type {column['pytype']}: {e}." - ) + if kdbai.version("kdbai_client") >= '1.2.0': + for column_name, column_type in zip(schema[len(DEFAULT_COLUMN_NAMES):], types[len(DEFAULT_COLUMN_NAMES):]): + try: + doc[column_name] = convert_metadata_col_v2( + column_name, column_type, node.metadata[column_name] + ) + except Exception as e: + logger.error( + f"Error writing column {column_name} as qtype {column_type}: {e}." + ) + else: + for column in schema[len(DEFAULT_COLUMN_NAMES) :]: + try: + doc[column["name"]] = convert_metadata_col_v1( + column, node.metadata[column["name"]] + ) + except Exception as e: + logger.error( + f"Error writing column {column['name']} as type {column['pytype']}: {e}." + ) docs.append(doc) @@ -171,8 +192,20 @@ def add( except Exception as e: logger.error(f"Error preparing data for KDB.AI: {e}.") - + def query(self, query: VectorStoreQuery, **kwargs: Any) -> VectorStoreQueryResult: + + try: + import kdbai_client as kdbai + + logger.info("KDBAI client version: " + kdbai.__version__) + + except ImportError: + raise ValueError( + "Could not import kdbai_client package." + "Please add it to the dependencies." + ) + if query.filters is None: filter = [] else: @@ -180,7 +213,12 @@ def query(self, query: VectorStoreQuery, **kwargs: Any) -> VectorStoreQueryResul if self.hybrid_search: alpha = query.alpha if query.alpha is not None else 0.5 - sparse_vectors = self._sparse_encoder([query.query_str]) + + if kdbai.version("kdbai_client") >= '1.2.0': + sparse_vectors = [self._sparse_encoder([query.query_str])] + else: + sparse_vectors = self._sparse_encoder([query.query_str]) + results = self._table.hybrid_search( dense_vectors=[query.query_embedding], sparse_vectors=sparse_vectors, diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-kdbai/llama_index/vector_stores/kdbai/utils.py b/llama-index-integrations/vector_stores/llama-index-vector-stores-kdbai/llama_index/vector_stores/kdbai/utils.py index f690dba7861a7..56f479b18080d 100644 --- a/llama-index-integrations/vector_stores/llama-index-vector-stores-kdbai/llama_index/vector_stores/kdbai/utils.py +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-kdbai/llama_index/vector_stores/kdbai/utils.py @@ -1,7 +1,46 @@ from typing import List, Dict +import logging +import pandas as pd +logger = logging.getLogger(__name__) -def default_sparse_encoder(texts: List[str]) -> List[Dict[int, int]]: + +def default_sparse_encoder_v2(texts: List[str]) -> Dict[int, int]: + try: + from transformers import BertTokenizer + from collections import Counter + except ImportError: + raise ImportError( + "Could not import transformers library. " + 'Please install transformers with `pip install "transformers"`' + ) + + tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") + tokenized_texts = tokenizer(texts, padding=True, truncation=True, max_length=512)["input_ids"] + + flat_tokenized_texts = [token_id for sublist in tokenized_texts for token_id in sublist] + + sparse_encoding = dict(Counter(flat_tokenized_texts)) + return sparse_encoding + +# MATCH THE METADATA COLUMN DATA TYPE TO ITS PYTYPE +def convert_metadata_col_v2(column_name, column_type, column_value): + try: + if column_type == "s": + return str(column_value) + elif column_type == "C": + return column_value.encode("utf-8") + elif column_type == "p": + return pd.to_datetime(column_value) + elif column_type == "n": + return pd.to_timedelta(column_value) + return column_value.astype(column_type) + except Exception as e: + logger.error( + f"Failed to convert column {column_name} to qtype {column_type}: {e}" + ) + +def default_sparse_encoder_v1(texts: List[str]) -> List[Dict[int, int]]: try: from transformers import BertTokenizer from collections import Counter @@ -20,3 +59,22 @@ def default_sparse_encoder(texts: List[str]) -> List[Dict[int, int]]: sparse_encoding = dict(Counter(tokenized_text)) results.append(sparse_encoding) return results + +def convert_metadata_col_v1(column, value): + try: + if column["pytype"] == "str": + return str(value) + elif column["pytype"] == "bytes": + return value.encode("utf-8") + elif column["pytype"] == "datetime64[ns]": + return pd.to_datetime(value) + elif column["pytype"] == "timedelta64[ns]": + return pd.to_timedelta(value) + return value.astype(column["pytype"]) + except Exception as e: + logger.error( + f"Failed to convert column {column['name']} to type {column['pytype']}: {e}" + ) + + + From d316b5bd9c4cd81ff3eb7ebc27c8ee80a041ac23 Mon Sep 17 00:00:00 2001 From: mshaw4015101 Date: Wed, 26 Jun 2024 11:02:17 +0100 Subject: [PATCH 2/3] formatting changes --- .../llama_index/vector_stores/kdbai/base.py | 32 ++++++++++--------- .../llama_index/vector_stores/kdbai/utils.py | 17 ++++++---- .../pyproject.toml | 2 +- 3 files changed, 28 insertions(+), 23 deletions(-) diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-kdbai/llama_index/vector_stores/kdbai/base.py b/llama-index-integrations/vector_stores/llama-index-vector-stores-kdbai/llama_index/vector_stores/kdbai/base.py index 108a52fa0214b..f952595f8cc09 100644 --- a/llama-index-integrations/vector_stores/llama-index-vector-stores-kdbai/llama_index/vector_stores/kdbai/base.py +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-kdbai/llama_index/vector_stores/kdbai/base.py @@ -17,9 +17,9 @@ VectorStoreQueryResult, ) from llama_index.vector_stores.kdbai.utils import ( - default_sparse_encoder_v1, + default_sparse_encoder_v1, convert_metadata_col_v1, - default_sparse_encoder_v2, + default_sparse_encoder_v2, convert_metadata_col_v2, ) @@ -84,7 +84,7 @@ def __init__( if hybrid_search: if sparse_encoder is None: - if kdbai.version("kdbai_client") >= '1.2.0': + if kdbai.version("kdbai_client") >= "1.2.0": self._sparse_encoder = default_sparse_encoder_v2 else: self._sparse_encoder = default_sparse_encoder_v1 @@ -125,22 +125,22 @@ def add( "Could not import kdbai_client package." "Please add it to the dependencies." ) - + df = pd.DataFrame() docs = [] - if kdbai.version("kdbai_client") >= '1.2.0': - schema = self._table.schema['schema']['c'] - types = self._table.schema['schema']['t'].decode('utf-8') + if kdbai.version("kdbai_client") >= "1.2.0": + schema = self._table.schema["schema"]["c"] + types = self._table.schema["schema"]["t"].decode("utf-8") else: schema = self._table.schema()["columns"] if self.hybrid_search: - if kdbai.version("kdbai_client") >= '1.2.0': + if kdbai.version("kdbai_client") >= "1.2.0": schema = [item for item in schema if item != "sparseVectors"] else: schema = [item for item in schema if item["name"] != "sparseVectors"] - + try: for node in nodes: doc = { @@ -154,8 +154,11 @@ def add( # handle extra columns if len(schema) > len(DEFAULT_COLUMN_NAMES): - if kdbai.version("kdbai_client") >= '1.2.0': - for column_name, column_type in zip(schema[len(DEFAULT_COLUMN_NAMES):], types[len(DEFAULT_COLUMN_NAMES):]): + if kdbai.version("kdbai_client") >= "1.2.0": + for column_name, column_type in zip( + schema[len(DEFAULT_COLUMN_NAMES) :], + types[len(DEFAULT_COLUMN_NAMES) :], + ): try: doc[column_name] = convert_metadata_col_v2( column_name, column_type, node.metadata[column_name] @@ -192,9 +195,8 @@ def add( except Exception as e: logger.error(f"Error preparing data for KDB.AI: {e}.") - - def query(self, query: VectorStoreQuery, **kwargs: Any) -> VectorStoreQueryResult: + def query(self, query: VectorStoreQuery, **kwargs: Any) -> VectorStoreQueryResult: try: import kdbai_client as kdbai @@ -205,7 +207,7 @@ def query(self, query: VectorStoreQuery, **kwargs: Any) -> VectorStoreQueryResul "Could not import kdbai_client package." "Please add it to the dependencies." ) - + if query.filters is None: filter = [] else: @@ -214,7 +216,7 @@ def query(self, query: VectorStoreQuery, **kwargs: Any) -> VectorStoreQueryResul if self.hybrid_search: alpha = query.alpha if query.alpha is not None else 0.5 - if kdbai.version("kdbai_client") >= '1.2.0': + if kdbai.version("kdbai_client") >= "1.2.0": sparse_vectors = [self._sparse_encoder([query.query_str])] else: sparse_vectors = self._sparse_encoder([query.query_str]) diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-kdbai/llama_index/vector_stores/kdbai/utils.py b/llama-index-integrations/vector_stores/llama-index-vector-stores-kdbai/llama_index/vector_stores/kdbai/utils.py index 56f479b18080d..6211f45badd4d 100644 --- a/llama-index-integrations/vector_stores/llama-index-vector-stores-kdbai/llama_index/vector_stores/kdbai/utils.py +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-kdbai/llama_index/vector_stores/kdbai/utils.py @@ -16,12 +16,16 @@ def default_sparse_encoder_v2(texts: List[str]) -> Dict[int, int]: ) tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") - tokenized_texts = tokenizer(texts, padding=True, truncation=True, max_length=512)["input_ids"] + tokenized_texts = tokenizer(texts, padding=True, truncation=True, max_length=512)[ + "input_ids" + ] - flat_tokenized_texts = [token_id for sublist in tokenized_texts for token_id in sublist] + flat_tokenized_texts = [ + token_id for sublist in tokenized_texts for token_id in sublist + ] + + return dict(Counter(flat_tokenized_texts)) - sparse_encoding = dict(Counter(flat_tokenized_texts)) - return sparse_encoding # MATCH THE METADATA COLUMN DATA TYPE TO ITS PYTYPE def convert_metadata_col_v2(column_name, column_type, column_value): @@ -40,6 +44,7 @@ def convert_metadata_col_v2(column_name, column_type, column_value): f"Failed to convert column {column_name} to qtype {column_type}: {e}" ) + def default_sparse_encoder_v1(texts: List[str]) -> List[Dict[int, int]]: try: from transformers import BertTokenizer @@ -60,6 +65,7 @@ def default_sparse_encoder_v1(texts: List[str]) -> List[Dict[int, int]]: results.append(sparse_encoding) return results + def convert_metadata_col_v1(column, value): try: if column["pytype"] == "str": @@ -75,6 +81,3 @@ def convert_metadata_col_v1(column, value): logger.error( f"Failed to convert column {column['name']} to type {column['pytype']}: {e}" ) - - - diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-kdbai/pyproject.toml b/llama-index-integrations/vector_stores/llama-index-vector-stores-kdbai/pyproject.toml index 0213d77630e73..ddf708de5a15a 100644 --- a/llama-index-integrations/vector_stores/llama-index-vector-stores-kdbai/pyproject.toml +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-kdbai/pyproject.toml @@ -36,7 +36,7 @@ version = "0.1.6" python = ">=3.8.1,<4.0" llama-index-core = "^0.10.0" pykx = "^2.1.1" -kdbai-client = "^0.1.2" +kdbai-client = ">=1.1.0" [tool.poetry.group.dev.dependencies] ipython = "8.10.0" From 2f6a428b130e2ac8f482e5b8f80a293bb5e6aaf7 Mon Sep 17 00:00:00 2001 From: Logan Markewich Date: Wed, 26 Jun 2024 13:45:45 -0700 Subject: [PATCH 3/3] vbump --- .../llama-index-vector-stores-kdbai/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama-index-integrations/vector_stores/llama-index-vector-stores-kdbai/pyproject.toml b/llama-index-integrations/vector_stores/llama-index-vector-stores-kdbai/pyproject.toml index ddf708de5a15a..07196ef3fb1a2 100644 --- a/llama-index-integrations/vector_stores/llama-index-vector-stores-kdbai/pyproject.toml +++ b/llama-index-integrations/vector_stores/llama-index-vector-stores-kdbai/pyproject.toml @@ -30,7 +30,7 @@ exclude = ["**/BUILD"] license = "MIT" name = "llama-index-vector-stores-kdbai" readme = "README.md" -version = "0.1.6" +version = "0.1.7" [tool.poetry.dependencies] python = ">=3.8.1,<4.0"