From 544b43470c36c918c02737f7c9fcb5a0783422cb Mon Sep 17 00:00:00 2001 From: Andreas Motl Date: Thu, 21 Dec 2023 15:10:33 +0100 Subject: [PATCH] Vector: Add support for CrateDB's `FLOAT_VECTOR` data type: `FloatVector` https://crate.io/docs/crate/reference/en/master/general/ddl/data-types.html#float-vector --- CHANGES.md | 5 + pyproject.toml | 6 + src/sqlalchemy_cratedb/compiler.py | 6 + src/sqlalchemy_cratedb/dialect.py | 5 +- src/sqlalchemy_cratedb/type/__init__.py | 1 + src/sqlalchemy_cratedb/type/vector.py | 159 ++++++++++++++++++++++++ 6 files changed, 180 insertions(+), 2 deletions(-) create mode 100644 src/sqlalchemy_cratedb/type/vector.py diff --git a/CHANGES.md b/CHANGES.md index dfb97015..65ac05b6 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -3,6 +3,11 @@ ## Unreleased +- Added support for CrateDB's [FLOAT_VECTOR] data type. For SQLAlchemy + column definitions, you can use it like `FloatVector(dimensions=1024)`. + +[FLOAT_VECTOR]: https://crate.io/docs/crate/reference/en/master/general/ddl/data-types.html#float-vector + ## 2023/09/29 0.34.0 diff --git a/pyproject.toml b/pyproject.toml index 5bd43255..6335a918 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -91,6 +91,9 @@ dependencies = [ "verlib2==0.2", ] [project.optional-dependencies] +all = [ + "sqlalchemy-cratedb[vector]", +] develop = [ "black<24", "mypy<1.8", @@ -114,6 +117,9 @@ test = [ "pytest-cov<5", "pytest-mock<4", ] +vector = [ + "numpy", +] [project.urls] changelog = "https://github.com/crate-workbench/sqlalchemy-cratedb/blob/main/CHANGES.md" documentation = "https://github.com/crate-workbench/sqlalchemy-cratedb" diff --git a/src/sqlalchemy_cratedb/compiler.py b/src/sqlalchemy_cratedb/compiler.py index 07106b87..d3a66188 100644 --- a/src/sqlalchemy_cratedb/compiler.py +++ b/src/sqlalchemy_cratedb/compiler.py @@ -238,6 +238,12 @@ def visit_ARRAY(self, type_, **kw): def visit_OBJECT(self, type_, **kw): return "OBJECT" + def visit_FLOAT_VECTOR(self, type_, **kw): + dimensions = type_.dimensions + if dimensions is None: + raise ValueError("FloatVector must be initialized with dimension size") + return f"FLOAT_VECTOR({dimensions})" + class CrateCompiler(compiler.SQLCompiler): diff --git a/src/sqlalchemy_cratedb/dialect.py b/src/sqlalchemy_cratedb/dialect.py index aebad9c2..73f6c539 100644 --- a/src/sqlalchemy_cratedb/dialect.py +++ b/src/sqlalchemy_cratedb/dialect.py @@ -33,7 +33,7 @@ ) from crate.client.exceptions import TimezoneUnawareException from .sa_version import SA_VERSION, SA_1_4, SA_2_0 -from .type import ObjectArray, ObjectType +from .type import FloatVector, ObjectArray, ObjectType TYPES_MAP = { "boolean": sqltypes.Boolean, @@ -51,7 +51,8 @@ "float": sqltypes.Float, "real": sqltypes.Float, "string": sqltypes.String, - "text": sqltypes.String + "text": sqltypes.String, + "float_vector": FloatVector, } try: # SQLAlchemy >= 1.1 diff --git a/src/sqlalchemy_cratedb/type/__init__.py b/src/sqlalchemy_cratedb/type/__init__.py index 8e78f7da..5bd871dc 100644 --- a/src/sqlalchemy_cratedb/type/__init__.py +++ b/src/sqlalchemy_cratedb/type/__init__.py @@ -1,3 +1,4 @@ from .array import ObjectArray from .geo import Geopoint, Geoshape from .object import ObjectType +from .vector import FloatVector diff --git a/src/sqlalchemy_cratedb/type/vector.py b/src/sqlalchemy_cratedb/type/vector.py new file mode 100644 index 00000000..9cde7ea2 --- /dev/null +++ b/src/sqlalchemy_cratedb/type/vector.py @@ -0,0 +1,159 @@ +""" +## About +SQLAlchemy data type implementation for CrateDB's `FLOAT_VECTOR` type. + +## References +- https://crate.io/docs/crate/reference/en/master/general/ddl/data-types.html#float-vector +- https://crate.io/docs/crate/reference/en/master/general/builtins/scalar-functions.html#scalar-knn-match + +## Details +The implementation is based on SQLAlchemy's `TypeDecorator`, and also +offers compiler support. + +## Notes +CrateDB currently only supports the similarity function `VectorSimilarityFunction.EUCLIDEAN`. +-- https://github.com/crate/crate/blob/5.5.1/server/src/main/java/io/crate/types/FloatVectorType.java#L55 + +On the other hand, pgvector use a comparator to apply different similarity +functions as operators, see `pgvector.sqlalchemy.Vector.comparator_factory`. + +<->: l2/euclidean_distance +<#>: max_inner_product +<=>: cosine_distance + +## Backlog +- The type implementation might want to be accompanied by corresponding support + for the `KNN_MATCH` function, similar to what the dialect already offers for + fulltext search through its `Match` predicate. + +## Origin +This module is based on the corresponding pgvector implementation +by Andrew Kane. Thank you. + +The MIT License (MIT) +Copyright (c) 2021-2023 Andrew Kane +https://github.com/pgvector/pgvector-python +""" +import typing as t + +if t.TYPE_CHECKING: + import numpy.typing as npt + +import sqlalchemy as sa + +__all__ = ["FloatVector"] + + +def from_db(value: t.Iterable) -> t.Optional[npt.ArrayLike]: + import numpy as np + + # from `pgvector.utils` + # could be ndarray if already cast by lower-level driver + if value is None or isinstance(value, np.ndarray): + return value + + return np.array(value, dtype=np.float32) + + +def to_db(value: t.Any, dim: t.Optional[int] = None) -> t.Optional[t.List]: + import numpy as np + + # from `pgvector.utils` + if value is None: + return value + + if isinstance(value, np.ndarray): + if value.ndim != 1: + raise ValueError("expected ndim to be 1") + + if not np.issubdtype(value.dtype, np.integer) and not np.issubdtype(value.dtype, np.floating): + raise ValueError("dtype must be numeric") + + value = value.tolist() + + if dim is not None and len(value) != dim: + raise ValueError("expected %d dimensions, not %d" % (dim, len(value))) + + return value + + +class FloatVector(sa.TypeDecorator[t.Sequence[float]]): + + """ + An improved implementation of the `FloatVector` data type for CrateDB, + compared to the previous implementation on behalf of the LangChain adapter. + + The previous implementation, based on SQLAlchemy's `UserDefinedType`, didn't + respect the `python_type` property on backward/reverse resolution of types. + This was observed on Meltano's database connector machinery doing a + type cast, which led to a `NotImplementedError`. + + typing.cast(type, sql_type.python_type) => NotImplementedError + + The `UserDefinedType` approach is easier to implement, because it doesn't + need compiler support. + + To get full SQLAlchemy type support, including support for forward- and + backward resolution / type casting, the custom data type should derive + from SQLAlchemy's `TypeEngine` base class instead. + + When deriving from `TypeEngine`, you will need to set the `__visit_name__` + attribute, and add a corresponding visitor method to the `CrateTypeCompiler`, + in this case, `visit_FLOAT_VECTOR`. + + Now, rendering a DDL succeeds. However, when reflecting the DDL schema back, + it doesn't work until you will establish a corresponding reverse type mapping. + + By invoking `SELECT DISTINCT(data_type) FROM information_schema.columns;`, + you will find out that the internal type name is `float_vector`, so you + announce it to the dialect using `TYPES_MAP["float_vector"] = FloatVector`. + + Still not there: `NotImplementedError: Default TypeEngine.as_generic() heuristic + method was unsuccessful for target_cratedb.sqlalchemy.vector.FloatVector. A + custom as_generic() method must be implemented for this type class.` + + So, as it signals that the type implementation also needs an `as_generic` + property, let's supply one, returning `sqltypes.ARRAY`. + + It looks like, in exchange to those improvements, the `get_col_spec` + method is not needed any longer. + + TODO: Would it be a good idea to derive from SQLAlchemy's + `ARRAY` right away, to get a few of the features without + the need to redefine them? + + Please note the outcome of this analysis and the corresponding implementation + has been derived from empirical observations, and from the feeling that we also + lack corresponding support on the other special data types of CrateDB (ARRAY and + OBJECT) within the SQLAlchemy dialect, i.e. "that something must be wrong or + incomplete". In this spirit, it is advisable to review and improve their + implementations correspondingly. + """ + + cache_ok = False + + __visit_name__ = "FLOAT_VECTOR" + + _is_array = True + + zero_indexes = False + + impl = sa.ARRAY + + def __init__(self, dimensions: int = None): + super().__init__(sa.FLOAT, dimensions=dimensions) + + def as_generic(self): + return sa.ARRAY + + def bind_processor(self, dialect: sa.Dialect) -> t.Callable: + def process(value: t.Iterable) -> t.Optional[t.List]: + return to_db(value, self.dimensions) + + return process + + def result_processor(self, dialect: sa.Dialect, coltype: t.Any) -> t.Callable: + def process(value: t.Any) -> t.Optional[npt.ArrayLike]: + return from_db(value) + + return process