Skip to content

Commit

Permalink
Vector: Add support for CrateDB's FLOAT_VECTOR data type: `FloatVec…
Browse files Browse the repository at this point in the history
  • Loading branch information
amotl committed Dec 21, 2023
1 parent c5a4020 commit 544b434
Show file tree
Hide file tree
Showing 6 changed files with 180 additions and 2 deletions.
5 changes: 5 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,11 @@

## Unreleased

- Added support for CrateDB's [FLOAT_VECTOR] data type. For SQLAlchemy
column definitions, you can use it like `FloatVector(dimensions=1024)`.

[FLOAT_VECTOR]: https://crate.io/docs/crate/reference/en/master/general/ddl/data-types.html#float-vector


## 2023/09/29 0.34.0

Expand Down
6 changes: 6 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,9 @@ dependencies = [
"verlib2==0.2",
]
[project.optional-dependencies]
all = [
"sqlalchemy-cratedb[vector]",
]
develop = [
"black<24",
"mypy<1.8",
Expand All @@ -114,6 +117,9 @@ test = [
"pytest-cov<5",
"pytest-mock<4",
]
vector = [
"numpy",
]
[project.urls]
changelog = "https://github.com/crate-workbench/sqlalchemy-cratedb/blob/main/CHANGES.md"
documentation = "https://github.com/crate-workbench/sqlalchemy-cratedb"
Expand Down
6 changes: 6 additions & 0 deletions src/sqlalchemy_cratedb/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,12 @@ def visit_ARRAY(self, type_, **kw):
def visit_OBJECT(self, type_, **kw):
return "OBJECT"

def visit_FLOAT_VECTOR(self, type_, **kw):
dimensions = type_.dimensions
if dimensions is None:
raise ValueError("FloatVector must be initialized with dimension size")
return f"FLOAT_VECTOR({dimensions})"


class CrateCompiler(compiler.SQLCompiler):

Expand Down
5 changes: 3 additions & 2 deletions src/sqlalchemy_cratedb/dialect.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
)
from crate.client.exceptions import TimezoneUnawareException
from .sa_version import SA_VERSION, SA_1_4, SA_2_0
from .type import ObjectArray, ObjectType
from .type import FloatVector, ObjectArray, ObjectType

TYPES_MAP = {
"boolean": sqltypes.Boolean,
Expand All @@ -51,7 +51,8 @@
"float": sqltypes.Float,
"real": sqltypes.Float,
"string": sqltypes.String,
"text": sqltypes.String
"text": sqltypes.String,
"float_vector": FloatVector,
}
try:
# SQLAlchemy >= 1.1
Expand Down
1 change: 1 addition & 0 deletions src/sqlalchemy_cratedb/type/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from .array import ObjectArray
from .geo import Geopoint, Geoshape
from .object import ObjectType
from .vector import FloatVector
159 changes: 159 additions & 0 deletions src/sqlalchemy_cratedb/type/vector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
"""
## About
SQLAlchemy data type implementation for CrateDB's `FLOAT_VECTOR` type.
## References
- https://crate.io/docs/crate/reference/en/master/general/ddl/data-types.html#float-vector
- https://crate.io/docs/crate/reference/en/master/general/builtins/scalar-functions.html#scalar-knn-match
## Details
The implementation is based on SQLAlchemy's `TypeDecorator`, and also
offers compiler support.
## Notes
CrateDB currently only supports the similarity function `VectorSimilarityFunction.EUCLIDEAN`.
-- https://github.com/crate/crate/blob/5.5.1/server/src/main/java/io/crate/types/FloatVectorType.java#L55
On the other hand, pgvector use a comparator to apply different similarity
functions as operators, see `pgvector.sqlalchemy.Vector.comparator_factory`.
<->: l2/euclidean_distance
<#>: max_inner_product
<=>: cosine_distance
## Backlog
- The type implementation might want to be accompanied by corresponding support
for the `KNN_MATCH` function, similar to what the dialect already offers for
fulltext search through its `Match` predicate.
## Origin
This module is based on the corresponding pgvector implementation
by Andrew Kane. Thank you.
The MIT License (MIT)
Copyright (c) 2021-2023 Andrew Kane
https://github.com/pgvector/pgvector-python
"""
import typing as t

if t.TYPE_CHECKING:
import numpy.typing as npt

import sqlalchemy as sa

__all__ = ["FloatVector"]


def from_db(value: t.Iterable) -> t.Optional[npt.ArrayLike]:
import numpy as np

# from `pgvector.utils`
# could be ndarray if already cast by lower-level driver
if value is None or isinstance(value, np.ndarray):
return value

return np.array(value, dtype=np.float32)


def to_db(value: t.Any, dim: t.Optional[int] = None) -> t.Optional[t.List]:
import numpy as np

# from `pgvector.utils`
if value is None:
return value

if isinstance(value, np.ndarray):
if value.ndim != 1:
raise ValueError("expected ndim to be 1")

if not np.issubdtype(value.dtype, np.integer) and not np.issubdtype(value.dtype, np.floating):
raise ValueError("dtype must be numeric")

value = value.tolist()

if dim is not None and len(value) != dim:
raise ValueError("expected %d dimensions, not %d" % (dim, len(value)))

return value


class FloatVector(sa.TypeDecorator[t.Sequence[float]]):

"""
An improved implementation of the `FloatVector` data type for CrateDB,
compared to the previous implementation on behalf of the LangChain adapter.
The previous implementation, based on SQLAlchemy's `UserDefinedType`, didn't
respect the `python_type` property on backward/reverse resolution of types.
This was observed on Meltano's database connector machinery doing a
type cast, which led to a `NotImplementedError`.
typing.cast(type, sql_type.python_type) => NotImplementedError
The `UserDefinedType` approach is easier to implement, because it doesn't
need compiler support.
To get full SQLAlchemy type support, including support for forward- and
backward resolution / type casting, the custom data type should derive
from SQLAlchemy's `TypeEngine` base class instead.
When deriving from `TypeEngine`, you will need to set the `__visit_name__`
attribute, and add a corresponding visitor method to the `CrateTypeCompiler`,
in this case, `visit_FLOAT_VECTOR`.
Now, rendering a DDL succeeds. However, when reflecting the DDL schema back,
it doesn't work until you will establish a corresponding reverse type mapping.
By invoking `SELECT DISTINCT(data_type) FROM information_schema.columns;`,
you will find out that the internal type name is `float_vector`, so you
announce it to the dialect using `TYPES_MAP["float_vector"] = FloatVector`.
Still not there: `NotImplementedError: Default TypeEngine.as_generic() heuristic
method was unsuccessful for target_cratedb.sqlalchemy.vector.FloatVector. A
custom as_generic() method must be implemented for this type class.`
So, as it signals that the type implementation also needs an `as_generic`
property, let's supply one, returning `sqltypes.ARRAY`.
It looks like, in exchange to those improvements, the `get_col_spec`
method is not needed any longer.
TODO: Would it be a good idea to derive from SQLAlchemy's
`ARRAY` right away, to get a few of the features without
the need to redefine them?
Please note the outcome of this analysis and the corresponding implementation
has been derived from empirical observations, and from the feeling that we also
lack corresponding support on the other special data types of CrateDB (ARRAY and
OBJECT) within the SQLAlchemy dialect, i.e. "that something must be wrong or
incomplete". In this spirit, it is advisable to review and improve their
implementations correspondingly.
"""

cache_ok = False

__visit_name__ = "FLOAT_VECTOR"

_is_array = True

zero_indexes = False

impl = sa.ARRAY

def __init__(self, dimensions: int = None):
super().__init__(sa.FLOAT, dimensions=dimensions)

def as_generic(self):
return sa.ARRAY

def bind_processor(self, dialect: sa.Dialect) -> t.Callable:
def process(value: t.Iterable) -> t.Optional[t.List]:
return to_db(value, self.dimensions)

return process

def result_processor(self, dialect: sa.Dialect, coltype: t.Any) -> t.Callable:
def process(value: t.Any) -> t.Optional[npt.ArrayLike]:
return from_db(value)

return process

0 comments on commit 544b434

Please sign in to comment.