Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve SQLAlchemy FloatVector type implementation #14

Merged
merged 1 commit into from
Dec 21, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion target_cratedb/connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,7 +137,7 @@ def pick_individual_type(jsonschema_type: dict):
if "type" in storage_properties and storage_properties["type"] == "vector":
# On PostgreSQL/pgvector, use the corresponding type definition
# from its SQLAlchemy dialect.
return FloatVector(storage_properties["dim"])
return FloatVector(dimensions=storage_properties["dim"])

# Discover/translate inner types.
inner_type = resolve_array_inner_type(jsonschema_type)
Expand Down
41 changes: 12 additions & 29 deletions target_cratedb/sqlalchemy/vector.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
import sqlalchemy as sa
from crate.client.sqlalchemy.compiler import CrateTypeCompiler
from crate.client.sqlalchemy.dialect import TYPES_MAP
from sqlalchemy import TypeDecorator
from sqlalchemy.sql import sqltypes
from sqlalchemy.sql.type_api import TypeEngine

__all__ = ["FloatVector"]

Expand Down Expand Up @@ -41,7 +41,8 @@ def to_db(value: t.Any, dim: t.Optional[int] = None) -> t.Optional[t.List]:
return value


class FloatVector(TypeEngine[t.Sequence[t.Any]]):
class FloatVector(TypeDecorator[t.Sequence[float]]):

"""
An improved implementation of the `FloatVector` data type for CrateDB,
compared to the previous implementation on behalf of the LangChain adapter.
Expand Down Expand Up @@ -96,32 +97,25 @@ class FloatVector(TypeEngine[t.Sequence[t.Any]]):
implementations correspondingly.
"""

cache_ok = True
cache_ok = False

__visit_name__ = "FLOAT_VECTOR"

_is_array = True

zero_indexes = False

def __init__(self, dim: t.Optional[int] = None, as_tuple: bool = False) -> None:
self.dim = dim
self.as_tuple = as_tuple

@property
def hashable(self):
return self.as_tuple
impl = sa.ARRAY

@property
def python_type(self):
return list
def __init__(self, dimensions: int = None):
super().__init__(sa.FLOAT, dimensions=dimensions)

def as_generic(self):
return sqltypes.ARRAY

def bind_processor(self, dialect: sa.Dialect) -> t.Callable:
def process(value: t.Iterable) -> t.Optional[t.List]:
return to_db(value, self.dim)
return to_db(value, self.dimensions)

return process

Expand All @@ -131,27 +125,16 @@ def process(value: t.Any) -> t.Optional[npt.ArrayLike]:

return process

"""
CrateDB currently only supports the similarity function `VectorSimilarityFunction.EUCLIDEAN`.
-- https://github.com/crate/crate/blob/1ca5c6dbb2/server/src/main/java/io/crate/types/FloatVectorType.java#L55
On the other hand, pgvector use a comparator to apply different similarity functions as operators,
see `pgvector.sqlalchemy.Vector.comparator_factory`.
<->: l2/euclidean_distance
<#>: max_inner_product
<=>: cosine_distance
TODO: Discuss.
""" # noqa: E501


# Accompanies the type definition for reverse type lookups.
TYPES_MAP["float_vector"] = FloatVector


def visit_FLOAT_VECTOR(self, type_, **kw):
return f"FLOAT_VECTOR({type_.dim})"
dimensions = type_.dimensions
if dimensions is None:
raise ValueError("FloatVector must be initialized with dimension size")
return f"FLOAT_VECTOR({dimensions})"


CrateTypeCompiler.visit_FLOAT_VECTOR = visit_FLOAT_VECTOR