crate-workbench · amotl · Dec 21, 2023 · Dec 14, 2023 · amotl · Dec 16, 2023
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -65,7 +65,7 @@ jobs:
         pip install "setuptools>=64" --upgrade
 
         # Install package in editable mode.
-        pip install --use-pep517 --prefer-binary --editable=.[test,develop]
+        pip install --use-pep517 --prefer-binary --editable=.[all,develop,test]
 
     - name: Run linter and software tests
       run: |

diff --git a/CHANGES.md b/CHANGES.md
@@ -1,6 +1,7 @@
 # Changelog for Meltano/Singer Target for CrateDB
 
 ## In progress
+- Add support for container types `ARRAY`, `OBJECT`, and `FLOAT_VECTOR`.
 
 ## 2023-12-08 v0.0.1
 - Make it work. It can run the canonical Meltano GitHub -> DB example.
diff --git a/README.md b/README.md
@@ -123,6 +123,27 @@ LIMIT
 ```
 
 
+## Vector Store Support
+
+In order to support CrateDB's vector store feature, i.e. its `FLOAT_VECTOR`
+data type, you will need to install `numpy`. It has been added to an "extra"
+of the Python package, called `vector`.
+
+When installing the package using pip, this would apply:
+```
+pip install 'meltano-target-cratedb[vector]'
+```
+
+When installing the package using the Meltano's project definition, this
+would probably be the right way to write it down, but it hasn't been verified
+yet.
+```yaml
+- name: target-cratedb
+  variant: cratedb
+  pip_url: meltano-target-cratedb[vector]
+```
+
+
 ## Development
 
 In order to work on this adapter dialect on behalf of a real pipeline definition,

diff --git a/pyproject.toml b/pyproject.toml
@@ -94,10 +94,13 @@ dynamic = [
 dependencies = [
   "crate[sqlalchemy]",
   "cratedb-toolkit",
-  'importlib-resources; python_version < "3.9"',
-  "meltanolabs-target-postgres==0.0.9",
+  'importlib-resources; python_version < "3.9"', # "meltanolabs-target-postgres==0.0.9",
+  "meltanolabs-target-postgres@ git+https://github.com/singer-contrib/meltanolabs-target-postgres.git@pgvector",
 ]
 [project.optional-dependencies]
+all = [
+  "meltano-target-cratedb[vector]",
+]
 develop = [
   "black<24",
   "mypy==1.7.1",
@@ -115,6 +118,9 @@ test = [
   "pytest-cov<5",
   "pytest-mock<4",
 ]
+vector = [
+  "numpy",
+]
 [project.urls]
 changelog = "https://github.com/crate-workbench/meltano-target-cratedb/blob/main/CHANGES.md"
 documentation = "https://github.com/crate-workbench/meltano-target-cratedb"

diff --git a/target_cratedb/__init__.py b/target_cratedb/__init__.py
@@ -1,4 +1,4 @@
 """Init CrateDB."""
-from target_cratedb.patch import patch_sqlalchemy
+from target_cratedb.sqlalchemy.patch import patch_sqlalchemy
 
 patch_sqlalchemy()
diff --git a/target_cratedb/connector.py b/target_cratedb/connector.py
@@ -6,14 +6,18 @@
 from datetime import datetime
 
 import sqlalchemy
+import sqlalchemy as sa
 from crate.client.sqlalchemy.types import ObjectType, ObjectTypeImpl, _ObjectArray
 from singer_sdk import typing as th
-from sqlalchemy.dialects.postgresql import ARRAY, BIGINT
+from singer_sdk.helpers._typing import is_array_type, is_boolean_type, is_integer_type, is_number_type, is_object_type
 from sqlalchemy.types import (
+    ARRAY,
+    BIGINT,
     BOOLEAN,
     DATE,
     DATETIME,
     DECIMAL,
+    FLOAT,
     INTEGER,
     TEXT,
     TIME,
@@ -22,7 +26,8 @@
 )
 from target_postgres.connector import NOTYPE, PostgresConnector
 
-from target_cratedb.patch import polyfill_refresh_after_dml_engine
+from target_cratedb.sqlalchemy.patch import polyfill_refresh_after_dml_engine
+from target_cratedb.sqlalchemy.vector import FloatVector
 
 
 class CrateDBConnector(PostgresConnector):
@@ -111,8 +116,52 @@ def pick_individual_type(jsonschema_type: dict):
         if "object" in jsonschema_type["type"]:
             return ObjectType
         if "array" in jsonschema_type["type"]:
-            # TODO: Handle other inner-types as well?
+            # Select between different kinds of `ARRAY` data types.
+            #
+            # This currently leverages an unspecified definition for the Singer SCHEMA,
+            # using the `additionalProperties` attribute to convey additional type
+            # information, agnostic of the target database.
+            #
+            # In this case, it is about telling different kinds of `ARRAY` types apart:
+            # Either it is a vanilla `ARRAY`, to be stored into a `jsonb[]` type, or,
+            # alternatively, it can be a "vector" kind `ARRAY` of floating point
+            # numbers, effectively what pgvector is storing in its `VECTOR` type.
+            #
+            # Still, `type: "vector"` is only a surrogate label here, because other
+            # database systems may use different types for implementing the same thing,
+            # and need to translate accordingly.
+            """
+            Schema override rule in `meltano.yml`:
+
+            type: "array"
+            items:
+              type: "number"
+            additionalProperties:
+              storage:
+                type: "vector"
+                dim: 4
+
+            Produced schema annotation in `catalog.json`:
+
+            {"type": "array",
+             "items": {"type": "number"},
+             "additionalProperties": {"storage": {"type": "vector", "dim": 4}}}
+            """
+            if "additionalProperties" in jsonschema_type and "storage" in jsonschema_type["additionalProperties"]:
+                storage_properties = jsonschema_type["additionalProperties"]["storage"]
+                if "type" in storage_properties and storage_properties["type"] == "vector":
+                    # On PostgreSQL/pgvector, use the corresponding type definition
+                    # from its SQLAlchemy dialect.
+                    return FloatVector(storage_properties["dim"])
+
+            # Discover/translate inner types.
+            inner_type = resolve_array_inner_type(jsonschema_type)
+            if inner_type is not None:
+                return ARRAY(inner_type)
+
+            # When type discovery fails, assume `TEXT`.
             return ARRAY(TEXT())
+
         if jsonschema_type.get("format") == "date-time":
             return TIMESTAMP()
         individual_type = th.to_sql_type(jsonschema_type)
@@ -139,20 +188,18 @@ def pick_best_sql_type(sql_type_array: list):
             DATE,
             TIME,
             DECIMAL,
+            FLOAT,
             BIGINT,
             INTEGER,
             BOOLEAN,
             NOTYPE,
             ARRAY,
-            ObjectType,
+            FloatVector,
+            ObjectTypeImpl,
         ]
 
         for sql_type in precedence_order:
             for obj in sql_type_array:
-                # FIXME: Workaround. Currently, ObjectType can not be resolved back to a type?
-                #        TypeError: isinstance() arg 2 must be a type, a tuple of types, or a union
-                if isinstance(sql_type, ObjectTypeImpl):
-                    return ObjectType
                 if isinstance(obj, sql_type):
                     return obj
         return TEXT()
@@ -188,6 +235,8 @@ def _get_type_sort_key(
 
             if isinstance(sql_type, _ObjectArray):
                 return 0, _len
+            if isinstance(sql_type, FloatVector):
+                return 0, _len
             if isinstance(sql_type, NOTYPE):
                 return 0, _len
 
@@ -245,3 +294,18 @@ def prepare_schema(self, schema_name: str) -> None:
         Don't emit `CREATE SCHEMA` statements to CrateDB.
         """
         pass
+
+
+def resolve_array_inner_type(jsonschema_type: dict) -> t.Union[sa.types.TypeEngine, None]:
+    if "items" in jsonschema_type:
+        if is_boolean_type(jsonschema_type["items"]):
+            return BOOLEAN()
+        if is_number_type(jsonschema_type["items"]):
+            return FLOAT()
+        if is_integer_type(jsonschema_type["items"]):
+            return BIGINT()
+        if is_object_type(jsonschema_type["items"]):
+            return ObjectType()
+        if is_array_type(jsonschema_type["items"]):
+            return resolve_array_inner_type(jsonschema_type["items"]["type"])
+    return None
diff --git a/target_cratedb/sqlalchemy/__init__.py b/target_cratedb/sqlalchemy/__init__.py
diff --git a/target_cratedb/patch.py → target_cratedb/sqlalchemy/patch.py b/target_cratedb/patch.py → target_cratedb/sqlalchemy/patch.py
@@ -1,21 +1,36 @@
 from datetime import datetime
 
 import sqlalchemy as sa
-from crate.client.sqlalchemy.dialect import TYPES_MAP, DateTime
+from _decimal import Decimal
+from crate.client.http import CrateJsonEncoder
+from crate.client.sqlalchemy.dialect import ARRAY, TYPES_MAP, DateTime
 from crate.client.sqlalchemy.types import _ObjectArray
 from sqlalchemy.sql import sqltypes
 
 
 def patch_sqlalchemy():
+    patch_types()
+    patch_json_encoder()
+
+
+def patch_types():
     """
-    Register missing timestamp data type.
+    Register missing data types, and fix erroneous ones.
 
     TODO: Upstream to crate-python.
     """
-    # TODO: Submit patch to `crate-python`.
+    TYPES_MAP["bigint"] = sqltypes.BIGINT
+    TYPES_MAP["bigint_array"] = ARRAY(sqltypes.BIGINT)
+    TYPES_MAP["long"] = sqltypes.BIGINT
+    TYPES_MAP["long_array"] = ARRAY(sqltypes.BIGINT)
+    TYPES_MAP["real"] = sqltypes.DOUBLE
+    TYPES_MAP["real_array"] = ARRAY(sqltypes.DOUBLE)
     TYPES_MAP["timestamp without time zone"] = sqltypes.TIMESTAMP
     TYPES_MAP["timestamp with time zone"] = sqltypes.TIMESTAMP
 
+    # TODO: Can `ARRAY` be inherited from PostgreSQL's
+    #       `ARRAY`, to make type checking work?
+
     def as_generic(self):
         return sqltypes.ARRAY
 
@@ -36,6 +51,23 @@ def process(value):
     DateTime.bind_processor = bind_processor
 
 
+def patch_json_encoder():
+    """
+    `Decimal` types have been rendered as strings.
+
+    TODO: Upstream to crate-python.
+    """
+
+    json_encoder_default = CrateJsonEncoder.default
+
+    def default(self, o):
+        if isinstance(o, Decimal):
+            return float(o)
+        return json_encoder_default(o)
+
+    CrateJsonEncoder.default = default
+
+
 def polyfill_refresh_after_dml_engine(engine: sa.Engine):
     def receive_after_execute(
         conn: sa.engine.Connection, clauseelement, multiparams, params, execution_options, result