From c561201550126fe9644d144fee8b8301c0dd6f88 Mon Sep 17 00:00:00 2001 From: Will Langdale Date: Thu, 5 Dec 2024 17:09:22 +0000 Subject: [PATCH 01/14] Initial structuring of config files --- sample.datasets.toml | 22 +++++++++++++++++++++- src/matchbox/admin.py | 4 +--- src/matchbox/common/db.py | 11 ++++++++++- src/matchbox/server/postgresql/adapter.py | 1 + 4 files changed, 33 insertions(+), 5 deletions(-) diff --git a/sample.datasets.toml b/sample.datasets.toml index e0dda57e..585cce7b 100644 --- a/sample.datasets.toml +++ b/sample.datasets.toml @@ -11,21 +11,41 @@ database = "pg_warehouse" db_schema = "companieshouse" db_table = "companies" db_pk = "id" +index = [ + { name = "crn", type = "text", literal = "crn" }, + { name = "company_name", type = "text", literal = "company_name" }, + { name = "postcode", type = "text", literal = "postcode" } +] [datasets.data_hub_companies] database = "pg_warehouse" db_schema = "dbt" db_table = "data_hub__companies" db_pk = "id" +index = [ + { name = "cdms", type = "text", literal = "cdms" }, + { name = "company_name", type = "text", literal = "company_name" }, + { name = "postcode", type = "text", literal = "postcode" } +] [datasets.hmrc_exporters] database = "pg_warehouse" db_schema = "hmrc" db_table = "trade__exporters" db_pk = "id" +index = [ + { name = "company_name", type = "text", literal = "company_name" }, + { name = "postcode", type = "text", literal = "postcode" } +] [datasets.export_wins] database = "pg_warehouse" db_schema = "dbt" db_table = "export_wins__wins_dataset" -db_pk = "id" \ No newline at end of file +db_pk = "id" +index = [ + { name = "company_name", type = "text", literal = "company_name" }, + { name = "cdms", type = "text", literal = "cdms" }, + { name = "dh_id", type = "text", literal = "data_hub_company_id" }, + { name = "postcode", type = "text", literal = "postcode" } +] diff --git a/src/matchbox/admin.py b/src/matchbox/admin.py index 1dc6c246..a2067594 100644 --- a/src/matchbox/admin.py +++ b/src/matchbox/admin.py @@ -7,9 +7,7 @@ from matchbox.common.db import SourceWarehouse from matchbox.server import MatchboxDBAdapter, inject_backend -from matchbox.server.base import ( - Source, -) +from matchbox.server.base import Source logger = logging.getLogger("mb_logic") diff --git a/src/matchbox/common/db.py b/src/matchbox/common/db.py index 61b56693..4b98b5ec 100644 --- a/src/matchbox/common/db.py +++ b/src/matchbox/common/db.py @@ -116,6 +116,14 @@ def from_engine(cls, engine: Engine, alias: str | None = None) -> "SourceWarehou return warehouse +class SourceColumn(BaseModel): + """A column in a dataset that can be indexed in the Matchbox database.""" + + name: str + type: str | None = None + literal: str | None = None + + class Source(BaseModel): """A dataset that can be indexed in the Matchbox database.""" @@ -123,10 +131,11 @@ class Source(BaseModel): populate_by_name=True, ) - database: SourceWarehouse | None = None + database: SourceWarehouse db_pk: str db_schema: str db_table: str + index: list[SourceColumn | bytes] def __str__(self) -> str: return f"{self.db_schema}.{self.db_table}" diff --git a/src/matchbox/server/postgresql/adapter.py b/src/matchbox/server/postgresql/adapter.py index 54dbe451..3a80da61 100644 --- a/src/matchbox/server/postgresql/adapter.py +++ b/src/matchbox/server/postgresql/adapter.py @@ -358,6 +358,7 @@ def get_dataset(self, db_schema: str, db_table: str, engine: Engine) -> Source: db_schema=dataset.schema, db_table=dataset.table, db_pk=dataset.id, + index=None, database=SourceWarehouse.from_engine(engine), ) else: From e22f7c6251136b36ababb97cafc7ea83f2507ad1 Mon Sep 17 00:00:00 2001 From: Will Langdale Date: Fri, 6 Dec 2024 15:59:54 +0000 Subject: [PATCH 02/14] Working ingestion of Source objects with configured columns --- sample.datasets.toml | 25 ++++---- src/matchbox/common/db.py | 132 ++++++++++++++++++++++++++++++++++++-- uv.lock | 90 ++++++++++++++------------ 3 files changed, 188 insertions(+), 59 deletions(-) diff --git a/sample.datasets.toml b/sample.datasets.toml index 585cce7b..343e5a2e 100644 --- a/sample.datasets.toml +++ b/sample.datasets.toml @@ -12,9 +12,9 @@ db_schema = "companieshouse" db_table = "companies" db_pk = "id" index = [ - { name = "crn", type = "text", literal = "crn" }, - { name = "company_name", type = "text", literal = "company_name" }, - { name = "postcode", type = "text", literal = "postcode" } + { literal = "crn", type = "text" }, + { literal = "company_name", type = "text" }, + { literal = "postcode", type = "text" } ] [datasets.data_hub_companies] @@ -23,9 +23,10 @@ db_schema = "dbt" db_table = "data_hub__companies" db_pk = "id" index = [ - { name = "cdms", type = "text", literal = "cdms" }, - { name = "company_name", type = "text", literal = "company_name" }, - { name = "postcode", type = "text", literal = "postcode" } + { literal = "cdms", type = "text" }, + { literal = "company_name", type = "text" }, + { literal = "postcode", type = "text" }, + { literal = "*" } ] [datasets.hmrc_exporters] @@ -34,8 +35,8 @@ db_schema = "hmrc" db_table = "trade__exporters" db_pk = "id" index = [ - { name = "company_name", type = "text", literal = "company_name" }, - { name = "postcode", type = "text", literal = "postcode" } + { literal = "company_name", type = "text" }, + { literal = "postcode", type = "text" }, ] [datasets.export_wins] @@ -44,8 +45,8 @@ db_schema = "dbt" db_table = "export_wins__wins_dataset" db_pk = "id" index = [ - { name = "company_name", type = "text", literal = "company_name" }, - { name = "cdms", type = "text", literal = "cdms" }, - { name = "dh_id", type = "text", literal = "data_hub_company_id" }, - { name = "postcode", type = "text", literal = "postcode" } + { literal = "company_name", type = "text" }, + { literal = "postcode", type = "text" }, + { literal = "cdms", type = "text" }, + { literal = "data_hub_company_id", alias = "dh_id", type = "text" }, ] diff --git a/src/matchbox/common/db.py b/src/matchbox/common/db.py index 4b98b5ec..033b0c6c 100644 --- a/src/matchbox/common/db.py +++ b/src/matchbox/common/db.py @@ -6,7 +6,12 @@ from matchbox.common.hash import HASH_FUNC from pandas import DataFrame from pyarrow import Table as ArrowTable -from pydantic import BaseModel, ConfigDict, Field +from pydantic import ( + BaseModel, + ConfigDict, + Field, + model_validator, +) from sqlalchemy import ( LABEL_STYLE_TABLENAME_PLUS_COL, ColumnElement, @@ -119,9 +124,64 @@ def from_engine(cls, engine: Engine, alias: str | None = None) -> "SourceWarehou class SourceColumn(BaseModel): """A column in a dataset that can be indexed in the Matchbox database.""" - name: str - type: str | None = None - literal: str | None = None + model_config = ConfigDict(arbitrary_types_allowed=True) + + literal: str = Field(description="The literal name of the column in the database.") + alias: str = Field( + default_factory=lambda data: data["literal"], + description="The alias to use when hashing the dataset in Matchbox.", + ) + type: str | None = Field( + default=None, description="The type to cast the column to before hashing data." + ) + indexed: bool = Field(description="Whether the column is indexed in the database.") + + def __eq__(self, other: object) -> bool: + """Compare SourceColumn with another SourceColumn or bytes object. + + Two SourceColumns are equal if: + + * Their literal names match, or + * Their alias names match, or + * The hash of either their literal or alias matches the other object's + corresponding hash + + A SourceColumn is equal to a bytes object if: + + * The hash of either its literal or alias matches the bytes object + + Args: + other: Another SourceColumn or a bytes object to compare against + + Returns: + bool: True if the objects are considered equal, False otherwise + """ + literal_hash = HASH_FUNC(str(self.literal).encode("utf-8")).digest() + alias_hash = HASH_FUNC(str(self.alias).encode("utf-8")).digest() + + if isinstance(other, SourceColumn): + if self.literal == other.literal or self.alias == other.alias: + return True + + other_literal_hash = HASH_FUNC(str(other.literal).encode("utf-8")).digest() + other_alias_hash = HASH_FUNC(str(other.alias).encode("utf-8")).digest() + + self_hashes = {literal_hash, alias_hash} + other_hashes = {other_literal_hash, other_alias_hash} + + return bool(self_hashes & other_hashes) + + if isinstance(other, bytes): + return other in {literal_hash, alias_hash} + + return NotImplemented + + +class SourceIndex(BaseModel): + """The hashes of column names in the Matchbox database.""" + + literal: list[bytes] + alias: list[bytes] class Source(BaseModel): @@ -135,7 +195,7 @@ class Source(BaseModel): db_pk: str db_schema: str db_table: str - index: list[SourceColumn | bytes] + db_columns: list[SourceColumn] def __str__(self) -> str: return f"{self.db_schema}.{self.db_table}" @@ -145,6 +205,68 @@ def __hash__(self) -> int: (type(self), self.db_pk, self.db_schema, self.db_table, self.database.alias) ) + @model_validator(mode="before") + @classmethod + def hash_columns(cls, data: dict[str, Any]) -> "Source": + """Shapes indices data from either the backend or TOML. + + Handles: + * From TOML, no columns specified + * From TOML, some or all columns specified + * From the database, indices already present + """ + # Database setup + if isinstance(data["database"], SourceWarehouse): + warehouse = data["database"] + else: + warehouse = SourceWarehouse(**data["database"]) + metadata = MetaData(schema=data["db_schema"]) + table = Table(data["db_table"], metadata, autoload_with=warehouse.engine) + + # Column logic + # Get all locally specified columns, or remotely specified hashes + local_columns: list[SourceColumn] = [] + local_hashes: SourceIndex | None = None + select_all = False + if isinstance(data["index"], dict): + # Came from Matchbox database + local_hashes = SourceIndex(**data["index"]) + else: + # Came from TOML + for column in data["index"]: + if column["literal"] == "*": + select_all = True + continue + local_columns.append(SourceColumn(**column, indexed=True)) + continue + + # Get all remote columns using the user's creds and merge with local spec + remote_columns = [ + SourceColumn(literal=col.name, type=str(col.type), indexed=select_all) + for col in table.columns + if col.name not in data["db_pk"] + ] + db_columns: list[SourceColumn] = [] + for remote_column in remote_columns: + if local_columns: + # Came from TOML, index and alias are configured from TOML + for local_column in local_columns: + if remote_column == local_column: + if local_column.type is None: + local_column.type = remote_column.type + db_columns.append(local_column) + break + else: + db_columns.append(remote_column) + elif local_hashes: + # Came from database, index is true when hashes match + if remote_column in local_hashes.literal + local_hashes.alias: + remote_column.indexed = True + db_columns.append(remote_column) + + data["db_columns"] = db_columns + return data + def to_table(self) -> Table: """Returns the dataset as a SQLAlchemy Table object.""" metadata = MetaData(schema=self.db_schema) diff --git a/uv.lock b/uv.lock index 2fdc60eb..8e9e11a1 100644 --- a/uv.lock +++ b/uv.lock @@ -1394,63 +1394,69 @@ wheels = [ [[package]] name = "pydantic" -version = "2.9.2" +version = "2.10.3" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "annotated-types" }, { name = "pydantic-core" }, { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/a9/b7/d9e3f12af310e1120c21603644a1cd86f59060e040ec5c3a80b8f05fae30/pydantic-2.9.2.tar.gz", hash = "sha256:d155cef71265d1e9807ed1c32b4c8deec042a44a50a4188b25ac67ecd81a9c0f", size = 769917 } +sdist = { url = "https://files.pythonhosted.org/packages/45/0f/27908242621b14e649a84e62b133de45f84c255eecb350ab02979844a788/pydantic-2.10.3.tar.gz", hash = "sha256:cb5ac360ce894ceacd69c403187900a02c4b20b693a9dd1d643e1effab9eadf9", size = 786486 } wheels = [ - { url = "https://files.pythonhosted.org/packages/df/e4/ba44652d562cbf0bf320e0f3810206149c8a4e99cdbf66da82e97ab53a15/pydantic-2.9.2-py3-none-any.whl", hash = "sha256:f048cec7b26778210e28a0459867920654d48e5e62db0958433636cde4254f12", size = 434928 }, + { url = "https://files.pythonhosted.org/packages/62/51/72c18c55cf2f46ff4f91ebcc8f75aa30f7305f3d726be3f4ebffb4ae972b/pydantic-2.10.3-py3-none-any.whl", hash = "sha256:be04d85bbc7b65651c5f8e6b9976ed9c6f41782a55524cef079a34a0bb82144d", size = 456997 }, ] [[package]] name = "pydantic-core" -version = "2.23.4" +version = "2.27.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "typing-extensions" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/e2/aa/6b6a9b9f8537b872f552ddd46dd3da230367754b6f707b8e1e963f515ea3/pydantic_core-2.23.4.tar.gz", hash = "sha256:2584f7cf844ac4d970fba483a717dbe10c1c1c96a969bf65d61ffe94df1b2863", size = 402156 } -wheels = [ - { url = "https://files.pythonhosted.org/packages/5d/30/890a583cd3f2be27ecf32b479d5d615710bb926d92da03e3f7838ff3e58b/pydantic_core-2.23.4-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:77733e3892bb0a7fa797826361ce8a9184d25c8dffaec60b7ffe928153680ba8", size = 1865160 }, - { url = "https://files.pythonhosted.org/packages/1d/9a/b634442e1253bc6889c87afe8bb59447f106ee042140bd57680b3b113ec7/pydantic_core-2.23.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1b84d168f6c48fabd1f2027a3d1bdfe62f92cade1fb273a5d68e621da0e44e6d", size = 1776777 }, - { url = "https://files.pythonhosted.org/packages/75/9a/7816295124a6b08c24c96f9ce73085032d8bcbaf7e5a781cd41aa910c891/pydantic_core-2.23.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:df49e7a0861a8c36d089c1ed57d308623d60416dab2647a4a17fe050ba85de0e", size = 1799244 }, - { url = "https://files.pythonhosted.org/packages/a9/8f/89c1405176903e567c5f99ec53387449e62f1121894aa9fc2c4fdc51a59b/pydantic_core-2.23.4-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ff02b6d461a6de369f07ec15e465a88895f3223eb75073ffea56b84d9331f607", size = 1805307 }, - { url = "https://files.pythonhosted.org/packages/d5/a5/1a194447d0da1ef492e3470680c66048fef56fc1f1a25cafbea4bc1d1c48/pydantic_core-2.23.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:996a38a83508c54c78a5f41456b0103c30508fed9abcad0a59b876d7398f25fd", size = 2000663 }, - { url = "https://files.pythonhosted.org/packages/13/a5/1df8541651de4455e7d587cf556201b4f7997191e110bca3b589218745a5/pydantic_core-2.23.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d97683ddee4723ae8c95d1eddac7c192e8c552da0c73a925a89fa8649bf13eea", size = 2655941 }, - { url = "https://files.pythonhosted.org/packages/44/31/a3899b5ce02c4316865e390107f145089876dff7e1dfc770a231d836aed8/pydantic_core-2.23.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:216f9b2d7713eb98cb83c80b9c794de1f6b7e3145eef40400c62e86cee5f4e1e", size = 2052105 }, - { url = "https://files.pythonhosted.org/packages/1b/aa/98e190f8745d5ec831f6d5449344c48c0627ac5fed4e5340a44b74878f8e/pydantic_core-2.23.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6f783e0ec4803c787bcea93e13e9932edab72068f68ecffdf86a99fd5918878b", size = 1919967 }, - { url = "https://files.pythonhosted.org/packages/ae/35/b6e00b6abb2acfee3e8f85558c02a0822e9a8b2f2d812ea8b9079b118ba0/pydantic_core-2.23.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:d0776dea117cf5272382634bd2a5c1b6eb16767c223c6a5317cd3e2a757c61a0", size = 1964291 }, - { url = "https://files.pythonhosted.org/packages/13/46/7bee6d32b69191cd649bbbd2361af79c472d72cb29bb2024f0b6e350ba06/pydantic_core-2.23.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:d5f7a395a8cf1621939692dba2a6b6a830efa6b3cee787d82c7de1ad2930de64", size = 2109666 }, - { url = "https://files.pythonhosted.org/packages/39/ef/7b34f1b122a81b68ed0a7d0e564da9ccdc9a2924c8d6c6b5b11fa3a56970/pydantic_core-2.23.4-cp311-none-win32.whl", hash = "sha256:74b9127ffea03643e998e0c5ad9bd3811d3dac8c676e47db17b0ee7c3c3bf35f", size = 1732940 }, - { url = "https://files.pythonhosted.org/packages/2f/76/37b7e76c645843ff46c1d73e046207311ef298d3f7b2f7d8f6ac60113071/pydantic_core-2.23.4-cp311-none-win_amd64.whl", hash = "sha256:98d134c954828488b153d88ba1f34e14259284f256180ce659e8d83e9c05eaa3", size = 1916804 }, - { url = "https://files.pythonhosted.org/packages/74/7b/8e315f80666194b354966ec84b7d567da77ad927ed6323db4006cf915f3f/pydantic_core-2.23.4-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:f3e0da4ebaef65158d4dfd7d3678aad692f7666877df0002b8a522cdf088f231", size = 1856459 }, - { url = "https://files.pythonhosted.org/packages/14/de/866bdce10ed808323d437612aca1ec9971b981e1c52e5e42ad9b8e17a6f6/pydantic_core-2.23.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f69a8e0b033b747bb3e36a44e7732f0c99f7edd5cea723d45bc0d6e95377ffee", size = 1770007 }, - { url = "https://files.pythonhosted.org/packages/dc/69/8edd5c3cd48bb833a3f7ef9b81d7666ccddd3c9a635225214e044b6e8281/pydantic_core-2.23.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:723314c1d51722ab28bfcd5240d858512ffd3116449c557a1336cbe3919beb87", size = 1790245 }, - { url = "https://files.pythonhosted.org/packages/80/33/9c24334e3af796ce80d2274940aae38dd4e5676298b4398eff103a79e02d/pydantic_core-2.23.4-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:bb2802e667b7051a1bebbfe93684841cc9351004e2badbd6411bf357ab8d5ac8", size = 1801260 }, - { url = "https://files.pythonhosted.org/packages/a5/6f/e9567fd90104b79b101ca9d120219644d3314962caa7948dd8b965e9f83e/pydantic_core-2.23.4-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d18ca8148bebe1b0a382a27a8ee60350091a6ddaf475fa05ef50dc35b5df6327", size = 1996872 }, - { url = "https://files.pythonhosted.org/packages/2d/ad/b5f0fe9e6cfee915dd144edbd10b6e9c9c9c9d7a56b69256d124b8ac682e/pydantic_core-2.23.4-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:33e3d65a85a2a4a0dc3b092b938a4062b1a05f3a9abde65ea93b233bca0e03f2", size = 2661617 }, - { url = "https://files.pythonhosted.org/packages/06/c8/7d4b708f8d05a5cbfda3243aad468052c6e99de7d0937c9146c24d9f12e9/pydantic_core-2.23.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:128585782e5bfa515c590ccee4b727fb76925dd04a98864182b22e89a4e6ed36", size = 2071831 }, - { url = "https://files.pythonhosted.org/packages/89/4d/3079d00c47f22c9a9a8220db088b309ad6e600a73d7a69473e3a8e5e3ea3/pydantic_core-2.23.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:68665f4c17edcceecc112dfed5dbe6f92261fb9d6054b47d01bf6371a6196126", size = 1917453 }, - { url = "https://files.pythonhosted.org/packages/e9/88/9df5b7ce880a4703fcc2d76c8c2d8eb9f861f79d0c56f4b8f5f2607ccec8/pydantic_core-2.23.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:20152074317d9bed6b7a95ade3b7d6054845d70584216160860425f4fbd5ee9e", size = 1968793 }, - { url = "https://files.pythonhosted.org/packages/e3/b9/41f7efe80f6ce2ed3ee3c2dcfe10ab7adc1172f778cc9659509a79518c43/pydantic_core-2.23.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:9261d3ce84fa1d38ed649c3638feefeae23d32ba9182963e465d58d62203bd24", size = 2116872 }, - { url = "https://files.pythonhosted.org/packages/63/08/b59b7a92e03dd25554b0436554bf23e7c29abae7cce4b1c459cd92746811/pydantic_core-2.23.4-cp312-none-win32.whl", hash = "sha256:4ba762ed58e8d68657fc1281e9bb72e1c3e79cc5d464be146e260c541ec12d84", size = 1738535 }, - { url = "https://files.pythonhosted.org/packages/88/8d/479293e4d39ab409747926eec4329de5b7129beaedc3786eca070605d07f/pydantic_core-2.23.4-cp312-none-win_amd64.whl", hash = "sha256:97df63000f4fea395b2824da80e169731088656d1818a11b95f3b173747b6cd9", size = 1917992 }, - { url = "https://files.pythonhosted.org/packages/ad/ef/16ee2df472bf0e419b6bc68c05bf0145c49247a1095e85cee1463c6a44a1/pydantic_core-2.23.4-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:7530e201d10d7d14abce4fb54cfe5b94a0aefc87da539d0346a484ead376c3cc", size = 1856143 }, - { url = "https://files.pythonhosted.org/packages/da/fa/bc3dbb83605669a34a93308e297ab22be82dfb9dcf88c6cf4b4f264e0a42/pydantic_core-2.23.4-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:df933278128ea1cd77772673c73954e53a1c95a4fdf41eef97c2b779271bd0bd", size = 1770063 }, - { url = "https://files.pythonhosted.org/packages/4e/48/e813f3bbd257a712303ebdf55c8dc46f9589ec74b384c9f652597df3288d/pydantic_core-2.23.4-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0cb3da3fd1b6a5d0279a01877713dbda118a2a4fc6f0d821a57da2e464793f05", size = 1790013 }, - { url = "https://files.pythonhosted.org/packages/b4/e0/56eda3a37929a1d297fcab1966db8c339023bcca0b64c5a84896db3fcc5c/pydantic_core-2.23.4-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:42c6dcb030aefb668a2b7009c85b27f90e51e6a3b4d5c9bc4c57631292015b0d", size = 1801077 }, - { url = "https://files.pythonhosted.org/packages/04/be/5e49376769bfbf82486da6c5c1683b891809365c20d7c7e52792ce4c71f3/pydantic_core-2.23.4-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:696dd8d674d6ce621ab9d45b205df149399e4bb9aa34102c970b721554828510", size = 1996782 }, - { url = "https://files.pythonhosted.org/packages/bc/24/e3ee6c04f1d58cc15f37bcc62f32c7478ff55142b7b3e6d42ea374ea427c/pydantic_core-2.23.4-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2971bb5ffe72cc0f555c13e19b23c85b654dd2a8f7ab493c262071377bfce9f6", size = 2661375 }, - { url = "https://files.pythonhosted.org/packages/c1/f8/11a9006de4e89d016b8de74ebb1db727dc100608bb1e6bbe9d56a3cbbcce/pydantic_core-2.23.4-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8394d940e5d400d04cad4f75c0598665cbb81aecefaca82ca85bd28264af7f9b", size = 2071635 }, - { url = "https://files.pythonhosted.org/packages/7c/45/bdce5779b59f468bdf262a5bc9eecbae87f271c51aef628d8c073b4b4b4c/pydantic_core-2.23.4-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:0dff76e0602ca7d4cdaacc1ac4c005e0ce0dcfe095d5b5259163a80d3a10d327", size = 1916994 }, - { url = "https://files.pythonhosted.org/packages/d8/fa/c648308fe711ee1f88192cad6026ab4f925396d1293e8356de7e55be89b5/pydantic_core-2.23.4-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:7d32706badfe136888bdea71c0def994644e09fff0bfe47441deaed8e96fdbc6", size = 1968877 }, - { url = "https://files.pythonhosted.org/packages/16/16/b805c74b35607d24d37103007f899abc4880923b04929547ae68d478b7f4/pydantic_core-2.23.4-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:ed541d70698978a20eb63d8c5d72f2cc6d7079d9d90f6b50bad07826f1320f5f", size = 2116814 }, - { url = "https://files.pythonhosted.org/packages/d1/58/5305e723d9fcdf1c5a655e6a4cc2a07128bf644ff4b1d98daf7a9dbf57da/pydantic_core-2.23.4-cp313-none-win32.whl", hash = "sha256:3d5639516376dce1940ea36edf408c554475369f5da2abd45d44621cb616f769", size = 1738360 }, - { url = "https://files.pythonhosted.org/packages/a5/ae/e14b0ff8b3f48e02394d8acd911376b7b66e164535687ef7dc24ea03072f/pydantic_core-2.23.4-cp313-none-win_amd64.whl", hash = "sha256:5a1504ad17ba4210df3a045132a7baeeba5a200e930f57512ee02909fc5c4cb5", size = 1919411 }, +sdist = { url = "https://files.pythonhosted.org/packages/a6/9f/7de1f19b6aea45aeb441838782d68352e71bfa98ee6fa048d5041991b33e/pydantic_core-2.27.1.tar.gz", hash = "sha256:62a763352879b84aa31058fc931884055fd75089cccbd9d58bb6afd01141b235", size = 412785 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/27/39/46fe47f2ad4746b478ba89c561cafe4428e02b3573df882334bd2964f9cb/pydantic_core-2.27.1-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:ac3b20653bdbe160febbea8aa6c079d3df19310d50ac314911ed8cc4eb7f8cb8", size = 1895553 }, + { url = "https://files.pythonhosted.org/packages/1c/00/0804e84a78b7fdb394fff4c4f429815a10e5e0993e6ae0e0b27dd20379ee/pydantic_core-2.27.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a5a8e19d7c707c4cadb8c18f5f60c843052ae83c20fa7d44f41594c644a1d330", size = 1807220 }, + { url = "https://files.pythonhosted.org/packages/01/de/df51b3bac9820d38371f5a261020f505025df732ce566c2a2e7970b84c8c/pydantic_core-2.27.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7f7059ca8d64fea7f238994c97d91f75965216bcbe5f695bb44f354893f11d52", size = 1829727 }, + { url = "https://files.pythonhosted.org/packages/5f/d9/c01d19da8f9e9fbdb2bf99f8358d145a312590374d0dc9dd8dbe484a9cde/pydantic_core-2.27.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:bed0f8a0eeea9fb72937ba118f9db0cb7e90773462af7962d382445f3005e5a4", size = 1854282 }, + { url = "https://files.pythonhosted.org/packages/5f/84/7db66eb12a0dc88c006abd6f3cbbf4232d26adfd827a28638c540d8f871d/pydantic_core-2.27.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a3cb37038123447cf0f3ea4c74751f6a9d7afef0eb71aa07bf5f652b5e6a132c", size = 2037437 }, + { url = "https://files.pythonhosted.org/packages/34/ac/a2537958db8299fbabed81167d58cc1506049dba4163433524e06a7d9f4c/pydantic_core-2.27.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:84286494f6c5d05243456e04223d5a9417d7f443c3b76065e75001beb26f88de", size = 2780899 }, + { url = "https://files.pythonhosted.org/packages/4a/c1/3e38cd777ef832c4fdce11d204592e135ddeedb6c6f525478a53d1c7d3e5/pydantic_core-2.27.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:acc07b2cfc5b835444b44a9956846b578d27beeacd4b52e45489e93276241025", size = 2135022 }, + { url = "https://files.pythonhosted.org/packages/7a/69/b9952829f80fd555fe04340539d90e000a146f2a003d3fcd1e7077c06c71/pydantic_core-2.27.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:4fefee876e07a6e9aad7a8c8c9f85b0cdbe7df52b8a9552307b09050f7512c7e", size = 1987969 }, + { url = "https://files.pythonhosted.org/packages/05/72/257b5824d7988af43460c4e22b63932ed651fe98804cc2793068de7ec554/pydantic_core-2.27.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:258c57abf1188926c774a4c94dd29237e77eda19462e5bb901d88adcab6af919", size = 1994625 }, + { url = "https://files.pythonhosted.org/packages/73/c3/78ed6b7f3278a36589bcdd01243189ade7fc9b26852844938b4d7693895b/pydantic_core-2.27.1-cp311-cp311-musllinux_1_1_armv7l.whl", hash = "sha256:35c14ac45fcfdf7167ca76cc80b2001205a8d5d16d80524e13508371fb8cdd9c", size = 2090089 }, + { url = "https://files.pythonhosted.org/packages/8d/c8/b4139b2f78579960353c4cd987e035108c93a78371bb19ba0dc1ac3b3220/pydantic_core-2.27.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:d1b26e1dff225c31897696cab7d4f0a315d4c0d9e8666dbffdb28216f3b17fdc", size = 2142496 }, + { url = "https://files.pythonhosted.org/packages/3e/f8/171a03e97eb36c0b51981efe0f78460554a1d8311773d3d30e20c005164e/pydantic_core-2.27.1-cp311-none-win32.whl", hash = "sha256:2cdf7d86886bc6982354862204ae3b2f7f96f21a3eb0ba5ca0ac42c7b38598b9", size = 1811758 }, + { url = "https://files.pythonhosted.org/packages/6a/fe/4e0e63c418c1c76e33974a05266e5633e879d4061f9533b1706a86f77d5b/pydantic_core-2.27.1-cp311-none-win_amd64.whl", hash = "sha256:3af385b0cee8df3746c3f406f38bcbfdc9041b5c2d5ce3e5fc6637256e60bbc5", size = 1980864 }, + { url = "https://files.pythonhosted.org/packages/50/fc/93f7238a514c155a8ec02fc7ac6376177d449848115e4519b853820436c5/pydantic_core-2.27.1-cp311-none-win_arm64.whl", hash = "sha256:81f2ec23ddc1b476ff96563f2e8d723830b06dceae348ce02914a37cb4e74b89", size = 1864327 }, + { url = "https://files.pythonhosted.org/packages/be/51/2e9b3788feb2aebff2aa9dfbf060ec739b38c05c46847601134cc1fed2ea/pydantic_core-2.27.1-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:9cbd94fc661d2bab2bc702cddd2d3370bbdcc4cd0f8f57488a81bcce90c7a54f", size = 1895239 }, + { url = "https://files.pythonhosted.org/packages/7b/9e/f8063952e4a7d0127f5d1181addef9377505dcce3be224263b25c4f0bfd9/pydantic_core-2.27.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:5f8c4718cd44ec1580e180cb739713ecda2bdee1341084c1467802a417fe0f02", size = 1805070 }, + { url = "https://files.pythonhosted.org/packages/2c/9d/e1d6c4561d262b52e41b17a7ef8301e2ba80b61e32e94520271029feb5d8/pydantic_core-2.27.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:15aae984e46de8d376df515f00450d1522077254ef6b7ce189b38ecee7c9677c", size = 1828096 }, + { url = "https://files.pythonhosted.org/packages/be/65/80ff46de4266560baa4332ae3181fffc4488ea7d37282da1a62d10ab89a4/pydantic_core-2.27.1-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:1ba5e3963344ff25fc8c40da90f44b0afca8cfd89d12964feb79ac1411a260ac", size = 1857708 }, + { url = "https://files.pythonhosted.org/packages/d5/ca/3370074ad758b04d9562b12ecdb088597f4d9d13893a48a583fb47682cdf/pydantic_core-2.27.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:992cea5f4f3b29d6b4f7f1726ed8ee46c8331c6b4eed6db5b40134c6fe1768bb", size = 2037751 }, + { url = "https://files.pythonhosted.org/packages/b1/e2/4ab72d93367194317b99d051947c071aef6e3eb95f7553eaa4208ecf9ba4/pydantic_core-2.27.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0325336f348dbee6550d129b1627cb8f5351a9dc91aad141ffb96d4937bd9529", size = 2733863 }, + { url = "https://files.pythonhosted.org/packages/8a/c6/8ae0831bf77f356bb73127ce5a95fe115b10f820ea480abbd72d3cc7ccf3/pydantic_core-2.27.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7597c07fbd11515f654d6ece3d0e4e5093edc30a436c63142d9a4b8e22f19c35", size = 2161161 }, + { url = "https://files.pythonhosted.org/packages/f1/f4/b2fe73241da2429400fc27ddeaa43e35562f96cf5b67499b2de52b528cad/pydantic_core-2.27.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:3bbd5d8cc692616d5ef6fbbbd50dbec142c7e6ad9beb66b78a96e9c16729b089", size = 1993294 }, + { url = "https://files.pythonhosted.org/packages/77/29/4bb008823a7f4cc05828198153f9753b3bd4c104d93b8e0b1bfe4e187540/pydantic_core-2.27.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:dc61505e73298a84a2f317255fcc72b710b72980f3a1f670447a21efc88f8381", size = 2001468 }, + { url = "https://files.pythonhosted.org/packages/f2/a9/0eaceeba41b9fad851a4107e0cf999a34ae8f0d0d1f829e2574f3d8897b0/pydantic_core-2.27.1-cp312-cp312-musllinux_1_1_armv7l.whl", hash = "sha256:e1f735dc43da318cad19b4173dd1ffce1d84aafd6c9b782b3abc04a0d5a6f5bb", size = 2091413 }, + { url = "https://files.pythonhosted.org/packages/d8/36/eb8697729725bc610fd73940f0d860d791dc2ad557faaefcbb3edbd2b349/pydantic_core-2.27.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:f4e5658dbffe8843a0f12366a4c2d1c316dbe09bb4dfbdc9d2d9cd6031de8aae", size = 2154735 }, + { url = "https://files.pythonhosted.org/packages/52/e5/4f0fbd5c5995cc70d3afed1b5c754055bb67908f55b5cb8000f7112749bf/pydantic_core-2.27.1-cp312-none-win32.whl", hash = "sha256:672ebbe820bb37988c4d136eca2652ee114992d5d41c7e4858cdd90ea94ffe5c", size = 1833633 }, + { url = "https://files.pythonhosted.org/packages/ee/f2/c61486eee27cae5ac781305658779b4a6b45f9cc9d02c90cb21b940e82cc/pydantic_core-2.27.1-cp312-none-win_amd64.whl", hash = "sha256:66ff044fd0bb1768688aecbe28b6190f6e799349221fb0de0e6f4048eca14c16", size = 1986973 }, + { url = "https://files.pythonhosted.org/packages/df/a6/e3f12ff25f250b02f7c51be89a294689d175ac76e1096c32bf278f29ca1e/pydantic_core-2.27.1-cp312-none-win_arm64.whl", hash = "sha256:9a3b0793b1bbfd4146304e23d90045f2a9b5fd5823aa682665fbdaf2a6c28f3e", size = 1883215 }, + { url = "https://files.pythonhosted.org/packages/0f/d6/91cb99a3c59d7b072bded9959fbeab0a9613d5a4935773c0801f1764c156/pydantic_core-2.27.1-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:f216dbce0e60e4d03e0c4353c7023b202d95cbaeff12e5fd2e82ea0a66905073", size = 1895033 }, + { url = "https://files.pythonhosted.org/packages/07/42/d35033f81a28b27dedcade9e967e8a40981a765795c9ebae2045bcef05d3/pydantic_core-2.27.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a2e02889071850bbfd36b56fd6bc98945e23670773bc7a76657e90e6b6603c08", size = 1807542 }, + { url = "https://files.pythonhosted.org/packages/41/c2/491b59e222ec7e72236e512108ecad532c7f4391a14e971c963f624f7569/pydantic_core-2.27.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42b0e23f119b2b456d07ca91b307ae167cc3f6c846a7b169fca5326e32fdc6cf", size = 1827854 }, + { url = "https://files.pythonhosted.org/packages/e3/f3/363652651779113189cefdbbb619b7b07b7a67ebb6840325117cc8cc3460/pydantic_core-2.27.1-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:764be71193f87d460a03f1f7385a82e226639732214b402f9aa61f0d025f0737", size = 1857389 }, + { url = "https://files.pythonhosted.org/packages/5f/97/be804aed6b479af5a945daec7538d8bf358d668bdadde4c7888a2506bdfb/pydantic_core-2.27.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1c00666a3bd2f84920a4e94434f5974d7bbc57e461318d6bb34ce9cdbbc1f6b2", size = 2037934 }, + { url = "https://files.pythonhosted.org/packages/42/01/295f0bd4abf58902917e342ddfe5f76cf66ffabfc57c2e23c7681a1a1197/pydantic_core-2.27.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3ccaa88b24eebc0f849ce0a4d09e8a408ec5a94afff395eb69baf868f5183107", size = 2735176 }, + { url = "https://files.pythonhosted.org/packages/9d/a0/cd8e9c940ead89cc37812a1a9f310fef59ba2f0b22b4e417d84ab09fa970/pydantic_core-2.27.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c65af9088ac534313e1963443d0ec360bb2b9cba6c2909478d22c2e363d98a51", size = 2160720 }, + { url = "https://files.pythonhosted.org/packages/73/ae/9d0980e286627e0aeca4c352a60bd760331622c12d576e5ea4441ac7e15e/pydantic_core-2.27.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:206b5cf6f0c513baffaeae7bd817717140770c74528f3e4c3e1cec7871ddd61a", size = 1992972 }, + { url = "https://files.pythonhosted.org/packages/bf/ba/ae4480bc0292d54b85cfb954e9d6bd226982949f8316338677d56541b85f/pydantic_core-2.27.1-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:062f60e512fc7fff8b8a9d680ff0ddaaef0193dba9fa83e679c0c5f5fbd018bc", size = 2001477 }, + { url = "https://files.pythonhosted.org/packages/55/b7/e26adf48c2f943092ce54ae14c3c08d0d221ad34ce80b18a50de8ed2cba8/pydantic_core-2.27.1-cp313-cp313-musllinux_1_1_armv7l.whl", hash = "sha256:a0697803ed7d4af5e4c1adf1670af078f8fcab7a86350e969f454daf598c4960", size = 2091186 }, + { url = "https://files.pythonhosted.org/packages/ba/cc/8491fff5b608b3862eb36e7d29d36a1af1c945463ca4c5040bf46cc73f40/pydantic_core-2.27.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:58ca98a950171f3151c603aeea9303ef6c235f692fe555e883591103da709b23", size = 2154429 }, + { url = "https://files.pythonhosted.org/packages/78/d8/c080592d80edd3441ab7f88f865f51dae94a157fc64283c680e9f32cf6da/pydantic_core-2.27.1-cp313-none-win32.whl", hash = "sha256:8065914ff79f7eab1599bd80406681f0ad08f8e47c880f17b416c9f8f7a26d05", size = 1833713 }, + { url = "https://files.pythonhosted.org/packages/83/84/5ab82a9ee2538ac95a66e51f6838d6aba6e0a03a42aa185ad2fe404a4e8f/pydantic_core-2.27.1-cp313-none-win_amd64.whl", hash = "sha256:ba630d5e3db74c79300d9a5bdaaf6200172b107f263c98a0539eeecb857b2337", size = 1987897 }, + { url = "https://files.pythonhosted.org/packages/df/c3/b15fb833926d91d982fde29c0624c9f225da743c7af801dace0d4e187e71/pydantic_core-2.27.1-cp313-none-win_arm64.whl", hash = "sha256:45cf8588c066860b623cd11c4ba687f8d7175d5f7ef65f7129df8a394c502de5", size = 1882983 }, ] [[package]] From 9fde1e929a935d35f1d463924d4eeec9960fcf4e Mon Sep 17 00:00:00 2001 From: Will Langdale Date: Fri, 6 Dec 2024 16:22:03 +0000 Subject: [PATCH 03/14] Added hashing method to column names --- src/matchbox/common/db.py | 43 ++++++++++++++--------- src/matchbox/server/postgresql/adapter.py | 1 - src/matchbox/server/postgresql/orm.py | 3 +- 3 files changed, 28 insertions(+), 19 deletions(-) diff --git a/src/matchbox/common/db.py b/src/matchbox/common/db.py index 033b0c6c..c810b380 100644 --- a/src/matchbox/common/db.py +++ b/src/matchbox/common/db.py @@ -6,12 +6,7 @@ from matchbox.common.hash import HASH_FUNC from pandas import DataFrame from pyarrow import Table as ArrowTable -from pydantic import ( - BaseModel, - ConfigDict, - Field, - model_validator, -) +from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator from sqlalchemy import ( LABEL_STYLE_TABLENAME_PLUS_COL, ColumnElement, @@ -121,13 +116,26 @@ def from_engine(cls, engine: Engine, alias: str | None = None) -> "SourceWarehou return warehouse +class SourceColumnName(BaseModel): + """A column name in the Matchbox database.""" + + name: str + + @property + def hash(self) -> bytes: + """Generate a unique hash based on the column name.""" + return HASH_FUNC(self.name.encode("utf-8")).digest() + + class SourceColumn(BaseModel): """A column in a dataset that can be indexed in the Matchbox database.""" model_config = ConfigDict(arbitrary_types_allowed=True) - literal: str = Field(description="The literal name of the column in the database.") - alias: str = Field( + literal: SourceColumnName = Field( + description="The literal name of the column in the database." + ) + alias: SourceColumnName = Field( default_factory=lambda data: data["literal"], description="The alias to use when hashing the dataset in Matchbox.", ) @@ -156,26 +164,27 @@ def __eq__(self, other: object) -> bool: Returns: bool: True if the objects are considered equal, False otherwise """ - literal_hash = HASH_FUNC(str(self.literal).encode("utf-8")).digest() - alias_hash = HASH_FUNC(str(self.alias).encode("utf-8")).digest() - if isinstance(other, SourceColumn): if self.literal == other.literal or self.alias == other.alias: return True - other_literal_hash = HASH_FUNC(str(other.literal).encode("utf-8")).digest() - other_alias_hash = HASH_FUNC(str(other.alias).encode("utf-8")).digest() - - self_hashes = {literal_hash, alias_hash} - other_hashes = {other_literal_hash, other_alias_hash} + self_hashes = {self.literal.hash, self.alias.hash} + other_hashes = {other.literal.hash, other.alias.hash} return bool(self_hashes & other_hashes) if isinstance(other, bytes): - return other in {literal_hash, alias_hash} + return other in {self.literal.hash, self.alias.hash} return NotImplemented + @field_validator("literal", "alias", mode="before") + def string_to_name(cls: "SourceColumn", value: str) -> SourceColumnName: + if isinstance(value, str): + return SourceColumnName(name=value) + else: + raise ValueError("Column name must be a string.") + class SourceIndex(BaseModel): """The hashes of column names in the Matchbox database.""" diff --git a/src/matchbox/server/postgresql/adapter.py b/src/matchbox/server/postgresql/adapter.py index 3a80da61..54dbe451 100644 --- a/src/matchbox/server/postgresql/adapter.py +++ b/src/matchbox/server/postgresql/adapter.py @@ -358,7 +358,6 @@ def get_dataset(self, db_schema: str, db_table: str, engine: Engine) -> Source: db_schema=dataset.schema, db_table=dataset.table, db_pk=dataset.id, - index=None, database=SourceWarehouse.from_engine(engine), ) else: diff --git a/src/matchbox/server/postgresql/orm.py b/src/matchbox/server/postgresql/orm.py index 8f1c6674..fa9a61fa 100644 --- a/src/matchbox/server/postgresql/orm.py +++ b/src/matchbox/server/postgresql/orm.py @@ -9,7 +9,7 @@ ForeignKey, select, ) -from sqlalchemy.dialects.postgresql import ARRAY, BYTEA +from sqlalchemy.dialects.postgresql import ARRAY, BYTEA, JSONB from sqlalchemy.orm import Session, relationship from matchbox.server.postgresql.db import MBDB @@ -155,6 +155,7 @@ class Sources(CountMixin, MBDB.MatchboxBase): schema = Column(VARCHAR, nullable=False) table = Column(VARCHAR, nullable=False) id = Column(VARCHAR, nullable=False) + indices = Column(JSONB, nullable=False) # Relationships dataset_model = relationship("Models", back_populates="source") From bf2afc084477fdd9dea3790652b6c76358b4f298 Mon Sep 17 00:00:00 2001 From: Will Langdale Date: Fri, 6 Dec 2024 17:05:53 +0000 Subject: [PATCH 04/14] Honoured ordering of columns in settings to give total control of hash creation --- src/matchbox/admin.py | 2 +- src/matchbox/common/db.py | 32 +++++++++++++------ src/matchbox/server/postgresql/orm.py | 1 + .../server/postgresql/utils/insert.py | 5 +++ 4 files changed, 30 insertions(+), 10 deletions(-) diff --git a/src/matchbox/admin.py b/src/matchbox/admin.py index a2067594..585ebc1c 100644 --- a/src/matchbox/admin.py +++ b/src/matchbox/admin.py @@ -32,7 +32,7 @@ def load_datasets_from_config(datasets: Path) -> dict[str, Source]: for dataset_name, dataset_config in config["datasets"].items(): warehouse_alias = dataset_config.get("database") dataset_config["database"] = warehouses[warehouse_alias] - datasets[dataset_name] = Source(**dataset_config) + datasets[dataset_name] = Source(alias=dataset_name, **dataset_config) return datasets diff --git a/src/matchbox/common/db.py b/src/matchbox/common/db.py index c810b380..4ded824b 100644 --- a/src/matchbox/common/db.py +++ b/src/matchbox/common/db.py @@ -201,6 +201,9 @@ class Source(BaseModel): ) database: SourceWarehouse + alias: str = Field( + default_factory=lambda data: f"{data['db_schema']}.{data['db_table']}" + ) db_pk: str db_schema: str db_table: str @@ -235,8 +238,8 @@ def hash_columns(cls, data: dict[str, Any]) -> "Source": # Column logic # Get all locally specified columns, or remotely specified hashes local_columns: list[SourceColumn] = [] + star_index: int | None = None local_hashes: SourceIndex | None = None - select_all = False if isinstance(data["index"], dict): # Came from Matchbox database local_hashes = SourceIndex(**data["index"]) @@ -244,18 +247,20 @@ def hash_columns(cls, data: dict[str, Any]) -> "Source": # Came from TOML for column in data["index"]: if column["literal"] == "*": - select_all = True + star_index = len(local_columns) continue local_columns.append(SourceColumn(**column, indexed=True)) - continue # Get all remote columns using the user's creds and merge with local spec remote_columns = [ - SourceColumn(literal=col.name, type=str(col.type), indexed=select_all) + SourceColumn(literal=col.name, type=str(col.type), indexed=False) for col in table.columns if col.name not in data["db_pk"] ] db_columns: list[SourceColumn] = [] + db_indexed_columns: list[SourceColumn] = [] + db_non_indexed_columns: list[SourceColumn] = [] + for remote_column in remote_columns: if local_columns: # Came from TOML, index and alias are configured from TOML @@ -263,16 +268,26 @@ def hash_columns(cls, data: dict[str, Any]) -> "Source": if remote_column == local_column: if local_column.type is None: local_column.type = remote_column.type - db_columns.append(local_column) + db_indexed_columns.append(local_column) break else: - db_columns.append(remote_column) + db_non_indexed_columns.append(remote_column) elif local_hashes: # Came from database, index is true when hashes match if remote_column in local_hashes.literal + local_hashes.alias: remote_column.indexed = True db_columns.append(remote_column) + if local_columns: + # Concatenate with TOML order, honouring star location (if present) + if star_index: + db_columns = db_indexed_columns + for c in db_non_indexed_columns: + c.indexed = True + db_columns.insert(star_index, c) + else: + db_columns = db_indexed_columns + db_non_indexed_columns + data["db_columns"] = db_columns return data @@ -320,9 +335,8 @@ def _get_column(col_name: str) -> ColumnElement: def to_hash(self) -> bytes: """Generate a unique hash based on the table's columns and datatypes.""" - table = self.to_table() - schema_representation = f"{str(self)}: " + ",".join( - f"{col.name}:{str(col.type)}" for col in table.columns + schema_representation = f"{self.alias}: " + ",".join( + f"{col.alias.name}:{col.type}" for col in self.db_columns if col.indexed ) return HASH_FUNC(schema_representation.encode("utf-8")).digest() diff --git a/src/matchbox/server/postgresql/orm.py b/src/matchbox/server/postgresql/orm.py index fa9a61fa..fc54871e 100644 --- a/src/matchbox/server/postgresql/orm.py +++ b/src/matchbox/server/postgresql/orm.py @@ -152,6 +152,7 @@ class Sources(CountMixin, MBDB.MatchboxBase): model = Column( BYTEA, ForeignKey("models.hash", ondelete="CASCADE"), primary_key=True ) + alias = Column(VARCHAR, nullable=False, unique=True) schema = Column(VARCHAR, nullable=False) table = Column(VARCHAR, nullable=False) id = Column(VARCHAR, nullable=False) diff --git a/src/matchbox/server/postgresql/utils/insert.py b/src/matchbox/server/postgresql/utils/insert.py index c58abd84..87eb1410 100644 --- a/src/matchbox/server/postgresql/utils/insert.py +++ b/src/matchbox/server/postgresql/utils/insert.py @@ -47,9 +47,14 @@ def insert_dataset(dataset: Source, engine: Engine, batch_size: int) -> None: source_data = { "model": model_hash, + "alias": dataset.alias, "schema": dataset.db_schema, "table": dataset.db_table, "id": dataset.db_pk, + "indices": { + "literal": [c.literal.hash for c in dataset.db_columns if c.indexed], + "alias": [c.alias.hash for c in dataset.db_columns if c.indexed], + }, } clusters = dataset_to_hashlist(dataset=dataset, model_hash=model_hash) From 98bd13d6cfad03c47fd364deb3e4bba962f8c420 Mon Sep 17 00:00:00 2001 From: Will Langdale Date: Fri, 6 Dec 2024 17:52:48 +0000 Subject: [PATCH 05/14] Saving and loading hashes from the database successfully --- src/matchbox/common/db.py | 25 ++++++++++++++----- src/matchbox/common/hash.py | 10 +++----- src/matchbox/server/postgresql/adapter.py | 8 ++++++ .../server/postgresql/utils/insert.py | 4 +-- 4 files changed, 33 insertions(+), 14 deletions(-) diff --git a/src/matchbox/common/db.py b/src/matchbox/common/db.py index 4ded824b..23f608b4 100644 --- a/src/matchbox/common/db.py +++ b/src/matchbox/common/db.py @@ -1,3 +1,4 @@ +import base64 from typing import TYPE_CHECKING, Any, Literal, TypeVar, Union, overload import connectorx as cx @@ -126,6 +127,11 @@ def hash(self) -> bytes: """Generate a unique hash based on the column name.""" return HASH_FUNC(self.name.encode("utf-8")).digest() + @property + def base64(self) -> str: + """Generate a base64 encoded hash based on the column name.""" + return base64.b64encode(self.hash).decode("utf-8") + class SourceColumn(BaseModel): """A column in a dataset that can be indexed in the Matchbox database.""" @@ -201,13 +207,13 @@ class Source(BaseModel): ) database: SourceWarehouse - alias: str = Field( - default_factory=lambda data: f"{data['db_schema']}.{data['db_table']}" - ) db_pk: str db_schema: str db_table: str db_columns: list[SourceColumn] + alias: str = Field( + default_factory=lambda data: f"{data['db_schema']}.{data['db_table']}" + ) def __str__(self) -> str: return f"{self.db_schema}.{self.db_table}" @@ -240,12 +246,13 @@ def hash_columns(cls, data: dict[str, Any]) -> "Source": local_columns: list[SourceColumn] = [] star_index: int | None = None local_hashes: SourceIndex | None = None - if isinstance(data["index"], dict): + index_data: dict | list | None = data.get("index") + if isinstance(index_data, dict): # Came from Matchbox database - local_hashes = SourceIndex(**data["index"]) + local_hashes = SourceIndex(**index_data) else: # Came from TOML - for column in data["index"]: + for column in index_data or []: if column["literal"] == "*": star_index = len(local_columns) continue @@ -288,6 +295,12 @@ def hash_columns(cls, data: dict[str, Any]) -> "Source": else: db_columns = db_indexed_columns + db_non_indexed_columns + if not local_columns and not local_hashes: + # No columns specified, index all columns except the primary key + for col in remote_columns: + col.indexed = True + db_columns = remote_columns + data["db_columns"] = db_columns return data diff --git a/src/matchbox/common/hash.py b/src/matchbox/common/hash.py index 4ce9c072..c9440d21 100644 --- a/src/matchbox/common/hash.py +++ b/src/matchbox/common/hash.py @@ -21,16 +21,14 @@ def dataset_to_hashlist(dataset: Source, model_hash: bytes) -> list[dict[str, An """Retrieve and hash a dataset from its warehouse, ready to be inserted.""" with Session(dataset.database.engine) as warehouse_session: source_table = dataset.to_table() - - # Exclude the primary key from the columns to be hashed - cols = tuple( - [col for col in list(source_table.c.keys()) if col != dataset.db_pk] + cols_to_index = tuple( + [col.literal.name for col in dataset.db_columns if col.indexed] ) slct_stmt = select( - func.concat(*source_table.c[cols]).label("raw"), + func.concat(*source_table.c[cols_to_index]).label("raw"), func.array_agg(source_table.c[dataset.db_pk].cast(String)).label("id"), - ).group_by(*source_table.c[cols]) + ).group_by(*source_table.c[cols_to_index]) raw_result = warehouse_session.execute(slct_stmt) diff --git a/src/matchbox/server/postgresql/adapter.py b/src/matchbox/server/postgresql/adapter.py index 54dbe451..6b906307 100644 --- a/src/matchbox/server/postgresql/adapter.py +++ b/src/matchbox/server/postgresql/adapter.py @@ -1,3 +1,4 @@ +import base64 from typing import TYPE_CHECKING, Any, Literal from pydantic import BaseModel @@ -354,10 +355,17 @@ def get_dataset(self, db_schema: str, db_table: str, engine: Engine) -> Source: .first() ) if dataset: + dataset_indices: dict[str, bytes] = {} + for index_type, index_b64_list in dataset.indices.items(): + dataset_indices[index_type] = [ + base64.b64decode(b64.encode("utf-8")) for b64 in index_b64_list + ] return Source( + alias=dataset.alias, db_schema=dataset.schema, db_table=dataset.table, db_pk=dataset.id, + db_columns=dataset_indices, database=SourceWarehouse.from_engine(engine), ) else: diff --git a/src/matchbox/server/postgresql/utils/insert.py b/src/matchbox/server/postgresql/utils/insert.py index 87eb1410..99b841c1 100644 --- a/src/matchbox/server/postgresql/utils/insert.py +++ b/src/matchbox/server/postgresql/utils/insert.py @@ -52,8 +52,8 @@ def insert_dataset(dataset: Source, engine: Engine, batch_size: int) -> None: "table": dataset.db_table, "id": dataset.db_pk, "indices": { - "literal": [c.literal.hash for c in dataset.db_columns if c.indexed], - "alias": [c.alias.hash for c in dataset.db_columns if c.indexed], + "literal": [c.literal.base64 for c in dataset.db_columns if c.indexed], + "alias": [c.alias.base64 for c in dataset.db_columns if c.indexed], }, } From 4b13c182954c0a250043f14e846d25d1bba9ffdc Mon Sep 17 00:00:00 2001 From: Will Langdale Date: Fri, 6 Dec 2024 18:03:20 +0000 Subject: [PATCH 06/14] More robust checking of Source insertion and retrieval --- test/server/test_adapter.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/test/server/test_adapter.py b/test/server/test_adapter.py index c369691e..2e650c65 100644 --- a/test/server/test_adapter.py +++ b/test/server/test_adapter.py @@ -3,7 +3,7 @@ import pytest import rustworkx as rx from dotenv import find_dotenv, load_dotenv -from matchbox.common.db import Source +from matchbox.common.db import Source, SourceColumn from matchbox.common.exceptions import ( MatchboxDataError, MatchboxDatasetError, @@ -114,9 +114,24 @@ def test_get_dataset(self): crn = self.warehouse_data[0] - self.backend.get_dataset( + crn_retrieved = self.backend.get_dataset( db_schema=crn.db_schema, db_table=crn.db_table, engine=crn.database.engine ) + + assert crn.db_columns == crn_retrieved.db_columns + + cols: dict[str, list[SourceColumn]] = {} + + # Indexing isn't used in the custom equality check + for col in crn.db_columns + crn_retrieved.db_columns: + if col.literal.name not in cols: + cols[col.literal.name] = [col] + else: + cols[col.literal.name].append(col) + + for c1, c2 in cols.values(): + assert c1.indexed == c2.indexed + with pytest.raises(MatchboxDatasetError): self.backend.get_dataset( db_schema="nonexistant", From 248dda73ef3f2accd2a6078cbc3d9a563d085e01 Mon Sep 17 00:00:00 2001 From: Will Langdale Date: Fri, 6 Dec 2024 18:22:32 +0000 Subject: [PATCH 07/14] Added warning to query function --- src/matchbox/server/postgresql/utils/query.py | 12 ++++++++++++ test/server/test_adapter.py | 19 +++++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/src/matchbox/server/postgresql/utils/query.py b/src/matchbox/server/postgresql/utils/query.py index ff492bf5..dbc235a2 100644 --- a/src/matchbox/server/postgresql/utils/query.py +++ b/src/matchbox/server/postgresql/utils/query.py @@ -1,4 +1,5 @@ import logging +import warnings from typing import TYPE_CHECKING, Any, Literal, TypeVar import pyarrow as pa @@ -290,6 +291,17 @@ def query( db_schema=source.db_schema, db_table=source.db_table ) + # Warn if non-indexed fields have been requested + not_indexed = set(fields) - set( + c.literal.name for c in source.db_columns if c.indexed + ) + if not_indexed: + warnings.warn( + "Found non-indexed fields. Do not use these fields in match jobs:" + f"{', '.join(sorted(not_indexed))}", + stacklevel=2, + ) + hash_query = _resolve_cluster_hierarchy( dataset_hash=dataset.hash, model=truth_model if truth_model else dataset, diff --git a/test/server/test_adapter.py b/test/server/test_adapter.py index 2e650c65..5002647a 100644 --- a/test/server/test_adapter.py +++ b/test/server/test_adapter.py @@ -413,6 +413,25 @@ def count_deduplicates(df: DataFrame) -> int: assert self.backend.data.count() == unique + def test_query_warning(self): + """Tests querying non-indexed fields warns the user.""" + self.setup_database("index") + + crn = self.warehouse_data[0] + select_crn = selector( + table=str(crn), + fields=["id", "crn"], + engine=crn.database.engine, + ) + with pytest.warns(Warning): + query( + selector=select_crn, + backend=self.backend, + model=None, + return_type="pandas", + limit=10, + ) + def test_query_single_table(self): """Test querying data from the database.""" self.setup_database("index") From c34f10d469e624d1213f2a5ae87195b0345587c7 Mon Sep 17 00:00:00 2001 From: Will Langdale Date: Sun, 8 Dec 2024 12:16:23 +0000 Subject: [PATCH 08/14] Added unit test to show TOML reading works as expected --- pyproject.toml | 1 + sample.datasets.toml | 25 ++++---- src/matchbox/common/db.py | 29 ++++++--- test/client/test_admin.py | 126 ++++++++++++++++++++++++++++++++++++++ uv.lock | 11 ++++ 5 files changed, 172 insertions(+), 20 deletions(-) create mode 100644 test/client/test_admin.py diff --git a/pyproject.toml b/pyproject.toml index 08063a37..f4d33280 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -38,6 +38,7 @@ dev = [ "pytest-env>=1.1.5", "ruff>=0.6.8", "docker>=7.1.0", + "tomli-w>=1.1.0", ] typing = [ "polars>=1.11.0", diff --git a/sample.datasets.toml b/sample.datasets.toml index 343e5a2e..dd41a62a 100644 --- a/sample.datasets.toml +++ b/sample.datasets.toml @@ -12,9 +12,10 @@ db_schema = "companieshouse" db_table = "companies" db_pk = "id" index = [ - { literal = "crn", type = "text" }, - { literal = "company_name", type = "text" }, - { literal = "postcode", type = "text" } + { literal = "crn", alias = "crn_id", type = "VARCHAR" }, + { literal = "company_name", alias = "name" }, + { literal = "*" }, + { literal = "postcode" } ] [datasets.data_hub_companies] @@ -23,9 +24,9 @@ db_schema = "dbt" db_table = "data_hub__companies" db_pk = "id" index = [ - { literal = "cdms", type = "text" }, - { literal = "company_name", type = "text" }, - { literal = "postcode", type = "text" }, + { literal = "cdms", alias = "cdms_id", type = "VARCHAR" }, + { literal = "company_name", alias = "name" }, + { literal = "postcode" }, { literal = "*" } ] @@ -35,8 +36,8 @@ db_schema = "hmrc" db_table = "trade__exporters" db_pk = "id" index = [ - { literal = "company_name", type = "text" }, - { literal = "postcode", type = "text" }, + { literal = "company_name", alias = "name" }, + { literal = "postcode" }, ] [datasets.export_wins] @@ -45,8 +46,8 @@ db_schema = "dbt" db_table = "export_wins__wins_dataset" db_pk = "id" index = [ - { literal = "company_name", type = "text" }, - { literal = "postcode", type = "text" }, - { literal = "cdms", type = "text" }, - { literal = "data_hub_company_id", alias = "dh_id", type = "text" }, + { literal = "company_name" }, + { literal = "postcode" }, + { literal = "cdms", alias = "cdms_id", type = "VARCHAR" }, + { literal = "data_hub_company_id", alias = "dh_id", type = "VARCHAR" }, ] diff --git a/src/matchbox/common/db.py b/src/matchbox/common/db.py index 23f608b4..ee864eb4 100644 --- a/src/matchbox/common/db.py +++ b/src/matchbox/common/db.py @@ -84,6 +84,25 @@ def engine(self) -> Engine: self.test_connection() return self._engine + def __str__(self): + return ( + f"SourceWarehouse(alias={self.alias}, type={self.db_type}, " + f"host={self.host}, port={self.port}, database={self.database})" + ) + + def __eq__(self, other): + if not isinstance(other, SourceWarehouse): + return False + return ( + self.alias == other.alias + and self.db_type == other.db_type + and self.user == other.user + and self.password == other.password + and self.host == other.host + and self.port == other.port + and self.database == other.database + ) + def test_connection(self): try: with self.engine.connect() as connection: @@ -92,12 +111,6 @@ def test_connection(self): self._engine = None raise - def __str__(self): - return ( - f"SourceWarehouse(alias={self.alias}, type={self.db_type}, " - f"host={self.host}, port={self.port}, database={self.database})" - ) - @classmethod def from_engine(cls, engine: Engine, alias: str | None = None) -> "SourceWarehouse": """Create a SourceWarehouse instance from an SQLAlchemy Engine object.""" @@ -142,7 +155,7 @@ class SourceColumn(BaseModel): description="The literal name of the column in the database." ) alias: SourceColumnName = Field( - default_factory=lambda data: data["literal"], + default_factory=lambda data: SourceColumnName(name=data["literal"].name), description="The alias to use when hashing the dataset in Matchbox.", ) type: str | None = Field( @@ -287,7 +300,7 @@ def hash_columns(cls, data: dict[str, Any]) -> "Source": if local_columns: # Concatenate with TOML order, honouring star location (if present) - if star_index: + if star_index is not None: db_columns = db_indexed_columns for c in db_non_indexed_columns: c.indexed = True diff --git a/test/client/test_admin.py b/test/client/test_admin.py new file mode 100644 index 00000000..e4a69ede --- /dev/null +++ b/test/client/test_admin.py @@ -0,0 +1,126 @@ +import re +from pathlib import Path +from tempfile import NamedTemporaryFile +from textwrap import dedent + +import pytest +from matchbox.admin import load_datasets_from_config +from matchbox.common.db import Source, SourceWarehouse +from tomli_w import dumps + + +def warehouse_toml(warehouse: SourceWarehouse) -> str: + return dedent(f""" + [warehouses.{warehouse.alias}] + db_type = "{warehouse.db_type}" + user = "{warehouse.user}" + password = "{warehouse.password}" + host = "{warehouse.host}" + port = {warehouse.port} + database = "{warehouse.database}" + """).strip() + + +def source_toml(source: Source, index: list[dict[str, str]]) -> str: + index_str = dumps({"index": index}).replace("\n", "\n ") + return dedent(f""" + [datasets.{re.sub(r"[^a-zA-Z0-9]", "", source.alias)}] + database = "test_warehouse" + db_schema = "{source.db_schema}" + db_table = "{source.db_table}" + db_pk = "{source.db_pk}" + {index_str} + """) + + +@pytest.mark.parametrize( + "index", + ( + [ + {"literal": "company_name"}, + {"literal": "crn"}, + ], + [{"literal": "company_name", "type": "VARCHAR", "alias": "name"}], + [ + {"literal": "company_name"}, + {"literal": "*"}, + ], + [ + {"literal": "*"}, + {"literal": "company_name"}, + ], + ), + ids=["vanilla", "alias_and_type", "star_end", "star_start"], +) +def test_load_datasets_from_config( + index: list[dict[str, str]], + warehouse: SourceWarehouse, + warehouse_data: list[Source], +): + """Tests loading datasets from a TOML file.""" + # Construct TOML from CRN data + # Columns: "id", "company_name", "crn" + crn = warehouse_data[0] + raw_toml = dedent(f""" + {warehouse_toml(warehouse)} + {source_toml(crn, index)} + """).strip() + + with NamedTemporaryFile(suffix=".toml", delete=False) as temp_file: + temp_file.write(raw_toml.encode()) + temp_file.flush() + temp_file_path = Path(temp_file.name) + + # Ingest + config = load_datasets_from_config(temp_file_path) + + # Helper variables + source = config.get(re.sub(r"[^a-zA-Z0-9]", "", crn.alias)) + named = [idx["literal"] for idx in index if idx["literal"] != "*"] + has_star = any(idx["literal"] == "*" for idx in index) + star_pos = next((i for i, idx in enumerate(index) if idx["literal"] == "*"), None) + col_names = [col.literal.name for col in source.db_columns] + + # Test 1: Core attributes match + assert source.database == warehouse + assert source.alias == re.sub(r"[^a-zA-Z0-9]", "", crn.alias) + assert source.db_schema == crn.db_schema + assert source.db_table == crn.db_table + assert source.db_pk == crn.db_pk + + # Test 2: All non-pk columns present + assert set(col_names) == {"company_name", "crn", "id"} - {source.db_pk} + + # Test 3: Column indexing + for col in source.db_columns: + assert col.indexed == (has_star or col.literal.name in named) + + # Test 4: Aliases and types match + for idx in index: + if idx["literal"] == "*": + continue + col = next(c for c in source.db_columns if c.literal.name == idx["literal"]) + assert col.alias.name == idx.get("alias", idx["literal"]) + assert col.type == idx.get("type", col.type) + + # Test 5: Column ordering + if star_pos is None: + for i, name in enumerate(named): + assert col_names[i] == name + else: + for i, idx in enumerate(index): + if idx["literal"] != "*": + if i < star_pos: + assert col_names[i] == idx["literal"] + else: + star_col_count = len(col_names) - len(index) + 1 + assert col_names[i + star_col_count - 1] == idx["literal"] + + # Test 6: column equalities + + assert source.db_columns[0] != source.db_columns[1] + assert source.db_columns[0] == source.db_columns[0] + assert source.db_columns[1].literal.hash == source.db_columns[1] + assert source.db_columns[1].alias.hash == source.db_columns[1] + assert source.db_columns[0].literal.hash != source.db_columns[1] + assert source.db_columns[0].alias.hash != source.db_columns[1] diff --git a/uv.lock b/uv.lock index 8e9e11a1..7934496f 100644 --- a/uv.lock +++ b/uv.lock @@ -943,6 +943,7 @@ dev = [ { name = "pytest-cov" }, { name = "pytest-env" }, { name = "ruff" }, + { name = "tomli-w" }, ] typing = [ { name = "polars" }, @@ -977,6 +978,7 @@ dev = [ { name = "pytest-cov", specifier = ">=5.0.0" }, { name = "pytest-env", specifier = ">=1.1.5" }, { name = "ruff", specifier = ">=0.6.8" }, + { name = "tomli-w", specifier = ">=1.1.0" }, ] typing = [{ name = "polars", specifier = ">=1.11.0" }] @@ -1945,6 +1947,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/97/75/10a9ebee3fd790d20926a90a2547f0bf78f371b2f13aa822c759680ca7b9/tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc", size = 12757 }, ] +[[package]] +name = "tomli-w" +version = "1.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d4/19/b65f1a088ee23e37cdea415b357843eca8b1422a7b11a9eee6e35d4ec273/tomli_w-1.1.0.tar.gz", hash = "sha256:49e847a3a304d516a169a601184932ef0f6b61623fe680f836a2aa7128ed0d33", size = 6929 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c4/ac/ce90573ba446a9bbe65838ded066a805234d159b4446ae9f8ec5bbd36cbd/tomli_w-1.1.0-py3-none-any.whl", hash = "sha256:1403179c78193e3184bfaade390ddbd071cba48a32a2e62ba11aae47490c63f7", size = 6440 }, +] + [[package]] name = "tornado" version = "6.4.1" From 67cdc171d3b7e8c8a8ccaa6ca8a2fccd92f9054c Mon Sep 17 00:00:00 2001 From: Will Langdale Date: Sun, 8 Dec 2024 12:22:56 +0000 Subject: [PATCH 09/14] Refactored hash_columns with cleaner logic --- src/matchbox/common/db.py | 133 ++++++++++++++++++++------------------ 1 file changed, 70 insertions(+), 63 deletions(-) diff --git a/src/matchbox/common/db.py b/src/matchbox/common/db.py index ee864eb4..5d16bd5f 100644 --- a/src/matchbox/common/db.py +++ b/src/matchbox/common/db.py @@ -241,80 +241,87 @@ def __hash__(self) -> int: def hash_columns(cls, data: dict[str, Any]) -> "Source": """Shapes indices data from either the backend or TOML. - Handles: - * From TOML, no columns specified - * From TOML, some or all columns specified - * From the database, indices already present + Handles three scenarios: + 1. No columns specified - all columns except primary key are indexed + 2. Columns specified in TOML - uses specified columns with optional '*' + 3. Indices from database - uses existing column hash information """ - # Database setup - if isinstance(data["database"], SourceWarehouse): - warehouse = data["database"] - else: - warehouse = SourceWarehouse(**data["database"]) + # Initialise warehouse and get table metadata + warehouse = ( + data["database"] + if isinstance(data["database"], SourceWarehouse) + else SourceWarehouse(**data["database"]) + ) + metadata = MetaData(schema=data["db_schema"]) table = Table(data["db_table"], metadata, autoload_with=warehouse.engine) - # Column logic - # Get all locally specified columns, or remotely specified hashes - local_columns: list[SourceColumn] = [] - star_index: int | None = None - local_hashes: SourceIndex | None = None - index_data: dict | list | None = data.get("index") - if isinstance(index_data, dict): - # Came from Matchbox database - local_hashes = SourceIndex(**index_data) - else: - # Came from TOML - for column in index_data or []: - if column["literal"] == "*": - star_index = len(local_columns) - continue - local_columns.append(SourceColumn(**column, indexed=True)) - - # Get all remote columns using the user's creds and merge with local spec + # Get all columns except primary key remote_columns = [ SourceColumn(literal=col.name, type=str(col.type), indexed=False) for col in table.columns if col.name not in data["db_pk"] ] - db_columns: list[SourceColumn] = [] - db_indexed_columns: list[SourceColumn] = [] - db_non_indexed_columns: list[SourceColumn] = [] - - for remote_column in remote_columns: - if local_columns: - # Came from TOML, index and alias are configured from TOML - for local_column in local_columns: - if remote_column == local_column: - if local_column.type is None: - local_column.type = remote_column.type - db_indexed_columns.append(local_column) - break - else: - db_non_indexed_columns.append(remote_column) - elif local_hashes: - # Came from database, index is true when hashes match - if remote_column in local_hashes.literal + local_hashes.alias: - remote_column.indexed = True - db_columns.append(remote_column) - - if local_columns: - # Concatenate with TOML order, honouring star location (if present) - if star_index is not None: - db_columns = db_indexed_columns - for c in db_non_indexed_columns: - c.indexed = True - db_columns.insert(star_index, c) - else: - db_columns = db_indexed_columns + db_non_indexed_columns - - if not local_columns and not local_hashes: - # No columns specified, index all columns except the primary key - for col in remote_columns: + + index_data = data.get("index") + + # Case 1: No columns specified - index everything + if not index_data: + data["db_columns"] = [ + SourceColumn(literal=col.literal.name, type=col.type, indexed=True) + for col in remote_columns + ] + return data + + # Case 2: Columns from database + if isinstance(index_data, dict): + source_index = SourceIndex(**index_data) + data["db_columns"] = [ + SourceColumn( + literal=col.literal.name, + type=col.type, + indexed=col in source_index.literal + source_index.alias, + ) + for col in remote_columns + ] + return data + + # Case 3: Columns from TOML + local_columns = [] + star_index = None + + # Process TOML column specifications + for i, column in enumerate(index_data): + if column["literal"] == "*": + star_index = i + continue + local_columns.append(SourceColumn(**column, indexed=True)) + + # Match remote columns with local specifications + indexed_columns = [] + non_indexed_columns = [] + + for remote_col in remote_columns: + matched = False + for local_col in local_columns: + if remote_col == local_col: + if local_col.type is None: + local_col.type = remote_col.type + indexed_columns.append(local_col) + matched = True + break + if not matched: + non_indexed_columns.append(remote_col) + + # Handle wildcard insertion + if star_index is not None: + for col in non_indexed_columns: col.indexed = True - db_columns = remote_columns + indexed_columns[star_index:star_index] = non_indexed_columns + data["db_columns"] = indexed_columns + else: + data["db_columns"] = indexed_columns + non_indexed_columns - data["db_columns"] = db_columns return data def to_table(self) -> Table: From 28c01f7a7fffdce998c10954342c14004a6fc0aa Mon Sep 17 00:00:00 2001 From: Will Langdale Date: Wed, 11 Dec 2024 15:29:12 +0000 Subject: [PATCH 10/14] Dealt with Leo's comments --- src/matchbox/common/db.py | 33 +++++++++++---------------- src/matchbox/server/postgresql/orm.py | 8 ++++++- test/client/test_admin.py | 9 ++++---- test/server/test_adapter.py | 8 +++---- 4 files changed, 27 insertions(+), 31 deletions(-) diff --git a/src/matchbox/common/db.py b/src/matchbox/common/db.py index 5d16bd5f..02fff10b 100644 --- a/src/matchbox/common/db.py +++ b/src/matchbox/common/db.py @@ -7,7 +7,14 @@ from matchbox.common.hash import HASH_FUNC from pandas import DataFrame from pyarrow import Table as ArrowTable -from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator +from pydantic import ( + BaseModel, + ConfigDict, + Field, + SecretStr, + field_validator, + model_validator, +) from sqlalchemy import ( LABEL_STYLE_TABLENAME_PLUS_COL, ColumnElement, @@ -70,7 +77,7 @@ class SourceWarehouse(BaseModel): alias: str db_type: str user: str - password: str = Field(repr=False) + password: SecretStr host: str port: int database: str @@ -79,17 +86,11 @@ class SourceWarehouse(BaseModel): @property def engine(self) -> Engine: if self._engine is None: - connection_string = f"{self.db_type}://{self.user}:{self.password}@{self.host}:{self.port}/{self.database}" + connection_string = f"{self.db_type}://{self.user}:{self.password.get_secret_value()}@{self.host}:{self.port}/{self.database}" self._engine = create_engine(connection_string) self.test_connection() return self._engine - def __str__(self): - return ( - f"SourceWarehouse(alias={self.alias}, type={self.db_type}, " - f"host={self.host}, port={self.port}, database={self.database})" - ) - def __eq__(self, other): if not isinstance(other, SourceWarehouse): return False @@ -205,13 +206,6 @@ def string_to_name(cls: "SourceColumn", value: str) -> SourceColumnName: raise ValueError("Column name must be a string.") -class SourceIndex(BaseModel): - """The hashes of column names in the Matchbox database.""" - - literal: list[bytes] - alias: list[bytes] - - class Source(BaseModel): """A dataset that can be indexed in the Matchbox database.""" @@ -243,8 +237,8 @@ def hash_columns(cls, data: dict[str, Any]) -> "Source": Handles three scenarios: 1. No columns specified - all columns except primary key are indexed - 2. Columns specified in TOML - uses specified columns with optional '*' - 3. Indices from database - uses existing column hash information + 2. Indices from database - uses existing column hash information + 3. Columns specified in TOML - uses specified columns with optional '*' """ # Initialise warehouse and get table metadata warehouse = ( @@ -275,12 +269,11 @@ def hash_columns(cls, data: dict[str, Any]) -> "Source": # Case 2: Columns from database if isinstance(index_data, dict): - source_index = SourceIndex(**index_data) data["db_columns"] = [ SourceColumn( literal=col.literal.name, type=col.type, - indexed=col in source_index.literal + source_index.alias, + indexed=col in index_data["literal"] + index_data["alias"], ) for col in remote_columns ] diff --git a/src/matchbox/server/postgresql/orm.py b/src/matchbox/server/postgresql/orm.py index fc54871e..e6d82b30 100644 --- a/src/matchbox/server/postgresql/orm.py +++ b/src/matchbox/server/postgresql/orm.py @@ -7,6 +7,7 @@ CheckConstraint, Column, ForeignKey, + UniqueConstraint, select, ) from sqlalchemy.dialects.postgresql import ARRAY, BYTEA, JSONB @@ -152,7 +153,7 @@ class Sources(CountMixin, MBDB.MatchboxBase): model = Column( BYTEA, ForeignKey("models.hash", ondelete="CASCADE"), primary_key=True ) - alias = Column(VARCHAR, nullable=False, unique=True) + alias = Column(VARCHAR, nullable=False) schema = Column(VARCHAR, nullable=False) table = Column(VARCHAR, nullable=False) id = Column(VARCHAR, nullable=False) @@ -162,6 +163,11 @@ class Sources(CountMixin, MBDB.MatchboxBase): dataset_model = relationship("Models", back_populates="source") clusters = relationship("Clusters", back_populates="source") + # Constraints + __table_args__ = ( + UniqueConstraint("alias", "schema", "table", name="unique_alias_schema_table"), + ) + @classmethod def list(cls) -> list["Sources"]: with Session(MBDB.get_engine()) as session: diff --git a/test/client/test_admin.py b/test/client/test_admin.py index e4a69ede..f54c3900 100644 --- a/test/client/test_admin.py +++ b/test/client/test_admin.py @@ -1,4 +1,3 @@ -import re from pathlib import Path from tempfile import NamedTemporaryFile from textwrap import dedent @@ -14,7 +13,7 @@ def warehouse_toml(warehouse: SourceWarehouse) -> str: [warehouses.{warehouse.alias}] db_type = "{warehouse.db_type}" user = "{warehouse.user}" - password = "{warehouse.password}" + password = "{warehouse.password.get_secret_value()}" host = "{warehouse.host}" port = {warehouse.port} database = "{warehouse.database}" @@ -24,7 +23,7 @@ def warehouse_toml(warehouse: SourceWarehouse) -> str: def source_toml(source: Source, index: list[dict[str, str]]) -> str: index_str = dumps({"index": index}).replace("\n", "\n ") return dedent(f""" - [datasets.{re.sub(r"[^a-zA-Z0-9]", "", source.alias)}] + [datasets.{source.alias.replace(".", "")}] database = "test_warehouse" db_schema = "{source.db_schema}" db_table = "{source.db_table}" @@ -75,7 +74,7 @@ def test_load_datasets_from_config( config = load_datasets_from_config(temp_file_path) # Helper variables - source = config.get(re.sub(r"[^a-zA-Z0-9]", "", crn.alias)) + source = config.get(crn.alias.replace(".", "")) named = [idx["literal"] for idx in index if idx["literal"] != "*"] has_star = any(idx["literal"] == "*" for idx in index) star_pos = next((i for i, idx in enumerate(index) if idx["literal"] == "*"), None) @@ -83,7 +82,7 @@ def test_load_datasets_from_config( # Test 1: Core attributes match assert source.database == warehouse - assert source.alias == re.sub(r"[^a-zA-Z0-9]", "", crn.alias) + assert source.alias == crn.alias.replace(".", "") assert source.db_schema == crn.db_schema assert source.db_table == crn.db_table assert source.db_pk == crn.db_pk diff --git a/test/server/test_adapter.py b/test/server/test_adapter.py index 5002647a..b1af8c2c 100644 --- a/test/server/test_adapter.py +++ b/test/server/test_adapter.py @@ -1,3 +1,4 @@ +from collections import defaultdict from typing import Callable import pytest @@ -120,14 +121,11 @@ def test_get_dataset(self): assert crn.db_columns == crn_retrieved.db_columns - cols: dict[str, list[SourceColumn]] = {} + cols: defaultdict[str, list[SourceColumn]] = defaultdict(list) # Indexing isn't used in the custom equality check for col in crn.db_columns + crn_retrieved.db_columns: - if col.literal.name not in cols: - cols[col.literal.name] = [col] - else: - cols[col.literal.name].append(col) + cols[col.literal.name].append(col) for c1, c2 in cols.values(): assert c1.indexed == c2.indexed From 8c6e1b61ad0540daa083dbec761f9cad933fcd23 Mon Sep 17 00:00:00 2001 From: Will Langdale Date: Thu, 12 Dec 2024 14:09:29 +0000 Subject: [PATCH 11/14] Fixed warning unit tests --- test/server/test_adapter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/server/test_adapter.py b/test/server/test_adapter.py index 5997517f..26388c22 100644 --- a/test/server/test_adapter.py +++ b/test/server/test_adapter.py @@ -430,7 +430,7 @@ def test_query_warning(self): query( selector=select_crn, backend=self.backend, - model=None, + resolution=None, return_type="pandas", limit=10, ) From f75e311ff8a63a6383d6e4b407606ef81dad761a Mon Sep 17 00:00:00 2001 From: Will Langdale Date: Thu, 12 Dec 2024 14:11:52 +0000 Subject: [PATCH 12/14] Refactored hash to base64 code --- src/matchbox/common/db.py | 5 ++--- src/matchbox/common/graph.py | 8 ++++++-- src/matchbox/common/hash.py | 2 +- test/fixtures/graph.py | 4 ++-- 4 files changed, 11 insertions(+), 8 deletions(-) diff --git a/src/matchbox/common/db.py b/src/matchbox/common/db.py index 02fff10b..cdf42d7a 100644 --- a/src/matchbox/common/db.py +++ b/src/matchbox/common/db.py @@ -1,10 +1,9 @@ -import base64 from typing import TYPE_CHECKING, Any, Literal, TypeVar, Union, overload import connectorx as cx import pyarrow as pa from matchbox.common.exceptions import MatchboxValidatonError -from matchbox.common.hash import HASH_FUNC +from matchbox.common.hash import HASH_FUNC, hash_to_base64 from pandas import DataFrame from pyarrow import Table as ArrowTable from pydantic import ( @@ -144,7 +143,7 @@ def hash(self) -> bytes: @property def base64(self) -> str: """Generate a base64 encoded hash based on the column name.""" - return base64.b64encode(self.hash).decode("utf-8") + return hash_to_base64(self.hash) class SourceColumn(BaseModel): diff --git a/src/matchbox/common/graph.py b/src/matchbox/common/graph.py index 1c9005de..0ba6b451 100644 --- a/src/matchbox/common/graph.py +++ b/src/matchbox/common/graph.py @@ -1,7 +1,7 @@ from enum import StrEnum import rustworkx as rx -from matchbox.common.hash import hash_to_str +from matchbox.common.hash import hash_to_base64 from pydantic import BaseModel @@ -36,7 +36,11 @@ def to_rx(self) -> rx.PyDiGraph: nodes = {} G = rx.PyDiGraph() for n in self.nodes: - node_data = {"id": hash_to_str(n.hash), "name": n.name, "type": str(n.type)} + node_data = { + "id": hash_to_base64(n.hash), + "name": n.name, + "type": str(n.type), + } nodes[n.hash] = G.add_node(node_data) for e in self.edges: G.add_edge(nodes[e.parent], nodes[e.child], {}) diff --git a/src/matchbox/common/hash.py b/src/matchbox/common/hash.py index 0fff7fd4..fd6990cf 100644 --- a/src/matchbox/common/hash.py +++ b/src/matchbox/common/hash.py @@ -18,7 +18,7 @@ HASH_FUNC = hashlib.sha256 -def hash_to_str(hash: bytes) -> str: +def hash_to_base64(hash: bytes) -> str: return base64.b64encode(hash).decode("utf-8") diff --git a/test/fixtures/graph.py b/test/fixtures/graph.py index dda47b0e..d2f038f7 100644 --- a/test/fixtures/graph.py +++ b/test/fixtures/graph.py @@ -7,7 +7,7 @@ from matchbox.common.graph import ( ResolutionNodeType as ResType, ) -from matchbox.common.hash import hash_to_str +from matchbox.common.hash import hash_to_base64 from rustworkx import PyDiGraph @@ -35,7 +35,7 @@ def resolution_graph() -> ResolutionGraph: @pytest.fixture def pydigraph() -> PyDiGraph: def make_id(n: int) -> str: - return hash_to_str(bytes(n)) + return hash_to_base64(bytes(n)) G = PyDiGraph() n1 = G.add_node({"id": make_id(1), "name": "1", "type": str(ResType.DATASET)}) From f55866e26da8b87eede82efe4cd781aeba4b6a06 Mon Sep 17 00:00:00 2001 From: Will Langdale Date: Thu, 12 Dec 2024 14:15:13 +0000 Subject: [PATCH 13/14] Dealt with allowing periods in a source's alias --- test/client/test_admin.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/test/client/test_admin.py b/test/client/test_admin.py index f54c3900..483ae0b8 100644 --- a/test/client/test_admin.py +++ b/test/client/test_admin.py @@ -22,8 +22,9 @@ def warehouse_toml(warehouse: SourceWarehouse) -> str: def source_toml(source: Source, index: list[dict[str, str]]) -> str: index_str = dumps({"index": index}).replace("\n", "\n ") + alias = source.alias if "." not in source.alias else f'"{source.alias}"' return dedent(f""" - [datasets.{source.alias.replace(".", "")}] + [datasets.{alias}] database = "test_warehouse" db_schema = "{source.db_schema}" db_table = "{source.db_table}" @@ -74,7 +75,7 @@ def test_load_datasets_from_config( config = load_datasets_from_config(temp_file_path) # Helper variables - source = config.get(crn.alias.replace(".", "")) + source = config.get(crn.alias) named = [idx["literal"] for idx in index if idx["literal"] != "*"] has_star = any(idx["literal"] == "*" for idx in index) star_pos = next((i for i, idx in enumerate(index) if idx["literal"] == "*"), None) @@ -82,7 +83,7 @@ def test_load_datasets_from_config( # Test 1: Core attributes match assert source.database == warehouse - assert source.alias == crn.alias.replace(".", "") + assert source.alias == crn.alias assert source.db_schema == crn.db_schema assert source.db_table == crn.db_table assert source.db_pk == crn.db_pk From 3e75fb5e1fd7de4ffe496b0b1de7108636d956d2 Mon Sep 17 00:00:00 2001 From: Will Langdale Date: Thu, 12 Dec 2024 14:30:03 +0000 Subject: [PATCH 14/14] Removed wildcard selector and declaring column type --- src/matchbox/common/db.py | 29 ++++++++++---------------- test/client/test_admin.py | 43 ++++++++------------------------------- 2 files changed, 19 insertions(+), 53 deletions(-) diff --git a/src/matchbox/common/db.py b/src/matchbox/common/db.py index cdf42d7a..0c913b8a 100644 --- a/src/matchbox/common/db.py +++ b/src/matchbox/common/db.py @@ -237,7 +237,7 @@ def hash_columns(cls, data: dict[str, Any]) -> "Source": Handles three scenarios: 1. No columns specified - all columns except primary key are indexed 2. Indices from database - uses existing column hash information - 3. Columns specified in TOML - uses specified columns with optional '*' + 3. Columns specified in TOML - specified columns are indexed """ # Initialise warehouse and get table metadata warehouse = ( @@ -280,14 +280,16 @@ def hash_columns(cls, data: dict[str, Any]) -> "Source": # Case 3: Columns from TOML local_columns = [] - star_index = None # Process TOML column specifications - for i, column in enumerate(index_data): - if column["literal"] == "*": - star_index = i - continue - local_columns.append(SourceColumn(**column, indexed=True)) + for column in index_data: + local_columns.append( + SourceColumn( + literal=column["literal"], + alias=column.get("alias", column["literal"]), + indexed=True, + ) + ) # Match remote columns with local specifications indexed_columns = [] @@ -296,23 +298,14 @@ def hash_columns(cls, data: dict[str, Any]) -> "Source": for remote_col in remote_columns: matched = False for local_col in local_columns: - if remote_col == local_col: - if local_col.type is None: - local_col.type = remote_col.type + if remote_col.literal == local_col.literal: indexed_columns.append(local_col) matched = True break if not matched: non_indexed_columns.append(remote_col) - # Handle wildcard insertion - if star_index is not None: - for col in non_indexed_columns: - col.indexed = True - indexed_columns[star_index:star_index] = non_indexed_columns - data["db_columns"] = indexed_columns - else: - data["db_columns"] = indexed_columns + non_indexed_columns + data["db_columns"] = indexed_columns + non_indexed_columns return data diff --git a/test/client/test_admin.py b/test/client/test_admin.py index 483ae0b8..fc4f3f53 100644 --- a/test/client/test_admin.py +++ b/test/client/test_admin.py @@ -40,17 +40,9 @@ def source_toml(source: Source, index: list[dict[str, str]]) -> str: {"literal": "company_name"}, {"literal": "crn"}, ], - [{"literal": "company_name", "type": "VARCHAR", "alias": "name"}], - [ - {"literal": "company_name"}, - {"literal": "*"}, - ], - [ - {"literal": "*"}, - {"literal": "company_name"}, - ], + [{"literal": "company_name", "alias": "name"}], ), - ids=["vanilla", "alias_and_type", "star_end", "star_start"], + ids=["vanilla", "alias"], ) def test_load_datasets_from_config( index: list[dict[str, str]], @@ -76,9 +68,7 @@ def test_load_datasets_from_config( # Helper variables source = config.get(crn.alias) - named = [idx["literal"] for idx in index if idx["literal"] != "*"] - has_star = any(idx["literal"] == "*" for idx in index) - star_pos = next((i for i, idx in enumerate(index) if idx["literal"] == "*"), None) + named = [idx["literal"] for idx in index] col_names = [col.literal.name for col in source.db_columns] # Test 1: Core attributes match @@ -91,33 +81,16 @@ def test_load_datasets_from_config( # Test 2: All non-pk columns present assert set(col_names) == {"company_name", "crn", "id"} - {source.db_pk} - # Test 3: Column indexing - for col in source.db_columns: - assert col.indexed == (has_star or col.literal.name in named) - - # Test 4: Aliases and types match + # Test 3: Aliases match for idx in index: - if idx["literal"] == "*": - continue col = next(c for c in source.db_columns if c.literal.name == idx["literal"]) assert col.alias.name == idx.get("alias", idx["literal"]) - assert col.type == idx.get("type", col.type) - - # Test 5: Column ordering - if star_pos is None: - for i, name in enumerate(named): - assert col_names[i] == name - else: - for i, idx in enumerate(index): - if idx["literal"] != "*": - if i < star_pos: - assert col_names[i] == idx["literal"] - else: - star_col_count = len(col_names) - len(index) + 1 - assert col_names[i + star_col_count - 1] == idx["literal"] - # Test 6: column equalities + # Test 4: Column ordering + for i, name in enumerate(named): + assert col_names[i] == name + # Test 5: column equalities assert source.db_columns[0] != source.db_columns[1] assert source.db_columns[0] == source.db_columns[0] assert source.db_columns[1].literal.hash == source.db_columns[1]