From c561201550126fe9644d144fee8b8301c0dd6f88 Mon Sep 17 00:00:00 2001
From: Will Langdale <will.langdale@businessandtrade.gov.uk>
Date: Thu, 5 Dec 2024 17:09:22 +0000
Subject: [PATCH 01/14] Initial structuring of config files

---
 sample.datasets.toml                      | 22 +++++++++++++++++++++-
 src/matchbox/admin.py                     |  4 +---
 src/matchbox/common/db.py                 | 11 ++++++++++-
 src/matchbox/server/postgresql/adapter.py |  1 +
 4 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/sample.datasets.toml b/sample.datasets.toml
index e0dda57e..585cce7b 100644
--- a/sample.datasets.toml
+++ b/sample.datasets.toml
@@ -11,21 +11,41 @@ database = "pg_warehouse"
 db_schema = "companieshouse"
 db_table = "companies"
 db_pk = "id"
+index = [
+    { name = "crn", type = "text", literal = "crn" },
+    { name = "company_name", type = "text", literal = "company_name" },
+    { name = "postcode", type = "text", literal = "postcode" }
+]
 
 [datasets.data_hub_companies]
 database = "pg_warehouse"
 db_schema = "dbt"
 db_table = "data_hub__companies"
 db_pk = "id"
+index = [
+    { name = "cdms", type = "text", literal = "cdms" },
+    { name = "company_name", type = "text", literal = "company_name" },
+    { name = "postcode", type = "text", literal = "postcode" }
+]
 
 [datasets.hmrc_exporters]
 database = "pg_warehouse"
 db_schema = "hmrc"
 db_table = "trade__exporters"
 db_pk = "id"
+index = [
+    { name = "company_name", type = "text", literal = "company_name" },
+    { name = "postcode", type = "text", literal = "postcode" }
+]
 
 [datasets.export_wins]
 database = "pg_warehouse"
 db_schema = "dbt"
 db_table = "export_wins__wins_dataset"
-db_pk = "id"
\ No newline at end of file
+db_pk = "id"
+index = [
+    { name = "company_name", type = "text", literal = "company_name" },
+    { name = "cdms", type = "text", literal = "cdms" },
+    { name = "dh_id", type = "text", literal = "data_hub_company_id" },
+    { name = "postcode", type = "text", literal = "postcode" }
+]
diff --git a/src/matchbox/admin.py b/src/matchbox/admin.py
index 1dc6c246..a2067594 100644
--- a/src/matchbox/admin.py
+++ b/src/matchbox/admin.py
@@ -7,9 +7,7 @@
 
 from matchbox.common.db import SourceWarehouse
 from matchbox.server import MatchboxDBAdapter, inject_backend
-from matchbox.server.base import (
-    Source,
-)
+from matchbox.server.base import Source
 
 logger = logging.getLogger("mb_logic")
 
diff --git a/src/matchbox/common/db.py b/src/matchbox/common/db.py
index 61b56693..4b98b5ec 100644
--- a/src/matchbox/common/db.py
+++ b/src/matchbox/common/db.py
@@ -116,6 +116,14 @@ def from_engine(cls, engine: Engine, alias: str | None = None) -> "SourceWarehou
         return warehouse
 
 
+class SourceColumn(BaseModel):
+    """A column in a dataset that can be indexed in the Matchbox database."""
+
+    name: str
+    type: str | None = None
+    literal: str | None = None
+
+
 class Source(BaseModel):
     """A dataset that can be indexed in the Matchbox database."""
 
@@ -123,10 +131,11 @@ class Source(BaseModel):
         populate_by_name=True,
     )
 
-    database: SourceWarehouse | None = None
+    database: SourceWarehouse
     db_pk: str
     db_schema: str
     db_table: str
+    index: list[SourceColumn | bytes]
 
     def __str__(self) -> str:
         return f"{self.db_schema}.{self.db_table}"
diff --git a/src/matchbox/server/postgresql/adapter.py b/src/matchbox/server/postgresql/adapter.py
index 54dbe451..3a80da61 100644
--- a/src/matchbox/server/postgresql/adapter.py
+++ b/src/matchbox/server/postgresql/adapter.py
@@ -358,6 +358,7 @@ def get_dataset(self, db_schema: str, db_table: str, engine: Engine) -> Source:
                     db_schema=dataset.schema,
                     db_table=dataset.table,
                     db_pk=dataset.id,
+                    index=None,
                     database=SourceWarehouse.from_engine(engine),
                 )
             else:

From e22f7c6251136b36ababb97cafc7ea83f2507ad1 Mon Sep 17 00:00:00 2001
From: Will Langdale <will.langdale@businessandtrade.gov.uk>
Date: Fri, 6 Dec 2024 15:59:54 +0000
Subject: [PATCH 02/14] Working ingestion of Source objects with configured
 columns

---
 sample.datasets.toml      |  25 ++++----
 src/matchbox/common/db.py | 132 ++++++++++++++++++++++++++++++++++++--
 uv.lock                   |  90 ++++++++++++++------------
 3 files changed, 188 insertions(+), 59 deletions(-)

diff --git a/sample.datasets.toml b/sample.datasets.toml
index 585cce7b..343e5a2e 100644
--- a/sample.datasets.toml
+++ b/sample.datasets.toml
@@ -12,9 +12,9 @@ db_schema = "companieshouse"
 db_table = "companies"
 db_pk = "id"
 index = [
-    { name = "crn", type = "text", literal = "crn" },
-    { name = "company_name", type = "text", literal = "company_name" },
-    { name = "postcode", type = "text", literal = "postcode" }
+    { literal = "crn", type = "text" },
+    { literal = "company_name", type = "text" },
+    { literal = "postcode", type = "text" }
 ]
 
 [datasets.data_hub_companies]
@@ -23,9 +23,10 @@ db_schema = "dbt"
 db_table = "data_hub__companies"
 db_pk = "id"
 index = [
-    { name = "cdms", type = "text", literal = "cdms" },
-    { name = "company_name", type = "text", literal = "company_name" },
-    { name = "postcode", type = "text", literal = "postcode" }
+    { literal = "cdms", type = "text" },
+    { literal = "company_name", type = "text" },
+    { literal = "postcode", type = "text" },
+    { literal = "*" }
 ]
 
 [datasets.hmrc_exporters]
@@ -34,8 +35,8 @@ db_schema = "hmrc"
 db_table = "trade__exporters"
 db_pk = "id"
 index = [
-    { name = "company_name", type = "text", literal = "company_name" },
-    { name = "postcode", type = "text", literal = "postcode" }
+    { literal = "company_name", type = "text" },
+    { literal = "postcode", type = "text" },
 ]
 
 [datasets.export_wins]
@@ -44,8 +45,8 @@ db_schema = "dbt"
 db_table = "export_wins__wins_dataset"
 db_pk = "id"
 index = [
-    { name = "company_name", type = "text", literal = "company_name" },
-    { name = "cdms", type = "text", literal = "cdms" },
-    { name = "dh_id", type = "text", literal = "data_hub_company_id" },
-    { name = "postcode", type = "text", literal = "postcode" }
+    { literal = "company_name", type = "text" },
+    { literal = "postcode", type = "text" },
+    { literal = "cdms", type = "text" },
+    { literal = "data_hub_company_id", alias = "dh_id", type = "text" },
 ]
diff --git a/src/matchbox/common/db.py b/src/matchbox/common/db.py
index 4b98b5ec..033b0c6c 100644
--- a/src/matchbox/common/db.py
+++ b/src/matchbox/common/db.py
@@ -6,7 +6,12 @@
 from matchbox.common.hash import HASH_FUNC
 from pandas import DataFrame
 from pyarrow import Table as ArrowTable
-from pydantic import BaseModel, ConfigDict, Field
+from pydantic import (
+    BaseModel,
+    ConfigDict,
+    Field,
+    model_validator,
+)
 from sqlalchemy import (
     LABEL_STYLE_TABLENAME_PLUS_COL,
     ColumnElement,
@@ -119,9 +124,64 @@ def from_engine(cls, engine: Engine, alias: str | None = None) -> "SourceWarehou
 class SourceColumn(BaseModel):
     """A column in a dataset that can be indexed in the Matchbox database."""
 
-    name: str
-    type: str | None = None
-    literal: str | None = None
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+
+    literal: str = Field(description="The literal name of the column in the database.")
+    alias: str = Field(
+        default_factory=lambda data: data["literal"],
+        description="The alias to use when hashing the dataset in Matchbox.",
+    )
+    type: str | None = Field(
+        default=None, description="The type to cast the column to before hashing data."
+    )
+    indexed: bool = Field(description="Whether the column is indexed in the database.")
+
+    def __eq__(self, other: object) -> bool:
+        """Compare SourceColumn with another SourceColumn or bytes object.
+
+        Two SourceColumns are equal if:
+
+        * Their literal names match, or
+        * Their alias names match, or
+        * The hash of either their literal or alias matches the other object's
+        corresponding hash
+
+        A SourceColumn is equal to a bytes object if:
+
+        * The hash of either its literal or alias matches the bytes object
+
+        Args:
+            other: Another SourceColumn or a bytes object to compare against
+
+        Returns:
+            bool: True if the objects are considered equal, False otherwise
+        """
+        literal_hash = HASH_FUNC(str(self.literal).encode("utf-8")).digest()
+        alias_hash = HASH_FUNC(str(self.alias).encode("utf-8")).digest()
+
+        if isinstance(other, SourceColumn):
+            if self.literal == other.literal or self.alias == other.alias:
+                return True
+
+            other_literal_hash = HASH_FUNC(str(other.literal).encode("utf-8")).digest()
+            other_alias_hash = HASH_FUNC(str(other.alias).encode("utf-8")).digest()
+
+            self_hashes = {literal_hash, alias_hash}
+            other_hashes = {other_literal_hash, other_alias_hash}
+
+            return bool(self_hashes & other_hashes)
+
+        if isinstance(other, bytes):
+            return other in {literal_hash, alias_hash}
+
+        return NotImplemented
+
+
+class SourceIndex(BaseModel):
+    """The hashes of column names in the Matchbox database."""
+
+    literal: list[bytes]
+    alias: list[bytes]
 
 
 class Source(BaseModel):
@@ -135,7 +195,7 @@ class Source(BaseModel):
     db_pk: str
     db_schema: str
     db_table: str
-    index: list[SourceColumn | bytes]
+    db_columns: list[SourceColumn]
 
     def __str__(self) -> str:
         return f"{self.db_schema}.{self.db_table}"
@@ -145,6 +205,68 @@ def __hash__(self) -> int:
             (type(self), self.db_pk, self.db_schema, self.db_table, self.database.alias)
         )
 
+    @model_validator(mode="before")
+    @classmethod
+    def hash_columns(cls, data: dict[str, Any]) -> "Source":
+        """Shapes indices data from either the backend or TOML.
+
+        Handles:
+            * From TOML, no columns specified
+            * From TOML, some or all columns specified
+            * From the database, indices already present
+        """
+        # Database setup
+        if isinstance(data["database"], SourceWarehouse):
+            warehouse = data["database"]
+        else:
+            warehouse = SourceWarehouse(**data["database"])
+        metadata = MetaData(schema=data["db_schema"])
+        table = Table(data["db_table"], metadata, autoload_with=warehouse.engine)
+
+        # Column logic
+        # Get all locally specified columns, or remotely specified hashes
+        local_columns: list[SourceColumn] = []
+        local_hashes: SourceIndex | None = None
+        select_all = False
+        if isinstance(data["index"], dict):
+            # Came from Matchbox database
+            local_hashes = SourceIndex(**data["index"])
+        else:
+            # Came from TOML
+            for column in data["index"]:
+                if column["literal"] == "*":
+                    select_all = True
+                    continue
+                local_columns.append(SourceColumn(**column, indexed=True))
+                continue
+
+        # Get all remote columns using the user's creds and merge with local spec
+        remote_columns = [
+            SourceColumn(literal=col.name, type=str(col.type), indexed=select_all)
+            for col in table.columns
+            if col.name not in data["db_pk"]
+        ]
+        db_columns: list[SourceColumn] = []
+        for remote_column in remote_columns:
+            if local_columns:
+                # Came from TOML, index and alias are configured from TOML
+                for local_column in local_columns:
+                    if remote_column == local_column:
+                        if local_column.type is None:
+                            local_column.type = remote_column.type
+                        db_columns.append(local_column)
+                        break
+                else:
+                    db_columns.append(remote_column)
+            elif local_hashes:
+                # Came from database, index is true when hashes match
+                if remote_column in local_hashes.literal + local_hashes.alias:
+                    remote_column.indexed = True
+                    db_columns.append(remote_column)
+
+        data["db_columns"] = db_columns
+        return data
+
     def to_table(self) -> Table:
         """Returns the dataset as a SQLAlchemy Table object."""
         metadata = MetaData(schema=self.db_schema)
diff --git a/uv.lock b/uv.lock
index 2fdc60eb..8e9e11a1 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1394,63 +1394,69 @@ wheels = [
 
 [[package]]
 name = "pydantic"
-version = "2.9.2"
+version = "2.10.3"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "annotated-types" },
     { name = "pydantic-core" },
     { name = "typing-extensions" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/a9/b7/d9e3f12af310e1120c21603644a1cd86f59060e040ec5c3a80b8f05fae30/pydantic-2.9.2.tar.gz", hash = "sha256:d155cef71265d1e9807ed1c32b4c8deec042a44a50a4188b25ac67ecd81a9c0f", size = 769917 }
+sdist = { url = "https://files.pythonhosted.org/packages/45/0f/27908242621b14e649a84e62b133de45f84c255eecb350ab02979844a788/pydantic-2.10.3.tar.gz", hash = "sha256:cb5ac360ce894ceacd69c403187900a02c4b20b693a9dd1d643e1effab9eadf9", size = 786486 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/df/e4/ba44652d562cbf0bf320e0f3810206149c8a4e99cdbf66da82e97ab53a15/pydantic-2.9.2-py3-none-any.whl", hash = "sha256:f048cec7b26778210e28a0459867920654d48e5e62db0958433636cde4254f12", size = 434928 },
+    { url = "https://files.pythonhosted.org/packages/62/51/72c18c55cf2f46ff4f91ebcc8f75aa30f7305f3d726be3f4ebffb4ae972b/pydantic-2.10.3-py3-none-any.whl", hash = "sha256:be04d85bbc7b65651c5f8e6b9976ed9c6f41782a55524cef079a34a0bb82144d", size = 456997 },
 ]
 
 [[package]]
 name = "pydantic-core"
-version = "2.23.4"
+version = "2.27.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "typing-extensions" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/e2/aa/6b6a9b9f8537b872f552ddd46dd3da230367754b6f707b8e1e963f515ea3/pydantic_core-2.23.4.tar.gz", hash = "sha256:2584f7cf844ac4d970fba483a717dbe10c1c1c96a969bf65d61ffe94df1b2863", size = 402156 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/5d/30/890a583cd3f2be27ecf32b479d5d615710bb926d92da03e3f7838ff3e58b/pydantic_core-2.23.4-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:77733e3892bb0a7fa797826361ce8a9184d25c8dffaec60b7ffe928153680ba8", size = 1865160 },
-    { url = "https://files.pythonhosted.org/packages/1d/9a/b634442e1253bc6889c87afe8bb59447f106ee042140bd57680b3b113ec7/pydantic_core-2.23.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1b84d168f6c48fabd1f2027a3d1bdfe62f92cade1fb273a5d68e621da0e44e6d", size = 1776777 },
-    { url = "https://files.pythonhosted.org/packages/75/9a/7816295124a6b08c24c96f9ce73085032d8bcbaf7e5a781cd41aa910c891/pydantic_core-2.23.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:df49e7a0861a8c36d089c1ed57d308623d60416dab2647a4a17fe050ba85de0e", size = 1799244 },
-    { url = "https://files.pythonhosted.org/packages/a9/8f/89c1405176903e567c5f99ec53387449e62f1121894aa9fc2c4fdc51a59b/pydantic_core-2.23.4-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ff02b6d461a6de369f07ec15e465a88895f3223eb75073ffea56b84d9331f607", size = 1805307 },
-    { url = "https://files.pythonhosted.org/packages/d5/a5/1a194447d0da1ef492e3470680c66048fef56fc1f1a25cafbea4bc1d1c48/pydantic_core-2.23.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:996a38a83508c54c78a5f41456b0103c30508fed9abcad0a59b876d7398f25fd", size = 2000663 },
-    { url = "https://files.pythonhosted.org/packages/13/a5/1df8541651de4455e7d587cf556201b4f7997191e110bca3b589218745a5/pydantic_core-2.23.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d97683ddee4723ae8c95d1eddac7c192e8c552da0c73a925a89fa8649bf13eea", size = 2655941 },
-    { url = "https://files.pythonhosted.org/packages/44/31/a3899b5ce02c4316865e390107f145089876dff7e1dfc770a231d836aed8/pydantic_core-2.23.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:216f9b2d7713eb98cb83c80b9c794de1f6b7e3145eef40400c62e86cee5f4e1e", size = 2052105 },
-    { url = "https://files.pythonhosted.org/packages/1b/aa/98e190f8745d5ec831f6d5449344c48c0627ac5fed4e5340a44b74878f8e/pydantic_core-2.23.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6f783e0ec4803c787bcea93e13e9932edab72068f68ecffdf86a99fd5918878b", size = 1919967 },
-    { url = "https://files.pythonhosted.org/packages/ae/35/b6e00b6abb2acfee3e8f85558c02a0822e9a8b2f2d812ea8b9079b118ba0/pydantic_core-2.23.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:d0776dea117cf5272382634bd2a5c1b6eb16767c223c6a5317cd3e2a757c61a0", size = 1964291 },
-    { url = "https://files.pythonhosted.org/packages/13/46/7bee6d32b69191cd649bbbd2361af79c472d72cb29bb2024f0b6e350ba06/pydantic_core-2.23.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:d5f7a395a8cf1621939692dba2a6b6a830efa6b3cee787d82c7de1ad2930de64", size = 2109666 },
-    { url = "https://files.pythonhosted.org/packages/39/ef/7b34f1b122a81b68ed0a7d0e564da9ccdc9a2924c8d6c6b5b11fa3a56970/pydantic_core-2.23.4-cp311-none-win32.whl", hash = "sha256:74b9127ffea03643e998e0c5ad9bd3811d3dac8c676e47db17b0ee7c3c3bf35f", size = 1732940 },
-    { url = "https://files.pythonhosted.org/packages/2f/76/37b7e76c645843ff46c1d73e046207311ef298d3f7b2f7d8f6ac60113071/pydantic_core-2.23.4-cp311-none-win_amd64.whl", hash = "sha256:98d134c954828488b153d88ba1f34e14259284f256180ce659e8d83e9c05eaa3", size = 1916804 },
-    { url = "https://files.pythonhosted.org/packages/74/7b/8e315f80666194b354966ec84b7d567da77ad927ed6323db4006cf915f3f/pydantic_core-2.23.4-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:f3e0da4ebaef65158d4dfd7d3678aad692f7666877df0002b8a522cdf088f231", size = 1856459 },
-    { url = "https://files.pythonhosted.org/packages/14/de/866bdce10ed808323d437612aca1ec9971b981e1c52e5e42ad9b8e17a6f6/pydantic_core-2.23.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f69a8e0b033b747bb3e36a44e7732f0c99f7edd5cea723d45bc0d6e95377ffee", size = 1770007 },
-    { url = "https://files.pythonhosted.org/packages/dc/69/8edd5c3cd48bb833a3f7ef9b81d7666ccddd3c9a635225214e044b6e8281/pydantic_core-2.23.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:723314c1d51722ab28bfcd5240d858512ffd3116449c557a1336cbe3919beb87", size = 1790245 },
-    { url = "https://files.pythonhosted.org/packages/80/33/9c24334e3af796ce80d2274940aae38dd4e5676298b4398eff103a79e02d/pydantic_core-2.23.4-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:bb2802e667b7051a1bebbfe93684841cc9351004e2badbd6411bf357ab8d5ac8", size = 1801260 },
-    { url = "https://files.pythonhosted.org/packages/a5/6f/e9567fd90104b79b101ca9d120219644d3314962caa7948dd8b965e9f83e/pydantic_core-2.23.4-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d18ca8148bebe1b0a382a27a8ee60350091a6ddaf475fa05ef50dc35b5df6327", size = 1996872 },
-    { url = "https://files.pythonhosted.org/packages/2d/ad/b5f0fe9e6cfee915dd144edbd10b6e9c9c9c9d7a56b69256d124b8ac682e/pydantic_core-2.23.4-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:33e3d65a85a2a4a0dc3b092b938a4062b1a05f3a9abde65ea93b233bca0e03f2", size = 2661617 },
-    { url = "https://files.pythonhosted.org/packages/06/c8/7d4b708f8d05a5cbfda3243aad468052c6e99de7d0937c9146c24d9f12e9/pydantic_core-2.23.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:128585782e5bfa515c590ccee4b727fb76925dd04a98864182b22e89a4e6ed36", size = 2071831 },
-    { url = "https://files.pythonhosted.org/packages/89/4d/3079d00c47f22c9a9a8220db088b309ad6e600a73d7a69473e3a8e5e3ea3/pydantic_core-2.23.4-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:68665f4c17edcceecc112dfed5dbe6f92261fb9d6054b47d01bf6371a6196126", size = 1917453 },
-    { url = "https://files.pythonhosted.org/packages/e9/88/9df5b7ce880a4703fcc2d76c8c2d8eb9f861f79d0c56f4b8f5f2607ccec8/pydantic_core-2.23.4-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:20152074317d9bed6b7a95ade3b7d6054845d70584216160860425f4fbd5ee9e", size = 1968793 },
-    { url = "https://files.pythonhosted.org/packages/e3/b9/41f7efe80f6ce2ed3ee3c2dcfe10ab7adc1172f778cc9659509a79518c43/pydantic_core-2.23.4-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:9261d3ce84fa1d38ed649c3638feefeae23d32ba9182963e465d58d62203bd24", size = 2116872 },
-    { url = "https://files.pythonhosted.org/packages/63/08/b59b7a92e03dd25554b0436554bf23e7c29abae7cce4b1c459cd92746811/pydantic_core-2.23.4-cp312-none-win32.whl", hash = "sha256:4ba762ed58e8d68657fc1281e9bb72e1c3e79cc5d464be146e260c541ec12d84", size = 1738535 },
-    { url = "https://files.pythonhosted.org/packages/88/8d/479293e4d39ab409747926eec4329de5b7129beaedc3786eca070605d07f/pydantic_core-2.23.4-cp312-none-win_amd64.whl", hash = "sha256:97df63000f4fea395b2824da80e169731088656d1818a11b95f3b173747b6cd9", size = 1917992 },
-    { url = "https://files.pythonhosted.org/packages/ad/ef/16ee2df472bf0e419b6bc68c05bf0145c49247a1095e85cee1463c6a44a1/pydantic_core-2.23.4-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:7530e201d10d7d14abce4fb54cfe5b94a0aefc87da539d0346a484ead376c3cc", size = 1856143 },
-    { url = "https://files.pythonhosted.org/packages/da/fa/bc3dbb83605669a34a93308e297ab22be82dfb9dcf88c6cf4b4f264e0a42/pydantic_core-2.23.4-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:df933278128ea1cd77772673c73954e53a1c95a4fdf41eef97c2b779271bd0bd", size = 1770063 },
-    { url = "https://files.pythonhosted.org/packages/4e/48/e813f3bbd257a712303ebdf55c8dc46f9589ec74b384c9f652597df3288d/pydantic_core-2.23.4-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0cb3da3fd1b6a5d0279a01877713dbda118a2a4fc6f0d821a57da2e464793f05", size = 1790013 },
-    { url = "https://files.pythonhosted.org/packages/b4/e0/56eda3a37929a1d297fcab1966db8c339023bcca0b64c5a84896db3fcc5c/pydantic_core-2.23.4-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:42c6dcb030aefb668a2b7009c85b27f90e51e6a3b4d5c9bc4c57631292015b0d", size = 1801077 },
-    { url = "https://files.pythonhosted.org/packages/04/be/5e49376769bfbf82486da6c5c1683b891809365c20d7c7e52792ce4c71f3/pydantic_core-2.23.4-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:696dd8d674d6ce621ab9d45b205df149399e4bb9aa34102c970b721554828510", size = 1996782 },
-    { url = "https://files.pythonhosted.org/packages/bc/24/e3ee6c04f1d58cc15f37bcc62f32c7478ff55142b7b3e6d42ea374ea427c/pydantic_core-2.23.4-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2971bb5ffe72cc0f555c13e19b23c85b654dd2a8f7ab493c262071377bfce9f6", size = 2661375 },
-    { url = "https://files.pythonhosted.org/packages/c1/f8/11a9006de4e89d016b8de74ebb1db727dc100608bb1e6bbe9d56a3cbbcce/pydantic_core-2.23.4-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8394d940e5d400d04cad4f75c0598665cbb81aecefaca82ca85bd28264af7f9b", size = 2071635 },
-    { url = "https://files.pythonhosted.org/packages/7c/45/bdce5779b59f468bdf262a5bc9eecbae87f271c51aef628d8c073b4b4b4c/pydantic_core-2.23.4-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:0dff76e0602ca7d4cdaacc1ac4c005e0ce0dcfe095d5b5259163a80d3a10d327", size = 1916994 },
-    { url = "https://files.pythonhosted.org/packages/d8/fa/c648308fe711ee1f88192cad6026ab4f925396d1293e8356de7e55be89b5/pydantic_core-2.23.4-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:7d32706badfe136888bdea71c0def994644e09fff0bfe47441deaed8e96fdbc6", size = 1968877 },
-    { url = "https://files.pythonhosted.org/packages/16/16/b805c74b35607d24d37103007f899abc4880923b04929547ae68d478b7f4/pydantic_core-2.23.4-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:ed541d70698978a20eb63d8c5d72f2cc6d7079d9d90f6b50bad07826f1320f5f", size = 2116814 },
-    { url = "https://files.pythonhosted.org/packages/d1/58/5305e723d9fcdf1c5a655e6a4cc2a07128bf644ff4b1d98daf7a9dbf57da/pydantic_core-2.23.4-cp313-none-win32.whl", hash = "sha256:3d5639516376dce1940ea36edf408c554475369f5da2abd45d44621cb616f769", size = 1738360 },
-    { url = "https://files.pythonhosted.org/packages/a5/ae/e14b0ff8b3f48e02394d8acd911376b7b66e164535687ef7dc24ea03072f/pydantic_core-2.23.4-cp313-none-win_amd64.whl", hash = "sha256:5a1504ad17ba4210df3a045132a7baeeba5a200e930f57512ee02909fc5c4cb5", size = 1919411 },
+sdist = { url = "https://files.pythonhosted.org/packages/a6/9f/7de1f19b6aea45aeb441838782d68352e71bfa98ee6fa048d5041991b33e/pydantic_core-2.27.1.tar.gz", hash = "sha256:62a763352879b84aa31058fc931884055fd75089cccbd9d58bb6afd01141b235", size = 412785 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/27/39/46fe47f2ad4746b478ba89c561cafe4428e02b3573df882334bd2964f9cb/pydantic_core-2.27.1-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:ac3b20653bdbe160febbea8aa6c079d3df19310d50ac314911ed8cc4eb7f8cb8", size = 1895553 },
+    { url = "https://files.pythonhosted.org/packages/1c/00/0804e84a78b7fdb394fff4c4f429815a10e5e0993e6ae0e0b27dd20379ee/pydantic_core-2.27.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a5a8e19d7c707c4cadb8c18f5f60c843052ae83c20fa7d44f41594c644a1d330", size = 1807220 },
+    { url = "https://files.pythonhosted.org/packages/01/de/df51b3bac9820d38371f5a261020f505025df732ce566c2a2e7970b84c8c/pydantic_core-2.27.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7f7059ca8d64fea7f238994c97d91f75965216bcbe5f695bb44f354893f11d52", size = 1829727 },
+    { url = "https://files.pythonhosted.org/packages/5f/d9/c01d19da8f9e9fbdb2bf99f8358d145a312590374d0dc9dd8dbe484a9cde/pydantic_core-2.27.1-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:bed0f8a0eeea9fb72937ba118f9db0cb7e90773462af7962d382445f3005e5a4", size = 1854282 },
+    { url = "https://files.pythonhosted.org/packages/5f/84/7db66eb12a0dc88c006abd6f3cbbf4232d26adfd827a28638c540d8f871d/pydantic_core-2.27.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a3cb37038123447cf0f3ea4c74751f6a9d7afef0eb71aa07bf5f652b5e6a132c", size = 2037437 },
+    { url = "https://files.pythonhosted.org/packages/34/ac/a2537958db8299fbabed81167d58cc1506049dba4163433524e06a7d9f4c/pydantic_core-2.27.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:84286494f6c5d05243456e04223d5a9417d7f443c3b76065e75001beb26f88de", size = 2780899 },
+    { url = "https://files.pythonhosted.org/packages/4a/c1/3e38cd777ef832c4fdce11d204592e135ddeedb6c6f525478a53d1c7d3e5/pydantic_core-2.27.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:acc07b2cfc5b835444b44a9956846b578d27beeacd4b52e45489e93276241025", size = 2135022 },
+    { url = "https://files.pythonhosted.org/packages/7a/69/b9952829f80fd555fe04340539d90e000a146f2a003d3fcd1e7077c06c71/pydantic_core-2.27.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:4fefee876e07a6e9aad7a8c8c9f85b0cdbe7df52b8a9552307b09050f7512c7e", size = 1987969 },
+    { url = "https://files.pythonhosted.org/packages/05/72/257b5824d7988af43460c4e22b63932ed651fe98804cc2793068de7ec554/pydantic_core-2.27.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:258c57abf1188926c774a4c94dd29237e77eda19462e5bb901d88adcab6af919", size = 1994625 },
+    { url = "https://files.pythonhosted.org/packages/73/c3/78ed6b7f3278a36589bcdd01243189ade7fc9b26852844938b4d7693895b/pydantic_core-2.27.1-cp311-cp311-musllinux_1_1_armv7l.whl", hash = "sha256:35c14ac45fcfdf7167ca76cc80b2001205a8d5d16d80524e13508371fb8cdd9c", size = 2090089 },
+    { url = "https://files.pythonhosted.org/packages/8d/c8/b4139b2f78579960353c4cd987e035108c93a78371bb19ba0dc1ac3b3220/pydantic_core-2.27.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:d1b26e1dff225c31897696cab7d4f0a315d4c0d9e8666dbffdb28216f3b17fdc", size = 2142496 },
+    { url = "https://files.pythonhosted.org/packages/3e/f8/171a03e97eb36c0b51981efe0f78460554a1d8311773d3d30e20c005164e/pydantic_core-2.27.1-cp311-none-win32.whl", hash = "sha256:2cdf7d86886bc6982354862204ae3b2f7f96f21a3eb0ba5ca0ac42c7b38598b9", size = 1811758 },
+    { url = "https://files.pythonhosted.org/packages/6a/fe/4e0e63c418c1c76e33974a05266e5633e879d4061f9533b1706a86f77d5b/pydantic_core-2.27.1-cp311-none-win_amd64.whl", hash = "sha256:3af385b0cee8df3746c3f406f38bcbfdc9041b5c2d5ce3e5fc6637256e60bbc5", size = 1980864 },
+    { url = "https://files.pythonhosted.org/packages/50/fc/93f7238a514c155a8ec02fc7ac6376177d449848115e4519b853820436c5/pydantic_core-2.27.1-cp311-none-win_arm64.whl", hash = "sha256:81f2ec23ddc1b476ff96563f2e8d723830b06dceae348ce02914a37cb4e74b89", size = 1864327 },
+    { url = "https://files.pythonhosted.org/packages/be/51/2e9b3788feb2aebff2aa9dfbf060ec739b38c05c46847601134cc1fed2ea/pydantic_core-2.27.1-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:9cbd94fc661d2bab2bc702cddd2d3370bbdcc4cd0f8f57488a81bcce90c7a54f", size = 1895239 },
+    { url = "https://files.pythonhosted.org/packages/7b/9e/f8063952e4a7d0127f5d1181addef9377505dcce3be224263b25c4f0bfd9/pydantic_core-2.27.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:5f8c4718cd44ec1580e180cb739713ecda2bdee1341084c1467802a417fe0f02", size = 1805070 },
+    { url = "https://files.pythonhosted.org/packages/2c/9d/e1d6c4561d262b52e41b17a7ef8301e2ba80b61e32e94520271029feb5d8/pydantic_core-2.27.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:15aae984e46de8d376df515f00450d1522077254ef6b7ce189b38ecee7c9677c", size = 1828096 },
+    { url = "https://files.pythonhosted.org/packages/be/65/80ff46de4266560baa4332ae3181fffc4488ea7d37282da1a62d10ab89a4/pydantic_core-2.27.1-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:1ba5e3963344ff25fc8c40da90f44b0afca8cfd89d12964feb79ac1411a260ac", size = 1857708 },
+    { url = "https://files.pythonhosted.org/packages/d5/ca/3370074ad758b04d9562b12ecdb088597f4d9d13893a48a583fb47682cdf/pydantic_core-2.27.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:992cea5f4f3b29d6b4f7f1726ed8ee46c8331c6b4eed6db5b40134c6fe1768bb", size = 2037751 },
+    { url = "https://files.pythonhosted.org/packages/b1/e2/4ab72d93367194317b99d051947c071aef6e3eb95f7553eaa4208ecf9ba4/pydantic_core-2.27.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0325336f348dbee6550d129b1627cb8f5351a9dc91aad141ffb96d4937bd9529", size = 2733863 },
+    { url = "https://files.pythonhosted.org/packages/8a/c6/8ae0831bf77f356bb73127ce5a95fe115b10f820ea480abbd72d3cc7ccf3/pydantic_core-2.27.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7597c07fbd11515f654d6ece3d0e4e5093edc30a436c63142d9a4b8e22f19c35", size = 2161161 },
+    { url = "https://files.pythonhosted.org/packages/f1/f4/b2fe73241da2429400fc27ddeaa43e35562f96cf5b67499b2de52b528cad/pydantic_core-2.27.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:3bbd5d8cc692616d5ef6fbbbd50dbec142c7e6ad9beb66b78a96e9c16729b089", size = 1993294 },
+    { url = "https://files.pythonhosted.org/packages/77/29/4bb008823a7f4cc05828198153f9753b3bd4c104d93b8e0b1bfe4e187540/pydantic_core-2.27.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:dc61505e73298a84a2f317255fcc72b710b72980f3a1f670447a21efc88f8381", size = 2001468 },
+    { url = "https://files.pythonhosted.org/packages/f2/a9/0eaceeba41b9fad851a4107e0cf999a34ae8f0d0d1f829e2574f3d8897b0/pydantic_core-2.27.1-cp312-cp312-musllinux_1_1_armv7l.whl", hash = "sha256:e1f735dc43da318cad19b4173dd1ffce1d84aafd6c9b782b3abc04a0d5a6f5bb", size = 2091413 },
+    { url = "https://files.pythonhosted.org/packages/d8/36/eb8697729725bc610fd73940f0d860d791dc2ad557faaefcbb3edbd2b349/pydantic_core-2.27.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:f4e5658dbffe8843a0f12366a4c2d1c316dbe09bb4dfbdc9d2d9cd6031de8aae", size = 2154735 },
+    { url = "https://files.pythonhosted.org/packages/52/e5/4f0fbd5c5995cc70d3afed1b5c754055bb67908f55b5cb8000f7112749bf/pydantic_core-2.27.1-cp312-none-win32.whl", hash = "sha256:672ebbe820bb37988c4d136eca2652ee114992d5d41c7e4858cdd90ea94ffe5c", size = 1833633 },
+    { url = "https://files.pythonhosted.org/packages/ee/f2/c61486eee27cae5ac781305658779b4a6b45f9cc9d02c90cb21b940e82cc/pydantic_core-2.27.1-cp312-none-win_amd64.whl", hash = "sha256:66ff044fd0bb1768688aecbe28b6190f6e799349221fb0de0e6f4048eca14c16", size = 1986973 },
+    { url = "https://files.pythonhosted.org/packages/df/a6/e3f12ff25f250b02f7c51be89a294689d175ac76e1096c32bf278f29ca1e/pydantic_core-2.27.1-cp312-none-win_arm64.whl", hash = "sha256:9a3b0793b1bbfd4146304e23d90045f2a9b5fd5823aa682665fbdaf2a6c28f3e", size = 1883215 },
+    { url = "https://files.pythonhosted.org/packages/0f/d6/91cb99a3c59d7b072bded9959fbeab0a9613d5a4935773c0801f1764c156/pydantic_core-2.27.1-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:f216dbce0e60e4d03e0c4353c7023b202d95cbaeff12e5fd2e82ea0a66905073", size = 1895033 },
+    { url = "https://files.pythonhosted.org/packages/07/42/d35033f81a28b27dedcade9e967e8a40981a765795c9ebae2045bcef05d3/pydantic_core-2.27.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a2e02889071850bbfd36b56fd6bc98945e23670773bc7a76657e90e6b6603c08", size = 1807542 },
+    { url = "https://files.pythonhosted.org/packages/41/c2/491b59e222ec7e72236e512108ecad532c7f4391a14e971c963f624f7569/pydantic_core-2.27.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42b0e23f119b2b456d07ca91b307ae167cc3f6c846a7b169fca5326e32fdc6cf", size = 1827854 },
+    { url = "https://files.pythonhosted.org/packages/e3/f3/363652651779113189cefdbbb619b7b07b7a67ebb6840325117cc8cc3460/pydantic_core-2.27.1-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:764be71193f87d460a03f1f7385a82e226639732214b402f9aa61f0d025f0737", size = 1857389 },
+    { url = "https://files.pythonhosted.org/packages/5f/97/be804aed6b479af5a945daec7538d8bf358d668bdadde4c7888a2506bdfb/pydantic_core-2.27.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1c00666a3bd2f84920a4e94434f5974d7bbc57e461318d6bb34ce9cdbbc1f6b2", size = 2037934 },
+    { url = "https://files.pythonhosted.org/packages/42/01/295f0bd4abf58902917e342ddfe5f76cf66ffabfc57c2e23c7681a1a1197/pydantic_core-2.27.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3ccaa88b24eebc0f849ce0a4d09e8a408ec5a94afff395eb69baf868f5183107", size = 2735176 },
+    { url = "https://files.pythonhosted.org/packages/9d/a0/cd8e9c940ead89cc37812a1a9f310fef59ba2f0b22b4e417d84ab09fa970/pydantic_core-2.27.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c65af9088ac534313e1963443d0ec360bb2b9cba6c2909478d22c2e363d98a51", size = 2160720 },
+    { url = "https://files.pythonhosted.org/packages/73/ae/9d0980e286627e0aeca4c352a60bd760331622c12d576e5ea4441ac7e15e/pydantic_core-2.27.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:206b5cf6f0c513baffaeae7bd817717140770c74528f3e4c3e1cec7871ddd61a", size = 1992972 },
+    { url = "https://files.pythonhosted.org/packages/bf/ba/ae4480bc0292d54b85cfb954e9d6bd226982949f8316338677d56541b85f/pydantic_core-2.27.1-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:062f60e512fc7fff8b8a9d680ff0ddaaef0193dba9fa83e679c0c5f5fbd018bc", size = 2001477 },
+    { url = "https://files.pythonhosted.org/packages/55/b7/e26adf48c2f943092ce54ae14c3c08d0d221ad34ce80b18a50de8ed2cba8/pydantic_core-2.27.1-cp313-cp313-musllinux_1_1_armv7l.whl", hash = "sha256:a0697803ed7d4af5e4c1adf1670af078f8fcab7a86350e969f454daf598c4960", size = 2091186 },
+    { url = "https://files.pythonhosted.org/packages/ba/cc/8491fff5b608b3862eb36e7d29d36a1af1c945463ca4c5040bf46cc73f40/pydantic_core-2.27.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:58ca98a950171f3151c603aeea9303ef6c235f692fe555e883591103da709b23", size = 2154429 },
+    { url = "https://files.pythonhosted.org/packages/78/d8/c080592d80edd3441ab7f88f865f51dae94a157fc64283c680e9f32cf6da/pydantic_core-2.27.1-cp313-none-win32.whl", hash = "sha256:8065914ff79f7eab1599bd80406681f0ad08f8e47c880f17b416c9f8f7a26d05", size = 1833713 },
+    { url = "https://files.pythonhosted.org/packages/83/84/5ab82a9ee2538ac95a66e51f6838d6aba6e0a03a42aa185ad2fe404a4e8f/pydantic_core-2.27.1-cp313-none-win_amd64.whl", hash = "sha256:ba630d5e3db74c79300d9a5bdaaf6200172b107f263c98a0539eeecb857b2337", size = 1987897 },
+    { url = "https://files.pythonhosted.org/packages/df/c3/b15fb833926d91d982fde29c0624c9f225da743c7af801dace0d4e187e71/pydantic_core-2.27.1-cp313-none-win_arm64.whl", hash = "sha256:45cf8588c066860b623cd11c4ba687f8d7175d5f7ef65f7129df8a394c502de5", size = 1882983 },
 ]
 
 [[package]]

From 9fde1e929a935d35f1d463924d4eeec9960fcf4e Mon Sep 17 00:00:00 2001
From: Will Langdale <will.langdale@businessandtrade.gov.uk>
Date: Fri, 6 Dec 2024 16:22:03 +0000
Subject: [PATCH 03/14] Added hashing method to column names

---
 src/matchbox/common/db.py                 | 43 ++++++++++++++---------
 src/matchbox/server/postgresql/adapter.py |  1 -
 src/matchbox/server/postgresql/orm.py     |  3 +-
 3 files changed, 28 insertions(+), 19 deletions(-)

diff --git a/src/matchbox/common/db.py b/src/matchbox/common/db.py
index 033b0c6c..c810b380 100644
--- a/src/matchbox/common/db.py
+++ b/src/matchbox/common/db.py
@@ -6,12 +6,7 @@
 from matchbox.common.hash import HASH_FUNC
 from pandas import DataFrame
 from pyarrow import Table as ArrowTable
-from pydantic import (
-    BaseModel,
-    ConfigDict,
-    Field,
-    model_validator,
-)
+from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
 from sqlalchemy import (
     LABEL_STYLE_TABLENAME_PLUS_COL,
     ColumnElement,
@@ -121,13 +116,26 @@ def from_engine(cls, engine: Engine, alias: str | None = None) -> "SourceWarehou
         return warehouse
 
 
+class SourceColumnName(BaseModel):
+    """A column name in the Matchbox database."""
+
+    name: str
+
+    @property
+    def hash(self) -> bytes:
+        """Generate a unique hash based on the column name."""
+        return HASH_FUNC(self.name.encode("utf-8")).digest()
+
+
 class SourceColumn(BaseModel):
     """A column in a dataset that can be indexed in the Matchbox database."""
 
     model_config = ConfigDict(arbitrary_types_allowed=True)
 
-    literal: str = Field(description="The literal name of the column in the database.")
-    alias: str = Field(
+    literal: SourceColumnName = Field(
+        description="The literal name of the column in the database."
+    )
+    alias: SourceColumnName = Field(
         default_factory=lambda data: data["literal"],
         description="The alias to use when hashing the dataset in Matchbox.",
     )
@@ -156,26 +164,27 @@ def __eq__(self, other: object) -> bool:
         Returns:
             bool: True if the objects are considered equal, False otherwise
         """
-        literal_hash = HASH_FUNC(str(self.literal).encode("utf-8")).digest()
-        alias_hash = HASH_FUNC(str(self.alias).encode("utf-8")).digest()
-
         if isinstance(other, SourceColumn):
             if self.literal == other.literal or self.alias == other.alias:
                 return True
 
-            other_literal_hash = HASH_FUNC(str(other.literal).encode("utf-8")).digest()
-            other_alias_hash = HASH_FUNC(str(other.alias).encode("utf-8")).digest()
-
-            self_hashes = {literal_hash, alias_hash}
-            other_hashes = {other_literal_hash, other_alias_hash}
+            self_hashes = {self.literal.hash, self.alias.hash}
+            other_hashes = {other.literal.hash, other.alias.hash}
 
             return bool(self_hashes & other_hashes)
 
         if isinstance(other, bytes):
-            return other in {literal_hash, alias_hash}
+            return other in {self.literal.hash, self.alias.hash}
 
         return NotImplemented
 
+    @field_validator("literal", "alias", mode="before")
+    def string_to_name(cls: "SourceColumn", value: str) -> SourceColumnName:
+        if isinstance(value, str):
+            return SourceColumnName(name=value)
+        else:
+            raise ValueError("Column name must be a string.")
+
 
 class SourceIndex(BaseModel):
     """The hashes of column names in the Matchbox database."""
diff --git a/src/matchbox/server/postgresql/adapter.py b/src/matchbox/server/postgresql/adapter.py
index 3a80da61..54dbe451 100644
--- a/src/matchbox/server/postgresql/adapter.py
+++ b/src/matchbox/server/postgresql/adapter.py
@@ -358,7 +358,6 @@ def get_dataset(self, db_schema: str, db_table: str, engine: Engine) -> Source:
                     db_schema=dataset.schema,
                     db_table=dataset.table,
                     db_pk=dataset.id,
-                    index=None,
                     database=SourceWarehouse.from_engine(engine),
                 )
             else:
diff --git a/src/matchbox/server/postgresql/orm.py b/src/matchbox/server/postgresql/orm.py
index 8f1c6674..fa9a61fa 100644
--- a/src/matchbox/server/postgresql/orm.py
+++ b/src/matchbox/server/postgresql/orm.py
@@ -9,7 +9,7 @@
     ForeignKey,
     select,
 )
-from sqlalchemy.dialects.postgresql import ARRAY, BYTEA
+from sqlalchemy.dialects.postgresql import ARRAY, BYTEA, JSONB
 from sqlalchemy.orm import Session, relationship
 
 from matchbox.server.postgresql.db import MBDB
@@ -155,6 +155,7 @@ class Sources(CountMixin, MBDB.MatchboxBase):
     schema = Column(VARCHAR, nullable=False)
     table = Column(VARCHAR, nullable=False)
     id = Column(VARCHAR, nullable=False)
+    indices = Column(JSONB, nullable=False)
 
     # Relationships
     dataset_model = relationship("Models", back_populates="source")

From bf2afc084477fdd9dea3790652b6c76358b4f298 Mon Sep 17 00:00:00 2001
From: Will Langdale <will.langdale@businessandtrade.gov.uk>
Date: Fri, 6 Dec 2024 17:05:53 +0000
Subject: [PATCH 04/14] Honoured ordering of columns in settings to give total
 control of hash creation

---
 src/matchbox/admin.py                         |  2 +-
 src/matchbox/common/db.py                     | 32 +++++++++++++------
 src/matchbox/server/postgresql/orm.py         |  1 +
 .../server/postgresql/utils/insert.py         |  5 +++
 4 files changed, 30 insertions(+), 10 deletions(-)

diff --git a/src/matchbox/admin.py b/src/matchbox/admin.py
index a2067594..585ebc1c 100644
--- a/src/matchbox/admin.py
+++ b/src/matchbox/admin.py
@@ -32,7 +32,7 @@ def load_datasets_from_config(datasets: Path) -> dict[str, Source]:
     for dataset_name, dataset_config in config["datasets"].items():
         warehouse_alias = dataset_config.get("database")
         dataset_config["database"] = warehouses[warehouse_alias]
-        datasets[dataset_name] = Source(**dataset_config)
+        datasets[dataset_name] = Source(alias=dataset_name, **dataset_config)
 
     return datasets
 
diff --git a/src/matchbox/common/db.py b/src/matchbox/common/db.py
index c810b380..4ded824b 100644
--- a/src/matchbox/common/db.py
+++ b/src/matchbox/common/db.py
@@ -201,6 +201,9 @@ class Source(BaseModel):
     )
 
     database: SourceWarehouse
+    alias: str = Field(
+        default_factory=lambda data: f"{data['db_schema']}.{data['db_table']}"
+    )
     db_pk: str
     db_schema: str
     db_table: str
@@ -235,8 +238,8 @@ def hash_columns(cls, data: dict[str, Any]) -> "Source":
         # Column logic
         # Get all locally specified columns, or remotely specified hashes
         local_columns: list[SourceColumn] = []
+        star_index: int | None = None
         local_hashes: SourceIndex | None = None
-        select_all = False
         if isinstance(data["index"], dict):
             # Came from Matchbox database
             local_hashes = SourceIndex(**data["index"])
@@ -244,18 +247,20 @@ def hash_columns(cls, data: dict[str, Any]) -> "Source":
             # Came from TOML
             for column in data["index"]:
                 if column["literal"] == "*":
-                    select_all = True
+                    star_index = len(local_columns)
                     continue
                 local_columns.append(SourceColumn(**column, indexed=True))
-                continue
 
         # Get all remote columns using the user's creds and merge with local spec
         remote_columns = [
-            SourceColumn(literal=col.name, type=str(col.type), indexed=select_all)
+            SourceColumn(literal=col.name, type=str(col.type), indexed=False)
             for col in table.columns
             if col.name not in data["db_pk"]
         ]
         db_columns: list[SourceColumn] = []
+        db_indexed_columns: list[SourceColumn] = []
+        db_non_indexed_columns: list[SourceColumn] = []
+
         for remote_column in remote_columns:
             if local_columns:
                 # Came from TOML, index and alias are configured from TOML
@@ -263,16 +268,26 @@ def hash_columns(cls, data: dict[str, Any]) -> "Source":
                     if remote_column == local_column:
                         if local_column.type is None:
                             local_column.type = remote_column.type
-                        db_columns.append(local_column)
+                        db_indexed_columns.append(local_column)
                         break
                 else:
-                    db_columns.append(remote_column)
+                    db_non_indexed_columns.append(remote_column)
             elif local_hashes:
                 # Came from database, index is true when hashes match
                 if remote_column in local_hashes.literal + local_hashes.alias:
                     remote_column.indexed = True
                     db_columns.append(remote_column)
 
+        if local_columns:
+            # Concatenate with TOML order, honouring star location (if present)
+            if star_index:
+                db_columns = db_indexed_columns
+                for c in db_non_indexed_columns:
+                    c.indexed = True
+                    db_columns.insert(star_index, c)
+            else:
+                db_columns = db_indexed_columns + db_non_indexed_columns
+
         data["db_columns"] = db_columns
         return data
 
@@ -320,9 +335,8 @@ def _get_column(col_name: str) -> ColumnElement:
 
     def to_hash(self) -> bytes:
         """Generate a unique hash based on the table's columns and datatypes."""
-        table = self.to_table()
-        schema_representation = f"{str(self)}: " + ",".join(
-            f"{col.name}:{str(col.type)}" for col in table.columns
+        schema_representation = f"{self.alias}: " + ",".join(
+            f"{col.alias.name}:{col.type}" for col in self.db_columns if col.indexed
         )
         return HASH_FUNC(schema_representation.encode("utf-8")).digest()
 
diff --git a/src/matchbox/server/postgresql/orm.py b/src/matchbox/server/postgresql/orm.py
index fa9a61fa..fc54871e 100644
--- a/src/matchbox/server/postgresql/orm.py
+++ b/src/matchbox/server/postgresql/orm.py
@@ -152,6 +152,7 @@ class Sources(CountMixin, MBDB.MatchboxBase):
     model = Column(
         BYTEA, ForeignKey("models.hash", ondelete="CASCADE"), primary_key=True
     )
+    alias = Column(VARCHAR, nullable=False, unique=True)
     schema = Column(VARCHAR, nullable=False)
     table = Column(VARCHAR, nullable=False)
     id = Column(VARCHAR, nullable=False)
diff --git a/src/matchbox/server/postgresql/utils/insert.py b/src/matchbox/server/postgresql/utils/insert.py
index c58abd84..87eb1410 100644
--- a/src/matchbox/server/postgresql/utils/insert.py
+++ b/src/matchbox/server/postgresql/utils/insert.py
@@ -47,9 +47,14 @@ def insert_dataset(dataset: Source, engine: Engine, batch_size: int) -> None:
 
     source_data = {
         "model": model_hash,
+        "alias": dataset.alias,
         "schema": dataset.db_schema,
         "table": dataset.db_table,
         "id": dataset.db_pk,
+        "indices": {
+            "literal": [c.literal.hash for c in dataset.db_columns if c.indexed],
+            "alias": [c.alias.hash for c in dataset.db_columns if c.indexed],
+        },
     }
 
     clusters = dataset_to_hashlist(dataset=dataset, model_hash=model_hash)

From 98bd13d6cfad03c47fd364deb3e4bba962f8c420 Mon Sep 17 00:00:00 2001
From: Will Langdale <will.langdale@businessandtrade.gov.uk>
Date: Fri, 6 Dec 2024 17:52:48 +0000
Subject: [PATCH 05/14] Saving and loading hashes from the database
 successfully

---
 src/matchbox/common/db.py                     | 25 ++++++++++++++-----
 src/matchbox/common/hash.py                   | 10 +++-----
 src/matchbox/server/postgresql/adapter.py     |  8 ++++++
 .../server/postgresql/utils/insert.py         |  4 +--
 4 files changed, 33 insertions(+), 14 deletions(-)

diff --git a/src/matchbox/common/db.py b/src/matchbox/common/db.py
index 4ded824b..23f608b4 100644
--- a/src/matchbox/common/db.py
+++ b/src/matchbox/common/db.py
@@ -1,3 +1,4 @@
+import base64
 from typing import TYPE_CHECKING, Any, Literal, TypeVar, Union, overload
 
 import connectorx as cx
@@ -126,6 +127,11 @@ def hash(self) -> bytes:
         """Generate a unique hash based on the column name."""
         return HASH_FUNC(self.name.encode("utf-8")).digest()
 
+    @property
+    def base64(self) -> str:
+        """Generate a base64 encoded hash based on the column name."""
+        return base64.b64encode(self.hash).decode("utf-8")
+
 
 class SourceColumn(BaseModel):
     """A column in a dataset that can be indexed in the Matchbox database."""
@@ -201,13 +207,13 @@ class Source(BaseModel):
     )
 
     database: SourceWarehouse
-    alias: str = Field(
-        default_factory=lambda data: f"{data['db_schema']}.{data['db_table']}"
-    )
     db_pk: str
     db_schema: str
     db_table: str
     db_columns: list[SourceColumn]
+    alias: str = Field(
+        default_factory=lambda data: f"{data['db_schema']}.{data['db_table']}"
+    )
 
     def __str__(self) -> str:
         return f"{self.db_schema}.{self.db_table}"
@@ -240,12 +246,13 @@ def hash_columns(cls, data: dict[str, Any]) -> "Source":
         local_columns: list[SourceColumn] = []
         star_index: int | None = None
         local_hashes: SourceIndex | None = None
-        if isinstance(data["index"], dict):
+        index_data: dict | list | None = data.get("index")
+        if isinstance(index_data, dict):
             # Came from Matchbox database
-            local_hashes = SourceIndex(**data["index"])
+            local_hashes = SourceIndex(**index_data)
         else:
             # Came from TOML
-            for column in data["index"]:
+            for column in index_data or []:
                 if column["literal"] == "*":
                     star_index = len(local_columns)
                     continue
@@ -288,6 +295,12 @@ def hash_columns(cls, data: dict[str, Any]) -> "Source":
             else:
                 db_columns = db_indexed_columns + db_non_indexed_columns
 
+        if not local_columns and not local_hashes:
+            # No columns specified, index all columns except the primary key
+            for col in remote_columns:
+                col.indexed = True
+            db_columns = remote_columns
+
         data["db_columns"] = db_columns
         return data
 
diff --git a/src/matchbox/common/hash.py b/src/matchbox/common/hash.py
index 4ce9c072..c9440d21 100644
--- a/src/matchbox/common/hash.py
+++ b/src/matchbox/common/hash.py
@@ -21,16 +21,14 @@ def dataset_to_hashlist(dataset: Source, model_hash: bytes) -> list[dict[str, An
     """Retrieve and hash a dataset from its warehouse, ready to be inserted."""
     with Session(dataset.database.engine) as warehouse_session:
         source_table = dataset.to_table()
-
-        # Exclude the primary key from the columns to be hashed
-        cols = tuple(
-            [col for col in list(source_table.c.keys()) if col != dataset.db_pk]
+        cols_to_index = tuple(
+            [col.literal.name for col in dataset.db_columns if col.indexed]
         )
 
         slct_stmt = select(
-            func.concat(*source_table.c[cols]).label("raw"),
+            func.concat(*source_table.c[cols_to_index]).label("raw"),
             func.array_agg(source_table.c[dataset.db_pk].cast(String)).label("id"),
-        ).group_by(*source_table.c[cols])
+        ).group_by(*source_table.c[cols_to_index])
 
         raw_result = warehouse_session.execute(slct_stmt)
 
diff --git a/src/matchbox/server/postgresql/adapter.py b/src/matchbox/server/postgresql/adapter.py
index 54dbe451..6b906307 100644
--- a/src/matchbox/server/postgresql/adapter.py
+++ b/src/matchbox/server/postgresql/adapter.py
@@ -1,3 +1,4 @@
+import base64
 from typing import TYPE_CHECKING, Any, Literal
 
 from pydantic import BaseModel
@@ -354,10 +355,17 @@ def get_dataset(self, db_schema: str, db_table: str, engine: Engine) -> Source:
                 .first()
             )
             if dataset:
+                dataset_indices: dict[str, bytes] = {}
+                for index_type, index_b64_list in dataset.indices.items():
+                    dataset_indices[index_type] = [
+                        base64.b64decode(b64.encode("utf-8")) for b64 in index_b64_list
+                    ]
                 return Source(
+                    alias=dataset.alias,
                     db_schema=dataset.schema,
                     db_table=dataset.table,
                     db_pk=dataset.id,
+                    db_columns=dataset_indices,
                     database=SourceWarehouse.from_engine(engine),
                 )
             else:
diff --git a/src/matchbox/server/postgresql/utils/insert.py b/src/matchbox/server/postgresql/utils/insert.py
index 87eb1410..99b841c1 100644
--- a/src/matchbox/server/postgresql/utils/insert.py
+++ b/src/matchbox/server/postgresql/utils/insert.py
@@ -52,8 +52,8 @@ def insert_dataset(dataset: Source, engine: Engine, batch_size: int) -> None:
         "table": dataset.db_table,
         "id": dataset.db_pk,
         "indices": {
-            "literal": [c.literal.hash for c in dataset.db_columns if c.indexed],
-            "alias": [c.alias.hash for c in dataset.db_columns if c.indexed],
+            "literal": [c.literal.base64 for c in dataset.db_columns if c.indexed],
+            "alias": [c.alias.base64 for c in dataset.db_columns if c.indexed],
         },
     }
 

From 4b13c182954c0a250043f14e846d25d1bba9ffdc Mon Sep 17 00:00:00 2001
From: Will Langdale <will.langdale@businessandtrade.gov.uk>
Date: Fri, 6 Dec 2024 18:03:20 +0000
Subject: [PATCH 06/14] More robust checking of Source insertion and retrieval

---
 test/server/test_adapter.py | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/test/server/test_adapter.py b/test/server/test_adapter.py
index c369691e..2e650c65 100644
--- a/test/server/test_adapter.py
+++ b/test/server/test_adapter.py
@@ -3,7 +3,7 @@
 import pytest
 import rustworkx as rx
 from dotenv import find_dotenv, load_dotenv
-from matchbox.common.db import Source
+from matchbox.common.db import Source, SourceColumn
 from matchbox.common.exceptions import (
     MatchboxDataError,
     MatchboxDatasetError,
@@ -114,9 +114,24 @@ def test_get_dataset(self):
 
         crn = self.warehouse_data[0]
 
-        self.backend.get_dataset(
+        crn_retrieved = self.backend.get_dataset(
             db_schema=crn.db_schema, db_table=crn.db_table, engine=crn.database.engine
         )
+
+        assert crn.db_columns == crn_retrieved.db_columns
+
+        cols: dict[str, list[SourceColumn]] = {}
+
+        # Indexing isn't used in the custom equality check
+        for col in crn.db_columns + crn_retrieved.db_columns:
+            if col.literal.name not in cols:
+                cols[col.literal.name] = [col]
+            else:
+                cols[col.literal.name].append(col)
+
+        for c1, c2 in cols.values():
+            assert c1.indexed == c2.indexed
+
         with pytest.raises(MatchboxDatasetError):
             self.backend.get_dataset(
                 db_schema="nonexistant",

From 248dda73ef3f2accd2a6078cbc3d9a563d085e01 Mon Sep 17 00:00:00 2001
From: Will Langdale <will.langdale@businessandtrade.gov.uk>
Date: Fri, 6 Dec 2024 18:22:32 +0000
Subject: [PATCH 07/14] Added warning to query function

---
 src/matchbox/server/postgresql/utils/query.py | 12 ++++++++++++
 test/server/test_adapter.py                   | 19 +++++++++++++++++++
 2 files changed, 31 insertions(+)

diff --git a/src/matchbox/server/postgresql/utils/query.py b/src/matchbox/server/postgresql/utils/query.py
index ff492bf5..dbc235a2 100644
--- a/src/matchbox/server/postgresql/utils/query.py
+++ b/src/matchbox/server/postgresql/utils/query.py
@@ -1,4 +1,5 @@
 import logging
+import warnings
 from typing import TYPE_CHECKING, Any, Literal, TypeVar
 
 import pyarrow as pa
@@ -290,6 +291,17 @@ def query(
                     db_schema=source.db_schema, db_table=source.db_table
                 )
 
+            # Warn if non-indexed fields have been requested
+            not_indexed = set(fields) - set(
+                c.literal.name for c in source.db_columns if c.indexed
+            )
+            if not_indexed:
+                warnings.warn(
+                    "Found non-indexed fields. Do not use these fields in match jobs:"
+                    f"{', '.join(sorted(not_indexed))}",
+                    stacklevel=2,
+                )
+
             hash_query = _resolve_cluster_hierarchy(
                 dataset_hash=dataset.hash,
                 model=truth_model if truth_model else dataset,
diff --git a/test/server/test_adapter.py b/test/server/test_adapter.py
index 2e650c65..5002647a 100644
--- a/test/server/test_adapter.py
+++ b/test/server/test_adapter.py
@@ -413,6 +413,25 @@ def count_deduplicates(df: DataFrame) -> int:
 
         assert self.backend.data.count() == unique
 
+    def test_query_warning(self):
+        """Tests querying non-indexed fields warns the user."""
+        self.setup_database("index")
+
+        crn = self.warehouse_data[0]
+        select_crn = selector(
+            table=str(crn),
+            fields=["id", "crn"],
+            engine=crn.database.engine,
+        )
+        with pytest.warns(Warning):
+            query(
+                selector=select_crn,
+                backend=self.backend,
+                model=None,
+                return_type="pandas",
+                limit=10,
+            )
+
     def test_query_single_table(self):
         """Test querying data from the database."""
         self.setup_database("index")

From c34f10d469e624d1213f2a5ae87195b0345587c7 Mon Sep 17 00:00:00 2001
From: Will Langdale <will.langdale@businessandtrade.gov.uk>
Date: Sun, 8 Dec 2024 12:16:23 +0000
Subject: [PATCH 08/14] Added unit test to show TOML reading works as expected

---
 pyproject.toml            |   1 +
 sample.datasets.toml      |  25 ++++----
 src/matchbox/common/db.py |  29 ++++++---
 test/client/test_admin.py | 126 ++++++++++++++++++++++++++++++++++++++
 uv.lock                   |  11 ++++
 5 files changed, 172 insertions(+), 20 deletions(-)
 create mode 100644 test/client/test_admin.py

diff --git a/pyproject.toml b/pyproject.toml
index 08063a37..f4d33280 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -38,6 +38,7 @@ dev = [
     "pytest-env>=1.1.5",
     "ruff>=0.6.8",
     "docker>=7.1.0",
+    "tomli-w>=1.1.0",
 ]
 typing = [
     "polars>=1.11.0",
diff --git a/sample.datasets.toml b/sample.datasets.toml
index 343e5a2e..dd41a62a 100644
--- a/sample.datasets.toml
+++ b/sample.datasets.toml
@@ -12,9 +12,10 @@ db_schema = "companieshouse"
 db_table = "companies"
 db_pk = "id"
 index = [
-    { literal = "crn", type = "text" },
-    { literal = "company_name", type = "text" },
-    { literal = "postcode", type = "text" }
+    { literal = "crn", alias = "crn_id", type = "VARCHAR" },
+    { literal = "company_name", alias = "name" },
+    { literal = "*" },
+    { literal = "postcode" }
 ]
 
 [datasets.data_hub_companies]
@@ -23,9 +24,9 @@ db_schema = "dbt"
 db_table = "data_hub__companies"
 db_pk = "id"
 index = [
-    { literal = "cdms", type = "text" },
-    { literal = "company_name", type = "text" },
-    { literal = "postcode", type = "text" },
+    { literal = "cdms", alias = "cdms_id", type = "VARCHAR" },
+    { literal = "company_name", alias = "name" },
+    { literal = "postcode" },
     { literal = "*" }
 ]
 
@@ -35,8 +36,8 @@ db_schema = "hmrc"
 db_table = "trade__exporters"
 db_pk = "id"
 index = [
-    { literal = "company_name", type = "text" },
-    { literal = "postcode", type = "text" },
+    { literal = "company_name", alias = "name" },
+    { literal = "postcode" },
 ]
 
 [datasets.export_wins]
@@ -45,8 +46,8 @@ db_schema = "dbt"
 db_table = "export_wins__wins_dataset"
 db_pk = "id"
 index = [
-    { literal = "company_name", type = "text" },
-    { literal = "postcode", type = "text" },
-    { literal = "cdms", type = "text" },
-    { literal = "data_hub_company_id", alias = "dh_id", type = "text" },
+    { literal = "company_name" },
+    { literal = "postcode" },
+    { literal = "cdms", alias = "cdms_id", type = "VARCHAR" },
+    { literal = "data_hub_company_id", alias = "dh_id", type = "VARCHAR" },
 ]
diff --git a/src/matchbox/common/db.py b/src/matchbox/common/db.py
index 23f608b4..ee864eb4 100644
--- a/src/matchbox/common/db.py
+++ b/src/matchbox/common/db.py
@@ -84,6 +84,25 @@ def engine(self) -> Engine:
             self.test_connection()
         return self._engine
 
+    def __str__(self):
+        return (
+            f"SourceWarehouse(alias={self.alias}, type={self.db_type}, "
+            f"host={self.host}, port={self.port}, database={self.database})"
+        )
+
+    def __eq__(self, other):
+        if not isinstance(other, SourceWarehouse):
+            return False
+        return (
+            self.alias == other.alias
+            and self.db_type == other.db_type
+            and self.user == other.user
+            and self.password == other.password
+            and self.host == other.host
+            and self.port == other.port
+            and self.database == other.database
+        )
+
     def test_connection(self):
         try:
             with self.engine.connect() as connection:
@@ -92,12 +111,6 @@ def test_connection(self):
             self._engine = None
             raise
 
-    def __str__(self):
-        return (
-            f"SourceWarehouse(alias={self.alias}, type={self.db_type}, "
-            f"host={self.host}, port={self.port}, database={self.database})"
-        )
-
     @classmethod
     def from_engine(cls, engine: Engine, alias: str | None = None) -> "SourceWarehouse":
         """Create a SourceWarehouse instance from an SQLAlchemy Engine object."""
@@ -142,7 +155,7 @@ class SourceColumn(BaseModel):
         description="The literal name of the column in the database."
     )
     alias: SourceColumnName = Field(
-        default_factory=lambda data: data["literal"],
+        default_factory=lambda data: SourceColumnName(name=data["literal"].name),
         description="The alias to use when hashing the dataset in Matchbox.",
     )
     type: str | None = Field(
@@ -287,7 +300,7 @@ def hash_columns(cls, data: dict[str, Any]) -> "Source":
 
         if local_columns:
             # Concatenate with TOML order, honouring star location (if present)
-            if star_index:
+            if star_index is not None:
                 db_columns = db_indexed_columns
                 for c in db_non_indexed_columns:
                     c.indexed = True
diff --git a/test/client/test_admin.py b/test/client/test_admin.py
new file mode 100644
index 00000000..e4a69ede
--- /dev/null
+++ b/test/client/test_admin.py
@@ -0,0 +1,126 @@
+import re
+from pathlib import Path
+from tempfile import NamedTemporaryFile
+from textwrap import dedent
+
+import pytest
+from matchbox.admin import load_datasets_from_config
+from matchbox.common.db import Source, SourceWarehouse
+from tomli_w import dumps
+
+
+def warehouse_toml(warehouse: SourceWarehouse) -> str:
+    return dedent(f"""
+        [warehouses.{warehouse.alias}]
+        db_type = "{warehouse.db_type}"
+        user = "{warehouse.user}"
+        password = "{warehouse.password}"
+        host = "{warehouse.host}"
+        port = {warehouse.port}
+        database = "{warehouse.database}"
+    """).strip()
+
+
+def source_toml(source: Source, index: list[dict[str, str]]) -> str:
+    index_str = dumps({"index": index}).replace("\n", "\n        ")
+    return dedent(f"""
+        [datasets.{re.sub(r"[^a-zA-Z0-9]", "", source.alias)}]
+        database = "test_warehouse"
+        db_schema = "{source.db_schema}"
+        db_table = "{source.db_table}"
+        db_pk = "{source.db_pk}"
+        {index_str}
+    """)
+
+
+@pytest.mark.parametrize(
+    "index",
+    (
+        [
+            {"literal": "company_name"},
+            {"literal": "crn"},
+        ],
+        [{"literal": "company_name", "type": "VARCHAR", "alias": "name"}],
+        [
+            {"literal": "company_name"},
+            {"literal": "*"},
+        ],
+        [
+            {"literal": "*"},
+            {"literal": "company_name"},
+        ],
+    ),
+    ids=["vanilla", "alias_and_type", "star_end", "star_start"],
+)
+def test_load_datasets_from_config(
+    index: list[dict[str, str]],
+    warehouse: SourceWarehouse,
+    warehouse_data: list[Source],
+):
+    """Tests loading datasets from a TOML file."""
+    # Construct TOML from CRN data
+    # Columns: "id", "company_name", "crn"
+    crn = warehouse_data[0]
+    raw_toml = dedent(f"""
+        {warehouse_toml(warehouse)}
+        {source_toml(crn, index)}      
+    """).strip()
+
+    with NamedTemporaryFile(suffix=".toml", delete=False) as temp_file:
+        temp_file.write(raw_toml.encode())
+        temp_file.flush()
+        temp_file_path = Path(temp_file.name)
+
+    # Ingest
+    config = load_datasets_from_config(temp_file_path)
+
+    # Helper variables
+    source = config.get(re.sub(r"[^a-zA-Z0-9]", "", crn.alias))
+    named = [idx["literal"] for idx in index if idx["literal"] != "*"]
+    has_star = any(idx["literal"] == "*" for idx in index)
+    star_pos = next((i for i, idx in enumerate(index) if idx["literal"] == "*"), None)
+    col_names = [col.literal.name for col in source.db_columns]
+
+    # Test 1: Core attributes match
+    assert source.database == warehouse
+    assert source.alias == re.sub(r"[^a-zA-Z0-9]", "", crn.alias)
+    assert source.db_schema == crn.db_schema
+    assert source.db_table == crn.db_table
+    assert source.db_pk == crn.db_pk
+
+    # Test 2: All non-pk columns present
+    assert set(col_names) == {"company_name", "crn", "id"} - {source.db_pk}
+
+    # Test 3: Column indexing
+    for col in source.db_columns:
+        assert col.indexed == (has_star or col.literal.name in named)
+
+    # Test 4: Aliases and types match
+    for idx in index:
+        if idx["literal"] == "*":
+            continue
+        col = next(c for c in source.db_columns if c.literal.name == idx["literal"])
+        assert col.alias.name == idx.get("alias", idx["literal"])
+        assert col.type == idx.get("type", col.type)
+
+    # Test 5: Column ordering
+    if star_pos is None:
+        for i, name in enumerate(named):
+            assert col_names[i] == name
+    else:
+        for i, idx in enumerate(index):
+            if idx["literal"] != "*":
+                if i < star_pos:
+                    assert col_names[i] == idx["literal"]
+                else:
+                    star_col_count = len(col_names) - len(index) + 1
+                    assert col_names[i + star_col_count - 1] == idx["literal"]
+
+    # Test 6: column equalities
+
+    assert source.db_columns[0] != source.db_columns[1]
+    assert source.db_columns[0] == source.db_columns[0]
+    assert source.db_columns[1].literal.hash == source.db_columns[1]
+    assert source.db_columns[1].alias.hash == source.db_columns[1]
+    assert source.db_columns[0].literal.hash != source.db_columns[1]
+    assert source.db_columns[0].alias.hash != source.db_columns[1]
diff --git a/uv.lock b/uv.lock
index 8e9e11a1..7934496f 100644
--- a/uv.lock
+++ b/uv.lock
@@ -943,6 +943,7 @@ dev = [
     { name = "pytest-cov" },
     { name = "pytest-env" },
     { name = "ruff" },
+    { name = "tomli-w" },
 ]
 typing = [
     { name = "polars" },
@@ -977,6 +978,7 @@ dev = [
     { name = "pytest-cov", specifier = ">=5.0.0" },
     { name = "pytest-env", specifier = ">=1.1.5" },
     { name = "ruff", specifier = ">=0.6.8" },
+    { name = "tomli-w", specifier = ">=1.1.0" },
 ]
 typing = [{ name = "polars", specifier = ">=1.11.0" }]
 
@@ -1945,6 +1947,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/97/75/10a9ebee3fd790d20926a90a2547f0bf78f371b2f13aa822c759680ca7b9/tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc", size = 12757 },
 ]
 
+[[package]]
+name = "tomli-w"
+version = "1.1.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/d4/19/b65f1a088ee23e37cdea415b357843eca8b1422a7b11a9eee6e35d4ec273/tomli_w-1.1.0.tar.gz", hash = "sha256:49e847a3a304d516a169a601184932ef0f6b61623fe680f836a2aa7128ed0d33", size = 6929 }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c4/ac/ce90573ba446a9bbe65838ded066a805234d159b4446ae9f8ec5bbd36cbd/tomli_w-1.1.0-py3-none-any.whl", hash = "sha256:1403179c78193e3184bfaade390ddbd071cba48a32a2e62ba11aae47490c63f7", size = 6440 },
+]
+
 [[package]]
 name = "tornado"
 version = "6.4.1"

From 67cdc171d3b7e8c8a8ccaa6ca8a2fccd92f9054c Mon Sep 17 00:00:00 2001
From: Will Langdale <will.langdale@businessandtrade.gov.uk>
Date: Sun, 8 Dec 2024 12:22:56 +0000
Subject: [PATCH 09/14] Refactored hash_columns with cleaner logic

---
 src/matchbox/common/db.py | 133 ++++++++++++++++++++------------------
 1 file changed, 70 insertions(+), 63 deletions(-)

diff --git a/src/matchbox/common/db.py b/src/matchbox/common/db.py
index ee864eb4..5d16bd5f 100644
--- a/src/matchbox/common/db.py
+++ b/src/matchbox/common/db.py
@@ -241,80 +241,87 @@ def __hash__(self) -> int:
     def hash_columns(cls, data: dict[str, Any]) -> "Source":
         """Shapes indices data from either the backend or TOML.
 
-        Handles:
-            * From TOML, no columns specified
-            * From TOML, some or all columns specified
-            * From the database, indices already present
+        Handles three scenarios:
+            1. No columns specified - all columns except primary key are indexed
+            2. Columns specified in TOML - uses specified columns with optional '*'
+            3. Indices from database - uses existing column hash information
         """
-        # Database setup
-        if isinstance(data["database"], SourceWarehouse):
-            warehouse = data["database"]
-        else:
-            warehouse = SourceWarehouse(**data["database"])
+        # Initialise warehouse and get table metadata
+        warehouse = (
+            data["database"]
+            if isinstance(data["database"], SourceWarehouse)
+            else SourceWarehouse(**data["database"])
+        )
+
         metadata = MetaData(schema=data["db_schema"])
         table = Table(data["db_table"], metadata, autoload_with=warehouse.engine)
 
-        # Column logic
-        # Get all locally specified columns, or remotely specified hashes
-        local_columns: list[SourceColumn] = []
-        star_index: int | None = None
-        local_hashes: SourceIndex | None = None
-        index_data: dict | list | None = data.get("index")
-        if isinstance(index_data, dict):
-            # Came from Matchbox database
-            local_hashes = SourceIndex(**index_data)
-        else:
-            # Came from TOML
-            for column in index_data or []:
-                if column["literal"] == "*":
-                    star_index = len(local_columns)
-                    continue
-                local_columns.append(SourceColumn(**column, indexed=True))
-
-        # Get all remote columns using the user's creds and merge with local spec
+        # Get all columns except primary key
         remote_columns = [
             SourceColumn(literal=col.name, type=str(col.type), indexed=False)
             for col in table.columns
             if col.name not in data["db_pk"]
         ]
-        db_columns: list[SourceColumn] = []
-        db_indexed_columns: list[SourceColumn] = []
-        db_non_indexed_columns: list[SourceColumn] = []
-
-        for remote_column in remote_columns:
-            if local_columns:
-                # Came from TOML, index and alias are configured from TOML
-                for local_column in local_columns:
-                    if remote_column == local_column:
-                        if local_column.type is None:
-                            local_column.type = remote_column.type
-                        db_indexed_columns.append(local_column)
-                        break
-                else:
-                    db_non_indexed_columns.append(remote_column)
-            elif local_hashes:
-                # Came from database, index is true when hashes match
-                if remote_column in local_hashes.literal + local_hashes.alias:
-                    remote_column.indexed = True
-                    db_columns.append(remote_column)
-
-        if local_columns:
-            # Concatenate with TOML order, honouring star location (if present)
-            if star_index is not None:
-                db_columns = db_indexed_columns
-                for c in db_non_indexed_columns:
-                    c.indexed = True
-                    db_columns.insert(star_index, c)
-            else:
-                db_columns = db_indexed_columns + db_non_indexed_columns
-
-        if not local_columns and not local_hashes:
-            # No columns specified, index all columns except the primary key
-            for col in remote_columns:
+
+        index_data = data.get("index")
+
+        # Case 1: No columns specified - index everything
+        if not index_data:
+            data["db_columns"] = [
+                SourceColumn(literal=col.literal.name, type=col.type, indexed=True)
+                for col in remote_columns
+            ]
+            return data
+
+        # Case 2: Columns from database
+        if isinstance(index_data, dict):
+            source_index = SourceIndex(**index_data)
+            data["db_columns"] = [
+                SourceColumn(
+                    literal=col.literal.name,
+                    type=col.type,
+                    indexed=col in source_index.literal + source_index.alias,
+                )
+                for col in remote_columns
+            ]
+            return data
+
+        # Case 3: Columns from TOML
+        local_columns = []
+        star_index = None
+
+        # Process TOML column specifications
+        for i, column in enumerate(index_data):
+            if column["literal"] == "*":
+                star_index = i
+                continue
+            local_columns.append(SourceColumn(**column, indexed=True))
+
+        # Match remote columns with local specifications
+        indexed_columns = []
+        non_indexed_columns = []
+
+        for remote_col in remote_columns:
+            matched = False
+            for local_col in local_columns:
+                if remote_col == local_col:
+                    if local_col.type is None:
+                        local_col.type = remote_col.type
+                    indexed_columns.append(local_col)
+                    matched = True
+                    break
+            if not matched:
+                non_indexed_columns.append(remote_col)
+
+        # Handle wildcard insertion
+        if star_index is not None:
+            for col in non_indexed_columns:
                 col.indexed = True
-            db_columns = remote_columns
+            indexed_columns[star_index:star_index] = non_indexed_columns
+            data["db_columns"] = indexed_columns
+        else:
+            data["db_columns"] = indexed_columns + non_indexed_columns
 
-        data["db_columns"] = db_columns
         return data
 
     def to_table(self) -> Table:

From 28c01f7a7fffdce998c10954342c14004a6fc0aa Mon Sep 17 00:00:00 2001
From: Will Langdale <will.langdale@businessandtrade.gov.uk>
Date: Wed, 11 Dec 2024 15:29:12 +0000
Subject: [PATCH 10/14] Dealt with Leo's comments

---
 src/matchbox/common/db.py             | 33 +++++++++++----------------
 src/matchbox/server/postgresql/orm.py |  8 ++++++-
 test/client/test_admin.py             |  9 ++++----
 test/server/test_adapter.py           |  8 +++----
 4 files changed, 27 insertions(+), 31 deletions(-)

diff --git a/src/matchbox/common/db.py b/src/matchbox/common/db.py
index 5d16bd5f..02fff10b 100644
--- a/src/matchbox/common/db.py
+++ b/src/matchbox/common/db.py
@@ -7,7 +7,14 @@
 from matchbox.common.hash import HASH_FUNC
 from pandas import DataFrame
 from pyarrow import Table as ArrowTable
-from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
+from pydantic import (
+    BaseModel,
+    ConfigDict,
+    Field,
+    SecretStr,
+    field_validator,
+    model_validator,
+)
 from sqlalchemy import (
     LABEL_STYLE_TABLENAME_PLUS_COL,
     ColumnElement,
@@ -70,7 +77,7 @@ class SourceWarehouse(BaseModel):
     alias: str
     db_type: str
     user: str
-    password: str = Field(repr=False)
+    password: SecretStr
     host: str
     port: int
     database: str
@@ -79,17 +86,11 @@ class SourceWarehouse(BaseModel):
     @property
     def engine(self) -> Engine:
         if self._engine is None:
-            connection_string = f"{self.db_type}://{self.user}:{self.password}@{self.host}:{self.port}/{self.database}"
+            connection_string = f"{self.db_type}://{self.user}:{self.password.get_secret_value()}@{self.host}:{self.port}/{self.database}"
             self._engine = create_engine(connection_string)
             self.test_connection()
         return self._engine
 
-    def __str__(self):
-        return (
-            f"SourceWarehouse(alias={self.alias}, type={self.db_type}, "
-            f"host={self.host}, port={self.port}, database={self.database})"
-        )
-
     def __eq__(self, other):
         if not isinstance(other, SourceWarehouse):
             return False
@@ -205,13 +206,6 @@ def string_to_name(cls: "SourceColumn", value: str) -> SourceColumnName:
             raise ValueError("Column name must be a string.")
 
 
-class SourceIndex(BaseModel):
-    """The hashes of column names in the Matchbox database."""
-
-    literal: list[bytes]
-    alias: list[bytes]
-
-
 class Source(BaseModel):
     """A dataset that can be indexed in the Matchbox database."""
 
@@ -243,8 +237,8 @@ def hash_columns(cls, data: dict[str, Any]) -> "Source":
 
         Handles three scenarios:
             1. No columns specified - all columns except primary key are indexed
-            2. Columns specified in TOML - uses specified columns with optional '*'
-            3. Indices from database - uses existing column hash information
+            2. Indices from database - uses existing column hash information
+            3. Columns specified in TOML - uses specified columns with optional '*'
         """
         # Initialise warehouse and get table metadata
         warehouse = (
@@ -275,12 +269,11 @@ def hash_columns(cls, data: dict[str, Any]) -> "Source":
 
         # Case 2: Columns from database
         if isinstance(index_data, dict):
-            source_index = SourceIndex(**index_data)
             data["db_columns"] = [
                 SourceColumn(
                     literal=col.literal.name,
                     type=col.type,
-                    indexed=col in source_index.literal + source_index.alias,
+                    indexed=col in index_data["literal"] + index_data["alias"],
                 )
                 for col in remote_columns
             ]
diff --git a/src/matchbox/server/postgresql/orm.py b/src/matchbox/server/postgresql/orm.py
index fc54871e..e6d82b30 100644
--- a/src/matchbox/server/postgresql/orm.py
+++ b/src/matchbox/server/postgresql/orm.py
@@ -7,6 +7,7 @@
     CheckConstraint,
     Column,
     ForeignKey,
+    UniqueConstraint,
     select,
 )
 from sqlalchemy.dialects.postgresql import ARRAY, BYTEA, JSONB
@@ -152,7 +153,7 @@ class Sources(CountMixin, MBDB.MatchboxBase):
     model = Column(
         BYTEA, ForeignKey("models.hash", ondelete="CASCADE"), primary_key=True
     )
-    alias = Column(VARCHAR, nullable=False, unique=True)
+    alias = Column(VARCHAR, nullable=False)
     schema = Column(VARCHAR, nullable=False)
     table = Column(VARCHAR, nullable=False)
     id = Column(VARCHAR, nullable=False)
@@ -162,6 +163,11 @@ class Sources(CountMixin, MBDB.MatchboxBase):
     dataset_model = relationship("Models", back_populates="source")
     clusters = relationship("Clusters", back_populates="source")
 
+    # Constraints
+    __table_args__ = (
+        UniqueConstraint("alias", "schema", "table", name="unique_alias_schema_table"),
+    )
+
     @classmethod
     def list(cls) -> list["Sources"]:
         with Session(MBDB.get_engine()) as session:
diff --git a/test/client/test_admin.py b/test/client/test_admin.py
index e4a69ede..f54c3900 100644
--- a/test/client/test_admin.py
+++ b/test/client/test_admin.py
@@ -1,4 +1,3 @@
-import re
 from pathlib import Path
 from tempfile import NamedTemporaryFile
 from textwrap import dedent
@@ -14,7 +13,7 @@ def warehouse_toml(warehouse: SourceWarehouse) -> str:
         [warehouses.{warehouse.alias}]
         db_type = "{warehouse.db_type}"
         user = "{warehouse.user}"
-        password = "{warehouse.password}"
+        password = "{warehouse.password.get_secret_value()}"
         host = "{warehouse.host}"
         port = {warehouse.port}
         database = "{warehouse.database}"
@@ -24,7 +23,7 @@ def warehouse_toml(warehouse: SourceWarehouse) -> str:
 def source_toml(source: Source, index: list[dict[str, str]]) -> str:
     index_str = dumps({"index": index}).replace("\n", "\n        ")
     return dedent(f"""
-        [datasets.{re.sub(r"[^a-zA-Z0-9]", "", source.alias)}]
+        [datasets.{source.alias.replace(".", "")}]
         database = "test_warehouse"
         db_schema = "{source.db_schema}"
         db_table = "{source.db_table}"
@@ -75,7 +74,7 @@ def test_load_datasets_from_config(
     config = load_datasets_from_config(temp_file_path)
 
     # Helper variables
-    source = config.get(re.sub(r"[^a-zA-Z0-9]", "", crn.alias))
+    source = config.get(crn.alias.replace(".", ""))
     named = [idx["literal"] for idx in index if idx["literal"] != "*"]
     has_star = any(idx["literal"] == "*" for idx in index)
     star_pos = next((i for i, idx in enumerate(index) if idx["literal"] == "*"), None)
@@ -83,7 +82,7 @@ def test_load_datasets_from_config(
 
     # Test 1: Core attributes match
     assert source.database == warehouse
-    assert source.alias == re.sub(r"[^a-zA-Z0-9]", "", crn.alias)
+    assert source.alias == crn.alias.replace(".", "")
     assert source.db_schema == crn.db_schema
     assert source.db_table == crn.db_table
     assert source.db_pk == crn.db_pk
diff --git a/test/server/test_adapter.py b/test/server/test_adapter.py
index 5002647a..b1af8c2c 100644
--- a/test/server/test_adapter.py
+++ b/test/server/test_adapter.py
@@ -1,3 +1,4 @@
+from collections import defaultdict
 from typing import Callable
 
 import pytest
@@ -120,14 +121,11 @@ def test_get_dataset(self):
 
         assert crn.db_columns == crn_retrieved.db_columns
 
-        cols: dict[str, list[SourceColumn]] = {}
+        cols: defaultdict[str, list[SourceColumn]] = defaultdict(list)
 
         # Indexing isn't used in the custom equality check
         for col in crn.db_columns + crn_retrieved.db_columns:
-            if col.literal.name not in cols:
-                cols[col.literal.name] = [col]
-            else:
-                cols[col.literal.name].append(col)
+            cols[col.literal.name].append(col)
 
         for c1, c2 in cols.values():
             assert c1.indexed == c2.indexed

From 8c6e1b61ad0540daa083dbec761f9cad933fcd23 Mon Sep 17 00:00:00 2001
From: Will Langdale <will.langdale@businessandtrade.gov.uk>
Date: Thu, 12 Dec 2024 14:09:29 +0000
Subject: [PATCH 11/14] Fixed warning unit tests

---
 test/server/test_adapter.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/server/test_adapter.py b/test/server/test_adapter.py
index 5997517f..26388c22 100644
--- a/test/server/test_adapter.py
+++ b/test/server/test_adapter.py
@@ -430,7 +430,7 @@ def test_query_warning(self):
             query(
                 selector=select_crn,
                 backend=self.backend,
-                model=None,
+                resolution=None,
                 return_type="pandas",
                 limit=10,
             )

From f75e311ff8a63a6383d6e4b407606ef81dad761a Mon Sep 17 00:00:00 2001
From: Will Langdale <will.langdale@businessandtrade.gov.uk>
Date: Thu, 12 Dec 2024 14:11:52 +0000
Subject: [PATCH 12/14] Refactored hash to base64 code

---
 src/matchbox/common/db.py    | 5 ++---
 src/matchbox/common/graph.py | 8 ++++++--
 src/matchbox/common/hash.py  | 2 +-
 test/fixtures/graph.py       | 4 ++--
 4 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/src/matchbox/common/db.py b/src/matchbox/common/db.py
index 02fff10b..cdf42d7a 100644
--- a/src/matchbox/common/db.py
+++ b/src/matchbox/common/db.py
@@ -1,10 +1,9 @@
-import base64
 from typing import TYPE_CHECKING, Any, Literal, TypeVar, Union, overload
 
 import connectorx as cx
 import pyarrow as pa
 from matchbox.common.exceptions import MatchboxValidatonError
-from matchbox.common.hash import HASH_FUNC
+from matchbox.common.hash import HASH_FUNC, hash_to_base64
 from pandas import DataFrame
 from pyarrow import Table as ArrowTable
 from pydantic import (
@@ -144,7 +143,7 @@ def hash(self) -> bytes:
     @property
     def base64(self) -> str:
         """Generate a base64 encoded hash based on the column name."""
-        return base64.b64encode(self.hash).decode("utf-8")
+        return hash_to_base64(self.hash)
 
 
 class SourceColumn(BaseModel):
diff --git a/src/matchbox/common/graph.py b/src/matchbox/common/graph.py
index 1c9005de..0ba6b451 100644
--- a/src/matchbox/common/graph.py
+++ b/src/matchbox/common/graph.py
@@ -1,7 +1,7 @@
 from enum import StrEnum
 
 import rustworkx as rx
-from matchbox.common.hash import hash_to_str
+from matchbox.common.hash import hash_to_base64
 from pydantic import BaseModel
 
 
@@ -36,7 +36,11 @@ def to_rx(self) -> rx.PyDiGraph:
         nodes = {}
         G = rx.PyDiGraph()
         for n in self.nodes:
-            node_data = {"id": hash_to_str(n.hash), "name": n.name, "type": str(n.type)}
+            node_data = {
+                "id": hash_to_base64(n.hash),
+                "name": n.name,
+                "type": str(n.type),
+            }
             nodes[n.hash] = G.add_node(node_data)
         for e in self.edges:
             G.add_edge(nodes[e.parent], nodes[e.child], {})
diff --git a/src/matchbox/common/hash.py b/src/matchbox/common/hash.py
index 0fff7fd4..fd6990cf 100644
--- a/src/matchbox/common/hash.py
+++ b/src/matchbox/common/hash.py
@@ -18,7 +18,7 @@
 HASH_FUNC = hashlib.sha256
 
 
-def hash_to_str(hash: bytes) -> str:
+def hash_to_base64(hash: bytes) -> str:
     return base64.b64encode(hash).decode("utf-8")
 
 
diff --git a/test/fixtures/graph.py b/test/fixtures/graph.py
index dda47b0e..d2f038f7 100644
--- a/test/fixtures/graph.py
+++ b/test/fixtures/graph.py
@@ -7,7 +7,7 @@
 from matchbox.common.graph import (
     ResolutionNodeType as ResType,
 )
-from matchbox.common.hash import hash_to_str
+from matchbox.common.hash import hash_to_base64
 from rustworkx import PyDiGraph
 
 
@@ -35,7 +35,7 @@ def resolution_graph() -> ResolutionGraph:
 @pytest.fixture
 def pydigraph() -> PyDiGraph:
     def make_id(n: int) -> str:
-        return hash_to_str(bytes(n))
+        return hash_to_base64(bytes(n))
 
     G = PyDiGraph()
     n1 = G.add_node({"id": make_id(1), "name": "1", "type": str(ResType.DATASET)})

From f55866e26da8b87eede82efe4cd781aeba4b6a06 Mon Sep 17 00:00:00 2001
From: Will Langdale <will.langdale@businessandtrade.gov.uk>
Date: Thu, 12 Dec 2024 14:15:13 +0000
Subject: [PATCH 13/14] Dealt with allowing periods in a source's alias

---
 test/client/test_admin.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/test/client/test_admin.py b/test/client/test_admin.py
index f54c3900..483ae0b8 100644
--- a/test/client/test_admin.py
+++ b/test/client/test_admin.py
@@ -22,8 +22,9 @@ def warehouse_toml(warehouse: SourceWarehouse) -> str:
 
 def source_toml(source: Source, index: list[dict[str, str]]) -> str:
     index_str = dumps({"index": index}).replace("\n", "\n        ")
+    alias = source.alias if "." not in source.alias else f'"{source.alias}"'
     return dedent(f"""
-        [datasets.{source.alias.replace(".", "")}]
+        [datasets.{alias}]
         database = "test_warehouse"
         db_schema = "{source.db_schema}"
         db_table = "{source.db_table}"
@@ -74,7 +75,7 @@ def test_load_datasets_from_config(
     config = load_datasets_from_config(temp_file_path)
 
     # Helper variables
-    source = config.get(crn.alias.replace(".", ""))
+    source = config.get(crn.alias)
     named = [idx["literal"] for idx in index if idx["literal"] != "*"]
     has_star = any(idx["literal"] == "*" for idx in index)
     star_pos = next((i for i, idx in enumerate(index) if idx["literal"] == "*"), None)
@@ -82,7 +83,7 @@ def test_load_datasets_from_config(
 
     # Test 1: Core attributes match
     assert source.database == warehouse
-    assert source.alias == crn.alias.replace(".", "")
+    assert source.alias == crn.alias
     assert source.db_schema == crn.db_schema
     assert source.db_table == crn.db_table
     assert source.db_pk == crn.db_pk

From 3e75fb5e1fd7de4ffe496b0b1de7108636d956d2 Mon Sep 17 00:00:00 2001
From: Will Langdale <will.langdale@businessandtrade.gov.uk>
Date: Thu, 12 Dec 2024 14:30:03 +0000
Subject: [PATCH 14/14] Removed wildcard selector and declaring column type

---
 src/matchbox/common/db.py | 29 ++++++++++----------------
 test/client/test_admin.py | 43 ++++++++-------------------------------
 2 files changed, 19 insertions(+), 53 deletions(-)

diff --git a/src/matchbox/common/db.py b/src/matchbox/common/db.py
index cdf42d7a..0c913b8a 100644
--- a/src/matchbox/common/db.py
+++ b/src/matchbox/common/db.py
@@ -237,7 +237,7 @@ def hash_columns(cls, data: dict[str, Any]) -> "Source":
         Handles three scenarios:
             1. No columns specified - all columns except primary key are indexed
             2. Indices from database - uses existing column hash information
-            3. Columns specified in TOML - uses specified columns with optional '*'
+            3. Columns specified in TOML - specified columns are indexed
         """
         # Initialise warehouse and get table metadata
         warehouse = (
@@ -280,14 +280,16 @@ def hash_columns(cls, data: dict[str, Any]) -> "Source":
 
         # Case 3: Columns from TOML
         local_columns = []
-        star_index = None
 
         # Process TOML column specifications
-        for i, column in enumerate(index_data):
-            if column["literal"] == "*":
-                star_index = i
-                continue
-            local_columns.append(SourceColumn(**column, indexed=True))
+        for column in index_data:
+            local_columns.append(
+                SourceColumn(
+                    literal=column["literal"],
+                    alias=column.get("alias", column["literal"]),
+                    indexed=True,
+                )
+            )
 
         # Match remote columns with local specifications
         indexed_columns = []
@@ -296,23 +298,14 @@ def hash_columns(cls, data: dict[str, Any]) -> "Source":
         for remote_col in remote_columns:
             matched = False
             for local_col in local_columns:
-                if remote_col == local_col:
-                    if local_col.type is None:
-                        local_col.type = remote_col.type
+                if remote_col.literal == local_col.literal:
                     indexed_columns.append(local_col)
                     matched = True
                     break
             if not matched:
                 non_indexed_columns.append(remote_col)
 
-        # Handle wildcard insertion
-        if star_index is not None:
-            for col in non_indexed_columns:
-                col.indexed = True
-            indexed_columns[star_index:star_index] = non_indexed_columns
-            data["db_columns"] = indexed_columns
-        else:
-            data["db_columns"] = indexed_columns + non_indexed_columns
+        data["db_columns"] = indexed_columns + non_indexed_columns
 
         return data
 
diff --git a/test/client/test_admin.py b/test/client/test_admin.py
index 483ae0b8..fc4f3f53 100644
--- a/test/client/test_admin.py
+++ b/test/client/test_admin.py
@@ -40,17 +40,9 @@ def source_toml(source: Source, index: list[dict[str, str]]) -> str:
             {"literal": "company_name"},
             {"literal": "crn"},
         ],
-        [{"literal": "company_name", "type": "VARCHAR", "alias": "name"}],
-        [
-            {"literal": "company_name"},
-            {"literal": "*"},
-        ],
-        [
-            {"literal": "*"},
-            {"literal": "company_name"},
-        ],
+        [{"literal": "company_name", "alias": "name"}],
     ),
-    ids=["vanilla", "alias_and_type", "star_end", "star_start"],
+    ids=["vanilla", "alias"],
 )
 def test_load_datasets_from_config(
     index: list[dict[str, str]],
@@ -76,9 +68,7 @@ def test_load_datasets_from_config(
 
     # Helper variables
     source = config.get(crn.alias)
-    named = [idx["literal"] for idx in index if idx["literal"] != "*"]
-    has_star = any(idx["literal"] == "*" for idx in index)
-    star_pos = next((i for i, idx in enumerate(index) if idx["literal"] == "*"), None)
+    named = [idx["literal"] for idx in index]
     col_names = [col.literal.name for col in source.db_columns]
 
     # Test 1: Core attributes match
@@ -91,33 +81,16 @@ def test_load_datasets_from_config(
     # Test 2: All non-pk columns present
     assert set(col_names) == {"company_name", "crn", "id"} - {source.db_pk}
 
-    # Test 3: Column indexing
-    for col in source.db_columns:
-        assert col.indexed == (has_star or col.literal.name in named)
-
-    # Test 4: Aliases and types match
+    # Test 3: Aliases match
     for idx in index:
-        if idx["literal"] == "*":
-            continue
         col = next(c for c in source.db_columns if c.literal.name == idx["literal"])
         assert col.alias.name == idx.get("alias", idx["literal"])
-        assert col.type == idx.get("type", col.type)
-
-    # Test 5: Column ordering
-    if star_pos is None:
-        for i, name in enumerate(named):
-            assert col_names[i] == name
-    else:
-        for i, idx in enumerate(index):
-            if idx["literal"] != "*":
-                if i < star_pos:
-                    assert col_names[i] == idx["literal"]
-                else:
-                    star_col_count = len(col_names) - len(index) + 1
-                    assert col_names[i + star_col_count - 1] == idx["literal"]
 
-    # Test 6: column equalities
+    # Test 4: Column ordering
+    for i, name in enumerate(named):
+        assert col_names[i] == name
 
+    # Test 5: column equalities
     assert source.db_columns[0] != source.db_columns[1]
     assert source.db_columns[0] == source.db_columns[0]
     assert source.db_columns[1].literal.hash == source.db_columns[1]