uktrade · wpfl-dbt · Dec 13, 2024 · Dec 3, 2024 · Dec 3, 2024 · Dec 3, 2024
diff --git a/src/matchbox/client/visualisation.py b/src/matchbox/client/visualisation.py
@@ -1,3 +1,6 @@
+from collections import defaultdict
+from itertools import count
+
 import rustworkx as rx
 from matplotlib.figure import Figure
 from rustworkx.visualization import mpl_draw
@@ -39,3 +42,92 @@ def draw_resolution_graph() -> Figure:
         labels=lambda node: node["name"],
         font_size=8,
     )
+
+
+def draw_data_tree(graph: rx.PyDiGraph) -> str:
+    """
+    Convert a rustworkx PyDiGraph to Mermaid graph visualization code.
+
+    Args:
+        graph (rx.PyDiGraph): A rustworkx directed graph with nodes containing 'id' and
+            'type' attributes
+
+    Returns:
+        str: Mermaid graph definition code
+    """
+    mermaid_lines = ["graph LR"]
+
+    counters = defaultdict(count, {"hash": count(1)})
+    node_to_var = {}
+    node_types = {}
+    data_nodes = set()
+
+    def format_id(id_value):
+        """Format ID value, converting bytes to hex if needed."""
+        if isinstance(id_value, bytes):
+            return f"\\x{id_value.hex()}"
+        return f"['{str(id_value)}']"
+
+    for node_idx in graph.node_indices():
+        node_data = graph.get_node_data(node_idx)
+        if isinstance(node_data, dict):
+            node_type = node_data.get("type", "")
+            node_types[node_idx] = node_type
+            if node_type == "data":
+                data_nodes.add(node_idx)
+
+    for node_idx, node_type in node_types.items():
+        if node_type == "source":
+            node_data = graph.get_node_data(node_idx)
+            table_name = node_data["id"].split(".")[-1]
+            node_to_var[node_idx] = table_name
+
+            counter = count(1)
+            for predecessor in graph.predecessor_indices(node_idx):
+                if predecessor in data_nodes:
+                    node_to_var[predecessor] = f"{table_name}{str(next(counter))}"
+                    data_nodes.remove(predecessor)
+
+    remaining_counter = count(len(node_to_var) + 1)
+    for node_idx in data_nodes:
+        node_to_var[node_idx] = str(next(remaining_counter))
+
+    for node_idx, node_type in node_types.items():
+        if node_type == "cluster":
+            node_to_var[node_idx] = f"hash{next(counters['hash'])}"
+
+    sources = []
+    data_defs = []
+    clusters = []
+
+    for node_idx, node_type in node_types.items():
+        node_data = graph.get_node_data(node_idx)
+        var_name = node_to_var[node_idx]
+
+        if node_type == "source":
+            node_def = f'    {var_name}["{node_data["id"]}"]'
+            sources.append(node_def)
+        elif node_type == "data":
+            node_label = format_id(node_data["id"])
+            node_label = node_label.strip("[]'")
+            node_def = f'    {var_name}["{node_label}"]'
+            data_defs.append(node_def)
+        elif node_type == "cluster":
+            node_label = format_id(node_data["id"])
+            node_def = f'    {var_name}["{node_label}"]'
+            clusters.append(node_def)
+
+    mermaid_lines.extend(sources)
+    mermaid_lines.extend(data_defs)
+    mermaid_lines.extend(clusters)
+
+    mermaid_lines.append("")
+
+    for edge in graph.edge_list():
+        source = edge[0]
+        target = edge[1]
+        source_var = node_to_var[source]
+        target_var = node_to_var[target]
+        mermaid_lines.append(f"    {source_var} --> {target_var}")
+
+    return "\n".join(mermaid_lines)
diff --git a/src/matchbox/common/db.py b/src/matchbox/common/db.py
@@ -40,6 +40,26 @@
 T = TypeVar("T")
 
 
+class Match(BaseModel):
+    """A match between primary keys in the Matchbox database."""
+
+    cluster: bytes | None
+    source: str
+    source_id: set[str] = Field(default_factory=set)
+    target: str
+    target_id: set[str] = Field(default_factory=set)
+
+    @model_validator(mode="after")
+    def found_or_none(self) -> "Match":
+        if self.target_id and not (self.source_id and self.cluster):
+            raise ValueError(
+                "A match must have sources and a cluster if target was found."
+            )
+        if self.cluster and not self.source_id:
+            raise ValueError("A match must have source if cluster is set.")
+        return self
+
+
 class Probability(BaseModel):
     """A probability of a match in the Matchbox database.
 

diff --git a/src/matchbox/helpers/selector.py b/src/matchbox/helpers/selector.py
@@ -4,7 +4,7 @@
 from pyarrow import Table as ArrowTable
 from sqlalchemy import Engine, inspect
 
-from matchbox.common.db import Source, get_schema_table_names
+from matchbox.common.db import Match, Source, get_schema_table_names
 from matchbox.server import MatchboxDBAdapter, inject_backend
 
 
@@ -91,3 +91,37 @@ def query(
         return_type="pandas" if not return_type else return_type,
         limit=limit,
     )
+
+
+@inject_backend
+def match(
+    backend: MatchboxDBAdapter,
+    source_id: str,
+    source: str,
+    target: str | list[str],
+    resolution: str,
+    threshold: float | dict[str, float] | None = None,
+) -> Match | list[Match]:
+    """Matches IDs against the selected backend.
+
+    Args:
+        backend: the backend to query
+        source_id: The ID of the source to match.
+        source: The name of the source dataset.
+        target: The name of the target dataset(s).
+        resolution: the resolution to use for filtering results
+        threshold (optional): the threshold to use for creating clusters
+            If None, uses the resolutions' default threshold
+            If a float, uses that threshold for the specified resolution, and the
+            resolution's cached thresholds for its ancestors
+            If a dictionary, expects a shape similar to resolution.ancestors, keyed
+            by resolution name and valued by the threshold to use for that resolution.
+            Will use these threshold values instead of the cached thresholds
+    """
+    return backend.match(
+        source_id=source_id,
+        source=source,
+        target=target,
+        resolution=resolution,
+        threshold=threshold,
+    )
diff --git a/src/matchbox/server/api.py b/src/matchbox/server/api.py
@@ -150,6 +150,11 @@ async def query():
     raise HTTPException(status_code=501, detail="Not implemented")
 
 
+@app.get("/match")
+async def match():
+    raise HTTPException(status_code=501, detail="Not implemented")
+
+
 @app.get("/validate/hash")
 async def validate_hashes():
     raise HTTPException(status_code=501, detail="Not implemented")

diff --git a/src/matchbox/server/base.py b/src/matchbox/server/base.py
@@ -19,7 +19,7 @@
 from pydantic_settings import BaseSettings, SettingsConfigDict
 from sqlalchemy import Engine
 
-from matchbox.common.db import Source
+from matchbox.common.db import Match, Source
 from matchbox.common.graph import ResolutionGraph
 
 if TYPE_CHECKING:
@@ -251,6 +251,16 @@ def query(
         limit: int = None,
     ) -> PandasDataFrame | ArrowTable | PolarsDataFrame: ...
 
+    @abstractmethod
+    def match(
+        self,
+        source_id: str,
+        source: str,
+        target: str | list[str],
+        resolution: str,
+        threshold: float | dict[str, float] | None = None,
+    ) -> Match | list[Match]: ...
+
     @abstractmethod
     def index(self, dataset: Source) -> None: ...
 

diff --git a/src/matchbox/server/postgresql/adapter.py b/src/matchbox/server/postgresql/adapter.py
@@ -5,7 +5,7 @@
 from sqlalchemy import Engine, and_, bindparam, delete, func, or_, select
 from sqlalchemy.orm import Session
 
-from matchbox.common.db import Source, SourceWarehouse
+from matchbox.common.db import Match, Source, SourceWarehouse
 from matchbox.common.exceptions import (
     MatchboxDataError,
     MatchboxDatasetError,
@@ -29,7 +29,7 @@
     insert_model,
     insert_results,
 )
-from matchbox.server.postgresql.utils.query import query
+from matchbox.server.postgresql.utils.query import match, query
 from matchbox.server.postgresql.utils.results import (
     get_model_clusters,
     get_model_probabilities,
@@ -300,6 +300,39 @@ def query(
             limit=limit,
         )
 
+    def match(
+        self,
+        source_id: str,
+        source: str,
+        target: str | list[str],
+        resolution: str,
+        threshold: float | dict[str, float] | None = None,
+    ) -> Match | list[Match]:
+        """Matches an ID in a source dataset and returns the keys in the targets.
+
+        Args:
+            source_id: The ID of the source to match.
+            source: The name of the source dataset.
+            target: The name of the target dataset(s).
+            resolution: The name of the resolution to use for matching.
+            threshold (optional): the threshold to use for creating clusters
+                If None, uses the resolutions' default threshold
+                If a float, uses that threshold for the specified resolution, and the
+                resolution's cached thresholds for its ancestors
+                If a dictionary, expects a shape similar to resolution.ancestors, keyed
+                by resolution name and valued by the threshold to use for that
+                resolution.
+                Will use these threshold values instead of the cached thresholds
+        """
+        return match(
+            source_id=source_id,
+            source=source,
+            target=target,
+            resolution=resolution,
+            engine=MBDB.get_engine(),
+            threshold=threshold,
+        )
+
     def index(self, dataset: Source) -> None:
         """Indexes a data from your data warehouse within Matchbox.
 

diff --git a/src/matchbox/server/postgresql/orm.py b/src/matchbox/server/postgresql/orm.py
@@ -5,6 +5,7 @@
     CheckConstraint,
     Column,
     ForeignKey,
+    Index,
     UniqueConstraint,
     select,
 )
@@ -98,6 +99,22 @@ def descendants(self) -> set["Resolutions"]:
             )
             return set(session.execute(descendant_query).scalars().all())
 
+    def get_lineage(self) -> dict[bytes, float]:
+        """Returns all ancestors and their cached truth values from this model."""
+        with Session(MBDB.get_engine()) as session:
+            lineage_query = (
+                select(ResolutionFrom.parent, ResolutionFrom.truth_cache)
+                .where(ResolutionFrom.child == self.hash)
+                .order_by(ResolutionFrom.level.desc())
+            )
+
+            results = session.execute(lineage_query).all()
+
+            lineage = {parent: truth for parent, truth in results}
+            lineage[self.hash] = self.truth
+
+            return lineage
+
     def get_lineage_to_dataset(
         self, dataset: "Resolutions"
     ) -> tuple[bytes, dict[bytes, float]]:
@@ -108,31 +125,24 @@ def get_lineage_to_dataset(
             )
 
         if self.hash == dataset.hash:
-            return {}
+            return {dataset.hash: None}
 
         with Session(MBDB.get_engine()) as session:
             path_query = (
-                select(
-                    ResolutionFrom.parent, ResolutionFrom.truth_cache, Resolutions.type
-                )
+                select(ResolutionFrom.parent, ResolutionFrom.truth_cache)
                 .join(Resolutions, Resolutions.hash == ResolutionFrom.parent)
                 .where(ResolutionFrom.child == self.hash)
                 .order_by(ResolutionFrom.level.desc())
             )
 
             results = session.execute(path_query).all()
 
-            if not any(parent == dataset.hash for parent, _, _ in results):
+            if not any(parent == dataset.hash for parent, _ in results):
                 raise ValueError(
                     f"No path between resolution {self.name}, dataset {dataset.name}"
                 )
 
-            lineage = {
-                parent: truth
-                for parent, truth, type in results
-                if type != ResolutionNodeType.DATASET.value
-            }
-
+            lineage = {parent: truth for parent, truth in results}
             lineage[self.hash] = self.truth
 
             return lineage
@@ -181,8 +191,12 @@ class Contains(CountMixin, MBDB.MatchboxBase):
         BYTEA, ForeignKey("clusters.hash", ondelete="CASCADE"), primary_key=True
     )
 
-    # Constraints
-    __table_args__ = (CheckConstraint("parent != child", name="no_self_containment"),)
+    # Constraints and indices
+    __table_args__ = (
+        CheckConstraint("parent != child", name="no_self_containment"),
+        Index("ix_contains_parent_child", "parent", "child"),
+        Index("ix_contains_child_parent", "child", "parent"),
+    )
 
 
 class Clusters(CountMixin, MBDB.MatchboxBase):
@@ -211,6 +225,9 @@ class Clusters(CountMixin, MBDB.MatchboxBase):
         backref="parents",
     )
 
+    # Constraints and indices
+    __table_args__ = (Index("ix_clusters_id_gin", id, postgresql_using="gin"),)
+
 
 class Probabilities(CountMixin, MBDB.MatchboxBase):
     """Table of probabilities that a cluster is correct, according to a resolution."""