From 987b7e8f72de383fa3c1d70cd47b8558ec69350b Mon Sep 17 00:00:00 2001
From: Leonardo Mazzone <leonardo.mazzone@digital.trade.gov.uk>
Date: Tue, 17 Dec 2024 14:57:29 +0000
Subject: [PATCH 1/5] Start exploring alternative int mapping

---
 src/matchbox/common/hash.py    | 30 ++++++++++++++++++++++++++++++
 src/matchbox/common/results.py | 23 ++++++++++++++---------
 test/common/test_hash.py       | 28 ++++++++++++++++++++++++++++
 3 files changed, 72 insertions(+), 9 deletions(-)
 create mode 100644 test/common/test_hash.py

diff --git a/src/matchbox/common/hash.py b/src/matchbox/common/hash.py
index aa2a2799..c2242013 100644
--- a/src/matchbox/common/hash.py
+++ b/src/matchbox/common/hash.py
@@ -112,6 +112,36 @@ def columns_to_value_ordered_hash(data: DataFrame, columns: list[str]) -> Series
     return Series(hashed_records)
 
 
+class IntMap:
+    def __init__(self, salt: int | None = None):
+        self.keys: list[int] = []
+        self.values: list[tuple[int]] = []
+        if salt and salt < 0:
+            raise ValueError("The salt must be a positive int")
+        self.salt: int | None = salt
+
+    def _add_salt(self, val: int) -> int:
+        """
+        If given a positive int, return as is, otherwise use Cantor pairing function
+        to combine the salt with a negative int key, and minus it.
+        """
+        if val >= 0:
+            return val
+        return -int(0.5 * (self.salt - val) * (self.salt - val + 1) - val)
+
+    def index(self, *refs: int) -> int:
+        new_key: int = -len(self.values) - 1
+        if self.salt:
+            new_key = self._add_salt(new_key)
+
+        self.keys.append(new_key)
+        self.values.append(refs)
+        return new_key
+
+    def export(self) -> tuple[list[int], list[list[int]]]:
+        return self.keys, self.values
+
+
 @lru_cache(maxsize=None)
 def combine_integers(*n: int) -> int:
     """
diff --git a/src/matchbox/common/results.py b/src/matchbox/common/results.py
index 3c8b2e78..0335ed57 100644
--- a/src/matchbox/common/results.py
+++ b/src/matchbox/common/results.py
@@ -17,8 +17,8 @@
 
 from matchbox.common.db import Cluster, Probability
 from matchbox.common.hash import (
+    IntMap,
     columns_to_value_ordered_hash,
-    combine_integers,
     list_to_value_ordered_hash,
 )
 from matchbox.server.base import MatchboxDBAdapter, inject_backend
@@ -404,6 +404,7 @@ def find(self, x: T, parent_dict: dict[T, T] | None = None) -> T:
                 self._shadow_parent[x] = x
                 self._shadow_rank[x] = 0
 
+        # TODO: Instead of being a `while`, could this be an `if`?
         while parent_dict[x] != x:
             parent_dict[x] = parent_dict[parent_dict[x]]
             x = parent_dict[x]
@@ -503,7 +504,9 @@ def diff(self) -> Iterator[tuple[set[T], set[T]]]:
         self._shadow_rank = self.rank.copy()
 
 
-def component_to_hierarchy(table: pa.Table, dtype: pa.DataType = pa.int32) -> pa.Table:
+def component_to_hierarchy(
+    table: pa.Table, dtype: pa.DataType = pa.int32, salt: int | None = None
+) -> pa.Table:
     """
     Convert pairwise probabilities into a hierarchical representation.
 
@@ -517,6 +520,7 @@ def component_to_hierarchy(table: pa.Table, dtype: pa.DataType = pa.int32) -> pa
     """
     hierarchy: list[tuple[int, int, float]] = []
     uf = UnionFindWithDiff[int]()
+    im = IntMap(salt=salt)
     probs = pc.unique(table["probability"])
 
     for threshold in probs:
@@ -532,27 +536,28 @@ def component_to_hierarchy(table: pa.Table, dtype: pa.DataType = pa.int32) -> pa
         ):
             left, right = row
             uf.union(left, right)
-            parent = combine_integers(left, right)
+            parent = im.index(left, right)
             hierarchy.extend([(parent, left, threshold), (parent, right, threshold)])
 
         # Process union-find diffs
         for old_comp, new_comp in uf.diff():
             if len(old_comp) > 1:
-                parent = combine_integers(*new_comp)
-                child = combine_integers(*old_comp)
+                parent = im.index(*new_comp)
+                child = im.index(*old_comp)
                 hierarchy.extend([(parent, child, threshold)])
             else:
-                parent = combine_integers(*new_comp)
+                parent = im.index(*new_comp)
                 hierarchy.extend([(parent, old_comp.pop(), threshold)])
 
     parents, children, probs = zip(*hierarchy, strict=False)
-    return pa.table(
+    hierarchy_results = pa.table(
         {
             "parent": pa.array(parents, type=dtype()),
             "child": pa.array(children, type=dtype()),
             "probability": pa.array(probs, type=pa.uint8()),
         }
     )
+    return hierarchy_results
 
 
 def to_hierarchical_clusters(
@@ -598,8 +603,8 @@ def to_hierarchical_clusters(
     results = []
     with ProcessPoolExecutor(max_workers=n_cores) as executor:
         futures = [
-            executor.submit(component_to_hierarchy, component_table, dtype)
-            for component_table in component_tables
+            executor.submit(component_to_hierarchy, component_table, dtype, salt)
+            for salt, component_table in enumerate(component_tables)
         ]
 
         for future in futures:
diff --git a/test/common/test_hash.py b/test/common/test_hash.py
new file mode 100644
index 00000000..2c31d979
--- /dev/null
+++ b/test/common/test_hash.py
@@ -0,0 +1,28 @@
+from matchbox.common.hash import IntMap
+
+
+def test_core_intmap():
+    im = IntMap()
+    im.index(1, 2)
+    im.index(3, 4)
+    im.index(-1, -2)
+
+    keys, values = im.export()
+    assert keys == [-1, -2, -3]
+    assert values == [(1, 2), (3, 4), (-1, -2)]
+
+
+def test_salted_intmap():
+    im = IntMap(salt=10)
+    a = im.index(1, 2)
+    b = im.index(3, 4)
+    c = im.index(a, b)
+
+    keys, values = im.export()
+
+    assert keys == [a, b, c]
+    assert a < 0 and a != -1
+    assert b < 0 and a != -2
+    assert c < 0 and c != -3
+
+    assert values == [(1, 2), (3, 4), (a, b)]

From 1bab1f5a6476473b8bbec9817f6c3a247873dd1e Mon Sep 17 00:00:00 2001
From: Leonardo Mazzone <leonardo.mazzone@digital.trade.gov.uk>
Date: Tue, 17 Dec 2024 16:04:00 +0000
Subject: [PATCH 2/5] Get unit tests to pass

---
 src/matchbox/common/hash.py   | 38 -----------------------------------
 test/client/test_hierarchy.py | 27 ++++++-------------------
 2 files changed, 6 insertions(+), 59 deletions(-)

diff --git a/src/matchbox/common/hash.py b/src/matchbox/common/hash.py
index c2242013..a0af7a8d 100644
--- a/src/matchbox/common/hash.py
+++ b/src/matchbox/common/hash.py
@@ -1,6 +1,5 @@
 import base64
 import hashlib
-from functools import lru_cache
 from typing import TYPE_CHECKING, Any, TypeVar
 from uuid import UUID
 
@@ -140,40 +139,3 @@ def index(self, *refs: int) -> int:
 
     def export(self) -> tuple[list[int], list[list[int]]]:
         return self.keys, self.values
-
-
-@lru_cache(maxsize=None)
-def combine_integers(*n: int) -> int:
-    """
-    Combine n integers into a single negative integer.
-
-    Used to create a symmetric deterministic hash of two integers that populates the
-    range of integers efficiently and reduces the likelihood of collisions.
-
-    Aims to vectorise amazingly when used in Arrow.
-
-    Does this by:
-
-    * Using a Mersenne prime as a modulus
-    * Making negative integers positive with modulo, sped up with bitwise operations
-    * Combining using symmetric operations with coprime multipliers
-
-    Args:
-        *args: Variable number of integers to combine
-
-    Returns:
-        A negative integer
-    """
-    P = 2147483647
-
-    total = 0
-    product = 1
-
-    for x in sorted(n):
-        x_pos = (x ^ (x >> 31)) - (x >> 31)
-        total = (total + x_pos) % P
-        product = (product * x_pos) % P
-
-    result = (31 * total + 37 * product) % P
-
-    return -result
diff --git a/test/client/test_hierarchy.py b/test/client/test_hierarchy.py
index 6396a7b8..4f73820f 100644
--- a/test/client/test_hierarchy.py
+++ b/test/client/test_hierarchy.py
@@ -9,7 +9,6 @@
 import pyarrow.compute as pc
 import pytest
 
-from matchbox.common.hash import combine_integers
 from matchbox.common.results import (
     attach_components_to_probabilities,
     component_to_hierarchy,
@@ -119,22 +118,6 @@ def test_attach_components_to_probabilities(parameters: dict[str, Any]):
     assert len(pc.unique(with_components["component"])) == parameters["num_components"]
 
 
-@pytest.mark.parametrize(
-    ("integer_list"),
-    [
-        [1, 2, 3],
-        [9, 0],
-        [-4, -5, -6],
-        [7, -8, 9],
-    ],
-    ids=["positive", "pair_only", "negative", "mixed"],
-)
-def test_combine_integers(integer_list: list[int]):
-    res = combine_integers(*integer_list)
-    assert isinstance(res, int)
-    assert res < 0
-
-
 @pytest.mark.parametrize(
     ("probabilities", "hierarchy"),
     [
@@ -195,9 +178,9 @@ def test_combine_integers(integer_list: list[int]):
 def test_component_to_hierarchy(
     probabilities: dict[str, list[str | float]], hierarchy: set[tuple[str, str, int]]
 ):
-    with patch(
-        "matchbox.common.results.combine_integers", side_effect=_combine_strings
-    ):
+    with patch("matchbox.common.results.IntMap") as MockIntMap:
+        instance = MockIntMap.return_value
+        instance.index.side_effect = _combine_strings
         probabilities_table = (
             pa.Table.from_pydict(probabilities)
             .cast(
@@ -340,8 +323,10 @@ def test_hierarchical_clusters(input_data, expected_hierarchy):
             "matchbox.common.results.ProcessPoolExecutor",
             lambda *args, **kwargs: parallel_pool_for_tests(timeout=30),
         ),
-        patch("matchbox.common.results.combine_integers", side_effect=_combine_strings),
+        patch("matchbox.common.results.IntMap") as MockIntMap,
     ):
+        instance = MockIntMap.return_value
+        instance.index.side_effect = _combine_strings
         result = to_hierarchical_clusters(
             probabilities, dtype=pa.string, proc_func=component_to_hierarchy
         )

From 83d5d0c478e6bc16fc405bdf4dc01575ca8740ea Mon Sep 17 00:00:00 2001
From: Leonardo Mazzone <leonardo.mazzone@digital.trade.gov.uk>
Date: Tue, 17 Dec 2024 17:32:30 +0000
Subject: [PATCH 3/5] Make order of intmap values not matter

---
 src/matchbox/common/hash.py    | 61 ++++++++++++++++++++++------------
 src/matchbox/common/results.py |  8 ++---
 test/common/test_hash.py       | 58 +++++++++++++++++++++-----------
 3 files changed, 82 insertions(+), 45 deletions(-)

diff --git a/src/matchbox/common/hash.py b/src/matchbox/common/hash.py
index a0af7a8d..8e10e00c 100644
--- a/src/matchbox/common/hash.py
+++ b/src/matchbox/common/hash.py
@@ -112,30 +112,47 @@ def columns_to_value_ordered_hash(data: DataFrame, columns: list[str]) -> Series
 
 
 class IntMap:
-    def __init__(self, salt: int | None = None):
-        self.keys: list[int] = []
-        self.values: list[tuple[int]] = []
-        if salt and salt < 0:
-            raise ValueError("The salt must be a positive int")
-        self.salt: int | None = salt
-
-    def _add_salt(self, val: int) -> int:
+    """
+    A data structure taking unordered sets of integers, and mapping them a to a key that
+    1) is a negative integer; 2) does not collide with other keys generated by other
+    instances of this class, as long as they are initialised with a different salt.
+
+    The fact that keys are always negative means that it's possible to build a hierarchy
+    where keys are themselves parts of keyed sets, and it's easy to distinguish integers
+    mapped to raw data points (which will be non-negative), to integers that are keys to
+    sets (which will be negative). The salt allows to work with a parallel execution
+    model, where each worker maintains their separate key space, as long as each worker
+    operates on disjoint subsets of positive integers.
+    """
+
+    def __init__(self, salt: int):
+        self.mapping: dict[frozenset[int], int] = {}
+        if salt < 0:
+            raise ValueError("The salt must be a positive integer")
+        self.salt: int = salt
+
+    def _salt_key(self, k: int) -> int:
         """
-        If given a positive int, return as is, otherwise use Cantor pairing function
-        to combine the salt with a negative int key, and minus it.
+        Use Cantor pairing function on the salt and a negative int key, and minus it.
         """
-        if val >= 0:
-            return val
-        return -int(0.5 * (self.salt - val) * (self.salt - val + 1) - val)
+        if k >= 0:
+            raise ValueError("Key must be a negative integer")
+        return -int(0.5 * (self.salt - k) * (self.salt - k + 1) - k)
 
-    def index(self, *refs: int) -> int:
-        new_key: int = -len(self.values) - 1
-        if self.salt:
-            new_key = self._add_salt(new_key)
+    def index(self, *values: int) -> int:
+        """
+        Args:
+            values: the integers in the set you want to index
+
+        Returns:
+            The old or new key corresponding to the set
+        """
+        value_set = frozenset(values)
+        if value_set in self.mapping:
+            return self.mapping[value_set]
 
-        self.keys.append(new_key)
-        self.values.append(refs)
-        return new_key
+        new_key: int = -len(self.mapping) - 1
+        salted_key = self._salt_key(new_key)
+        self.mapping[value_set] = salted_key
 
-    def export(self) -> tuple[list[int], list[list[int]]]:
-        return self.keys, self.values
+        return salted_key
diff --git a/src/matchbox/common/results.py b/src/matchbox/common/results.py
index 55c4e3ae..c1a28db4 100644
--- a/src/matchbox/common/results.py
+++ b/src/matchbox/common/results.py
@@ -303,7 +303,7 @@ def to_clusters(results: ProbabilityResults) -> ClusterResults:
     )
 
     # Get unique probability thresholds, sorted
-    thresholds = edges_df["probability"].unique()
+    thresholds = sorted(edges_df["probability"].unique())
 
     # Process edges grouped by probability threshold
     for prob in thresholds:
@@ -367,7 +367,7 @@ def attach_components_to_probabilities(probabilities: pa.Table) -> pa.Table:
     graph = rx.PyGraph(node_count_hint=n_nodes, edge_count_hint=n_edges)
     graph.add_nodes_from(range(n_nodes))
 
-    edges = tuple(zip(left_indices.to_numpy(), right_indices.to_numpy(), strict=False))
+    edges = tuple(zip(left_indices.to_numpy(), right_indices.to_numpy(), strict=True))
     graph.add_edges_from_no_data(edges)
 
     components = rx.connected_components(graph)
@@ -541,7 +541,7 @@ def component_to_hierarchy(
         for row in zip(
             current_probs["left"].to_numpy(),
             current_probs["right"].to_numpy(),
-            strict=False,
+            strict=True,
         ):
             left, right = row
             uf.union(left, right)
@@ -558,7 +558,7 @@ def component_to_hierarchy(
                 parent = im.index(*new_comp)
                 hierarchy.extend([(parent, old_comp.pop(), threshold)])
 
-    parents, children, probs = zip(*hierarchy, strict=False)
+    parents, children, probs = zip(*hierarchy, strict=True)
     hierarchy_results = pa.table(
         {
             "parent": pa.array(parents, type=dtype()),
diff --git a/test/common/test_hash.py b/test/common/test_hash.py
index 2c31d979..7556ef79 100644
--- a/test/common/test_hash.py
+++ b/test/common/test_hash.py
@@ -1,28 +1,48 @@
 from matchbox.common.hash import IntMap
 
 
-def test_core_intmap():
-    im = IntMap()
-    im.index(1, 2)
-    im.index(3, 4)
-    im.index(-1, -2)
+def test_intmap_basic():
+    im1 = IntMap(salt=10)
+    a = im1.index(1, 2)
+    b = im1.index(3, 4)
+    c = im1.index(a, b)
 
-    keys, values = im.export()
-    assert keys == [-1, -2, -3]
-    assert values == [(1, 2), (3, 4), (-1, -2)]
+    assert len({a, b, c}) == 3
+    assert max(a, b, c) < 0
 
 
-def test_salted_intmap():
-    im = IntMap(salt=10)
-    a = im.index(1, 2)
-    b = im.index(3, 4)
-    c = im.index(a, b)
+def test_intmap_same():
+    im1 = IntMap(salt=10)
+    a = im1.index(1, 2)
+    b = im1.index(3, 4)
+    c = im1.index(a, b)
 
-    keys, values = im.export()
+    im2 = IntMap(salt=10)
+    x = im2.index(1, 2)
+    y = im2.index(3, 4)
+    z = im2.index(a, b)
 
-    assert keys == [a, b, c]
-    assert a < 0 and a != -1
-    assert b < 0 and a != -2
-    assert c < 0 and c != -3
+    assert (a, b, c) == (x, y, z)
 
-    assert values == [(1, 2), (3, 4), (a, b)]
+
+def test_intmap_different():
+    im1 = IntMap(salt=10)
+    a = im1.index(1, 2)
+    b = im1.index(3, 4)
+    c = im1.index(a, b)
+
+    im2 = IntMap(salt=11)
+    x = im2.index(1, 2)
+    y = im2.index(3, 4)
+    z = im2.index(a, b)
+
+    for v1, v2 in zip([a, b, c], [x, y, z], strict=True):
+        assert v1 != v2
+
+
+def test_intmap_unordered():
+    im1 = IntMap(salt=10)
+    a = im1.index(1, 2, 3)
+    b = im1.index(3, 1, 2)
+
+    assert a == b

From 1febf53219a88a156fbf3c0915e65a260f9dacbd Mon Sep 17 00:00:00 2001
From: Leonardo Mazzone <leonardo.mazzone@digital.trade.gov.uk>
Date: Wed, 18 Dec 2024 10:26:18 +0000
Subject: [PATCH 4/5] Address PR comments

---
 src/matchbox/common/hash.py    | 34 +++++++++++++++++-----------------
 src/matchbox/common/results.py |  3 +--
 2 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/src/matchbox/common/hash.py b/src/matchbox/common/hash.py
index 8e10e00c..8b6f2f81 100644
--- a/src/matchbox/common/hash.py
+++ b/src/matchbox/common/hash.py
@@ -113,15 +113,15 @@ def columns_to_value_ordered_hash(data: DataFrame, columns: list[str]) -> Series
 
 class IntMap:
     """
-    A data structure taking unordered sets of integers, and mapping them a to a key that
-    1) is a negative integer; 2) does not collide with other keys generated by other
+    A data structure taking unordered sets of integers, and mapping them a to an ID that
+    1) is a negative integer; 2) does not collide with other IDs generated by other
     instances of this class, as long as they are initialised with a different salt.
 
-    The fact that keys are always negative means that it's possible to build a hierarchy
-    where keys are themselves parts of keyed sets, and it's easy to distinguish integers
-    mapped to raw data points (which will be non-negative), to integers that are keys to
-    sets (which will be negative). The salt allows to work with a parallel execution
-    model, where each worker maintains their separate key space, as long as each worker
+    The fact that IDs are always negative means that it's possible to build a hierarchy
+    where IDs are themselves parts of other sets, and it's easy to distinguish integers
+    mapped to raw data points (which will be non-negative), to integers that are IDs
+    (which will be negative). The salt allows to work with a parallel execution
+    model, where each worker maintains their separate ID space, as long as each worker
     operates on disjoint subsets of positive integers.
     """
 
@@ -131,13 +131,13 @@ def __init__(self, salt: int):
             raise ValueError("The salt must be a positive integer")
         self.salt: int = salt
 
-    def _salt_key(self, k: int) -> int:
+    def _salt_id(self, i: int) -> int:
         """
-        Use Cantor pairing function on the salt and a negative int key, and minus it.
+        Use Cantor pairing function on the salt and a negative int ID, and minus it.
         """
-        if k >= 0:
-            raise ValueError("Key must be a negative integer")
-        return -int(0.5 * (self.salt - k) * (self.salt - k + 1) - k)
+        if i >= 0:
+            raise ValueError("ID must be a negative integer")
+        return -int(0.5 * (self.salt - i) * (self.salt - i + 1) - i)
 
     def index(self, *values: int) -> int:
         """
@@ -145,14 +145,14 @@ def index(self, *values: int) -> int:
             values: the integers in the set you want to index
 
         Returns:
-            The old or new key corresponding to the set
+            The old or new ID corresponding to the set
         """
         value_set = frozenset(values)
         if value_set in self.mapping:
             return self.mapping[value_set]
 
-        new_key: int = -len(self.mapping) - 1
-        salted_key = self._salt_key(new_key)
-        self.mapping[value_set] = salted_key
+        new_id: int = -len(self.mapping) - 1
+        salted_id = self._salt_id(new_id)
+        self.mapping[value_set] = salted_id
 
-        return salted_key
+        return salted_id
diff --git a/src/matchbox/common/results.py b/src/matchbox/common/results.py
index c1a28db4..3e853e7b 100644
--- a/src/matchbox/common/results.py
+++ b/src/matchbox/common/results.py
@@ -559,14 +559,13 @@ def component_to_hierarchy(
                 hierarchy.extend([(parent, old_comp.pop(), threshold)])
 
     parents, children, probs = zip(*hierarchy, strict=True)
-    hierarchy_results = pa.table(
+    return pa.table(
         {
             "parent": pa.array(parents, type=dtype()),
             "child": pa.array(children, type=dtype()),
             "probability": pa.array(probs, type=pa.uint8()),
         }
     )
-    return hierarchy_results
 
 
 def to_hierarchical_clusters(

From e28937c122b9c451cac691fb7c7aa3ade82e6f96 Mon Sep 17 00:00:00 2001
From: Leonardo Mazzone <leonardo.mazzone@digital.trade.gov.uk>
Date: Wed, 18 Dec 2024 10:40:48 +0000
Subject: [PATCH 5/5] Set default salt in component_to_hierarchy

---
 src/matchbox/common/results.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/matchbox/common/results.py b/src/matchbox/common/results.py
index 3e853e7b..ff97e5ca 100644
--- a/src/matchbox/common/results.py
+++ b/src/matchbox/common/results.py
@@ -514,7 +514,7 @@ def diff(self) -> Iterator[tuple[set[T], set[T]]]:
 
 
 def component_to_hierarchy(
-    table: pa.Table, dtype: pa.DataType = pa.int32, salt: int | None = None
+    table: pa.Table, dtype: pa.DataType = pa.int32, salt: int = 1
 ) -> pa.Table:
     """
     Convert pairwise probabilities into a hierarchical representation.