From 987b7e8f72de383fa3c1d70cd47b8558ec69350b Mon Sep 17 00:00:00 2001 From: Leonardo Mazzone Date: Tue, 17 Dec 2024 14:57:29 +0000 Subject: [PATCH 1/5] Start exploring alternative int mapping --- src/matchbox/common/hash.py | 30 ++++++++++++++++++++++++++++++ src/matchbox/common/results.py | 23 ++++++++++++++--------- test/common/test_hash.py | 28 ++++++++++++++++++++++++++++ 3 files changed, 72 insertions(+), 9 deletions(-) create mode 100644 test/common/test_hash.py diff --git a/src/matchbox/common/hash.py b/src/matchbox/common/hash.py index aa2a2799..c2242013 100644 --- a/src/matchbox/common/hash.py +++ b/src/matchbox/common/hash.py @@ -112,6 +112,36 @@ def columns_to_value_ordered_hash(data: DataFrame, columns: list[str]) -> Series return Series(hashed_records) +class IntMap: + def __init__(self, salt: int | None = None): + self.keys: list[int] = [] + self.values: list[tuple[int]] = [] + if salt and salt < 0: + raise ValueError("The salt must be a positive int") + self.salt: int | None = salt + + def _add_salt(self, val: int) -> int: + """ + If given a positive int, return as is, otherwise use Cantor pairing function + to combine the salt with a negative int key, and minus it. + """ + if val >= 0: + return val + return -int(0.5 * (self.salt - val) * (self.salt - val + 1) - val) + + def index(self, *refs: int) -> int: + new_key: int = -len(self.values) - 1 + if self.salt: + new_key = self._add_salt(new_key) + + self.keys.append(new_key) + self.values.append(refs) + return new_key + + def export(self) -> tuple[list[int], list[list[int]]]: + return self.keys, self.values + + @lru_cache(maxsize=None) def combine_integers(*n: int) -> int: """ diff --git a/src/matchbox/common/results.py b/src/matchbox/common/results.py index 3c8b2e78..0335ed57 100644 --- a/src/matchbox/common/results.py +++ b/src/matchbox/common/results.py @@ -17,8 +17,8 @@ from matchbox.common.db import Cluster, Probability from matchbox.common.hash import ( + IntMap, columns_to_value_ordered_hash, - combine_integers, list_to_value_ordered_hash, ) from matchbox.server.base import MatchboxDBAdapter, inject_backend @@ -404,6 +404,7 @@ def find(self, x: T, parent_dict: dict[T, T] | None = None) -> T: self._shadow_parent[x] = x self._shadow_rank[x] = 0 + # TODO: Instead of being a `while`, could this be an `if`? while parent_dict[x] != x: parent_dict[x] = parent_dict[parent_dict[x]] x = parent_dict[x] @@ -503,7 +504,9 @@ def diff(self) -> Iterator[tuple[set[T], set[T]]]: self._shadow_rank = self.rank.copy() -def component_to_hierarchy(table: pa.Table, dtype: pa.DataType = pa.int32) -> pa.Table: +def component_to_hierarchy( + table: pa.Table, dtype: pa.DataType = pa.int32, salt: int | None = None +) -> pa.Table: """ Convert pairwise probabilities into a hierarchical representation. @@ -517,6 +520,7 @@ def component_to_hierarchy(table: pa.Table, dtype: pa.DataType = pa.int32) -> pa """ hierarchy: list[tuple[int, int, float]] = [] uf = UnionFindWithDiff[int]() + im = IntMap(salt=salt) probs = pc.unique(table["probability"]) for threshold in probs: @@ -532,27 +536,28 @@ def component_to_hierarchy(table: pa.Table, dtype: pa.DataType = pa.int32) -> pa ): left, right = row uf.union(left, right) - parent = combine_integers(left, right) + parent = im.index(left, right) hierarchy.extend([(parent, left, threshold), (parent, right, threshold)]) # Process union-find diffs for old_comp, new_comp in uf.diff(): if len(old_comp) > 1: - parent = combine_integers(*new_comp) - child = combine_integers(*old_comp) + parent = im.index(*new_comp) + child = im.index(*old_comp) hierarchy.extend([(parent, child, threshold)]) else: - parent = combine_integers(*new_comp) + parent = im.index(*new_comp) hierarchy.extend([(parent, old_comp.pop(), threshold)]) parents, children, probs = zip(*hierarchy, strict=False) - return pa.table( + hierarchy_results = pa.table( { "parent": pa.array(parents, type=dtype()), "child": pa.array(children, type=dtype()), "probability": pa.array(probs, type=pa.uint8()), } ) + return hierarchy_results def to_hierarchical_clusters( @@ -598,8 +603,8 @@ def to_hierarchical_clusters( results = [] with ProcessPoolExecutor(max_workers=n_cores) as executor: futures = [ - executor.submit(component_to_hierarchy, component_table, dtype) - for component_table in component_tables + executor.submit(component_to_hierarchy, component_table, dtype, salt) + for salt, component_table in enumerate(component_tables) ] for future in futures: diff --git a/test/common/test_hash.py b/test/common/test_hash.py new file mode 100644 index 00000000..2c31d979 --- /dev/null +++ b/test/common/test_hash.py @@ -0,0 +1,28 @@ +from matchbox.common.hash import IntMap + + +def test_core_intmap(): + im = IntMap() + im.index(1, 2) + im.index(3, 4) + im.index(-1, -2) + + keys, values = im.export() + assert keys == [-1, -2, -3] + assert values == [(1, 2), (3, 4), (-1, -2)] + + +def test_salted_intmap(): + im = IntMap(salt=10) + a = im.index(1, 2) + b = im.index(3, 4) + c = im.index(a, b) + + keys, values = im.export() + + assert keys == [a, b, c] + assert a < 0 and a != -1 + assert b < 0 and a != -2 + assert c < 0 and c != -3 + + assert values == [(1, 2), (3, 4), (a, b)] From 1bab1f5a6476473b8bbec9817f6c3a247873dd1e Mon Sep 17 00:00:00 2001 From: Leonardo Mazzone Date: Tue, 17 Dec 2024 16:04:00 +0000 Subject: [PATCH 2/5] Get unit tests to pass --- src/matchbox/common/hash.py | 38 ----------------------------------- test/client/test_hierarchy.py | 27 ++++++------------------- 2 files changed, 6 insertions(+), 59 deletions(-) diff --git a/src/matchbox/common/hash.py b/src/matchbox/common/hash.py index c2242013..a0af7a8d 100644 --- a/src/matchbox/common/hash.py +++ b/src/matchbox/common/hash.py @@ -1,6 +1,5 @@ import base64 import hashlib -from functools import lru_cache from typing import TYPE_CHECKING, Any, TypeVar from uuid import UUID @@ -140,40 +139,3 @@ def index(self, *refs: int) -> int: def export(self) -> tuple[list[int], list[list[int]]]: return self.keys, self.values - - -@lru_cache(maxsize=None) -def combine_integers(*n: int) -> int: - """ - Combine n integers into a single negative integer. - - Used to create a symmetric deterministic hash of two integers that populates the - range of integers efficiently and reduces the likelihood of collisions. - - Aims to vectorise amazingly when used in Arrow. - - Does this by: - - * Using a Mersenne prime as a modulus - * Making negative integers positive with modulo, sped up with bitwise operations - * Combining using symmetric operations with coprime multipliers - - Args: - *args: Variable number of integers to combine - - Returns: - A negative integer - """ - P = 2147483647 - - total = 0 - product = 1 - - for x in sorted(n): - x_pos = (x ^ (x >> 31)) - (x >> 31) - total = (total + x_pos) % P - product = (product * x_pos) % P - - result = (31 * total + 37 * product) % P - - return -result diff --git a/test/client/test_hierarchy.py b/test/client/test_hierarchy.py index 6396a7b8..4f73820f 100644 --- a/test/client/test_hierarchy.py +++ b/test/client/test_hierarchy.py @@ -9,7 +9,6 @@ import pyarrow.compute as pc import pytest -from matchbox.common.hash import combine_integers from matchbox.common.results import ( attach_components_to_probabilities, component_to_hierarchy, @@ -119,22 +118,6 @@ def test_attach_components_to_probabilities(parameters: dict[str, Any]): assert len(pc.unique(with_components["component"])) == parameters["num_components"] -@pytest.mark.parametrize( - ("integer_list"), - [ - [1, 2, 3], - [9, 0], - [-4, -5, -6], - [7, -8, 9], - ], - ids=["positive", "pair_only", "negative", "mixed"], -) -def test_combine_integers(integer_list: list[int]): - res = combine_integers(*integer_list) - assert isinstance(res, int) - assert res < 0 - - @pytest.mark.parametrize( ("probabilities", "hierarchy"), [ @@ -195,9 +178,9 @@ def test_combine_integers(integer_list: list[int]): def test_component_to_hierarchy( probabilities: dict[str, list[str | float]], hierarchy: set[tuple[str, str, int]] ): - with patch( - "matchbox.common.results.combine_integers", side_effect=_combine_strings - ): + with patch("matchbox.common.results.IntMap") as MockIntMap: + instance = MockIntMap.return_value + instance.index.side_effect = _combine_strings probabilities_table = ( pa.Table.from_pydict(probabilities) .cast( @@ -340,8 +323,10 @@ def test_hierarchical_clusters(input_data, expected_hierarchy): "matchbox.common.results.ProcessPoolExecutor", lambda *args, **kwargs: parallel_pool_for_tests(timeout=30), ), - patch("matchbox.common.results.combine_integers", side_effect=_combine_strings), + patch("matchbox.common.results.IntMap") as MockIntMap, ): + instance = MockIntMap.return_value + instance.index.side_effect = _combine_strings result = to_hierarchical_clusters( probabilities, dtype=pa.string, proc_func=component_to_hierarchy ) From 83d5d0c478e6bc16fc405bdf4dc01575ca8740ea Mon Sep 17 00:00:00 2001 From: Leonardo Mazzone Date: Tue, 17 Dec 2024 17:32:30 +0000 Subject: [PATCH 3/5] Make order of intmap values not matter --- src/matchbox/common/hash.py | 61 ++++++++++++++++++++++------------ src/matchbox/common/results.py | 8 ++--- test/common/test_hash.py | 58 +++++++++++++++++++++----------- 3 files changed, 82 insertions(+), 45 deletions(-) diff --git a/src/matchbox/common/hash.py b/src/matchbox/common/hash.py index a0af7a8d..8e10e00c 100644 --- a/src/matchbox/common/hash.py +++ b/src/matchbox/common/hash.py @@ -112,30 +112,47 @@ def columns_to_value_ordered_hash(data: DataFrame, columns: list[str]) -> Series class IntMap: - def __init__(self, salt: int | None = None): - self.keys: list[int] = [] - self.values: list[tuple[int]] = [] - if salt and salt < 0: - raise ValueError("The salt must be a positive int") - self.salt: int | None = salt - - def _add_salt(self, val: int) -> int: + """ + A data structure taking unordered sets of integers, and mapping them a to a key that + 1) is a negative integer; 2) does not collide with other keys generated by other + instances of this class, as long as they are initialised with a different salt. + + The fact that keys are always negative means that it's possible to build a hierarchy + where keys are themselves parts of keyed sets, and it's easy to distinguish integers + mapped to raw data points (which will be non-negative), to integers that are keys to + sets (which will be negative). The salt allows to work with a parallel execution + model, where each worker maintains their separate key space, as long as each worker + operates on disjoint subsets of positive integers. + """ + + def __init__(self, salt: int): + self.mapping: dict[frozenset[int], int] = {} + if salt < 0: + raise ValueError("The salt must be a positive integer") + self.salt: int = salt + + def _salt_key(self, k: int) -> int: """ - If given a positive int, return as is, otherwise use Cantor pairing function - to combine the salt with a negative int key, and minus it. + Use Cantor pairing function on the salt and a negative int key, and minus it. """ - if val >= 0: - return val - return -int(0.5 * (self.salt - val) * (self.salt - val + 1) - val) + if k >= 0: + raise ValueError("Key must be a negative integer") + return -int(0.5 * (self.salt - k) * (self.salt - k + 1) - k) - def index(self, *refs: int) -> int: - new_key: int = -len(self.values) - 1 - if self.salt: - new_key = self._add_salt(new_key) + def index(self, *values: int) -> int: + """ + Args: + values: the integers in the set you want to index + + Returns: + The old or new key corresponding to the set + """ + value_set = frozenset(values) + if value_set in self.mapping: + return self.mapping[value_set] - self.keys.append(new_key) - self.values.append(refs) - return new_key + new_key: int = -len(self.mapping) - 1 + salted_key = self._salt_key(new_key) + self.mapping[value_set] = salted_key - def export(self) -> tuple[list[int], list[list[int]]]: - return self.keys, self.values + return salted_key diff --git a/src/matchbox/common/results.py b/src/matchbox/common/results.py index 55c4e3ae..c1a28db4 100644 --- a/src/matchbox/common/results.py +++ b/src/matchbox/common/results.py @@ -303,7 +303,7 @@ def to_clusters(results: ProbabilityResults) -> ClusterResults: ) # Get unique probability thresholds, sorted - thresholds = edges_df["probability"].unique() + thresholds = sorted(edges_df["probability"].unique()) # Process edges grouped by probability threshold for prob in thresholds: @@ -367,7 +367,7 @@ def attach_components_to_probabilities(probabilities: pa.Table) -> pa.Table: graph = rx.PyGraph(node_count_hint=n_nodes, edge_count_hint=n_edges) graph.add_nodes_from(range(n_nodes)) - edges = tuple(zip(left_indices.to_numpy(), right_indices.to_numpy(), strict=False)) + edges = tuple(zip(left_indices.to_numpy(), right_indices.to_numpy(), strict=True)) graph.add_edges_from_no_data(edges) components = rx.connected_components(graph) @@ -541,7 +541,7 @@ def component_to_hierarchy( for row in zip( current_probs["left"].to_numpy(), current_probs["right"].to_numpy(), - strict=False, + strict=True, ): left, right = row uf.union(left, right) @@ -558,7 +558,7 @@ def component_to_hierarchy( parent = im.index(*new_comp) hierarchy.extend([(parent, old_comp.pop(), threshold)]) - parents, children, probs = zip(*hierarchy, strict=False) + parents, children, probs = zip(*hierarchy, strict=True) hierarchy_results = pa.table( { "parent": pa.array(parents, type=dtype()), diff --git a/test/common/test_hash.py b/test/common/test_hash.py index 2c31d979..7556ef79 100644 --- a/test/common/test_hash.py +++ b/test/common/test_hash.py @@ -1,28 +1,48 @@ from matchbox.common.hash import IntMap -def test_core_intmap(): - im = IntMap() - im.index(1, 2) - im.index(3, 4) - im.index(-1, -2) +def test_intmap_basic(): + im1 = IntMap(salt=10) + a = im1.index(1, 2) + b = im1.index(3, 4) + c = im1.index(a, b) - keys, values = im.export() - assert keys == [-1, -2, -3] - assert values == [(1, 2), (3, 4), (-1, -2)] + assert len({a, b, c}) == 3 + assert max(a, b, c) < 0 -def test_salted_intmap(): - im = IntMap(salt=10) - a = im.index(1, 2) - b = im.index(3, 4) - c = im.index(a, b) +def test_intmap_same(): + im1 = IntMap(salt=10) + a = im1.index(1, 2) + b = im1.index(3, 4) + c = im1.index(a, b) - keys, values = im.export() + im2 = IntMap(salt=10) + x = im2.index(1, 2) + y = im2.index(3, 4) + z = im2.index(a, b) - assert keys == [a, b, c] - assert a < 0 and a != -1 - assert b < 0 and a != -2 - assert c < 0 and c != -3 + assert (a, b, c) == (x, y, z) - assert values == [(1, 2), (3, 4), (a, b)] + +def test_intmap_different(): + im1 = IntMap(salt=10) + a = im1.index(1, 2) + b = im1.index(3, 4) + c = im1.index(a, b) + + im2 = IntMap(salt=11) + x = im2.index(1, 2) + y = im2.index(3, 4) + z = im2.index(a, b) + + for v1, v2 in zip([a, b, c], [x, y, z], strict=True): + assert v1 != v2 + + +def test_intmap_unordered(): + im1 = IntMap(salt=10) + a = im1.index(1, 2, 3) + b = im1.index(3, 1, 2) + + assert a == b From 1febf53219a88a156fbf3c0915e65a260f9dacbd Mon Sep 17 00:00:00 2001 From: Leonardo Mazzone Date: Wed, 18 Dec 2024 10:26:18 +0000 Subject: [PATCH 4/5] Address PR comments --- src/matchbox/common/hash.py | 34 +++++++++++++++++----------------- src/matchbox/common/results.py | 3 +-- 2 files changed, 18 insertions(+), 19 deletions(-) diff --git a/src/matchbox/common/hash.py b/src/matchbox/common/hash.py index 8e10e00c..8b6f2f81 100644 --- a/src/matchbox/common/hash.py +++ b/src/matchbox/common/hash.py @@ -113,15 +113,15 @@ def columns_to_value_ordered_hash(data: DataFrame, columns: list[str]) -> Series class IntMap: """ - A data structure taking unordered sets of integers, and mapping them a to a key that - 1) is a negative integer; 2) does not collide with other keys generated by other + A data structure taking unordered sets of integers, and mapping them a to an ID that + 1) is a negative integer; 2) does not collide with other IDs generated by other instances of this class, as long as they are initialised with a different salt. - The fact that keys are always negative means that it's possible to build a hierarchy - where keys are themselves parts of keyed sets, and it's easy to distinguish integers - mapped to raw data points (which will be non-negative), to integers that are keys to - sets (which will be negative). The salt allows to work with a parallel execution - model, where each worker maintains their separate key space, as long as each worker + The fact that IDs are always negative means that it's possible to build a hierarchy + where IDs are themselves parts of other sets, and it's easy to distinguish integers + mapped to raw data points (which will be non-negative), to integers that are IDs + (which will be negative). The salt allows to work with a parallel execution + model, where each worker maintains their separate ID space, as long as each worker operates on disjoint subsets of positive integers. """ @@ -131,13 +131,13 @@ def __init__(self, salt: int): raise ValueError("The salt must be a positive integer") self.salt: int = salt - def _salt_key(self, k: int) -> int: + def _salt_id(self, i: int) -> int: """ - Use Cantor pairing function on the salt and a negative int key, and minus it. + Use Cantor pairing function on the salt and a negative int ID, and minus it. """ - if k >= 0: - raise ValueError("Key must be a negative integer") - return -int(0.5 * (self.salt - k) * (self.salt - k + 1) - k) + if i >= 0: + raise ValueError("ID must be a negative integer") + return -int(0.5 * (self.salt - i) * (self.salt - i + 1) - i) def index(self, *values: int) -> int: """ @@ -145,14 +145,14 @@ def index(self, *values: int) -> int: values: the integers in the set you want to index Returns: - The old or new key corresponding to the set + The old or new ID corresponding to the set """ value_set = frozenset(values) if value_set in self.mapping: return self.mapping[value_set] - new_key: int = -len(self.mapping) - 1 - salted_key = self._salt_key(new_key) - self.mapping[value_set] = salted_key + new_id: int = -len(self.mapping) - 1 + salted_id = self._salt_id(new_id) + self.mapping[value_set] = salted_id - return salted_key + return salted_id diff --git a/src/matchbox/common/results.py b/src/matchbox/common/results.py index c1a28db4..3e853e7b 100644 --- a/src/matchbox/common/results.py +++ b/src/matchbox/common/results.py @@ -559,14 +559,13 @@ def component_to_hierarchy( hierarchy.extend([(parent, old_comp.pop(), threshold)]) parents, children, probs = zip(*hierarchy, strict=True) - hierarchy_results = pa.table( + return pa.table( { "parent": pa.array(parents, type=dtype()), "child": pa.array(children, type=dtype()), "probability": pa.array(probs, type=pa.uint8()), } ) - return hierarchy_results def to_hierarchical_clusters( From e28937c122b9c451cac691fb7c7aa3ade82e6f96 Mon Sep 17 00:00:00 2001 From: Leonardo Mazzone Date: Wed, 18 Dec 2024 10:40:48 +0000 Subject: [PATCH 5/5] Set default salt in component_to_hierarchy --- src/matchbox/common/results.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/matchbox/common/results.py b/src/matchbox/common/results.py index 3e853e7b..ff97e5ca 100644 --- a/src/matchbox/common/results.py +++ b/src/matchbox/common/results.py @@ -514,7 +514,7 @@ def diff(self) -> Iterator[tuple[set[T], set[T]]]: def component_to_hierarchy( - table: pa.Table, dtype: pa.DataType = pa.int32, salt: int | None = None + table: pa.Table, dtype: pa.DataType = pa.int32, salt: int = 1 ) -> pa.Table: """ Convert pairwise probabilities into a hierarchical representation.