From fae47a8749615d514209ec7e80c94adb37a0e388 Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Tue, 6 Aug 2024 15:35:03 +0200 Subject: [PATCH 01/15] Add hypothesis test for FPS --- tests/utils/test_sampling_algorithms.py | 61 ++++++++++++++++++++++++- 1 file changed, 60 insertions(+), 1 deletion(-) diff --git a/tests/utils/test_sampling_algorithms.py b/tests/utils/test_sampling_algorithms.py index f9aad3bc7..48c06579a 100644 --- a/tests/utils/test_sampling_algorithms.py +++ b/tests/utils/test_sampling_algorithms.py @@ -2,11 +2,19 @@ import math +import hypothesis.extra.numpy as hnp +import hypothesis.strategies as st import numpy as np import pandas as pd import pytest +from hypothesis import example, given +from sklearn.metrics import pairwise_distances -from baybe.utils.sampling_algorithms import DiscreteSamplingMethod, sample_numerical_df +from baybe.utils.sampling_algorithms import ( + DiscreteSamplingMethod, + farthest_point_sampling, + sample_numerical_df, +) @pytest.mark.parametrize("fraction", [0.2, 0.8, 1.0, 1.2, 2.0, 2.4, 3.5]) @@ -31,3 +39,54 @@ def test_discrete_sampling(fraction, method): assert len(sampled) == len( sampled.drop_duplicates() ), "Undersized sampling did not return unique points." + + +@given( + points=hnp.arrays( + dtype=float, + shape=hnp.array_shapes(min_dims=2, max_dims=2, min_side=1), + # Because of the square involved in the Euclidean distance computation, + # we limit the floating point range to avoid overflow problems + elements=st.floats(min_value=-1e100, max_value=1e100, allow_nan=False), + ) +) +# Explicitly test scenario with equidistant points (see comments in test body) +@example(points=np.array([[0, 0], [0, 1], [1, 0], [1, 1]])) +def test_farthest_point_sampling(points: np.ndarray): + """FPS produces the same point sequence regardless of the order in which the + points are provided. Also, each point fulfills the "farthest point" criterion + in its respective iteration. + """ # noqa + # Order the points using FPS + sorting_idxs = farthest_point_sampling(points, len(points)) + target = points[sorting_idxs] + + # For the ordered collection of points, it must hold: + # --------------------------------------------------- + # The minimum distance of the n_th selected point to all previously selected points + # must be larger than the minimum distance of any other remaining candidate point to + # the previously selected points – that's what makes it the "farthest point" in the + # respective iteration. + # + # For the check, we start with the second point (because there are otherwise no + # previous points) and end with the second last point (because there are otherwise + # no alternative candidates left): + dist_mat = pairwise_distances(target) + for i in range(1, len(dist_mat) - 1): + min_dist_selected_to_previous = np.min(dist_mat[i, :i]) + min_dist_remaining_to_previous = np.min(dist_mat[i + 1 :, :i]) + z = min_dist_selected_to_previous >= min_dist_remaining_to_previous + assert z + + # Also, for the algorithm to be fully deterministic, the obtained result should not + # depend on the particular (random) order in which the points are provided. That is, + # running the algorithm on a permutation should still produce the same sequence of + # points. Note: We establish the check on the point coordinates and not the + # selection index, because the latter can still differ in case of duplicated points. + # + # Examples where this can make a difference is three points forming an equilateral + # triangle or four points spanning a unit cube. Here, tie-breaking operations such + # as `np.max` can lead to different results depending on the order. + permutation_idxs = np.random.permutation(len(points)) + sorting_idxs = farthest_point_sampling(points[permutation_idxs], len(points)) + assert np.array_equal(target, points[permutation_idxs][sorting_idxs]) From 2774db41ff3ad417af0a3eafc0e744f0e457b28b Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Tue, 6 Aug 2024 15:38:17 +0200 Subject: [PATCH 02/15] Refactor input validation --- baybe/utils/sampling_algorithms.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/baybe/utils/sampling_algorithms.py b/baybe/utils/sampling_algorithms.py index 04052f483..9fa57f9fb 100644 --- a/baybe/utils/sampling_algorithms.py +++ b/baybe/utils/sampling_algorithms.py @@ -35,6 +35,13 @@ def farthest_point_sampling( Raises: ValueError: If an unknown initialization recommender is used. """ + if np.ndim(points) != 2: + raise ValueError("The provided array must be two-dimensional.") + if len(points) == 0: + raise ValueError("The provided array must contain at least one row.") + if points.shape[-1] == 0: + raise ValueError("The provided input space must be at least one-dimensional.") + # Compute the pairwise distances between all points dist_matrix = pairwise_distances(points) @@ -48,11 +55,6 @@ def farthest_point_sampling( ) if n_samples == 1: return np.random.choice(selected_point_indices, 1).tolist() - elif n_samples < 1: - raise ValueError( - f"Farthest point sampling must be done with >= 1 samples, but " - f"{n_samples=} was given." - ) else: raise ValueError(f"unknown initialization recommender: '{initialization}'") From 702a820bc6e4b08605787dda85b09171959d36c6 Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Tue, 6 Aug 2024 16:39:59 +0200 Subject: [PATCH 03/15] Catch edge case of completely duplicated points --- baybe/utils/sampling_algorithms.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/baybe/utils/sampling_algorithms.py b/baybe/utils/sampling_algorithms.py index 9fa57f9fb..23be46be5 100644 --- a/baybe/utils/sampling_algorithms.py +++ b/baybe/utils/sampling_algorithms.py @@ -45,6 +45,9 @@ def farthest_point_sampling( # Compute the pairwise distances between all points dist_matrix = pairwise_distances(points) + # Avoid wrong behavior pathological situation where all points are duplicates + np.fill_diagonal(dist_matrix, -np.inf) + # Initialize the point selection subset if initialization == "random": selected_point_indices = [np.random.randint(0, len(points))] From 79da6f3879680ab747d20bc2842d578bfc354b93 Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Tue, 6 Aug 2024 16:42:08 +0200 Subject: [PATCH 04/15] Make algorithm agnostic to point order --- baybe/utils/sampling_algorithms.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/baybe/utils/sampling_algorithms.py b/baybe/utils/sampling_algorithms.py index 23be46be5..2e3c10f9d 100644 --- a/baybe/utils/sampling_algorithms.py +++ b/baybe/utils/sampling_algorithms.py @@ -42,7 +42,11 @@ def farthest_point_sampling( if points.shape[-1] == 0: raise ValueError("The provided input space must be at least one-dimensional.") - # Compute the pairwise distances between all points + # Sort the points to produce the same result regardless of the input order + sort_idx = np.lexsort(tuple(points.T)) + points = points[sort_idx] + + # Pre-compute the pairwise distances between all points dist_matrix = pairwise_distances(points) # Avoid wrong behavior pathological situation where all points are duplicates @@ -53,11 +57,9 @@ def farthest_point_sampling( selected_point_indices = [np.random.randint(0, len(points))] elif initialization == "farthest": idx_1d = np.argmax(dist_matrix) - selected_point_indices = list( - map(int, np.unravel_index(idx_1d, dist_matrix.shape)) - ) + selected_point_indices = list(np.unravel_index(idx_1d, dist_matrix.shape)) if n_samples == 1: - return np.random.choice(selected_point_indices, 1).tolist() + return [selected_point_indices[0]] else: raise ValueError(f"unknown initialization recommender: '{initialization}'") @@ -81,7 +83,8 @@ def farthest_point_sampling( selected_point_indices.append(selected_point_index) remaining_point_indices.remove(selected_point_index) - return selected_point_indices + # Undo the initial point reordering + return sort_idx[selected_point_indices] class DiscreteSamplingMethod(Enum): From 7ca80a0df2942a6d01ea97b5e2bbfd0df301266b Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Tue, 6 Aug 2024 17:11:23 +0200 Subject: [PATCH 05/15] Improve docstrings --- baybe/utils/sampling_algorithms.py | 31 +++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/baybe/utils/sampling_algorithms.py b/baybe/utils/sampling_algorithms.py index 2e3c10f9d..ba93c18d8 100644 --- a/baybe/utils/sampling_algorithms.py +++ b/baybe/utils/sampling_algorithms.py @@ -13,27 +13,32 @@ def farthest_point_sampling( n_samples: int = 1, initialization: Literal["farthest", "random"] = "farthest", ) -> list[int]: - """Sample points according to a farthest point heuristic. + """Select a subset of points using farthest point sampling. - Creates a subset of a collection of points by successively adding points with the - largest Euclidean distance to intermediate point selections encountered during - the algorithmic process. + Creates a subset of a given collection of points by successively adding points with + the largest Euclidean distance to intermediate point selections encountered during + the algorithmic process. The mechanism used for the initial point selection is + configurable. Args: - points: The points that are available for selection, represented as a 2D array - whose first dimension corresponds to the point index. + points: The points that are available for selection, represented as a 2-D array + of shape (n, k), where n is the number of points and k is the dimensionality + of the points. n_samples: The total number of points to be selected. - initialization: Determines how the first points are selected. When - ``"farthest"`` is chosen, the first two selected points are those with the - largest distance. If only a single point is requested, it is selected - randomly from these two. When ``"random"`` is chosen, the first point is - selected uniformly at random. + initialization: Determines how the first points are selected: + * ``"farthest"``: The first two selected points are those with the + largest distance. If only a single point is requested, a deterministic + choice is made based on the point coordinates. + * ``"random"``: The first point is selected uniformly at random. Returns: A list containing the positional indices of the selected points. Raises: - ValueError: If an unknown initialization recommender is used. + ValueError: If the provided array is not two-dimensional. + ValueError: If the array contains no points. + ValueError: If the input space has no dimensions. + ValueError: If an unknown method for initialization is specified. """ if np.ndim(points) != 2: raise ValueError("The provided array must be two-dimensional.") @@ -52,7 +57,7 @@ def farthest_point_sampling( # Avoid wrong behavior pathological situation where all points are duplicates np.fill_diagonal(dist_matrix, -np.inf) - # Initialize the point selection subset + # Initialize the point selection if initialization == "random": selected_point_indices = [np.random.randint(0, len(points))] elif initialization == "farthest": From 7b008aef6c31c9c0a3724624af755ff36987e087 Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Tue, 6 Aug 2024 17:12:18 +0200 Subject: [PATCH 06/15] Use walrus to extract number of points --- baybe/utils/sampling_algorithms.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/baybe/utils/sampling_algorithms.py b/baybe/utils/sampling_algorithms.py index ba93c18d8..a9f7f82eb 100644 --- a/baybe/utils/sampling_algorithms.py +++ b/baybe/utils/sampling_algorithms.py @@ -42,7 +42,7 @@ def farthest_point_sampling( """ if np.ndim(points) != 2: raise ValueError("The provided array must be two-dimensional.") - if len(points) == 0: + if (n_points := len(points)) == 0: raise ValueError("The provided array must contain at least one row.") if points.shape[-1] == 0: raise ValueError("The provided input space must be at least one-dimensional.") @@ -59,7 +59,7 @@ def farthest_point_sampling( # Initialize the point selection if initialization == "random": - selected_point_indices = [np.random.randint(0, len(points))] + selected_point_indices = [np.random.randint(0, n_points)] elif initialization == "farthest": idx_1d = np.argmax(dist_matrix) selected_point_indices = list(np.unravel_index(idx_1d, dist_matrix.shape)) @@ -69,7 +69,7 @@ def farthest_point_sampling( raise ValueError(f"unknown initialization recommender: '{initialization}'") # Initialize the list of remaining points - remaining_point_indices = list(range(len(points))) + remaining_point_indices = list(range(n_points)) for idx in selected_point_indices: remaining_point_indices.remove(idx) From c28703e110f99c6e62ebeaa2dfee3346b1caaa3c Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Tue, 6 Aug 2024 17:15:21 +0200 Subject: [PATCH 07/15] Update CHANGELOG.md --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 075ab5378..f85c11b08 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed - `CategoricalParameter` and `TaskParameter` no longer incorrectly coerce a single string input to categories/tasks +- `farthest_point_sampling` no longer depends on the provided point order ## [0.10.0] - 2024-08-02 ### Breaking Changes From fb3748b311adce2e1f5497366cb38e8233aa7d8c Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Wed, 7 Aug 2024 12:37:29 +0200 Subject: [PATCH 08/15] Add test case for one single FPS recommendation --- tests/utils/test_sampling_algorithms.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/utils/test_sampling_algorithms.py b/tests/utils/test_sampling_algorithms.py index 48c06579a..27da3e4b4 100644 --- a/tests/utils/test_sampling_algorithms.py +++ b/tests/utils/test_sampling_algorithms.py @@ -90,3 +90,8 @@ def test_farthest_point_sampling(points: np.ndarray): permutation_idxs = np.random.permutation(len(points)) sorting_idxs = farthest_point_sampling(points[permutation_idxs], len(points)) assert np.array_equal(target, points[permutation_idxs][sorting_idxs]) + + # Because requesting a single point needs special treatment in FPS, + # we test this as additional case + sorting_idxs = farthest_point_sampling(points[permutation_idxs], 1) + assert np.array_equal(target[[0]], points[permutation_idxs][sorting_idxs]) From 6f6c33131cad7b50c04ef6f34b3ec1dff80e5a08 Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Wed, 7 Aug 2024 12:38:10 +0200 Subject: [PATCH 09/15] Fix index bug in case of one requested point --- baybe/utils/sampling_algorithms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/baybe/utils/sampling_algorithms.py b/baybe/utils/sampling_algorithms.py index a9f7f82eb..396400b2f 100644 --- a/baybe/utils/sampling_algorithms.py +++ b/baybe/utils/sampling_algorithms.py @@ -64,7 +64,7 @@ def farthest_point_sampling( idx_1d = np.argmax(dist_matrix) selected_point_indices = list(np.unravel_index(idx_1d, dist_matrix.shape)) if n_samples == 1: - return [selected_point_indices[0]] + return [sort_idx[selected_point_indices[0]]] else: raise ValueError(f"unknown initialization recommender: '{initialization}'") From ddfeb8098c93ee52c1c25ec86d4c5df4479a236e Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Wed, 7 Aug 2024 13:41:34 +0200 Subject: [PATCH 10/15] Fix mypy error --- baybe/utils/sampling_algorithms.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/baybe/utils/sampling_algorithms.py b/baybe/utils/sampling_algorithms.py index 396400b2f..6e83dec0c 100644 --- a/baybe/utils/sampling_algorithms.py +++ b/baybe/utils/sampling_algorithms.py @@ -62,7 +62,9 @@ def farthest_point_sampling( selected_point_indices = [np.random.randint(0, n_points)] elif initialization == "farthest": idx_1d = np.argmax(dist_matrix) - selected_point_indices = list(np.unravel_index(idx_1d, dist_matrix.shape)) + selected_point_indices = list( + map(int, np.unravel_index(idx_1d, dist_matrix.shape)) + ) if n_samples == 1: return [sort_idx[selected_point_indices[0]]] else: From c719e2244263f98d5c07f0097133a95071a1758f Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Fri, 9 Aug 2024 14:02:12 +0200 Subject: [PATCH 11/15] Fix type of returned value --- baybe/utils/sampling_algorithms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/baybe/utils/sampling_algorithms.py b/baybe/utils/sampling_algorithms.py index 6e83dec0c..d98984804 100644 --- a/baybe/utils/sampling_algorithms.py +++ b/baybe/utils/sampling_algorithms.py @@ -91,7 +91,7 @@ def farthest_point_sampling( remaining_point_indices.remove(selected_point_index) # Undo the initial point reordering - return sort_idx[selected_point_indices] + return sort_idx[selected_point_indices].tolist() class DiscreteSamplingMethod(Enum): From 926fb123fdd35fb30bebdc714bd3ca24e9f6ee74 Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Fri, 9 Aug 2024 14:19:21 +0200 Subject: [PATCH 12/15] Provide details on incorrect user input in error message --- baybe/utils/sampling_algorithms.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/baybe/utils/sampling_algorithms.py b/baybe/utils/sampling_algorithms.py index d98984804..7b95edf1e 100644 --- a/baybe/utils/sampling_algorithms.py +++ b/baybe/utils/sampling_algorithms.py @@ -40,8 +40,11 @@ def farthest_point_sampling( ValueError: If the input space has no dimensions. ValueError: If an unknown method for initialization is specified. """ - if np.ndim(points) != 2: - raise ValueError("The provided array must be two-dimensional.") + if (n_dims := np.ndim(points)) != 2: + raise ValueError( + f"The provided array must be two-dimensional but the given input had " + f"{n_dims} dimensions." + ) if (n_points := len(points)) == 0: raise ValueError("The provided array must contain at least one row.") if points.shape[-1] == 0: From a60513b5756a3a48c4b7800eb02f213748958b1a Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Fri, 9 Aug 2024 14:20:16 +0200 Subject: [PATCH 13/15] Add missing validation for number of requested samples --- baybe/utils/sampling_algorithms.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/baybe/utils/sampling_algorithms.py b/baybe/utils/sampling_algorithms.py index 7b95edf1e..67b836755 100644 --- a/baybe/utils/sampling_algorithms.py +++ b/baybe/utils/sampling_algorithms.py @@ -49,6 +49,11 @@ def farthest_point_sampling( raise ValueError("The provided array must contain at least one row.") if points.shape[-1] == 0: raise ValueError("The provided input space must be at least one-dimensional.") + if n_samples > n_points: + raise ValueError( + f"The number of requested samples ({n_samples}) cannot be larger than the " + f"total number of points provided ({n_points})." + ) # Sort the points to produce the same result regardless of the input order sort_idx = np.lexsort(tuple(points.T)) From f0c07e85a3b161bba55a3b937c31580fce5e39af Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Fri, 9 Aug 2024 14:22:30 +0200 Subject: [PATCH 14/15] Raise warning in pathological case --- baybe/utils/sampling_algorithms.py | 8 +++++++- tests/utils/test_sampling_algorithms.py | 8 ++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/baybe/utils/sampling_algorithms.py b/baybe/utils/sampling_algorithms.py index 67b836755..4ea2ae929 100644 --- a/baybe/utils/sampling_algorithms.py +++ b/baybe/utils/sampling_algorithms.py @@ -1,5 +1,6 @@ """A collection of point sampling algorithms.""" +import warnings from enum import Enum from typing import Literal @@ -55,6 +56,11 @@ def farthest_point_sampling( f"total number of points provided ({n_points})." ) + # Catch the pathological case upfront + if len(np.unique(points, axis=0)) == 1: + warnings.warn("All points are identical.", UserWarning) + return list(range(n_samples)) + # Sort the points to produce the same result regardless of the input order sort_idx = np.lexsort(tuple(points.T)) points = points[sort_idx] @@ -62,7 +68,7 @@ def farthest_point_sampling( # Pre-compute the pairwise distances between all points dist_matrix = pairwise_distances(points) - # Avoid wrong behavior pathological situation where all points are duplicates + # Avoid wrong behavior situations where all (remaining) points are duplicates np.fill_diagonal(dist_matrix, -np.inf) # Initialize the point selection diff --git a/tests/utils/test_sampling_algorithms.py b/tests/utils/test_sampling_algorithms.py index 27da3e4b4..3a46b96cd 100644 --- a/tests/utils/test_sampling_algorithms.py +++ b/tests/utils/test_sampling_algorithms.py @@ -95,3 +95,11 @@ def test_farthest_point_sampling(points: np.ndarray): # we test this as additional case sorting_idxs = farthest_point_sampling(points[permutation_idxs], 1) assert np.array_equal(target[[0]], points[permutation_idxs][sorting_idxs]) + + +def test_farthest_point_sampling_pathological_case(): + """FPS executed on a degenerate point cloud raises a warning.""" + points = np.ones((3, 3)) + with pytest.warns(UserWarning, match="identical"): + selection = farthest_point_sampling(points, 2) + assert selection == [0, 1] From ed960a468c91b6d92421f91ed5f8d416e60f8eb9 Mon Sep 17 00:00:00 2001 From: AdrianSosic Date: Fri, 9 Aug 2024 14:37:19 +0200 Subject: [PATCH 15/15] Add backticks to docstring --- baybe/utils/sampling_algorithms.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/baybe/utils/sampling_algorithms.py b/baybe/utils/sampling_algorithms.py index 4ea2ae929..6adf5da23 100644 --- a/baybe/utils/sampling_algorithms.py +++ b/baybe/utils/sampling_algorithms.py @@ -23,8 +23,8 @@ def farthest_point_sampling( Args: points: The points that are available for selection, represented as a 2-D array - of shape (n, k), where n is the number of points and k is the dimensionality - of the points. + of shape ``(n, k)``, where ``n`` is the number of points and ``k`` is the + dimensionality of the points. n_samples: The total number of points to be selected. initialization: Determines how the first points are selected: * ``"farthest"``: The first two selected points are those with the