emdgroup · AdrianSosic · Aug 21, 2024 · Aug 6, 2024 · Aug 6, 2024 · Aug 6, 2024
@@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### Fixed
 - `CategoricalParameter` and `TaskParameter` no longer incorrectly coerce a single
   string input to categories/tasks
+- `farthest_point_sampling` no longer depends on the provided point order
 
 ## [0.10.0] - 2024-08-02
 ### Breaking Changes

@@ -1,5 +1,6 @@
 """A collection of point sampling algorithms."""
 
+import warnings
 from enum import Enum
 from typing import Literal
 
@@ -13,51 +14,78 @@ def farthest_point_sampling(
     n_samples: int = 1,
     initialization: Literal["farthest", "random"] = "farthest",
 ) -> list[int]:
-    """Sample points according to a farthest point heuristic.
+    """Select a subset of points using farthest point sampling.
 
-    Creates a subset of a collection of points by successively adding points with the
-    largest Euclidean distance to intermediate point selections encountered during
-    the algorithmic process.
+    Creates a subset of a given collection of points by successively adding points with
+    the largest Euclidean distance to intermediate point selections encountered during
+    the algorithmic process. The mechanism used for the initial point selection is
+    configurable.
 
     Args:
-        points: The points that are available for selection, represented as a 2D array
-            whose first dimension corresponds to the point index.
+        points: The points that are available for selection, represented as a 2-D array
+            of shape ``(n, k)``, where ``n`` is the number of points and ``k`` is the
+            dimensionality of the points.
         n_samples: The total number of points to be selected.
-        initialization: Determines how the first points are selected. When
-            ``"farthest"`` is chosen, the first two selected points are those with the
-            largest distance. If only a single point is requested, it is selected
-            randomly from these two. When ``"random"`` is chosen, the first point is
-            selected uniformly at random.
+        initialization: Determines how the first points are selected:
+            * ``"farthest"``: The first two selected points are those with the
+              largest distance. If only a single point is requested, a deterministic
+              choice is made based on the point coordinates.
+            * ``"random"``: The first point is selected uniformly at random.
 
     Returns:
         A list containing the positional indices of the selected points.
 
     Raises:
-        ValueError: If an unknown initialization recommender is used.
+        ValueError: If the provided array is not two-dimensional.
+        ValueError: If the array contains no points.
+        ValueError: If the input space has no dimensions.
+        ValueError: If an unknown method for initialization is specified.
     """
-    # Compute the pairwise distances between all points
+    if (n_dims := np.ndim(points)) != 2:
+        raise ValueError(
+            f"The provided array must be two-dimensional but the given input had "
+            f"{n_dims} dimensions."
+        )
+    if (n_points := len(points)) == 0:
+        raise ValueError("The provided array must contain at least one row.")
+    if points.shape[-1] == 0:
+        raise ValueError("The provided input space must be at least one-dimensional.")
+    if n_samples > n_points:
+        raise ValueError(
+            f"The number of requested samples ({n_samples}) cannot be larger than the "
+            f"total number of points provided ({n_points})."
+        )
+
+    # Catch the pathological case upfront
+    if len(np.unique(points, axis=0)) == 1:
+        warnings.warn("All points are identical.", UserWarning)
+        return list(range(n_samples))
+
+    # Sort the points to produce the same result regardless of the input order
+    sort_idx = np.lexsort(tuple(points.T))
+    points = points[sort_idx]
+
+    # Pre-compute the pairwise distances between all points
     dist_matrix = pairwise_distances(points)
 
-    # Initialize the point selection subset
+    # Avoid wrong behavior situations where all (remaining) points are duplicates
+    np.fill_diagonal(dist_matrix, -np.inf)
+
+    # Initialize the point selection
     if initialization == "random":
-        selected_point_indices = [np.random.randint(0, len(points))]
+        selected_point_indices = [np.random.randint(0, n_points)]
     elif initialization == "farthest":
         idx_1d = np.argmax(dist_matrix)
         selected_point_indices = list(
             map(int, np.unravel_index(idx_1d, dist_matrix.shape))
         )
         if n_samples == 1:
-            return np.random.choice(selected_point_indices, 1).tolist()
-        elif n_samples < 1:
-            raise ValueError(
-                f"Farthest point sampling must be done with >= 1 samples, but "
-                f"{n_samples=} was given."
-            )
+            return [sort_idx[selected_point_indices[0]]]
     else:
         raise ValueError(f"unknown initialization recommender: '{initialization}'")
 
     # Initialize the list of remaining points
-    remaining_point_indices = list(range(len(points)))
+    remaining_point_indices = list(range(n_points))
     for idx in selected_point_indices:
         remaining_point_indices.remove(idx)
 
@@ -76,7 +104,8 @@ def farthest_point_sampling(
         selected_point_indices.append(selected_point_index)
         remaining_point_indices.remove(selected_point_index)
 
-    return selected_point_indices
+    # Undo the initial point reordering
+    return sort_idx[selected_point_indices].tolist()
 
 
 class DiscreteSamplingMethod(Enum):

@@ -2,11 +2,19 @@
 
 import math
 
+import hypothesis.extra.numpy as hnp
+import hypothesis.strategies as st
 import numpy as np
 import pandas as pd
 import pytest
+from hypothesis import example, given
+from sklearn.metrics import pairwise_distances
 
-from baybe.utils.sampling_algorithms import DiscreteSamplingMethod, sample_numerical_df
+from baybe.utils.sampling_algorithms import (
+    DiscreteSamplingMethod,
+    farthest_point_sampling,
+    sample_numerical_df,
+)
 
 
 @pytest.mark.parametrize("fraction", [0.2, 0.8, 1.0, 1.2, 2.0, 2.4, 3.5])
@@ -31,3 +39,67 @@ def test_discrete_sampling(fraction, method):
         assert len(sampled) == len(
             sampled.drop_duplicates()
         ), "Undersized sampling did not return unique points."
+
+
+@given(
+    points=hnp.arrays(
+        dtype=float,
+        shape=hnp.array_shapes(min_dims=2, max_dims=2, min_side=1),
+        # Because of the square involved in the Euclidean distance computation,
+        # we limit the floating point range to avoid overflow problems
+        elements=st.floats(min_value=-1e100, max_value=1e100, allow_nan=False),
+    )
+)
+# Explicitly test scenario with equidistant points (see comments in test body)
+@example(points=np.array([[0, 0], [0, 1], [1, 0], [1, 1]]))
+def test_farthest_point_sampling(points: np.ndarray):
+    """FPS produces the same point sequence regardless of the order in which the
+    points are provided. Also, each point fulfills the "farthest point" criterion
+    in its respective iteration.
+    """  # noqa
+    # Order the points using FPS
+    sorting_idxs = farthest_point_sampling(points, len(points))
+    target = points[sorting_idxs]
+
+    # For the ordered collection of points, it must hold:
+    # ---------------------------------------------------
+    # The minimum distance of the n_th selected point to all previously selected points
+    # must be larger than the minimum distance of any other remaining candidate point to
+    # the previously selected points – that's what makes it the "farthest point" in the
+    # respective iteration.
+    #
+    # For the check, we start with the second point (because there are otherwise no
+    # previous points) and end with the second last point (because there are otherwise
+    # no alternative candidates left):
+    dist_mat = pairwise_distances(target)
+    for i in range(1, len(dist_mat) - 1):
+        min_dist_selected_to_previous = np.min(dist_mat[i, :i])
+        min_dist_remaining_to_previous = np.min(dist_mat[i + 1 :, :i])
+        z = min_dist_selected_to_previous >= min_dist_remaining_to_previous
+        assert z
+
+    # Also, for the algorithm to be fully deterministic, the obtained result should not
+    # depend on the particular (random) order in which the points are provided. That is,
+    # running the algorithm on a permutation should still produce the same sequence of
+    # points. Note: We establish the check on the point coordinates and not the
+    # selection index, because the latter can still differ in case of duplicated points.
+    #
+    # Examples where this can make a difference is three points forming an equilateral
+    # triangle or four points spanning a unit cube. Here, tie-breaking operations such
+    # as `np.max` can lead to different results depending on the order.
+    permutation_idxs = np.random.permutation(len(points))
+    sorting_idxs = farthest_point_sampling(points[permutation_idxs], len(points))
+    assert np.array_equal(target, points[permutation_idxs][sorting_idxs])
+
+    # Because requesting a single point needs special treatment in FPS,
+    # we test this as additional case
+    sorting_idxs = farthest_point_sampling(points[permutation_idxs], 1)
+    assert np.array_equal(target[[0]], points[permutation_idxs][sorting_idxs])
+
+
+def test_farthest_point_sampling_pathological_case():
+    """FPS executed on a degenerate point cloud raises a warning."""
+    points = np.ones((3, 3))
+    with pytest.warns(UserWarning, match="identical"):
+        selection = farthest_point_sampling(points, 2)
+    assert selection == [0, 1]