Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve FPS #336

Merged
merged 15 commits into from
Aug 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Fixed
- `CategoricalParameter` and `TaskParameter` no longer incorrectly coerce a single
string input to categories/tasks
- `farthest_point_sampling` no longer depends on the provided point order

## [0.10.0] - 2024-08-02
### Breaking Changes
Expand Down
75 changes: 52 additions & 23 deletions baybe/utils/sampling_algorithms.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""A collection of point sampling algorithms."""

import warnings
from enum import Enum
from typing import Literal

Expand All @@ -13,51 +14,78 @@ def farthest_point_sampling(
n_samples: int = 1,
initialization: Literal["farthest", "random"] = "farthest",
) -> list[int]:
"""Sample points according to a farthest point heuristic.
"""Select a subset of points using farthest point sampling.

Creates a subset of a collection of points by successively adding points with the
largest Euclidean distance to intermediate point selections encountered during
the algorithmic process.
Creates a subset of a given collection of points by successively adding points with
the largest Euclidean distance to intermediate point selections encountered during
the algorithmic process. The mechanism used for the initial point selection is
configurable.

Args:
points: The points that are available for selection, represented as a 2D array
whose first dimension corresponds to the point index.
points: The points that are available for selection, represented as a 2-D array
of shape ``(n, k)``, where ``n`` is the number of points and ``k`` is the
dimensionality of the points.
n_samples: The total number of points to be selected.
initialization: Determines how the first points are selected. When
``"farthest"`` is chosen, the first two selected points are those with the
largest distance. If only a single point is requested, it is selected
randomly from these two. When ``"random"`` is chosen, the first point is
selected uniformly at random.
initialization: Determines how the first points are selected:
* ``"farthest"``: The first two selected points are those with the
largest distance. If only a single point is requested, a deterministic
choice is made based on the point coordinates.
* ``"random"``: The first point is selected uniformly at random.

Returns:
A list containing the positional indices of the selected points.

Raises:
ValueError: If an unknown initialization recommender is used.
ValueError: If the provided array is not two-dimensional.
ValueError: If the array contains no points.
ValueError: If the input space has no dimensions.
ValueError: If an unknown method for initialization is specified.
"""
# Compute the pairwise distances between all points
if (n_dims := np.ndim(points)) != 2:
raise ValueError(
f"The provided array must be two-dimensional but the given input had "
f"{n_dims} dimensions."
)
if (n_points := len(points)) == 0:
raise ValueError("The provided array must contain at least one row.")
if points.shape[-1] == 0:
raise ValueError("The provided input space must be at least one-dimensional.")
if n_samples > n_points:
raise ValueError(
f"The number of requested samples ({n_samples}) cannot be larger than the "
f"total number of points provided ({n_points})."
)

# Catch the pathological case upfront
if len(np.unique(points, axis=0)) == 1:
warnings.warn("All points are identical.", UserWarning)
return list(range(n_samples))

# Sort the points to produce the same result regardless of the input order
sort_idx = np.lexsort(tuple(points.T))
points = points[sort_idx]
AVHopp marked this conversation as resolved.
Show resolved Hide resolved

# Pre-compute the pairwise distances between all points
dist_matrix = pairwise_distances(points)

# Initialize the point selection subset
# Avoid wrong behavior situations where all (remaining) points are duplicates
np.fill_diagonal(dist_matrix, -np.inf)
AVHopp marked this conversation as resolved.
Show resolved Hide resolved

# Initialize the point selection
if initialization == "random":
selected_point_indices = [np.random.randint(0, len(points))]
selected_point_indices = [np.random.randint(0, n_points)]
elif initialization == "farthest":
idx_1d = np.argmax(dist_matrix)
selected_point_indices = list(
map(int, np.unravel_index(idx_1d, dist_matrix.shape))
)
if n_samples == 1:
return np.random.choice(selected_point_indices, 1).tolist()
elif n_samples < 1:
raise ValueError(
f"Farthest point sampling must be done with >= 1 samples, but "
f"{n_samples=} was given."
)
return [sort_idx[selected_point_indices[0]]]
else:
raise ValueError(f"unknown initialization recommender: '{initialization}'")

# Initialize the list of remaining points
remaining_point_indices = list(range(len(points)))
remaining_point_indices = list(range(n_points))
for idx in selected_point_indices:
remaining_point_indices.remove(idx)

Expand All @@ -76,7 +104,8 @@ def farthest_point_sampling(
selected_point_indices.append(selected_point_index)
remaining_point_indices.remove(selected_point_index)

return selected_point_indices
# Undo the initial point reordering
return sort_idx[selected_point_indices].tolist()


class DiscreteSamplingMethod(Enum):
Expand Down
74 changes: 73 additions & 1 deletion tests/utils/test_sampling_algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,19 @@

import math

import hypothesis.extra.numpy as hnp
import hypothesis.strategies as st
import numpy as np
import pandas as pd
import pytest
from hypothesis import example, given
from sklearn.metrics import pairwise_distances

from baybe.utils.sampling_algorithms import DiscreteSamplingMethod, sample_numerical_df
from baybe.utils.sampling_algorithms import (
DiscreteSamplingMethod,
farthest_point_sampling,
sample_numerical_df,
)


@pytest.mark.parametrize("fraction", [0.2, 0.8, 1.0, 1.2, 2.0, 2.4, 3.5])
Expand All @@ -31,3 +39,67 @@ def test_discrete_sampling(fraction, method):
assert len(sampled) == len(
sampled.drop_duplicates()
), "Undersized sampling did not return unique points."


@given(
points=hnp.arrays(
dtype=float,
shape=hnp.array_shapes(min_dims=2, max_dims=2, min_side=1),
# Because of the square involved in the Euclidean distance computation,
# we limit the floating point range to avoid overflow problems
elements=st.floats(min_value=-1e100, max_value=1e100, allow_nan=False),
AdrianSosic marked this conversation as resolved.
Show resolved Hide resolved
)
)
# Explicitly test scenario with equidistant points (see comments in test body)
@example(points=np.array([[0, 0], [0, 1], [1, 0], [1, 1]]))
def test_farthest_point_sampling(points: np.ndarray):
"""FPS produces the same point sequence regardless of the order in which the
points are provided. Also, each point fulfills the "farthest point" criterion
in its respective iteration.
""" # noqa
# Order the points using FPS
sorting_idxs = farthest_point_sampling(points, len(points))
target = points[sorting_idxs]

# For the ordered collection of points, it must hold:
# ---------------------------------------------------
# The minimum distance of the n_th selected point to all previously selected points
# must be larger than the minimum distance of any other remaining candidate point to
# the previously selected points – that's what makes it the "farthest point" in the
# respective iteration.
#
# For the check, we start with the second point (because there are otherwise no
# previous points) and end with the second last point (because there are otherwise
# no alternative candidates left):
dist_mat = pairwise_distances(target)
for i in range(1, len(dist_mat) - 1):
min_dist_selected_to_previous = np.min(dist_mat[i, :i])
min_dist_remaining_to_previous = np.min(dist_mat[i + 1 :, :i])
z = min_dist_selected_to_previous >= min_dist_remaining_to_previous
assert z

# Also, for the algorithm to be fully deterministic, the obtained result should not
# depend on the particular (random) order in which the points are provided. That is,
# running the algorithm on a permutation should still produce the same sequence of
# points. Note: We establish the check on the point coordinates and not the
# selection index, because the latter can still differ in case of duplicated points.
#
# Examples where this can make a difference is three points forming an equilateral
# triangle or four points spanning a unit cube. Here, tie-breaking operations such
# as `np.max` can lead to different results depending on the order.
permutation_idxs = np.random.permutation(len(points))
sorting_idxs = farthest_point_sampling(points[permutation_idxs], len(points))
assert np.array_equal(target, points[permutation_idxs][sorting_idxs])

# Because requesting a single point needs special treatment in FPS,
# we test this as additional case
sorting_idxs = farthest_point_sampling(points[permutation_idxs], 1)
assert np.array_equal(target[[0]], points[permutation_idxs][sorting_idxs])


def test_farthest_point_sampling_pathological_case():
"""FPS executed on a degenerate point cloud raises a warning."""
points = np.ones((3, 3))
with pytest.warns(UserWarning, match="identical"):
selection = farthest_point_sampling(points, 2)
assert selection == [0, 1]
Loading