From 9baf493e2a964986f97f7a1f8e7f31569843c103 Mon Sep 17 00:00:00 2001 From: Thomas Nickerson <64759920+ThomasNickerson@users.noreply.github.com> Date: Thu, 21 Oct 2021 14:39:15 -0300 Subject: [PATCH 1/4] Fix umap.update by adding compressed=False to NNDescent args --- umap/umap_.py | 1 + 1 file changed, 1 insertion(+) diff --git a/umap/umap_.py b/umap/umap_.py index 26af4f78..862c0edd 100644 --- a/umap/umap_.py +++ b/umap/umap_.py @@ -337,6 +337,7 @@ def nearest_neighbors( low_memory=low_memory, n_jobs=n_jobs, verbose=verbose, + compressed=False ) knn_indices, knn_dists = knn_search_index.neighbor_graph From b9e85f19ee438ad21dc2b5424cb0ef2751d9d708 Mon Sep 17 00:00:00 2001 From: Thomas Nickerson <64759920+ThomasNickerson@users.noreply.github.com> Date: Thu, 21 Oct 2021 15:50:20 -0300 Subject: [PATCH 2/4] Add test to cover n>4096 path in update, run black --- umap/tests/conftest.py | 10 ++++++++++ umap/tests/test_umap_on_iris.py | 4 +++- umap/tests/test_umap_ops.py | 18 ++++++++++++++++++ umap/umap_.py | 6 +++--- 4 files changed, 34 insertions(+), 4 deletions(-) diff --git a/umap/tests/conftest.py b/umap/tests/conftest.py index e5647ca2..d5ca6429 100644 --- a/umap/tests/conftest.py +++ b/umap/tests/conftest.py @@ -165,6 +165,16 @@ def iris_subset_model(iris, iris_selection): ) +@pytest.fixture(scope="session") +def iris_subset_model_large(iris, iris_selection): + return UMAP( + n_neighbors=10, + min_dist=0.01, + random_state=42, + force_approximation_algorithm=True, + ).fit(iris.data[iris_selection]) + + @pytest.fixture(scope="session") def supervised_iris_model(iris): return UMAP(n_neighbors=10, min_dist=0.01, n_epochs=200, random_state=42).fit( diff --git a/umap/tests/test_umap_on_iris.py b/umap/tests/test_umap_on_iris.py index 15318181..4c929b29 100644 --- a/umap/tests/test_umap_on_iris.py +++ b/umap/tests/test_umap_on_iris.py @@ -66,7 +66,9 @@ def test_umap_trustworthiness_on_sphere_iris( r * np.cos(embedding[:, 0]), ] ).T - trust = trustworthiness(iris.data, projected_embedding, n_neighbors=10, metric="cosine") + trust = trustworthiness( + iris.data, projected_embedding, n_neighbors=10, metric="cosine" + ) assert ( trust >= 0.80 ), "Insufficiently trustworthy spherical embedding for iris dataset: {}".format( diff --git a/umap/tests/test_umap_ops.py b/umap/tests/test_umap_ops.py index ca815c76..4af35a94 100644 --- a/umap/tests/test_umap_ops.py +++ b/umap/tests/test_umap_ops.py @@ -239,6 +239,24 @@ def test_umap_update(iris, iris_subset_model, iris_selection, iris_model): assert error < 1.0 +def test_umap_update_large(iris, iris_subset_model_large, iris_selection, iris_model): + + new_data = iris.data[~iris_selection] + new_model = iris_subset_model_large + new_model.update(new_data) + + comparison_graph = scipy.sparse.vstack( + [iris_model.graph_[iris_selection], iris_model.graph_[~iris_selection]] + ) + comparison_graph = scipy.sparse.hstack( + [comparison_graph[:, iris_selection], comparison_graph[:, ~iris_selection]] + ) + + error = np.sum(np.abs((new_model.graph_ - comparison_graph).data)) + + assert error < 1.0 + + # ----------------- # UMAP Graph output # ----------------- diff --git a/umap/umap_.py b/umap/umap_.py index 862c0edd..09e8d4fb 100644 --- a/umap/umap_.py +++ b/umap/umap_.py @@ -337,7 +337,7 @@ def nearest_neighbors( low_memory=low_memory, n_jobs=n_jobs, verbose=verbose, - compressed=False + compressed=False, ) knn_indices, knn_dists = knn_search_index.neighbor_graph @@ -1528,8 +1528,8 @@ class UMAP(BaseEstimator): target_weight: float (optional, default 0.5) weighting factor between data topology and target topology. A value of - 0.0 weights predominantly on data, a value of 1.0 places a strong emphasis on - target. The default of 0.5 balances the weighting equally between data and + 0.0 weights predominantly on data, a value of 1.0 places a strong emphasis on + target. The default of 0.5 balances the weighting equally between data and target. transform_seed: int (optional, default 42) From 0df2d7071e64ca803be8d2f074f81becdf81dc44 Mon Sep 17 00:00:00 2001 From: Thomas Nickerson <64759920+ThomasNickerson@users.noreply.github.com> Date: Thu, 21 Oct 2021 16:41:25 -0300 Subject: [PATCH 3/4] Fix new test --- umap/tests/conftest.py | 10 ++++++++++ umap/tests/test_umap_ops.py | 11 ++++++++--- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/umap/tests/conftest.py b/umap/tests/conftest.py index d5ca6429..1ad9ab43 100644 --- a/umap/tests/conftest.py +++ b/umap/tests/conftest.py @@ -158,6 +158,16 @@ def iris_model(iris): return UMAP(n_neighbors=10, min_dist=0.01, random_state=42).fit(iris.data) +@pytest.fixture(scope="session") +def iris_model_large(iris): + return UMAP( + n_neighbors=10, + min_dist=0.01, + random_state=42, + force_approximation_algorithm=True, + ).fit(iris.data) + + @pytest.fixture(scope="session") def iris_subset_model(iris, iris_selection): return UMAP(n_neighbors=10, min_dist=0.01, random_state=42).fit( diff --git a/umap/tests/test_umap_ops.py b/umap/tests/test_umap_ops.py index 4af35a94..ceafbfad 100644 --- a/umap/tests/test_umap_ops.py +++ b/umap/tests/test_umap_ops.py @@ -239,14 +239,19 @@ def test_umap_update(iris, iris_subset_model, iris_selection, iris_model): assert error < 1.0 -def test_umap_update_large(iris, iris_subset_model_large, iris_selection, iris_model): +def test_umap_update_large( + iris, iris_subset_model_large, iris_selection, iris_model_large +): new_data = iris.data[~iris_selection] new_model = iris_subset_model_large new_model.update(new_data) comparison_graph = scipy.sparse.vstack( - [iris_model.graph_[iris_selection], iris_model.graph_[~iris_selection]] + [ + iris_model_large.graph_[iris_selection], + iris_model_large.graph_[~iris_selection], + ] ) comparison_graph = scipy.sparse.hstack( [comparison_graph[:, iris_selection], comparison_graph[:, ~iris_selection]] @@ -254,7 +259,7 @@ def test_umap_update_large(iris, iris_subset_model_large, iris_selection, iris_m error = np.sum(np.abs((new_model.graph_ - comparison_graph).data)) - assert error < 1.0 + assert error < 1.5 # ----------------- From 9342d9717dd4d7f66997eb2e200eca94166129cc Mon Sep 17 00:00:00 2001 From: Thomas Nickerson <64759920+ThomasNickerson@users.noreply.github.com> Date: Fri, 22 Oct 2021 17:12:23 -0300 Subject: [PATCH 4/4] Prepare knn search index before updating --- umap/umap_.py | 1 + 1 file changed, 1 insertion(+) diff --git a/umap/umap_.py b/umap/umap_.py index 09e8d4fb..465cb785 100644 --- a/umap/umap_.py +++ b/umap/umap_.py @@ -3311,6 +3311,7 @@ def update(self, X): ) else: + self._knn_search_index.prepare() self._knn_search_index.update(X) self._raw_data = self._knn_search_index._raw_data (