aeon-toolkit · MatthewMiddlehurst · Aug 2, 2024 · Jun 26, 2024 · Jun 27, 2024 · Jun 28, 2024
@@ -1,8 +1,14 @@
 """Distance based time series classifiers."""
 
-__all__ = ["ElasticEnsemble", "KNeighborsTimeSeriesClassifier", "ProximityTree"]
+__all__ = [
+    "ElasticEnsemble",
+    "KNeighborsTimeSeriesClassifier",
+    "ProximityTree",
+    "ProximityForest",
+]
 
 from aeon.classification.distance_based._elastic_ensemble import ElasticEnsemble
+from aeon.classification.distance_based._proximity_forest import ProximityForest
 from aeon.classification.distance_based._proximity_tree import ProximityTree
 from aeon.classification.distance_based._time_series_neighbors import (
     KNeighborsTimeSeriesClassifier,

@@ -0,0 +1,156 @@
+"""Proximity Forest Classifier.
+
+The Proximity Forest is an ensemble of Proximity Trees.
+"""
+
+__all__ = ["ProximityForest"]
+
+from typing import Type, Union
+
+import numpy as np
+from joblib import Parallel, delayed
+from sklearn.utils import check_random_state
+
+from aeon.classification.base import BaseClassifier
+from aeon.classification.distance_based._proximity_tree import ProximityTree
+
+
+class ProximityForest(BaseClassifier):
+    """Proximity Forest Classifier.
+
+    The Proximity Forest is a distance-based classifier that creates an
+    ensemble of decision trees, where the splits are based on the
+    similarity between time series measured using various parameterised
+    distance measures.
+
+    Parameters
+    ----------
+    n_trees: int, default = 100
+        The number of trees, by default an ensemble of 100 trees is formed.
+    n_splitters: int, default = 5
+        The number of candidate splitters to be evaluated at each node.
+    max_depth: int, default = None
+        The maximum depth of the tree. If None, then nodes are expanded until all
+        leaves are pure or until all leaves contain less than min_samples_split samples.
+    min_samples_split: int, default = 2
+        The minimum number of samples required to split an internal node.
+    random_state : int, RandomState instance or None, default=None
+        If `int`, random_state is the seed used by the random number generator;
+        If `RandomState` instance, random_state is the random number generator;
+        If `None`, the random number generator is the `RandomState` instance used
+        by `np.random`.
+    n_jobs : int, default = 1
+        The number of parallel jobs to run for neighbors search.
+        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
+        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
+        for more details. Parameter for compatibility purposes, still unimplemented.
+    parallel_backend : str, ParallelBackendBase instance or None, default=None
+        Specify the parallelisation backend implementation in joblib, if None a 'prefer'
+        value of "threads" is used by default.
+        Valid options are "loky", "multiprocessing", "threading" or a custom backend.
+        See the joblib Parallel documentation for more details.
+
+    Notes
+    -----
+    For the Java version, see
+    `ProximityForest
+    <https://github.com/fpetitjean/ProximityForest>`_.
+
+    References
+    ----------
+    .. [1] Lucas, B., Shifaz, A., Pelletier, C., O’Neill, L., Zaidi, N., Goethals, B.,
+    Petitjean, F. and Webb, G.I., 2019. Proximity forest: an effective and scalable
+    distance-based classifier for time series. Data Mining and Knowledge Discovery,
+    33(3), pp.607-635.
+
+    Examples
+    --------
+    >>> from aeon.datasets import load_unit_test
+    >>> from aeon.classification.distance_based import ProximityForest
+    >>> X_train, y_train = load_unit_test(split="train")
+    >>> X_test, y_test = load_unit_test(split="test")
+    >>> classifier = ProximityForest(n_trees = 10, n_splitters = 3)
+    >>> classifier.fit(X_train, y_train)
+    ProximityForest(...)
+    >>> y_pred = classifier.predict(X_test)
+    """
+
+    _tags = {
+        "capability:multivariate": False,
+        "capability:unequal_length": False,
+        "capability:multithreading": True,
+        "algorithm_type": "distance",
+        "X_inner_type": ["numpy2D"],
+    }
+
+    def __init__(
+        self,
+        n_trees=100,
+        n_splitters: int = 5,
+        max_depth: int = None,
+        min_samples_split: int = 2,
+        random_state: Union[int, Type[np.random.RandomState], None] = None,
+        n_jobs: int = 1,
+        parallel_backend=None,
+    ):
+        self.n_trees = n_trees
+        self.n_splitters = n_splitters
+        self.max_depth = max_depth
+        self.min_samples_split = min_samples_split
+        self.random_state = random_state
+        self.n_jobs = n_jobs
+        self.parallel_backend = parallel_backend
+        super().__init__()
+
+    def _fit(self, X, y):
+        rng = check_random_state(self.random_state)
+        self.trees_ = Parallel(
+            n_jobs=self._n_jobs, backend=self.parallel_backend, prefer="threads"
+        )(
+            delayed(_fit_tree)(
+                X,
+                y,
+                self.n_splitters,
+                self.max_depth,
+                self.min_samples_split,
+                check_random_state(rng.randint(np.iinfo(np.int32).max)),
+            )
+            for _ in range(self.n_trees)
+        )
+
+    def _predict_proba(self, X):
+        classes = list(self.classes_)
+        preds = Parallel(
+            n_jobs=self._n_jobs, backend=self.parallel_backend, prefer="threads"
+        )(delayed(_predict_tree)(tree, X) for tree in self.trees_)
+        n_cases = X.shape[0]
+        votes = np.zeros((n_cases, self.n_classes_))
+        for i in range(len(preds)):
+            predictions = np.array(
+                [classes.index(class_label) for class_label in preds[i]]
+            )
+            for j in range(n_cases):
+                votes[j, predictions[j]] += 1
+        output_probas = votes / self.n_trees
+        return output_probas
+
+    def _predict(self, X):
+        probas = self._predict_proba(X)
+        idx = np.argmax(probas, axis=1)
+        preds = np.asarray([self.classes_[x] for x in idx])
+        return preds
+
+
+def _fit_tree(X, y, n_splitters, max_depth, min_samples_split, random_state):
+    clf = ProximityTree(
+        n_splitters=n_splitters,
+        max_depth=max_depth,
+        min_samples_split=min_samples_split,
+        random_state=random_state,
+    )
+    clf.fit(X, y)
+    return clf
+
+
+def _predict_tree(tree, X):
+    return tree.predict(X)
@@ -82,11 +82,6 @@ class ProximityTree(BaseClassifier):
         If `RandomState` instance, random_state is the random number generator;
         If `None`, the random number generator is the `RandomState` instance used
         by `np.random`.
-    n_jobs : int, default = 1
-        The number of parallel jobs to run for neighbors search.
-        ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context.
-        ``-1`` means using all processors. See :term:`Glossary <n_jobs>`
-        for more details. Parameter for compatibility purposes, still unimplemented.
 
     Notes
     -----
@@ -117,7 +112,7 @@ class ProximityTree(BaseClassifier):
         "capability:multivariate": False,
         "capability:unequal_length": False,
         "algorithm_type": "distance",
-        "X_inner_type": ["numpy2D", "numpy3D"],
+        "X_inner_type": ["numpy2D"],
     }
 
     def __init__(
@@ -126,13 +121,11 @@ def __init__(
         max_depth: int = None,
         min_samples_split: int = 2,
         random_state: Union[int, Type[np.random.RandomState], None] = None,
-        n_jobs: int = 1,
     ) -> None:
         self.n_splitters = n_splitters
         self.max_depth = max_depth
         self.min_samples_split = min_samples_split
         self.random_state = random_state
-        self.n_jobs = n_jobs
         super().__init__()
 
     def _get_parameter_value(self, X):
@@ -276,8 +269,8 @@ def _build_tree(self, X, y, depth, node_id, parent_target_value=None):
             for label, count in zip(*np.unique(y, return_counts=True))
         }
 
-        # If min sample splits is reached
-        if self.min_samples_split >= len(X):
+        # Pure node
+        if len(np.unique(y)) == 1:
             leaf_label = target_value
             leaf = _Node(
                 node_id=node_id,
@@ -287,8 +280,8 @@ def _build_tree(self, X, y, depth, node_id, parent_target_value=None):
             )
             return leaf
 
-        # If max depth is reached
-        if (self.max_depth is not None) and (depth >= self.max_depth):
+        # If min sample splits is reached
+        if self.min_samples_split >= len(X):
             leaf_label = target_value
             leaf = _Node(
                 node_id=node_id,
@@ -298,8 +291,8 @@ def _build_tree(self, X, y, depth, node_id, parent_target_value=None):
             )
             return leaf
 
-        # Pure node
-        if len(np.unique(y)) == 1:
+        # If max depth is reached
+        if (self.max_depth is not None) and (depth >= self.max_depth):
             leaf_label = target_value
             leaf = _Node(
                 node_id=node_id,
@@ -371,16 +364,6 @@ def _find_target_value(y):
         return mode_value
 
     def _fit(self, X, y):
-        # Check dimension of X
-        if X.ndim == 3:
-            if X.shape[1] == 1:
-                X = np.squeeze(X, axis=1)
-            else:
-                raise ValueError("X should be univariate.")
-
-        # Set the unique class labels
-        self.classes_ = list(np.unique(y))
-
         self.root = self._build_tree(
             X, y, depth=0, node_id="0", parent_target_value=None
         )
@@ -391,14 +374,8 @@ def _predict(self, X):
         return np.array([self.classes_[pred] for pred in predictions])
 
     def _predict_proba(self, X):
-        # Check dimension of X
-        if X.ndim == 3:
-            if X.shape[1] == 1:
-                X = np.squeeze(X, axis=1)
-            else:
-                raise ValueError("X should be univariate.")
         # Get the unique class labels
-        classes = self.classes_
+        classes = list(self.classes_)
         class_count = len(classes)
         probas = []
 

@@ -0,0 +1,27 @@
+"""Test for Proximity Forest."""
+
+import pytest
+from sklearn.metrics import accuracy_score
+
+from aeon.classification.distance_based import ProximityForest
+from aeon.datasets import load_unit_test
+
+
+def test_univariate():
+    """Test that the function gives appropriate error message."""
+    X, y = load_unit_test()
+    X_multivariate = X.reshape((-1, 2, 12))
+    clf = ProximityForest(n_trees=5, random_state=42, n_jobs=-1)
+    with pytest.raises(ValueError):
+        clf.fit(X_multivariate, y)
+
+
+def test_proximity_forest():
+    """Test the fit method of ProximityTree."""
+    X_train, y_train = load_unit_test()
+    X_test, y_test = load_unit_test(split="test")
+    clf = ProximityForest(n_trees=5, n_splitters=3, max_depth=4)
+    clf.fit(X_train, y_train)
+    y_pred = clf.predict(X_test)
+    score = accuracy_score(y_test, y_pred)
+    assert score >= 0.9
@@ -6,17 +6,7 @@
 
 from aeon.classification.distance_based import ProximityTree
 from aeon.classification.distance_based._proximity_tree import gini, gini_gain
-from aeon.testing.data_generation import make_example_3d_numpy
-
-
-@pytest.fixture
-def time_series_dataset():
-    """Generate time series dataset for testing."""
-    n_samples = 100  # Total number of samples (should be even)
-    n_timepoints = 24  # Length of each time series
-    n_channels = 1
-    data, labels = make_example_3d_numpy(n_samples, n_channels, n_timepoints)
-    return data, labels
+from aeon.datasets import load_unit_test
 
 
 def test_gini():
@@ -110,9 +100,9 @@ def test_get_parameter_value():
             assert measure_params["c"] in [10**i for i in range(-2, 3)]
 
 
-def test_get_cadidate_splitter(time_series_dataset):
+def test_get_cadidate_splitter():
     """Test the method to generate candidate splitters."""
-    X, y = time_series_dataset
+    X, y = load_unit_test()
     clf = ProximityTree()
     splitter = clf._get_candidate_splitter(X, y)
     assert len(splitter) == 2
@@ -132,9 +122,9 @@ def test_get_cadidate_splitter(time_series_dataset):
     assert measure in expected_measures
 
 
-def test_get_best_splitter(time_series_dataset):
+def test_get_best_splitter():
     """Test the method to get optimum splitter of a node."""
-    X, y = time_series_dataset
+    X, y = load_unit_test()
     clf = ProximityTree(n_splitters=3)
 
     splitter = clf._get_best_splitter(X, y)
@@ -146,12 +136,12 @@ def test_get_best_splitter(time_series_dataset):
     assert len(splitter) == 2
 
 
-def test_proximity_tree(time_series_dataset):
+def test_proximity_tree():
     """Test the fit method of ProximityTree."""
-    X, y = time_series_dataset
+    X, y = load_unit_test()
     clf = ProximityTree(n_splitters=3, max_depth=4)
     clf.fit(X, y)
-    X_test, y_test = time_series_dataset
+    X_test, y_test = load_unit_test(split="train")
     y_pred = clf.predict(X_test)
     score = accuracy_score(y_test, y_pred)
     assert score >= 0.9
diff --git a/docs/api_reference/classification.rst b/docs/api_reference/classification.rst
@@ -76,6 +76,7 @@ Distance-based
 
     ElasticEnsemble
     KNeighborsTimeSeriesClassifier
+    ProximityForest
     ProximityTree
 
 Feature-based