diff --git a/docs/source/sections/model_selection/searching.rst b/docs/source/sections/model_selection/searching.rst index f3bbb58..e5b6635 100644 --- a/docs/source/sections/model_selection/searching.rst +++ b/docs/source/sections/model_selection/searching.rst @@ -15,11 +15,12 @@ versions of these methods to support sequence data. API reference ------------- -Classes -^^^^^^^ +Classes/Methods +^^^^^^^^^^^^^^^ .. autosummary:: + ~sequentia.model_selection.param_grid ~sequentia.model_selection.GridSearchCV ~sequentia.model_selection.RandomizedSearchCV ~sequentia.model_selection.HalvingGridSearchCV @@ -81,6 +82,8 @@ cross-validate a :class:`.KNNClassifier` training pipeline. :: Definitions ^^^^^^^^^^^ +.. autofunction:: sequentia.model_selection.param_grid + .. autoclass:: sequentia.model_selection.GridSearchCV :members: __init__ :exclude-members: __new__ diff --git a/sequentia/model_selection/__init__.py b/sequentia/model_selection/__init__.py index b1d7cd2..52dec87 100644 --- a/sequentia/model_selection/__init__.py +++ b/sequentia/model_selection/__init__.py @@ -5,7 +5,11 @@ """Hyper-parameter search and dataset splitting utilities.""" -from sequentia.model_selection._search import GridSearchCV, RandomizedSearchCV +from sequentia.model_selection._search import ( + GridSearchCV, + RandomizedSearchCV, + param_grid, +) from sequentia.model_selection._search_successive_halving import ( HalvingGridSearchCV, HalvingRandomSearchCV, @@ -30,4 +34,5 @@ "RandomizedSearchCV", "HalvingGridSearchCV", "HalvingRandomSearchCV", + "param_grid", ] diff --git a/sequentia/model_selection/_search.py b/sequentia/model_selection/_search.py index 49f5b2b..e8be5ff 100644 --- a/sequentia/model_selection/_search.py +++ b/sequentia/model_selection/_search.py @@ -50,6 +50,7 @@ # License: BSD 3 clause import time +import typing as t from collections import defaultdict from itertools import product @@ -66,7 +67,61 @@ from sequentia.model_selection._validation import _fit_and_score -__all__ = ["BaseSearchCV", "GridSearchCV", "RandomizedSearchCV"] +__all__ = ["BaseSearchCV", "GridSearchCV", "RandomizedSearchCV", "param_grid"] + + +def param_grid(**kwargs: list[t.Any]) -> list[dict[str, t.Any]]: + """Generates a hyper-parameter grid for a nested object. + + Examples + -------- + Using :func:`.param_grid` in a grid search to cross-validate over + settings for :class:`.GaussianMixtureHMM`, which is a nested model + specified in the constructor of a :class:`.HMMClassifier`. :: + + from sklearn.preprocessing import Pipeline, minmax_scale + + from sequenta.enums import PriorMode, CovarianceMode, TopologyMode + from sequentia.models import HMMClassifier, GaussianMixtureHMM + from sequentia.preprocessing import IndependentFunctionTransformer + from sequentia.model_selection import GridSearchCV, StratifiedKFold + + GridSearchCV( + estimator=Pipeline( + [ + ("scale", IndependentFunctionTransformer(minmax_scale)), + ("clf", HMMClassifier(variant=GaussianMixtureHMM)), + ] + ), + param_grid={ + "clf__prior": [PriorMode.UNIFORM, PriorMode.FREQUENCY], + "clf__model_kwargs": param_grid( + n_states=[3, 5, 7], + n_components=[2, 3, 4], + covariance=[ + CovarianceMode.DIAGONAL, CovarianceMode.SPHERICAL + ], + topology=[ + TopologyMode.LEFT_RIGHT, TopologyMode.LINEAR + ], + ) + }, + cv=StratifiedKFold(), + ) + + Parameters + ---------- + **kwargs: + Hyper-parameter name and corresponding values. + + Returns + ------- + Hyper-parameter grid for a nested object. + """ + return [ + dict(zip(kwargs.keys(), values)) + for values in product(*kwargs.values()) + ] class BaseSearchCV(_search.BaseSearchCV): diff --git a/sequentia/models/hmm/classifier.py b/sequentia/models/hmm/classifier.py index 1bb887e..0ccf032 100644 --- a/sequentia/models/hmm/classifier.py +++ b/sequentia/models/hmm/classifier.py @@ -22,7 +22,7 @@ from sequentia.datasets.base import SequentialDataset from sequentia.enums import PriorMode from sequentia.models.base import ClassifierMixin -from sequentia.models.hmm.variants.base import BaseHMM +from sequentia.models.hmm import variants class HMMClassifier(ClassifierMixin): @@ -35,8 +35,9 @@ class HMMClassifier(ClassifierMixin): Examples -------- - Using a :class:`.HMMClassifier` (with :class:`.GaussianMixtureHMM` - models) to classify spoken digits. :: + Using a :class:`.HMMClassifier` with :class:`.GaussianMixtureHMM` + models for each class (all with identical settings), + to classify spoken digits. :: import numpy as np from sequentia.datasets import load_digits @@ -47,7 +48,29 @@ class HMMClassifier(ClassifierMixin): # Fetch MFCCs of spoken digits data = load_digits() - train_data, test_data = data.split(test_size=0.2, random_state=random_state) + train_data, test_data = data.split( + test_size=0.2, random_state=random_state + ) + + # Create a HMMClassifier using: + # - a separate GaussianMixtureHMM for each class (with 3 states) + # - a class frequency prior + clf = HMMClassifier( + variant=GaussianMixtureHMM, + model_kwargs=dict(n_states=3, random_state=random_state) + prior='frequency', + ) + + # Fit the HMMs by providing observation sequences for all classes + clf.fit(train_data.X, train_data.y, lengths=train_data.lengths) + + # Predict classes for the test observation sequences + y_pred = clf.predict(test_data.X, lengths=test_data.lengths) + + For more complex problems, it might be necessary to specify different + hyper-parameters for each individual class HMM. This can be done by + using :func:`add_model` or :func:`add_models` to add HMM objects + after the :class:`HMMClassifier` has been initialized. :: # Create a HMMClassifier using a class frequency prior clf = HMMClassifier(prior='frequency') @@ -57,24 +80,18 @@ class HMMClassifier(ClassifierMixin): model = GaussianMixtureHMM(random_state=random_state) clf.add_model(model, label=label) - # Fit the HMMs by providing training observation sequences for all classes + # Fit the HMMs by providing observation sequences for all classes clf.fit(train_data.X, train_data.y, lengths=train_data.lengths) - # Predict classes for the test observation sequences - y_pred = clf.predict(test_data.X, lengths=test_data.lengths) - - As done in the above example, we can provide unfitted HMMs using - :func:`add_model` or :func:`add_models`, then provide training - observation sequences for all classes to :func:`fit`, which will - automatically train each HMM on the appropriate subset of data. - - Alternatively, we may provide pre-fitted HMMs and call :func:`fit` with - no arguments. :: + Alternatively, we might want to pre-fit the HMMs individually, + then add these fitted HMMs to the :class:`.HMMClassifier`. In this case, + :func:`fit` on the :class:`.HMMClassifier` is called without providing any + data as arguments, since the HMMs are already fitted. :: # Create a HMMClassifier using a class frequency prior clf = HMMClassifier(prior='frequency') - # Manually fit each HMM on its own subset of data + # Manually fit each HMM on its own subset of data for X_train, lengths_train, label for train_data.iter_by_class(): model = GaussianMixtureHMM(random_state=random_state) model.fit(X_train, lengths=lengths_train) @@ -82,12 +99,16 @@ class HMMClassifier(ClassifierMixin): # Fit the classifier clf.fit() - """ # noqa: E501 + """ @pyd.validate_call(config=dict(arbitrary_types_allowed=True)) def __init__( self: pyd.SkipValidation, *, + variant: type[variants.CategoricalHMM] + | type[variants.GaussianMixtureHMM] + | None = None, + model_kwargs: dict[str, t.Any] | None = None, prior: ( PriorMode | dict[int, pyd.confloat(ge=0, le=1)] ) = PriorMode.UNIFORM, # placeholder @@ -100,10 +121,21 @@ def __init__( ---------- self: HMMClassifier + variant: + Variant of HMM to use for modelling each class. If not specified, + models must instead be added using the :func:`add_model` or + :func:`add_models` methods after the :class:`.HMMClassifier` has + been initialized. + + model_kwargs: + If ``variant`` is specified, these parameters are used to + initialize the created HMM object(s). Note that all HMMs + will be created with identical settings. + prior: Type of prior probability to assign to each HMM. - - If ``None``, a uniform prior will be used, making each HMM + - If ``"uniform"``, a uniform prior will be used, making each HMM equally likely. - If ``"frequency"``, the prior probability of each HMM is equal to the fraction of total observation sequences that the HMM was @@ -134,6 +166,14 @@ class labels provided here. ------- HMMClassifier """ + #: Type of HMM to use for each class. + self.variant: ( + type[variants.CategoricalHMM] + | type[variants.GaussianMixtureHMM] + | None + ) = variant + #: Model parameters for initializing HMMs. + self.model_kwargs: dict[str, t.Any] | None = model_kwargs #: Type of prior probability to assign to each HMM. self.prior: PriorMode | dict[int, pyd.confloat(ge=0, le=1)] = prior #: Set of possible class labels. @@ -141,7 +181,7 @@ class labels provided here. #: Maximum number of concurrently running workers. self.n_jobs: pyd.PositiveInt | pyd.NegativeInt = n_jobs #: HMMs constituting the :class:`.HMMClassifier`. - self.models: dict[int, BaseHMM] = {} + self.models: dict[int, variants.BaseHMM] = {} # Allow metadata routing for lengths if _sklearn.routing_enabled(): @@ -158,7 +198,7 @@ class labels provided here. @pyd.validate_call(config=dict(arbitrary_types_allowed=True)) def add_model( self: pyd.SkipValidation, - model: BaseHMM, + model: variants.BaseHMM, /, *, label: int, @@ -200,7 +240,7 @@ def add_model( @pyd.validate_call(config=dict(arbitrary_types_allowed=True)) def add_models( self: pyd.SkipValidation, - models: dict[int, BaseHMM], + models: dict[int, variants.BaseHMM], /, ) -> pyd.SkipValidation: """Add HMMs to the classifier. @@ -239,8 +279,9 @@ def fit( - If fitted models were provided with :func:`add_model` or :func:`add_models`, no arguments should be passed to :func:`fit`. - If unfitted models were provided with :func:`add_model` or - :func:`add_models`, training data ``X``, ``y`` and ``lengths`` - must be provided to :func:`fit`. + :func:`add_models`, or a ``variant`` was specified in + :func:`HMMClassifier.__init__`, training data ``X``, ``y`` and + ``lengths`` must be provided to :func:`fit`. Parameters ---------- @@ -291,6 +332,13 @@ def fit( y = _validation.check_y(y, lengths=lengths, dtype=np.int8) self.classes_ = _validation.check_classes(y, classes=self.classes) + # Initialize models based on instructor spec if provided + if self.variant: + model_kwargs = self.model_kwargs or {} + self.models = { + label: self.variant(**model_kwargs) for label in self.classes_ + } + # Check that each label has a HMM (and vice versa) if set(self.models.keys()) != set(self.classes_): msg = ( @@ -312,7 +360,7 @@ def fit( self.models[c].fit(X_c, lengths=lengths_c) # Set class priors - models: t.Iterator[int, BaseHMM] = self.models.items() + models: t.Iterable[int, variants.BaseHMM] = self.models.items() if self.prior == PriorMode.UNIFORM: self.prior_ = {c: 1 / len(self.classes_) for c, _ in models} elif self.prior == PriorMode.FREQUENCY: @@ -464,7 +512,7 @@ def predict_scores( ----- This method requires a trained classifier — see :func:`fit`. """ - model: BaseHMM = next(iter(self.models.values())) + model: variants.BaseHMM = next(iter(self.models.values())) X, lengths = _validation.check_X_lengths( X, lengths=lengths, diff --git a/sequentia/models/hmm/variants/__init__.py b/sequentia/models/hmm/variants/__init__.py index 68bc4b7..ea339bb 100644 --- a/sequentia/models/hmm/variants/__init__.py +++ b/sequentia/models/hmm/variants/__init__.py @@ -5,7 +5,8 @@ """Supported hidden Markov Model variants.""" +from sequentia.models.hmm.variants.base import BaseHMM from sequentia.models.hmm.variants.categorical import CategoricalHMM from sequentia.models.hmm.variants.gaussian_mixture import GaussianMixtureHMM -__all__ = ["CategoricalHMM", "GaussianMixtureHMM"] +__all__ = ["BaseHMM", "CategoricalHMM", "GaussianMixtureHMM"] diff --git a/tests/unit/test_model_selection.py b/tests/unit/test_model_selection.py index 9c416b2..1b88d1f 100644 --- a/tests/unit/test_model_selection.py +++ b/tests/unit/test_model_selection.py @@ -16,6 +16,7 @@ from sklearn.preprocessing import minmax_scale from sequentia.datasets import SequentialDataset, load_digits +from sequentia.enums import CovarianceMode, PriorMode, TopologyMode from sequentia.model_selection import ( GridSearchCV, HalvingGridSearchCV, @@ -26,9 +27,15 @@ ShuffleSplit, StratifiedKFold, StratifiedShuffleSplit, + param_grid, ) from sequentia.model_selection._search import BaseSearchCV -from sequentia.models import KNNClassifier, KNNRegressor +from sequentia.models import ( + GaussianMixtureHMM, + HMMClassifier, + KNNClassifier, + KNNRegressor, +) from sequentia.preprocessing import IndependentFunctionTransformer EPS: np.float32 = np.finfo(np.float32).eps @@ -70,7 +77,7 @@ def data() -> SequentialDataset: @pytest.mark.parametrize( "search", [GridSearchCV, RandomizedSearchCV, HalvingGridSearchCV] ) -def test_classifier( +def test_knn_classifier( data: SequentialDataset, search: type[BaseSearchCV], cv: type[BaseCrossValidator] | type[BaseShuffleSplit], @@ -134,7 +141,7 @@ def test_classifier( @pytest.mark.parametrize( "search", [GridSearchCV, RandomizedSearchCV, HalvingGridSearchCV] ) -def test_regressor( +def test_knn_regressor( data: SequentialDataset, search: type[BaseSearchCV], cv: type[BaseCrossValidator] | type[BaseShuffleSplit], @@ -175,3 +182,47 @@ def test_regressor( # Calculate R^2 r2 = model.score(data.X, y, lengths=data.lengths) assert r2 > 0.8 + + +def test_hmm_classifier(data: SequentialDataset) -> None: + # Initialize search, splitter and parameter + optimizer = GridSearchCV( + estimator=Pipeline( + [ + ("scale", IndependentFunctionTransformer(minmax_scale)), + ("clf", HMMClassifier(variant=GaussianMixtureHMM, n_jobs=-1)), + ] + ), + param_grid={ + "clf__prior": [PriorMode.UNIFORM, PriorMode.FREQUENCY], + "clf__model_kwargs": param_grid( + n_states=[3, 4, 5], + n_components=[2, 3, 4], + covariance=[CovarianceMode.DIAGONAL, CovarianceMode.SPHERICAL], + topology=[TopologyMode.LEFT_RIGHT, TopologyMode.LINEAR], + ), + }, + cv=StratifiedKFold(), + n_jobs=-1, + ) + + # Perform the hyper-parameter search and retrieve the best model + optimizer.fit(data.X, data.y, lengths=data.lengths) + assert optimizer.best_score_ > 0.8 + clf = optimizer.best_estimator_ + + # Predict labels + y_pred = clf.predict(data.X, lengths=data.lengths) + assert np.isin(y_pred, (0, 1)).all() + + # Predict probabilities + y_probs = clf.predict_proba(data.X, lengths=data.lengths) + assert ((y_probs >= 0) & (y_probs <= 1)).all() + npt.assert_almost_equal(y_probs.sum(axis=1), 1.0) + + # Predict log probabilities + clf.predict_log_proba(data.X, lengths=data.lengths) + + # Calculate accuracy + acc = clf.score(data.X, data.y, lengths=data.lengths) + assert acc > 0.8 diff --git a/tests/unit/test_models/hmm/test_classifier.py b/tests/unit/test_models/hmm/test_classifier.py index 6bcd2d3..31dbedc 100644 --- a/tests/unit/test_models/hmm/test_classifier.py +++ b/tests/unit/test_models/hmm/test_classifier.py @@ -6,6 +6,7 @@ from __future__ import annotations import copy +import enum import os import tempfile import typing as t @@ -37,6 +38,12 @@ n_classes = 7 +class FitMode(enum.StrEnum): + PREFIT = "prefit" + POSTFIT_IDENTICAL = "postfit_identical" + POSTFIT_FLEXIBLE = "postfit_flexible" + + @pytest.fixture(scope="module") def random_state(request: SubRequest) -> np.random.RandomState: return np.random.RandomState(1) @@ -113,16 +120,15 @@ def assert_fit(clf: BaseHMM): }, ], ) -@pytest.mark.parametrize("prefit", [True, False]) +@pytest.mark.parametrize("fit_mode", list(FitMode)) def test_classifier_e2e( request: SubRequest, helpers: t.Any, model: BaseHMM, dataset: SequentialDataset, prior: enums.PriorMode | dict[int, float], + fit_mode: FitMode, random_state: np.random.RandomState, - *, - prefit: bool, ) -> None: clf = HMMClassifier(prior=prior) clf.add_models({i: copy.deepcopy(model) for i in range(n_classes)}) @@ -139,12 +145,19 @@ def test_classifier_e2e( test_size=0.2, random_state=random_state, stratify=True ) - if prefit: + if fit_mode == FitMode.PREFIT: for X, lengths, c in train.iter_by_class(): clf.models[c].fit(X, lengths=lengths) assert_fit(clf.fit()) - else: + elif fit_mode == FitMode.POSTFIT_FLEXIBLE: assert_fit(clf.fit(**train.X_y_lengths)) + elif fit_mode == FitMode.POSTFIT_IDENTICAL: + clf = HMMClassifier( + variant=type(model), + model_kwargs=model.get_params(), + prior=prior, + ) + clf.fit(**train.X_y_lengths) scores_pred = clf.predict_scores(**test.X_lengths) assert scores_pred.shape == (len(test), n_classes)