From 51466ad4ae08221b1d6f7449522dc3e396d31141 Mon Sep 17 00:00:00 2001 From: Mirko Bunse Date: Mon, 12 Jun 2023 10:36:40 +0200 Subject: [PATCH] Complete the documentation --- README.md | 2 +- docs/source/api.md | 37 ++++++++++++++++++++++++++++++++-- docs/source/developer-guide.md | 2 +- docs/source/index.md | 23 ++++++++++++++++++--- qunfold/losses.py | 6 +++--- qunfold/methods.py | 20 ++++++++++++------ qunfold/quapy.py | 12 +++++++++++ qunfold/sklearn.py | 5 +++++ qunfold/transformers.py | 2 +- 9 files changed, 92 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index 57562c2..8b1fea9 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ # qunfold | Quantification & Unfolding -This Python package implements composable methods for quantification and unfolding. +This Python package implements our unified framework of algorithms for quantification and unfolding. It is designed for enabling the composition of novel methods from existing and easily customized loss functions and data representations. Moreover, this package leverages a powerful optimization back-end to yield state-of-the-art performances for all compositions. ## Installation diff --git a/docs/source/api.md b/docs/source/api.md index d51fb65..4154b12 100644 --- a/docs/source/api.md +++ b/docs/source/api.md @@ -2,7 +2,7 @@ The `GenericMethod` defines the interface for many common quantification and unfolding algorithms. Most importantly, this interface consists of their `fit` and `predict` methods. -Instances of [](#popular-algorithms) for quantification and unfolding are created through specialized constructors. However, you can also define your own quantification algorithm as a `GenericMethod` that combines an arbitrary choice of [](#losses), [](#regularizers) and [](#feature-transformations). +Instances of [](#popular-algorithms) for quantification and unfolding are created through the corresponding constructors. However, you can also define your own quantification methods as a `GenericMethod` that combines an arbitrary choice of [](#losses), [](#regularizers) and [](#feature-transformations). ```{eval-rst} .. autoclass:: qunfold.GenericMethod @@ -27,6 +27,10 @@ We categorize existing, well-known quantification and unfolding algorithms into ### Distribution matching ```{eval-rst} +.. autoclass:: qunfold.EDx + +.. autoclass:: qunfold.EDy + .. autoclass:: qunfold.HDx .. autoclass:: qunfold.HDy @@ -45,10 +49,12 @@ We categorize existing, well-known quantification and unfolding algorithms into ```{eval-rst} .. autoclass:: qunfold.LeastSquaresLoss -.. autoclass:: qunfold.BlobelLoss +.. autoclass:: qunfold.EnergyLoss .. autoclass:: qunfold.HellingerSurrogateLoss +.. autoclass:: qunfold.BlobelLoss + .. autoclass:: qunfold.CombinedLoss ``` @@ -71,5 +77,32 @@ You can use the `CombinedLoss` to create arbitrary, weighted sums of losses and ```{eval-rst} .. autoclass:: qunfold.ClassTransformer +.. autoclass:: qunfold.DistanceTransformer + .. autoclass:: qunfold.HistogramTransformer ``` + + +## Utilities + +The following classes provide functionalities that go beyond the composition of quantification methods. + +### QuaPy + +The `qunfold.quapy` module allows you to wrap any quantification method for being used in [QuaPy](https://github.com/HLT-ISTI/QuaPy). + +```{eval-rst} +.. autoclass:: qunfold.quapy.QuaPyWrapper +``` + +### Cross-validated training + +The `qunfold.sklearn` module allows you to train classification-based quantification methods through cross-validation. Importing this module requires [scikit-learn](https://scikit-learn.org/stable/) to be installed. + +```{eval-rst} +.. autoclass:: qunfold.sklearn.CVClassifier +``` + +```{hint} +If you use a bagging classifier (like random forests) with `oob_score=True`, you do not need to use cross-validation. Instead, the quantification method is then trained on the out-of-bag predictions of the bagging classifier. +``` diff --git a/docs/source/developer-guide.md b/docs/source/developer-guide.md index 63161e7..31f2cf1 100644 --- a/docs/source/developer-guide.md +++ b/docs/source/developer-guide.md @@ -1,6 +1,6 @@ # Developer guide -We provide best practices regarding the implementation [](#workflow) before going into detail about how to take out [](#custom-implementations). +In the following, we introduce best practices regarding the implementation [workflow](#workflow) before going into detail about how to take out [custom implementations](#custom-implementations). ## Workflow diff --git a/docs/source/index.md b/docs/source/index.md index 20392df..be34a1d 100644 --- a/docs/source/index.md +++ b/docs/source/index.md @@ -8,7 +8,7 @@ developer-guide # Quickstart -This Python package implements our unified framework of algorithms for quantification and unfolding. +The Python package [qunfold](https://github.com/mirkobunse/qunfold) implements our unified framework of algorithms for quantification and unfolding. It is designed for enabling the composition of novel methods from existing and easily customized loss functions and data representations. Moreover, this package leverages a powerful optimization back-end to yield state-of-the-art performances for all compositions. ## Installation @@ -23,13 +23,17 @@ Moreover, you will need a [JAX](https://jax.readthedocs.io/) backend. Typically, pip install "jax[cpu]" ``` -**Updating:** To update an existing installation of `qunfold`, run +### Upgrading + +To upgrade an existing installation of `qunfold`, run ``` pip install --force-reinstall --no-deps 'qunfold @ git+https://github.com/mirkobunse/qunfold@main' ``` -**Troubleshooting:** Starting from `pip 23.1.2`, you have to install `setuptools` and `wheel` explicitly. If you receive a "NameError: name 'setuptools' is not defined", you need to execute the following command before installing `qunfold`. +### Troubleshooting + +Starting from `pip 23.1.2`, you have to install `setuptools` and `wheel` explicitly. If you receive a "NameError: name 'setuptools' is not defined", you need to execute the following command before installing `qunfold`. ``` pip install --upgrade pip setuptools wheel @@ -50,3 +54,16 @@ acc = ACC( # use OOB predictions for training the quantifier acc.fit(X_trn, y_trn) # fit to training data p_hat = acc.predict(X_tst) # estimate a prevalence vector ``` + +You can easily compose new quantification methods from existing loss functions and feature transformations. In the following example, we compose the ordinal variant of ACC and prepare it for being used in [QuaPy](https://github.com/HLT-ISTI/QuaPy). + +```python +# the ACC loss, regularized with strength 0.01 for ordinal quantification +loss = TikhonovRegularized(LeastSquaresLoss(), 0.01) + +# the original data representation of ACC with 10-fold cross-validation +transformer = ClassTransformer(CVClassifier(LogisticRegression(), 10)) + +# the ordinal variant of ACC, ready for being used in QuaPy +ordinal_acc = QuaPyWrapper(GenericMethod(loss, transformer)) +``` diff --git a/qunfold/losses.py b/qunfold/losses.py index d11c3cb..31ad565 100644 --- a/qunfold/losses.py +++ b/qunfold/losses.py @@ -103,7 +103,7 @@ def _instantiate(self, q, M, N=None): return lambda p: self.loss_function(p, q, M, N) class LeastSquaresLoss(FunctionLoss): - """The loss function of ACC, PACC, and ReadMe. + """The loss function of ACC (Forman, 2008), PACC (Bella et al., 2019), and ReadMe (Hopkins & King, 2010). This loss function computes the sum of squares of element-wise errors between `q` and `M*p`. """ @@ -132,7 +132,7 @@ def _hellinger_surrogate(p, q, M, indices): return jnp.sum(jnp.array([ jnp.sum(v[i]) for i in indices ])) class HellingerSurrogateLoss(AbstractLoss): - """The loss function of HDx and HDy. + """The loss function of HDx and HDy (González-Castro et al., 2013). This loss function computes the average of the squared Hellinger distances between feature-wise (or class-wise) histograms. Note that the original HDx and HDy by González-Castro et al (2013) do not use the squared but the regular Hellinger distance. This approach is problematic because the regular distance is not always twice differentiable and, hence, complicates numerical optimizations. @@ -204,7 +204,7 @@ def _instantiate(self, q, M, N): # the inspection that the QuaPyWrapper takes out. def TikhonovRegularized(loss, tau=0.): - """Add TikhonovRegularization to any loss. + """Add TikhonovRegularization (Blobel, 1985) to any loss. Calling this function is equivalent to calling diff --git a/qunfold/methods.py b/qunfold/methods.py index b265907..7d00a06 100644 --- a/qunfold/methods.py +++ b/qunfold/methods.py @@ -65,6 +65,14 @@ class GenericMethod: solver (optional): The `method` argument in `scipy.optimize.minimize`. Defaults to `"trust-ncg"`. solver_options (optional): The `options` argument in `scipy.optimize.minimize`. Defaults to `{"gtol": 1e-8, "maxiter": 1000}`. seed (optional): A random number generator seed from which a numpy RandomState is created. Defaults to `None`. + + Examples: + Here, we create the ordinal variant of ACC (Bunse et al., 2023). This variant consists of the original feature transformation of ACC and of the original loss of ACC, the latter of which is regularized towards smooth solutions. + + >>> GenericMethod( + >>> TikhonovRegularized(LeastSquaresLoss(), 0.01), + >>> ClassTransformer(RandomForestClassifier(oob_score=True)) + >>> ) """ def __init__(self, loss, transformer, solver = "trust-ncg", @@ -136,7 +144,7 @@ def solve(self, q, M, N=None): # TODO add argument p_trn return Result(_np_softmax(opt.x), opt.nit, opt.message) class ACC(GenericMethod): - """Adjusted Classify & Count. + """Adjusted Classify & Count by Forman (2008). This subclass of `GenericMethod` is instantiated with a `LeastSquaresLoss` and a `ClassTransformer`. @@ -157,7 +165,7 @@ def __init__(self, classifier, fit_classifier=True, **kwargs): ) class PACC(GenericMethod): - """Probabilistic Adjusted Classify & Count. + """Probabilistic Adjusted Classify & Count by Bella et al. (2010). This subclass of `GenericMethod` is instantiated with a `LeastSquaresLoss` and a `ClassTransformer`. @@ -199,10 +207,10 @@ def __init__(self, transformer, *, tau=0., **kwargs): class EDx(GenericMethod): """The energy distance-based EDx method by Kawakubo et al. (2016). - This subclass of `GenericMethod` is instantiated with a `EnergyLoss` and a `DistanceTransformer`. + This subclass of `GenericMethod` is instantiated with an `EnergyLoss` and a `DistanceTransformer`. Args: - metric (optional): The metric with which the distance between data items is measured. Defaults to `"euclidean"`. + metric (optional): The metric with which the distance between data items is measured. Can take any value that is accepted by `scipy.spatial.distance.cdist`. Defaults to `"euclidean"`. **kwargs: Keyword arguments accepted by `GenericMethod`. """ def __init__(self, metric="euclidean", **kwargs): @@ -216,11 +224,11 @@ def __init__(self, metric="euclidean", **kwargs): class EDy(GenericMethod): """The energy distance-based EDy method by Castaño et al. (2022). - This subclass of `GenericMethod` is instantiated with a `EnergyLoss` and a `DistanceTransformer`, the latter of which uses a `ClassTransformer` as a preprocessor. + This subclass of `GenericMethod` is instantiated with an `EnergyLoss` and a `DistanceTransformer`, the latter of which uses a `ClassTransformer` as a preprocessor. Args: classifier: A classifier that implements the API of scikit-learn. - metric (optional): The metric with which the distance between data items is measured. Defaults to `"euclidean"`. + metric (optional): The metric with which the distance between data items is measured. Can take any value that is accepted by `scipy.spatial.distance.cdist`. Defaults to `"euclidean"`. fit_classifier (optional): Whether to fit the `classifier` when this quantifier is fitted. Defaults to `True`. **kwargs: Keyword arguments accepted by `GenericMethod`. """ diff --git a/qunfold/quapy.py b/qunfold/quapy.py index 95d8481..112661d 100644 --- a/qunfold/quapy.py +++ b/qunfold/quapy.py @@ -63,6 +63,18 @@ class QuaPyWrapper(BaseQuantifier): Args: generic_method: A GenericMethod method to wrap. + + Examples: + Here, we wrap an instance of ACC to perform a grid search with QuaPy. + + >>> qunfold_method = QuaPyWrapper(ACC(RandomForestClassifier(obb_score=True))) + >>> quapy.model_selection.GridSearchQ( + >>> model = qunfold_method, + >>> param_grid = { # try both splitting criteria + >>> "transformer__classifier__estimator__criterion": ["gini", "entropy"], + >>> }, + >>> # ... + >>> ) """ def __init__(self, generic_method): self.generic_method = generic_method diff --git a/qunfold/sklearn.py b/qunfold/sklearn.py index 9248973..4d87f2e 100644 --- a/qunfold/sklearn.py +++ b/qunfold/sklearn.py @@ -11,6 +11,11 @@ class CVClassifier(BaseEstimator, ClassifierMixin): Args: estimator: A classifier that implements the API of scikit-learn. n_estimators: The number of stratified cross-validation folds. + + Examples: + Here, we create an instance of ACC that trains a logistic regression classifier with 10 cross-validation folds. + + >>> ACC(CVClassifier(LogisticRegression(), 10)) """ def __init__(self, estimator, n_estimators, random_state=None): self.estimator = estimator diff --git a/qunfold/transformers.py b/qunfold/transformers.py index 27fd4a2..94aa568 100644 --- a/qunfold/transformers.py +++ b/qunfold/transformers.py @@ -79,7 +79,7 @@ class DistanceTransformer(AbstractTransformer): """A distance-based feature transformation, as it is used in `EDx` and `EDy`. Args: - metric (optional): The metric with which the distance between data items is measured. Defaults to `"euclidean"`. + metric (optional): The metric with which the distance between data items is measured. Can take any value that is accepted by `scipy.spatial.distance.cdist`. Defaults to `"euclidean"`. preprocessor (optional): Another `AbstractTransformer` that is called before this transformer. Defaults to `None`. """ def __init__(self, metric="euclidean", preprocessor=None):