diff --git a/CHANGELOG.md b/CHANGELOG.md index e3c8900..6575144 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,13 @@ # Changelog +## 0.3.1, Fall 2023 + +- All of the `sklearn` components can now be instantiated with `warn=False` in order to trigger a `ValueException` instead of a warning. This allows you to build pipelines that will break if a detector is triggered. +- You can now pass `groups` to `redflag.distributions.is_multimodal()`. If present, the modality will be checked for each group, returning a Boolean array of values (one for each group). This allows you to check a feature partitioned by target class, for example. +- Added `MultimodalDetector` to provide a way to check for multimodal features. If `y` is passed and is categorical, it will be used to partition the data and modality will be checked for each class. +- Removed `RegressionMultimodalDetector`. Use `MultimodalDetector` instead. + + ## 0.3.0, 21 September 2023 - Added some accessors to give access to `redflag` functions directly from `pandas.Series` objects, via an 'accessor'. For example, for a Series `s`, one can call `minority_classes = s.redflag.minority_classes()` instead of `redflag.minority_classes(s)`. Other functions include `imbalance_degree()`, `dummy_scores()` (see below). Probably not very useful yet, but future releases will add some reporting functions that wrap multiple Redflag functions. **This is an experimental feature and subject to change.** diff --git a/src/redflag/distributions.py b/src/redflag/distributions.py index 075c921..5a50ba5 100644 --- a/src/redflag/distributions.py +++ b/src/redflag/distributions.py @@ -20,7 +20,7 @@ """ from __future__ import annotations -from typing import Optional, NamedTuple, Callable +from typing import Optional, NamedTuple, Callable, Union from collections import namedtuple from itertools import combinations import warnings @@ -399,7 +399,7 @@ def fit_kde(a: ArrayLike, bandwidth: float=1.0, kernel: str='gaussian') -> tuple ... ValueError: Data must be 1D. """ - a = np.asarray(a) + a = np.squeeze(a) if a.ndim >= 2: raise ValueError("Data must be 1D.") if not is_standard_normal(a): @@ -500,27 +500,40 @@ def kde_peaks(a: ArrayLike, method: str='scott', threshold: float=0.1) -> tuple[ return find_large_peaks(*get_kde(a, method), threshold=threshold) -def is_multimodal(a: ArrayLike, method: str='scott', threshold: float=0.1) -> bool: +def is_multimodal(a: ArrayLike, + groups:Optional[ArrayLike]=None, + method: str='scott', + threshold: float=0.1) -> Union[bool, np.ndarray]: """ Test if the data is multimodal. Args: a (array): The data. + groups (array): Group labels, if the data is to be partitioned before + testing. method (str): The rule of thumb for bandwidth estimation. Must be one of 'silverman', 'scott', or 'cv'. Default 'scott'. threshold (float): The threshold for peak amplitude. Default 0.1. Returns: - bool: True if the data is multimodal. + bool or np.ndarray: True if the data appear to be multimodal. If groups + were passed, an array with one result per group is returned. Examples: >>> rng = np.random.default_rng(42) - >>> data = rng.normal(size=100) - >>> is_multimodal(data) + >>> a = rng.normal(size=200) + >>> is_multimodal(a) False - >>> data = np.concatenate([rng.normal(size=100)-2, rng.normal(size=100)+2]) - >>> is_multimodal(data) + >>> b = np.concatenate([rng.normal(size=100)-2, rng.normal(size=100)+2]) + >>> is_multimodal(b) True + >>> c = np.concatenate([a, b]) + >>> is_multimodal(c, groups=[0]*200 + [1]*200) + array([False, True]) """ - x, y = kde_peaks(a, method=method, threshold=threshold) - return len(x) > 1 + a = np.asarray(a) + result = [] + for group in iter_groups(groups): + x, y = kde_peaks(a[group], method=method, threshold=threshold) + result.append(len(x) > 1) + return result[0] if len(result) == 1 else np.array(result) diff --git a/src/redflag/sklearn.py b/src/redflag/sklearn.py index f755afe..a6cc727 100644 --- a/src/redflag/sklearn.py +++ b/src/redflag/sklearn.py @@ -58,6 +58,12 @@ def __init__(self, func, message, warn=True, **kwargs): self.warn = warn def fit(self, X, y=None): + return self + + def fit_transform(self, X, y=None): + return self.transform(X, y) + + def transform(self, X, y=None): X = check_array(X) positive = [i for i, feature in enumerate(X.T) if self.func(feature)] @@ -81,13 +87,6 @@ def fit(self, X, y=None): else: raise ValueError(message) - return self - - def transform(self, X, y=None): - """ - Can check X here, but y is not passed into here by `fit`. - """ - return X @@ -133,17 +132,6 @@ def __init__(self): super().__init__(is_correlated, "may be correlated") -class RegressionMultimodalDetector(BaseRedflagDetector): - """ - Transformer that detects features with non-unimodal distributions. In a - regression task, it considers the univariate distributions of the features - and the target. Do not use this detector for classification tasks, use - `MultimodalDetector` instead. - """ - def __init__(self): - super().__init__(is_multimodal, "may be multimodally distributed") - - class UnivariateOutlierDetector(BaseRedflagDetector): """ Transformer that detects if there are more than the expected number of @@ -222,6 +210,9 @@ def __init__(self, p=0.99, threshold=None, factor=1, warn=True): def fit(self, X, y=None): return self + def fit_transform(self, X, y=None): + return self.transform(X, y) + def transform(self, X, y=None): """ Checks X (and y, if it is continuous data) for outlier values. @@ -243,12 +234,10 @@ def transform(self, X, y=None): if (y is not None) and is_continuous(y): if np.asarray(y).ndim == 1: - y_ = y.reshape(-1, 1) kind = 'univariate' else: - y_ = y kind = 'multivariate' - if has_outliers(y_, p=self.p, threshold=self.threshold, factor=self.factor): + if has_outliers(y, p=self.p, threshold=self.threshold, factor=self.factor): message = f"🚩 Target has more {kind} outlier samples than expected." if self.warn: warnings.warn(message) @@ -494,6 +483,89 @@ def fit_transform(self, X, y=None): return X +class MultimodalityDetector(BaseEstimator, TransformerMixin): + + def __init__(self, task='auto', method='scott', threshold=0.1, warn=True): + """ + Constructor for the class. + + Args: + task (str): The task to use for multimodality detection. If 'auto', + then the detector will try to guess the task based on whether `y` + is continuous or not. Must be one of 'auto', 'classification', + 'regression'. Default: 'auto'. + method (str): The rule of thumb for bandwidth estimation. Must be one + of 'silverman', 'scott', or 'cv'. Default 'scott'. + threshold (float): The threshold for peak amplitude. Default 0.1. + warn (bool): Whether to raise a warning or raise an error. + """ + self.task = task + self.method = method + self.threshold = threshold + self.warn = warn + + def fit(self, X, y=None): + """ + Checks for multimodality in the features of X. Each feature is checked + separately. + + If `y` is categorical, the features are checked for multimodality + separately for each class. + + Args: + X (np.ndarray): The data to compare to the training data. Not used + by this transformer. + y (np.ndarray): The labels for the data. + + Returns: + self. + """ + X = check_array(X) + + if (self.task == 'auto' and is_continuous(y)) or (self.task == 'regression'): + groups = None + else: + groups = y + + positive = [] + for i, feature in enumerate(X.T): + multi = is_multimodal(feature, groups=groups, method=self.method, threshold=self.threshold) + # This unpleasantness is a consequence of is_multimodal returning + # a list of booleans if groups is not None, and a single boolean + # if groups is None. + try: + if any(multi): + positive.append(i) + except TypeError: + if multi: + positive.append(i) + + if n := len(positive): + pos = ', '.join(str(i) for i in positive) + message = f"{'' if n > 1 else 'a'} multimodal distribution{'s' if n > 1 else ''}" + message = f"🚩 Feature{'' if n == 1 else 's'} {pos} {'has' if n == 1 else 'have'} {message}." + if self.warn: + warnings.warn(message) + else: + raise ValueError(message) + + return self + + def transform(self, X, y=None): + """ + This detector does nothing during 'transform', only during 'fit'. + + Args: + X (np.ndarray): The data to compare to the training data. Not used + by this transformer. + y (np.ndarray): The labels for the data. + + Returns: + X. + """ + return check_array(X) + + class ImbalanceDetector(BaseEstimator, TransformerMixin): def __init__(self, method='id', threshold=0.4, classes=None, warn=True): @@ -512,6 +584,7 @@ def __init__(self, method='id', threshold=0.4, classes=None, warn=True): minority class). classes (list): The names of the classes present in the data, even if they are not present in the array `y`. + warn (bool): Whether to raise a warning or raise an error. """ if method not in ['id', 'ir']: raise ValueError(f"Method must be 'id' or 'ir' but was {method}") diff --git a/tests/test_sklearn.py b/tests/test_sklearn.py index 6f6408a..27e7b32 100644 --- a/tests/test_sklearn.py +++ b/tests/test_sklearn.py @@ -21,6 +21,14 @@ def test_clip_detector(): with pytest.warns(UserWarning, match="Feature 1 has samples that may be clipped."): pipe.fit_transform(X) + # Warns about y, but only on continuous data. + rng = np.random.default_rng(0) + X = rng.normal(size=(100, 2)) + y = rng.normal(size=100) + y[:3] = y.max() + with pytest.warns(UserWarning, match="Target 0 has samples that may be clipped."): + pipe.fit_transform(X, y) + # Does not warn: X = np.array([[2, 1], [3, 2], [4, 3], [5, 4]]) pipe.fit_transform(X) @@ -37,18 +45,22 @@ def test_correlation_detector(): pipe.fit_transform(X) -def test_simple_multimodal_detector(): +def test_multimodality_detector(): """ Checks for features with a multimodal distribution, considered across the - entire dataset (i.e. not per class). + entire dataset. """ - pipe = make_pipeline(rf.RegressionMultimodalDetector()) + pipe = make_pipeline(rf.MultimodalityDetector()) rng = np.random.default_rng(0) X1 = np.stack([rng.normal(size=80), rng.normal(size=80)]).T X2 = np.stack([rng.normal(size=80), 3 + rng.normal(size=80)]).T X = np.vstack([X1, X2]) - with pytest.warns(UserWarning, match="Feature 1 has samples that may be multimodally distributed."): + with pytest.warns(UserWarning, match="Feature 1 has a multimodal distribution."): pipe.fit_transform(X) + y = np.hstack([np.zeros(80), np.ones(80)]) + + # Does not warn. + pipe.fit(X, y) def test_custom_detector(): @@ -115,10 +127,22 @@ def test_multivariate_outlier_detector(): with pytest.warns(UserWarning, match="Dataset has more multivariate outlier samples than expected."): pipe.fit_transform(X) + # Warns for y too. + pipe = make_pipeline(rf.MultivariateOutlierDetector(factor=0.5, p=0.8)) + X = rng.uniform(size=(1_000, 2)) + y = rng.normal(size=1_000) + # y[:100] = 10 + with pytest.warns(UserWarning, match="Target has more univariate outlier samples than expected."): + pipe.fit_transform(X, y) + # Does not warn with factor of 2.5: pipe = make_pipeline(rf.MultivariateOutlierDetector(factor=2.5)) pipe.fit_transform(X) + # Does not warn for y. + y = rng.normal(size=1_000) + pipe.fit(X, y) + def test_outlier_detector(): # Use a factor of 0.5 to almost guarantee that this will throw a warning. @@ -202,6 +226,8 @@ def test_imbalance_comparator(): y = rng.normal(size=100) with pytest.warns(UserWarning, match="Target y seems continuous"): pipe.fit_transform(X, y) + with pytest.warns(UserWarning, match="Target y seems continuous"): + pipe.transform(X, y) # No warning if y is None, just skips: pipe.fit_transform(X)