Pass more tests

scienxlab · Sep 25, 2023 · 6fa4662 · 6fa4662
1 parent 48efa8e
commit 6fa4662
Show file tree

Hide file tree

Showing 4 changed files with 155 additions and 35 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,13 @@
 # Changelog
 
+## 0.3.1, Fall 2023
+
+- All of the `sklearn` components can now be instantiated with `warn=False` in order to trigger a `ValueException` instead of a warning. This allows you to build pipelines that will break if a detector is triggered.
+- You can now pass `groups` to `redflag.distributions.is_multimodal()`. If present, the modality will be checked for each group, returning a Boolean array of values (one for each group). This allows you to check a feature partitioned by target class, for example.
+- Added `MultimodalDetector` to provide a way to check for multimodal features. If `y` is passed and is categorical, it will be used to partition the data and modality will be checked for each class.
+- Removed `RegressionMultimodalDetector`. Use `MultimodalDetector` instead.
+
+
 ## 0.3.0, 21 September 2023
 
 - Added some accessors to give access to `redflag` functions directly from `pandas.Series` objects, via an 'accessor'. For example, for a Series `s`, one can call `minority_classes = s.redflag.minority_classes()` instead of `redflag.minority_classes(s)`. Other functions include `imbalance_degree()`, `dummy_scores()` (see below). Probably not very useful yet, but future releases will add some reporting functions that wrap multiple Redflag functions. **This is an experimental feature and subject to change.**

diff --git a/src/redflag/distributions.py b/src/redflag/distributions.py
@@ -20,7 +20,7 @@
 """
 from __future__ import annotations
 
-from typing import Optional, NamedTuple, Callable
+from typing import Optional, NamedTuple, Callable, Union
 from collections import namedtuple
 from itertools import combinations
 import warnings
@@ -399,7 +399,7 @@ def fit_kde(a: ArrayLike, bandwidth: float=1.0, kernel: str='gaussian') -> tuple
           ...
         ValueError: Data must be 1D.
     """
-    a = np.asarray(a)
+    a = np.squeeze(a)
     if a.ndim >= 2:
         raise ValueError("Data must be 1D.")
     if not is_standard_normal(a):
@@ -500,27 +500,40 @@ def kde_peaks(a: ArrayLike, method: str='scott', threshold: float=0.1) -> tuple[
     return find_large_peaks(*get_kde(a, method), threshold=threshold)
 
 
-def is_multimodal(a: ArrayLike, method: str='scott', threshold: float=0.1) -> bool:
+def is_multimodal(a: ArrayLike,
+                  groups:Optional[ArrayLike]=None,
+                  method: str='scott',
+                  threshold: float=0.1) -> Union[bool, np.ndarray]:
     """
     Test if the data is multimodal.
 
     Args:
         a (array): The data.
+        groups (array): Group labels, if the data is to be partitioned before
+            testing.
         method (str): The rule of thumb for bandwidth estimation. Must be one
             of 'silverman', 'scott', or 'cv'. Default 'scott'.
         threshold (float): The threshold for peak amplitude. Default 0.1.
 
     Returns:
-        bool: True if the data is multimodal.
+        bool or np.ndarray: True if the data appear to be multimodal. If groups
+        were passed, an array with one result per group is returned.
 
     Examples:
         >>> rng = np.random.default_rng(42)
-        >>> data = rng.normal(size=100)
-        >>> is_multimodal(data)
+        >>> a = rng.normal(size=200)
+        >>> is_multimodal(a)
         False
-        >>> data = np.concatenate([rng.normal(size=100)-2, rng.normal(size=100)+2])
-        >>> is_multimodal(data)
+        >>> b = np.concatenate([rng.normal(size=100)-2, rng.normal(size=100)+2])
+        >>> is_multimodal(b)
         True
+        >>> c = np.concatenate([a, b])
+        >>> is_multimodal(c, groups=[0]*200 + [1]*200)
+        array([False,  True])
     """
-    x, y = kde_peaks(a, method=method, threshold=threshold)
-    return len(x) > 1
+    a = np.asarray(a)
+    result = []
+    for group in iter_groups(groups):
+        x, y = kde_peaks(a[group], method=method, threshold=threshold)
+        result.append(len(x) > 1)
+    return result[0] if len(result) == 1 else np.array(result)
diff --git a/src/redflag/sklearn.py b/src/redflag/sklearn.py
@@ -58,6 +58,12 @@ def __init__(self, func, message, warn=True, **kwargs):
         self.warn = warn
 
     def fit(self, X, y=None):
+        return self
+
+    def fit_transform(self, X, y=None):
+        return self.transform(X, y)
+
+    def transform(self, X, y=None):
         X = check_array(X)
 
         positive = [i for i, feature in enumerate(X.T) if self.func(feature)]
@@ -81,13 +87,6 @@ def fit(self, X, y=None):
                     else:
                         raise ValueError(message)
 
-        return self
-
-    def transform(self, X, y=None):
-        """
-        Can check X here, but y is not passed into here by `fit`.
-        """
-
         return X
 
 
@@ -133,17 +132,6 @@ def __init__(self):
         super().__init__(is_correlated, "may be correlated")
 
 
-class RegressionMultimodalDetector(BaseRedflagDetector):
-    """
-    Transformer that detects features with non-unimodal distributions. In a
-    regression task, it considers the univariate distributions of the features
-    and the target. Do not use this detector for classification tasks, use
-    `MultimodalDetector` instead.
-    """
-    def __init__(self):
-        super().__init__(is_multimodal, "may be multimodally distributed")
-
-
 class UnivariateOutlierDetector(BaseRedflagDetector):
     """
     Transformer that detects if there are more than the expected number of
@@ -222,6 +210,9 @@ def __init__(self, p=0.99, threshold=None, factor=1, warn=True):
     def fit(self, X, y=None):
         return self
 
+    def fit_transform(self, X, y=None):
+        return self.transform(X, y)
+
     def transform(self, X, y=None):
         """
         Checks X (and y, if it is continuous data) for outlier values.
@@ -243,12 +234,10 @@ def transform(self, X, y=None):
 
         if (y is not None) and is_continuous(y):
             if np.asarray(y).ndim == 1:
-                y_ = y.reshape(-1, 1)
                 kind = 'univariate'
             else:
-                y_ = y
                 kind = 'multivariate'
-            if has_outliers(y_, p=self.p, threshold=self.threshold, factor=self.factor):
+            if has_outliers(y, p=self.p, threshold=self.threshold, factor=self.factor):
                 message = f"🚩 Target has more {kind} outlier samples than expected."
                 if self.warn:
                     warnings.warn(message)
@@ -494,6 +483,89 @@ def fit_transform(self, X, y=None):
         return X
 
 
+class MultimodalityDetector(BaseEstimator, TransformerMixin):
+
+    def __init__(self, task='auto', method='scott', threshold=0.1, warn=True):
+        """
+        Constructor for the class.
+
+        Args:
+            task (str): The task to use for multimodality detection. If 'auto',
+                then the detector will try to guess the task based on whether `y`
+                is continuous or not. Must be one of 'auto', 'classification',
+                'regression'. Default: 'auto'.
+            method (str): The rule of thumb for bandwidth estimation. Must be one
+                of 'silverman', 'scott', or 'cv'. Default 'scott'.
+            threshold (float): The threshold for peak amplitude. Default 0.1.
+            warn (bool): Whether to raise a warning or raise an error.
+        """
+        self.task = task
+        self.method = method
+        self.threshold = threshold
+        self.warn = warn
+
+    def fit(self, X, y=None):
+        """
+        Checks for multimodality in the features of X. Each feature is checked
+        separately.
+
+        If `y` is categorical, the features are checked for multimodality
+        separately for each class.
+
+        Args:
+            X (np.ndarray): The data to compare to the training data. Not used
+                by this transformer.
+            y (np.ndarray): The labels for the data.
+
+        Returns:
+            self.
+        """
+        X = check_array(X)
+
+        if (self.task == 'auto' and is_continuous(y)) or (self.task == 'regression'):
+            groups = None
+        else:
+            groups = y
+
+        positive = []
+        for i, feature in enumerate(X.T):
+            multi = is_multimodal(feature, groups=groups, method=self.method, threshold=self.threshold)
+            # This unpleasantness is a consequence of is_multimodal returning
+            # a list of booleans if groups is not None, and a single boolean
+            # if groups is None.
+            try:
+                if any(multi):
+                    positive.append(i)
+            except TypeError:
+                if multi:
+                    positive.append(i)
+
+        if n := len(positive):
+            pos = ', '.join(str(i) for i in positive)
+            message = f"{'' if n > 1 else 'a'} multimodal distribution{'s' if n > 1 else ''}"
+            message = f"🚩 Feature{'' if n == 1 else 's'} {pos} {'has' if n == 1 else 'have'} {message}."
+            if self.warn:
+                warnings.warn(message)
+            else:
+                raise ValueError(message)
+
+        return self
+
+    def transform(self, X, y=None):
+        """
+        This detector does nothing during 'transform', only during 'fit'.
+
+        Args:
+            X (np.ndarray): The data to compare to the training data. Not used
+                by this transformer.
+            y (np.ndarray): The labels for the data.
+
+        Returns:
+            X.
+        """
+        return check_array(X)
+
+
 class ImbalanceDetector(BaseEstimator, TransformerMixin):
 
     def __init__(self, method='id', threshold=0.4, classes=None, warn=True):
@@ -512,6 +584,7 @@ def __init__(self, method='id', threshold=0.4, classes=None, warn=True):
                 minority class).
             classes (list): The names of the classes present in the data, even
                 if they are not present in the array `y`.
+            warn (bool): Whether to raise a warning or raise an error.
         """
         if method not in ['id', 'ir']:
             raise ValueError(f"Method must be 'id' or 'ir' but was {method}")

diff --git a/tests/test_sklearn.py b/tests/test_sklearn.py
@@ -21,6 +21,14 @@ def test_clip_detector():
     with pytest.warns(UserWarning, match="Feature 1 has samples that may be clipped."):
         pipe.fit_transform(X)
 
+    # Warns about y, but only on continuous data.
+    rng = np.random.default_rng(0)
+    X = rng.normal(size=(100, 2))
+    y = rng.normal(size=100)
+    y[:3] = y.max()
+    with pytest.warns(UserWarning, match="Target 0 has samples that may be clipped."):
+        pipe.fit_transform(X, y)
+
     # Does not warn:
     X = np.array([[2, 1], [3, 2], [4, 3], [5, 4]])
     pipe.fit_transform(X)
@@ -37,18 +45,22 @@ def test_correlation_detector():
         pipe.fit_transform(X)
 
 
-def test_simple_multimodal_detector():
+def test_multimodality_detector():
     """
     Checks for features with a multimodal distribution, considered across the
-    entire dataset (i.e. not per class).
+    entire dataset.
     """
-    pipe = make_pipeline(rf.RegressionMultimodalDetector())
+    pipe = make_pipeline(rf.MultimodalityDetector())
     rng = np.random.default_rng(0)
     X1 = np.stack([rng.normal(size=80), rng.normal(size=80)]).T
     X2 = np.stack([rng.normal(size=80), 3 + rng.normal(size=80)]).T
     X = np.vstack([X1, X2])
-    with pytest.warns(UserWarning, match="Feature 1 has samples that may be multimodally distributed."):
+    with pytest.warns(UserWarning, match="Feature 1 has a multimodal distribution."):
         pipe.fit_transform(X)
+    y = np.hstack([np.zeros(80), np.ones(80)])
+
+    # Does not warn.
+    pipe.fit(X, y)
 
 
 def test_custom_detector():
@@ -115,10 +127,22 @@ def test_multivariate_outlier_detector():
     with pytest.warns(UserWarning, match="Dataset has more multivariate outlier samples than expected."):
         pipe.fit_transform(X)
 
+    # Warns for y too.
+    pipe = make_pipeline(rf.MultivariateOutlierDetector(factor=0.5, p=0.8))
+    X = rng.uniform(size=(1_000, 2))
+    y = rng.normal(size=1_000)
+    # y[:100] = 10
+    with pytest.warns(UserWarning, match="Target has more univariate outlier samples than expected."):
+        pipe.fit_transform(X, y)
+
     # Does not warn with factor of 2.5:
     pipe = make_pipeline(rf.MultivariateOutlierDetector(factor=2.5))
     pipe.fit_transform(X)
 
+    # Does not warn for y.
+    y = rng.normal(size=1_000)
+    pipe.fit(X, y)
+
 
 def test_outlier_detector():
     # Use a factor of 0.5 to almost guarantee that this will throw a warning.
@@ -202,6 +226,8 @@ def test_imbalance_comparator():
     y = rng.normal(size=100)
     with pytest.warns(UserWarning, match="Target y seems continuous"):
         pipe.fit_transform(X, y)
+    with pytest.warns(UserWarning, match="Target y seems continuous"):
+        pipe.transform(X, y)
 
     # No warning if y is None, just skips:
     pipe.fit_transform(X)