Skip to content

Commit

Permalink
Pass more tests
Browse files Browse the repository at this point in the history
  • Loading branch information
kwinkunks committed Sep 25, 2023
1 parent 48efa8e commit 6fa4662
Show file tree
Hide file tree
Showing 4 changed files with 155 additions and 35 deletions.
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,13 @@
# Changelog

## 0.3.1, Fall 2023

- All of the `sklearn` components can now be instantiated with `warn=False` in order to trigger a `ValueException` instead of a warning. This allows you to build pipelines that will break if a detector is triggered.
- You can now pass `groups` to `redflag.distributions.is_multimodal()`. If present, the modality will be checked for each group, returning a Boolean array of values (one for each group). This allows you to check a feature partitioned by target class, for example.
- Added `MultimodalDetector` to provide a way to check for multimodal features. If `y` is passed and is categorical, it will be used to partition the data and modality will be checked for each class.
- Removed `RegressionMultimodalDetector`. Use `MultimodalDetector` instead.


## 0.3.0, 21 September 2023

- Added some accessors to give access to `redflag` functions directly from `pandas.Series` objects, via an 'accessor'. For example, for a Series `s`, one can call `minority_classes = s.redflag.minority_classes()` instead of `redflag.minority_classes(s)`. Other functions include `imbalance_degree()`, `dummy_scores()` (see below). Probably not very useful yet, but future releases will add some reporting functions that wrap multiple Redflag functions. **This is an experimental feature and subject to change.**
Expand Down
33 changes: 23 additions & 10 deletions src/redflag/distributions.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
"""
from __future__ import annotations

from typing import Optional, NamedTuple, Callable
from typing import Optional, NamedTuple, Callable, Union
from collections import namedtuple
from itertools import combinations
import warnings
Expand Down Expand Up @@ -399,7 +399,7 @@ def fit_kde(a: ArrayLike, bandwidth: float=1.0, kernel: str='gaussian') -> tuple
...
ValueError: Data must be 1D.
"""
a = np.asarray(a)
a = np.squeeze(a)
if a.ndim >= 2:
raise ValueError("Data must be 1D.")
if not is_standard_normal(a):
Expand Down Expand Up @@ -500,27 +500,40 @@ def kde_peaks(a: ArrayLike, method: str='scott', threshold: float=0.1) -> tuple[
return find_large_peaks(*get_kde(a, method), threshold=threshold)


def is_multimodal(a: ArrayLike, method: str='scott', threshold: float=0.1) -> bool:
def is_multimodal(a: ArrayLike,
groups:Optional[ArrayLike]=None,
method: str='scott',
threshold: float=0.1) -> Union[bool, np.ndarray]:
"""
Test if the data is multimodal.
Args:
a (array): The data.
groups (array): Group labels, if the data is to be partitioned before
testing.
method (str): The rule of thumb for bandwidth estimation. Must be one
of 'silverman', 'scott', or 'cv'. Default 'scott'.
threshold (float): The threshold for peak amplitude. Default 0.1.
Returns:
bool: True if the data is multimodal.
bool or np.ndarray: True if the data appear to be multimodal. If groups
were passed, an array with one result per group is returned.
Examples:
>>> rng = np.random.default_rng(42)
>>> data = rng.normal(size=100)
>>> is_multimodal(data)
>>> a = rng.normal(size=200)
>>> is_multimodal(a)
False
>>> data = np.concatenate([rng.normal(size=100)-2, rng.normal(size=100)+2])
>>> is_multimodal(data)
>>> b = np.concatenate([rng.normal(size=100)-2, rng.normal(size=100)+2])
>>> is_multimodal(b)
True
>>> c = np.concatenate([a, b])
>>> is_multimodal(c, groups=[0]*200 + [1]*200)
array([False, True])
"""
x, y = kde_peaks(a, method=method, threshold=threshold)
return len(x) > 1
a = np.asarray(a)
result = []
for group in iter_groups(groups):
x, y = kde_peaks(a[group], method=method, threshold=threshold)
result.append(len(x) > 1)
return result[0] if len(result) == 1 else np.array(result)
115 changes: 94 additions & 21 deletions src/redflag/sklearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,12 @@ def __init__(self, func, message, warn=True, **kwargs):
self.warn = warn

def fit(self, X, y=None):
return self

def fit_transform(self, X, y=None):
return self.transform(X, y)

def transform(self, X, y=None):
X = check_array(X)

positive = [i for i, feature in enumerate(X.T) if self.func(feature)]
Expand All @@ -81,13 +87,6 @@ def fit(self, X, y=None):
else:
raise ValueError(message)

return self

def transform(self, X, y=None):
"""
Can check X here, but y is not passed into here by `fit`.
"""

return X


Expand Down Expand Up @@ -133,17 +132,6 @@ def __init__(self):
super().__init__(is_correlated, "may be correlated")


class RegressionMultimodalDetector(BaseRedflagDetector):
"""
Transformer that detects features with non-unimodal distributions. In a
regression task, it considers the univariate distributions of the features
and the target. Do not use this detector for classification tasks, use
`MultimodalDetector` instead.
"""
def __init__(self):
super().__init__(is_multimodal, "may be multimodally distributed")


class UnivariateOutlierDetector(BaseRedflagDetector):
"""
Transformer that detects if there are more than the expected number of
Expand Down Expand Up @@ -222,6 +210,9 @@ def __init__(self, p=0.99, threshold=None, factor=1, warn=True):
def fit(self, X, y=None):
return self

def fit_transform(self, X, y=None):
return self.transform(X, y)

def transform(self, X, y=None):
"""
Checks X (and y, if it is continuous data) for outlier values.
Expand All @@ -243,12 +234,10 @@ def transform(self, X, y=None):

if (y is not None) and is_continuous(y):
if np.asarray(y).ndim == 1:
y_ = y.reshape(-1, 1)
kind = 'univariate'
else:
y_ = y
kind = 'multivariate'
if has_outliers(y_, p=self.p, threshold=self.threshold, factor=self.factor):
if has_outliers(y, p=self.p, threshold=self.threshold, factor=self.factor):
message = f"🚩 Target has more {kind} outlier samples than expected."
if self.warn:
warnings.warn(message)
Expand Down Expand Up @@ -494,6 +483,89 @@ def fit_transform(self, X, y=None):
return X


class MultimodalityDetector(BaseEstimator, TransformerMixin):

def __init__(self, task='auto', method='scott', threshold=0.1, warn=True):
"""
Constructor for the class.
Args:
task (str): The task to use for multimodality detection. If 'auto',
then the detector will try to guess the task based on whether `y`
is continuous or not. Must be one of 'auto', 'classification',
'regression'. Default: 'auto'.
method (str): The rule of thumb for bandwidth estimation. Must be one
of 'silverman', 'scott', or 'cv'. Default 'scott'.
threshold (float): The threshold for peak amplitude. Default 0.1.
warn (bool): Whether to raise a warning or raise an error.
"""
self.task = task
self.method = method
self.threshold = threshold
self.warn = warn

def fit(self, X, y=None):
"""
Checks for multimodality in the features of X. Each feature is checked
separately.
If `y` is categorical, the features are checked for multimodality
separately for each class.
Args:
X (np.ndarray): The data to compare to the training data. Not used
by this transformer.
y (np.ndarray): The labels for the data.
Returns:
self.
"""
X = check_array(X)

if (self.task == 'auto' and is_continuous(y)) or (self.task == 'regression'):
groups = None
else:
groups = y

positive = []
for i, feature in enumerate(X.T):
multi = is_multimodal(feature, groups=groups, method=self.method, threshold=self.threshold)
# This unpleasantness is a consequence of is_multimodal returning
# a list of booleans if groups is not None, and a single boolean
# if groups is None.
try:
if any(multi):
positive.append(i)
except TypeError:
if multi:
positive.append(i)

if n := len(positive):
pos = ', '.join(str(i) for i in positive)
message = f"{'' if n > 1 else 'a'} multimodal distribution{'s' if n > 1 else ''}"
message = f"🚩 Feature{'' if n == 1 else 's'} {pos} {'has' if n == 1 else 'have'} {message}."
if self.warn:
warnings.warn(message)
else:
raise ValueError(message)

return self

def transform(self, X, y=None):
"""
This detector does nothing during 'transform', only during 'fit'.
Args:
X (np.ndarray): The data to compare to the training data. Not used
by this transformer.
y (np.ndarray): The labels for the data.
Returns:
X.
"""
return check_array(X)


class ImbalanceDetector(BaseEstimator, TransformerMixin):

def __init__(self, method='id', threshold=0.4, classes=None, warn=True):
Expand All @@ -512,6 +584,7 @@ def __init__(self, method='id', threshold=0.4, classes=None, warn=True):
minority class).
classes (list): The names of the classes present in the data, even
if they are not present in the array `y`.
warn (bool): Whether to raise a warning or raise an error.
"""
if method not in ['id', 'ir']:
raise ValueError(f"Method must be 'id' or 'ir' but was {method}")
Expand Down
34 changes: 30 additions & 4 deletions tests/test_sklearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,14 @@ def test_clip_detector():
with pytest.warns(UserWarning, match="Feature 1 has samples that may be clipped."):
pipe.fit_transform(X)

# Warns about y, but only on continuous data.
rng = np.random.default_rng(0)
X = rng.normal(size=(100, 2))
y = rng.normal(size=100)
y[:3] = y.max()
with pytest.warns(UserWarning, match="Target 0 has samples that may be clipped."):
pipe.fit_transform(X, y)

# Does not warn:
X = np.array([[2, 1], [3, 2], [4, 3], [5, 4]])
pipe.fit_transform(X)
Expand All @@ -37,18 +45,22 @@ def test_correlation_detector():
pipe.fit_transform(X)


def test_simple_multimodal_detector():
def test_multimodality_detector():
"""
Checks for features with a multimodal distribution, considered across the
entire dataset (i.e. not per class).
entire dataset.
"""
pipe = make_pipeline(rf.RegressionMultimodalDetector())
pipe = make_pipeline(rf.MultimodalityDetector())
rng = np.random.default_rng(0)
X1 = np.stack([rng.normal(size=80), rng.normal(size=80)]).T
X2 = np.stack([rng.normal(size=80), 3 + rng.normal(size=80)]).T
X = np.vstack([X1, X2])
with pytest.warns(UserWarning, match="Feature 1 has samples that may be multimodally distributed."):
with pytest.warns(UserWarning, match="Feature 1 has a multimodal distribution."):
pipe.fit_transform(X)
y = np.hstack([np.zeros(80), np.ones(80)])

# Does not warn.
pipe.fit(X, y)


def test_custom_detector():
Expand Down Expand Up @@ -115,10 +127,22 @@ def test_multivariate_outlier_detector():
with pytest.warns(UserWarning, match="Dataset has more multivariate outlier samples than expected."):
pipe.fit_transform(X)

# Warns for y too.
pipe = make_pipeline(rf.MultivariateOutlierDetector(factor=0.5, p=0.8))
X = rng.uniform(size=(1_000, 2))
y = rng.normal(size=1_000)
# y[:100] = 10
with pytest.warns(UserWarning, match="Target has more univariate outlier samples than expected."):
pipe.fit_transform(X, y)

# Does not warn with factor of 2.5:
pipe = make_pipeline(rf.MultivariateOutlierDetector(factor=2.5))
pipe.fit_transform(X)

# Does not warn for y.
y = rng.normal(size=1_000)
pipe.fit(X, y)


def test_outlier_detector():
# Use a factor of 0.5 to almost guarantee that this will throw a warning.
Expand Down Expand Up @@ -202,6 +226,8 @@ def test_imbalance_comparator():
y = rng.normal(size=100)
with pytest.warns(UserWarning, match="Target y seems continuous"):
pipe.fit_transform(X, y)
with pytest.warns(UserWarning, match="Target y seems continuous"):
pipe.transform(X, y)

# No warning if y is None, just skips:
pipe.fit_transform(X)
Expand Down

0 comments on commit 6fa4662

Please sign in to comment.