diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml index 8276100..781c3b8 100644 --- a/.github/workflows/build-test.yml +++ b/.github/workflows/build-test.yml @@ -14,16 +14,17 @@ jobs: strategy: fail-fast: false matrix: - python-version: ["3.8", "3.9", "3.10", "3.11"] + python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} + allow-prereleases: true - name: Install dependencies run: | diff --git a/CHANGELOG.md b/CHANGELOG.md index f3e29be..c0538e8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,19 @@ # Changelog -## 0.1.11, in development - -- Coming soon... +## 0.2.0, 3 September 2023 + +- Moved to something more closely resembling semantic versioning, which is the main reason this is version 0.2.0. +- Builds and tests on Python 3.11 have been successful, so now supporting this version. Started testing on Python 3.12, which is not supported for the time being. +- Added custom 'alarm' `Detector`, which can be instantiated with a function and a warning to emit when the function returns True for a 1D array. You can easily write your own detectors with this class. +- Added `make_detector_pipeline()` which can take sequences of functions and warnings (or a mapping of functions to warnings) and returns a `scikit-learn.pipeline.Pipeline` containing a `Detector` for each function. +- Added `RegressionMultimodalDetector` to allow detection of non-unimodal distributions in features, when considered across the entire dataset. (Coming soon, a similar detector for classification tasks that will partition the data by class.) +- Redefined `is_standardized` (deprecated) as `is_standard_normal`, which implements the Kolmogorov–Smirnov test. It seems more reliable than assuming the data will have a mean of almost exactly 0 and standard deviation of exactly 1, when all we really care about is that the feature is roughly normal. +- Changed the wording slightly in the existing detector warning messages. +- No longer warning if `y` is `None` in, eg, `ImportanceDetector`, since you most likely know this. +- Some changes to `ImportanceDetector`. It now uses KNN estimators instead of SVMs as the third measure of importance; the SVMs were too unstable, causing numerical issues. It also now requires that the number of important features is less than the total number of features to be triggered. So if you have 2 features and both are important, it does not trigger. +- Improved `is_continuous()` which was erroneously classifying integer arrays with many consecutive values as non-continuous. +- Added a `Tutorial.ipynb` notebook to the docs. +- Added a **Copy** button to code blocks in the docs. ## 0.1.10, 21 November 2022 diff --git a/README.md b/README.md index d4ec102..9ad96a7 100644 --- a/README.md +++ b/README.md @@ -8,8 +8,6 @@ 🚩 `redflag` aims to be an automatic safety net for machine learning datasets. The vision is to accept input of a Pandas `DataFrame` or NumPy `ndarray` (one for each of the input `X` and target `y` in a machine learning task). `redflag` will provide an analysis of each feature, and of the target, including aspects such as class imbalance, leakage, outliers, anomalous data patterns, threats to the IID assumption, and so on. The goal is to complement other projects like `pandas-profiling` and `greatexpectations`. -⚠️ **This project is very rough and does not do much yet. The API will very likely change without warning. Please consider contributing!** - ## Installation diff --git a/docs/conf.py b/docs/conf.py index d7482db..68b39b6 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -48,11 +48,12 @@ def setup(app): # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom # ones. extensions = [ - 'sphinx.ext.githubpages', 'sphinxcontrib.apidoc', + 'sphinx.ext.githubpages', 'sphinx.ext.napoleon', - 'myst_nb', 'sphinx.ext.coverage', + 'sphinx_copybutton', + 'myst_nb', ] myst_enable_extensions = ["dollarmath", "amsmath"] diff --git a/docs/index.rst b/docs/index.rst index 5669fc3..7703273 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -41,6 +41,7 @@ User guide installation _notebooks/Basic_usage.ipynb _notebooks/Using_redflag_with_sklearn.ipynb + _notebooks/Tutorial.ipynb API reference @@ -82,5 +83,5 @@ Indices and tables PyPI releases Code in GitHub Issue tracker - Community guidelines - Scienxlab + Community guidelines + Scienxlab diff --git a/docs/make.bat b/docs/make.bat deleted file mode 100644 index 153be5e..0000000 --- a/docs/make.bat +++ /dev/null @@ -1,35 +0,0 @@ -@ECHO OFF - -pushd %~dp0 - -REM Command file for Sphinx documentation - -if "%SPHINXBUILD%" == "" ( - set SPHINXBUILD=sphinx-build -) -set SOURCEDIR=. -set BUILDDIR=_build - -if "%1" == "" goto help - -%SPHINXBUILD% >NUL 2>NUL -if errorlevel 9009 ( - echo. - echo.The 'sphinx-build' command was not found. Make sure you have Sphinx - echo.installed, then set the SPHINXBUILD environment variable to point - echo.to the full path of the 'sphinx-build' executable. Alternatively you - echo.may add the Sphinx directory to PATH. - echo. - echo.If you don't have Sphinx installed, grab it from - echo.https://www.sphinx-doc.org/ - exit /b 1 -) - -%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% -goto end - -:help -%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% - -:end -popd diff --git a/docs/notebooks/Tutorial.ipynb b/docs/notebooks/Tutorial.ipynb index 5fa283a..8830a0b 100644 --- a/docs/notebooks/Tutorial.ipynb +++ b/docs/notebooks/Tutorial.ipynb @@ -80,7 +80,7 @@ "X_scaled = scaler.transform(X)\n", "\n", "clf.fit(X_scaled, y)\n", - "clf.predict(X)" + "clf.predict(X) # <-- Oops, we predicted on unscaled data." ] }, { @@ -100,7 +100,7 @@ { "data": { "text/plain": [ - "array(['ms', 'ss'], dtype='" + "" ] }, - "execution_count": 11, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" }, { "data": { - "image/png": "\n", + "image/png": "\n", "text/plain": [ "
" ] @@ -533,7 +533,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -542,7 +542,7 @@ "True" ] }, - "execution_count": 12, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -555,12 +555,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "This is order-dependent. That is, shuffling the data removes the correlation, but does not mean the records are independent — the only way around this issue is to split the data differently." + "This is order-dependent. That is, shuffling the data removes the correlation:" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -569,7 +569,7 @@ "False" ] }, - "execution_count": 13, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -582,6 +582,13 @@ "rf.is_correlated(gr)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "But this does not mean the records are independent — the only way around this issue is to split the data differently." + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -593,16 +600,16 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "array([0.42261124, 0.19923465, 0.31613598, 0.06121184])" + "array([0.42028113, 0.2001267 , 0.3180724 , 0.06151976])" ] }, - "execution_count": 14, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -631,14 +638,16 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "To make things as easy as possible, it would be nice to have some alarms in the pipeline. This won't be able to catch everything, for example if the data are shuffled and/or randomly sampled in a split, it might be very hard to spot self-correlation. I'm not sure how to alret the user to that kind of error, other than by potentially providing a wrapped version of `train_test_split()`.\n", + "To make things as easy as possible, it would be nice to have some smoke alarms in the pipeline. Redflag has some prebuilt smoke alarms, and you can also make your own.\n", + "\n", + "Redflag's smoke alarms won't be able to catch everything, however. For example if the data are shuffled and/or randomly sampled in a split, it might be very hard to spot self-correlation. I'm not sure how to alert the user to that kind of error, other than by potentially providing a wrapped version of `train_test_split()`.\n", "\n", "Anyway, let's split our data in a sensible way: by well." ] }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -655,40 +664,34 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "
Pipeline(steps=[('rf.imbalance', ImbalanceDetector()),\n",
+       "
Pipeline(steps=[('rf.imbalance', ImbalanceDetector()),\n",
        "                ('rf.clip', ClipDetector()),\n",
        "                ('rf.correlation', CorrelationDetector()),\n",
-       "                ('rf.outlier',\n",
-       "                 OutlierDetector(p=0.9899999999999985,\n",
-       "                                 threshold=3.3682141715600706)),\n",
+       "                ('rf.outlier', OutlierDetector()),\n",
        "                ('rf.distributions', DistributionComparator()),\n",
-       "                ('rf.importance', ImportanceDetector())])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
ImbalanceDetector()
ClipDetector()
CorrelationDetector()
OutlierDetector()
DistributionComparator()
ImportanceDetector()
" ], "text/plain": [ "Pipeline(steps=[('rf.imbalance', ImbalanceDetector()),\n", " ('rf.clip', ClipDetector()),\n", " ('rf.correlation', CorrelationDetector()),\n", - " ('rf.outlier',\n", - " OutlierDetector(p=0.9899999999999985,\n", - " threshold=3.3682141715600706)),\n", + " ('rf.outlier', OutlierDetector()),\n", " ('rf.distributions', DistributionComparator()),\n", " ('rf.importance', ImportanceDetector())])" ] }, - "execution_count": 38, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -706,40 +709,34 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "
Pipeline(steps=[('standardscaler', StandardScaler()),\n",
+       "
Pipeline(steps=[('standardscaler', StandardScaler()),\n",
        "                ('pipeline',\n",
        "                 Pipeline(steps=[('rf.imbalance', ImbalanceDetector()),\n",
        "                                 ('rf.clip', ClipDetector()),\n",
        "                                 ('rf.correlation', CorrelationDetector()),\n",
-       "                                 ('rf.outlier',\n",
-       "                                  OutlierDetector(p=0.9899999999999985,\n",
-       "                                                  threshold=3.3682141715600706)),\n",
+       "                                 ('rf.outlier', OutlierDetector()),\n",
        "                                 ('rf.distributions', DistributionComparator()),\n",
        "                                 ('rf.importance', ImportanceDetector())])),\n",
-       "                ('svc', SVC())])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
ImbalanceDetector()
ClipDetector()
CorrelationDetector()
OutlierDetector()
DistributionComparator()
ImportanceDetector()
SVC()
" ], "text/plain": [ "Pipeline(steps=[('standardscaler', StandardScaler()),\n", @@ -747,15 +744,13 @@ " Pipeline(steps=[('rf.imbalance', ImbalanceDetector()),\n", " ('rf.clip', ClipDetector()),\n", " ('rf.correlation', CorrelationDetector()),\n", - " ('rf.outlier',\n", - " OutlierDetector(p=0.9899999999999985,\n", - " threshold=3.3682141715600706)),\n", + " ('rf.outlier', OutlierDetector()),\n", " ('rf.distributions', DistributionComparator()),\n", " ('rf.importance', ImportanceDetector())])),\n", " ('svc', SVC())])" ] }, - "execution_count": 39, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -769,7 +764,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -777,43 +772,39 @@ "output_type": "stream", "text": [ "🚩 The labels are imbalanced by more than the threshold (0.420 > 0.400). See self.minority_classes_ for the minority classes.\n", - "🚩 Features 0, 1 may have clipped values.\n", - "🚩 Features 0, 1, 2 may have correlated values.\n", - "🚩 There are more outliers than expected in the training data (390 vs 72).\n", + "🚩 Features 0, 1 have samples that may be clipped.\n", + "🚩 Features 0, 1, 2 have samples that may be correlated.\n", + "🚩 There are more outliers than expected in the training data (316 vs 31).\n", "🚩 Feature 3 has low importance; check for relevance.\n" ] }, { "data": { "text/html": [ - "
Pipeline(steps=[('standardscaler', StandardScaler()),\n",
+       "
Pipeline(steps=[('standardscaler', StandardScaler()),\n",
        "                ('pipeline',\n",
        "                 Pipeline(steps=[('rf.imbalance', ImbalanceDetector()),\n",
        "                                 ('rf.clip', ClipDetector()),\n",
        "                                 ('rf.correlation', CorrelationDetector()),\n",
        "                                 ('rf.outlier',\n",
-       "                                  OutlierDetector(p=0.977050261730397,\n",
-       "                                                  threshold=3.3682141715600706)),\n",
+       "                                  OutlierDetector(threshold=3.643721188696941)),\n",
        "                                 ('rf.distributions', DistributionComparator()),\n",
        "                                 ('rf.importance', ImportanceDetector())])),\n",
-       "                ('svc', SVC())])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
ImbalanceDetector()
ClipDetector()
CorrelationDetector()
OutlierDetector(threshold=3.643721188696941)
DistributionComparator()
ImportanceDetector()
SVC()
" ], "text/plain": [ "Pipeline(steps=[('standardscaler', StandardScaler()),\n", @@ -822,14 +813,13 @@ " ('rf.clip', ClipDetector()),\n", " ('rf.correlation', CorrelationDetector()),\n", " ('rf.outlier',\n", - " OutlierDetector(p=0.977050261730397,\n", - " threshold=3.3682141715600706)),\n", + " OutlierDetector(threshold=3.643721188696941)),\n", " ('rf.distributions', DistributionComparator()),\n", " ('rf.importance', ImportanceDetector())])),\n", " ('svc', SVC())])" ] }, - "execution_count": 40, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -840,16 +830,16 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "🚩 Feature 0 may have clipped values.\n", - "🚩 Features 0, 1, 2 may have correlated values.\n", - "🚩 There are more outliers than expected in the data (41 vs 18).\n", + "🚩 Feature 0 has samples that may be clipped.\n", + "🚩 Features 0, 1, 2 have samples that may be correlated.\n", + "🚩 There are more outliers than expected in the data (26 vs 8).\n", "🚩 Feature 2 has a distribution that is different from training.\n" ] }, @@ -1025,7 +1015,7 @@ " 'siltstone', 'siltstone', 'siltstone', 'siltstone'], dtype=object)" ] }, - "execution_count": 41, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -1034,12 +1024,70 @@ "pipe.predict(X_test)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Making your own tests" + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "🚩 Feature 3 has samples that are negative.\n" + ] + }, + { + "data": { + "text/html": [ + "
Pipeline(steps=[('detector',\n",
+       "                 Detector(func=<function BaseRedflagDetector.__init__.<locals>.<lambda> at 0x7f5de3dbeca0>,\n",
+       "                          warning='are negative')),\n",
+       "                ('svc', SVC())])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "Pipeline(steps=[('detector',\n", + " Detector(func=. at 0x7f5de3dbeca0>,\n", + " warning='are negative')),\n", + " ('svc', SVC())])" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from redflag import Detector\n", + "\n", + "def has_negative(x) -> bool:\n", + " \"\"\"Returns True, i.e. triggers, if any samples are negative.\"\"\"\n", + " return any(x < 0)\n", + "\n", + "negative_detector = Detector(has_negative, \"are negative\")\n", + "\n", + "pipe = make_pipeline(negative_detector, SVC()) # NB, no standardization.\n", + "pipe.fit(X_train, y_train)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The noise feature we added has negative values; the others are all positive, which is what we expect for these data.\n", + "\n", + "(Careful! All standardized features will have negative values.)" + ] } ], "metadata": { @@ -1058,7 +1106,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.8" + "version": "3.9.12" }, "vscode": { "interpreter": { diff --git a/docs/notebooks/Using_redflag_with_sklearn.ipynb b/docs/notebooks/Using_redflag_with_sklearn.ipynb index 9586c52..06a0420 100644 --- a/docs/notebooks/Using_redflag_with_sklearn.ipynb +++ b/docs/notebooks/Using_redflag_with_sklearn.ipynb @@ -269,7 +269,7 @@ }, { "data": { - "image/png": "\n", + "image/png": "\n", "text/plain": [ "
" ] @@ -478,8 +478,8 @@ "output_type": "stream", "text": [ "🚩 The labels are imbalanced by more than the threshold (0.420 > 0.400). See self.minority_classes_ for the minority classes.\n", - "🚩 Features 0, 1 may have clipped values.\n", - "🚩 Features 0, 1, 2 may have correlated values.\n", + "🚩 Features 0, 1 have samples that may be clipped.\n", + "🚩 Features 0, 1, 2 have samples that may be correlated.\n", "🚩 There are more outliers than expected in the training data (349 vs 31).\n" ] }, @@ -552,8 +552,8 @@ "name": "stderr", "output_type": "stream", "text": [ - "🚩 Feature 0 may have clipped values.\n", - "🚩 Features 0, 1, 2 may have correlated values.\n", + "🚩 Feature 0 has samples that may be clipped.\n", + "🚩 Features 0, 1, 2 have samples that may be correlated.\n", "🚩 There are more outliers than expected in the data (30 vs 8).\n", "🚩 Feature 2 has a distribution that is different from training.\n" ] @@ -658,7 +658,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "🚩 Feature 1 may have clipped values.\n", + "🚩 Feature 1 has samples that may be clipped.\n", "🚩 There are more outliers than expected in the training data (839 vs 626).\n" ] }, @@ -782,7 +782,7 @@ "output_type": "stream", "text": [ "🚩 There is a different number of minority classes (2) compared to the training data (4).\n", - "🚩 The minority classes (sandstone, dolomite) are different from those in the training data (dolomite, sandstone, mudstone, wackestone).\n" + "🚩 The minority classes (dolomite, sandstone) are different from those in the training data (dolomite, wackestone, mudstone, sandstone).\n" ] }, { @@ -806,6 +806,142 @@ "pipe.transform(X_test, y_test)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Making your own smoke detector" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can pass a detection function to a generic `Detector`, along with a warning to emit when it is triggered:" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Pipeline(steps=[('detector',\n",
+       "                 Detector(func=<function BaseRedflagDetector.__init__.<locals>.<lambda> at 0x7fc60c4dd3a0>,\n",
+       "                          warning='are NaNs')),\n",
+       "                ('svc', SVC())])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "Pipeline(steps=[('detector',\n", + " Detector(func=. at 0x7fc60c4dd3a0>,\n", + " warning='are NaNs')),\n", + " ('svc', SVC())])" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from redflag import Detector\n", + "import numpy as np\n", + "\n", + "def has_nans(x) -> bool:\n", + " \"\"\"Returns True, i.e. triggers, if any samples are NaN.\"\"\"\n", + " return any(np.isnan(x))\n", + "\n", + "negative_detector = Detector(has_nans, \"are NaNs\")\n", + "\n", + "pipe = make_pipeline(negative_detector, SVC())\n", + "pipe.fit(X_train, y_train)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There are no NaNs.\n", + "\n", + "You can use `make_detector_pipeline` to combine several tests into a single pipeline." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "🚩 Features 0, 2 have samples that fail has_outliers().\n" + ] + }, + { + "data": { + "text/html": [ + "
Pipeline(steps=[('standardscaler', StandardScaler()),\n",
+       "                ('pipeline',\n",
+       "                 Pipeline(steps=[('detector-1',\n",
+       "                                  Detector(func=<function BaseRedflagDetector.__init__.<locals>.<lambda> at 0x7fc60c4ddf70>,\n",
+       "                                           warning='fail has_nans()')),\n",
+       "                                 ('detector-2',\n",
+       "                                  Detector(func=<function BaseRedflagDetector.__init__.<locals>.<lambda> at 0x7fc60c4ddca0>,\n",
+       "                                           warning='fail has_outliers()'))])),\n",
+       "                ('svc', SVC())])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "Pipeline(steps=[('standardscaler', StandardScaler()),\n", + " ('pipeline',\n", + " Pipeline(steps=[('detector-1',\n", + " Detector(func=. at 0x7fc60c4ddf70>,\n", + " warning='fail has_nans()')),\n", + " ('detector-2',\n", + " Detector(func=. at 0x7fc60c4ddca0>,\n", + " warning='fail has_outliers()'))])),\n", + " ('svc', SVC())])" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from redflag import make_detector_pipeline\n", + "\n", + "def has_outliers(x):\n", + " \"\"\"Returns True, i.e. triggers, if any samples are negative.\"\"\"\n", + " return any(abs(x) > 5)\n", + "\n", + "detectors = make_detector_pipeline([has_nans, has_outliers])\n", + "\n", + "pipe = make_pipeline(StandardScaler(), detectors, SVC())\n", + "pipe.fit(X_train, y_train)" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -881,9 +1017,9 @@ ], "metadata": { "kernelspec": { - "display_name": "py39", + "display_name": "redflag", "language": "python", - "name": "py39" + "name": "redflag" }, "language_info": { "codemirror_mode": { @@ -895,7 +1031,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.9.12" } }, "nbformat": 4, diff --git a/docs/post_process_html.py b/docs/post_process_html.py index 7f1d454..a88d19f 100644 --- a/docs/post_process_html.py +++ b/docs/post_process_html.py @@ -26,7 +26,7 @@ def add_analytics(html): """ s = r'' pattern = re.compile(s) - new_s = '' + new_s = '' html = pattern.sub(new_s, html) return html diff --git a/pyproject.toml b/pyproject.toml index a1263b8..9584082 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,6 +24,7 @@ classifiers = [ ] dependencies = [ + "numpy<2.0", # NumPy 2 will likely break some things. "scipy!=1.10.0", # Bug in stats.powerlaw. "scikit-learn", ] @@ -46,7 +47,7 @@ dev = [ ] [project.urls] -"documentation" = "https://scienxlab.github.io/redflag" +"documentation" = "https://scienxlab.org/redflag" "repository" = "https://github.com/scienxlab/redflag" [tool.setuptools_scm] diff --git a/src/redflag/__init__.py b/src/redflag/__init__.py index c482b84..1b9cf06 100644 --- a/src/redflag/__init__.py +++ b/src/redflag/__init__.py @@ -11,17 +11,11 @@ from .importance import * from .outliers import * +# From https://github.com/pypa/setuptools_scm +from importlib.metadata import version, PackageNotFoundError -from pkg_resources import get_distribution, DistributionNotFound try: - VERSION = get_distribution(__name__).version -except DistributionNotFound: - try: - from ._version import version as VERSION - except ImportError: - raise ImportError( - "Failed to find (autogenerated) _version.py. " - "This might be because you are installing from GitHub's tarballs, " - "use the PyPI ones." - ) -__version__ = VERSION + __version__ = version("package-name") +except PackageNotFoundError: + # package is not installed + pass diff --git a/src/redflag/distributions.py b/src/redflag/distributions.py index f7eea5d..f406304 100644 --- a/src/redflag/distributions.py +++ b/src/redflag/distributions.py @@ -1,7 +1,7 @@ """ Functions related to understanding distributions. -Author: Matt Hall, scienxlab.com +Author: Matt Hall, scienxlab.org Licence: Apache 2.0 Copyright 2022 Redflag contributors @@ -34,7 +34,7 @@ from sklearn.neighbors import KernelDensity from sklearn.model_selection import GridSearchCV -from .utils import is_standardized +from .utils import is_standard_normal from .utils import iter_groups @@ -256,9 +256,9 @@ def wasserstein(X: ArrayLike, except AttributeError: # It's probably a 1D array or list. pass - + if stacked: - if not is_standardized(first): + if not is_standard_normal(first.flat): warnings.warn('First group does not appear to be standardized.', stacklevel=2) groups = np.hstack([len(dataset)*[i] for i, dataset in enumerate(X)]) X = np.vstack(X) @@ -267,7 +267,7 @@ def wasserstein(X: ArrayLike, X = np.asarray(X) if X.ndim != 2: raise ValueError("X must be a 2D array-like.") - + if groups is None: raise ValueError("Must provide a 1D array of group labels if X is a 2D array.") n_groups = np.unique(groups).size @@ -303,9 +303,13 @@ def bw_silverman(a: ArrayLike) -> float: """ Calculate the Silverman bandwidth. + Silverman, BW (1981), "Using kernel density estimates to investigate + multimodality", Journal of the Royal Statistical Society. Series B Vol. 43, + No. 1 (1981), pp. 97-99. + Args: a (array): The data. - + Returns: float: The Silverman bandwidth. @@ -321,7 +325,7 @@ def bw_silverman(a: ArrayLike) -> float: def bw_scott(a: ArrayLike) -> float: """ Calculate the Scott bandwidth. - + Args: a (array): The data. @@ -350,12 +354,20 @@ def cv_kde(a: ArrayLike, n_bandwidths: int=20, cv: int=10) -> float: Returns: float. The optimal bandwidth. - Examples: - >>> data = [1, 1, 1, 2, 2, 1, 1, 2, 2, 3, 2, 2, 2, 3, 3] - >>> abs(cv_kde(data, n_bandwidths=3, cv=3) - 0.290905379576344) < 1e-9 - True + Example: + >>> rng = np.random.default_rng(42) + >>> data = rng.normal(size=100) + >>> cv_kde(data, n_bandwidths=3, cv=3) + 0.5212113989811242 """ - a = np.asarray(a).reshape(-1, 1) + a = np.asarray(a) + if not is_standard_normal(a): + warnings.warn('Data does not appear to be standardized, the KDE may be a poor fit.', stacklevel=2) + if a.ndim == 1: + a = a.reshape(-1, 1) + elif a.ndim >= 2: + raise ValueError("Data must be 1D.") + silverman = bw_silverman(a) scott = bw_scott(a) start = min(silverman, scott)/2 @@ -378,22 +390,30 @@ def fit_kde(a: ArrayLike, bandwidth: float=1.0, kernel: str='gaussian') -> tuple Returns: tuple: (x, kde). - Examples: - >>> data = [-3, 1, -2, -2, -2, -2, 1, 2, 2, 1, 1, 2, 0, 0, 2, 2, 3, 3] + Example: + >>> rng = np.random.default_rng(42) + >>> data = rng.normal(size=100) >>> x, kde = fit_kde(data) - >>> x[0] - -4.5 - >>> abs(kde[0] - 0.011092399847113) < 1e-9 + >>> x[0] + 3.2124714013056916 < 1e-9 + True + >>> kde[0] - 0.014367259502733645 < 1e-9 True >>> len(kde) 200 """ a = np.asarray(a) + if not is_standard_normal(a): + warnings.warn('Data does not appear to be standardized, the KDE may be a poor fit.', stacklevel=2) + if a.ndim == 1: + a = a.reshape(-1, 1) + elif a.ndim >= 2: + raise ValueError("Data must be 1D.") model = KernelDensity(kernel=kernel, bandwidth=bandwidth) - model.fit(a.reshape(-1, 1)) - mima = 1.5 * np.abs(a).max() + model.fit(a) + mima = 1.5 * bandwidth * np.abs(a).max() x = np.linspace(-mima, mima, 200).reshape(-1, 1) log_density = model.score_samples(x) + return np.squeeze(x), np.exp(log_density) @@ -403,18 +423,19 @@ def get_kde(a: ArrayLike, method: str='scott') -> tuple[np.ndarray, np.ndarray]: Args: a (array): The data. - method (str): The rule of thumb for bandwidth estimation. - Default 'scott'. + method (str): The rule of thumb for bandwidth estimation. Must be one + of 'silverman', 'scott', or 'cv'. Default 'scott'. Returns: tuple: (x, kde). Examples: - >>> data = [-3, 1, -2, -2, -2, -2, 1, 2, 2, 1, 1, 2, 0, 0, 2, 2, 3, 3] + >>> rng = np.random.default_rng(42) + >>> data = rng.normal(size=100) >>> x, kde = get_kde(data) - >>> x[0] - -4.5 - >>> abs(kde[0] - 0.0015627693633590066) < 1e-09 + >>> x[0] + 1.354649738246933 < 1e-9 + True + >>> kde[0] - 0.162332012191087 < 1e-9 True >>> len(kde) 200 @@ -462,8 +483,8 @@ def kde_peaks(a: ArrayLike, method: str='scott', threshold: float=0.1) -> tuple[ Args: a (array): The data. - method (str): The rule of thumb for bandwidth estimation. - Default 'scott'. + method (str): The rule of thumb for bandwidth estimation. Must be one + of 'silverman', 'scott', or 'cv'. Default 'scott'. threshold (float): The threshold for peak amplitude. Default 0.1. Returns: @@ -471,11 +492,38 @@ def kde_peaks(a: ArrayLike, method: str='scott', threshold: float=0.1) -> tuple[ the peaks. Examples: - >>> data = [-3, 1, -2, -2, -2, -2, 1, 2, 2, 1, 1, 2, 0, 0, 2, 2, 3, 3] + >>> rng = np.random.default_rng(42) + >>> data = np.concatenate([rng.normal(size=100)-2, rng.normal(size=100)+2]) >>> x_peaks, y_peaks = kde_peaks(data) >>> x_peaks - array([-2.05778894, 1.74120603]) + array([-1.67243035, 1.88998226]) >>> y_peaks - array([0.15929031, 0.24708215]) + array([0.22014721, 0.19729456]) """ return find_large_peaks(*get_kde(a, method), threshold=threshold) + + +def is_multimodal(a: ArrayLike, method: str='scott', threshold: float=0.1) -> bool: + """ + Test if the data is multimodal. + + Args: + a (array): The data. + method (str): The rule of thumb for bandwidth estimation. Must be one + of 'silverman', 'scott', or 'cv'. Default 'scott'. + threshold (float): The threshold for peak amplitude. Default 0.1. + + Returns: + bool: True if the data is multimodal. + + Examples: + >>> rng = np.random.default_rng(42) + >>> data = rng.normal(size=100) + >>> is_multimodal(data) + False + >>> data = np.concatenate([rng.normal(size=100)-2, rng.normal(size=100)+2]) + >>> is_multimodal(data) + True + """ + x, y = kde_peaks(a, method=method, threshold=threshold) + return len(x) > 1 diff --git a/src/redflag/imbalance.py b/src/redflag/imbalance.py index 5540ef8..0c1f8e0 100644 --- a/src/redflag/imbalance.py +++ b/src/redflag/imbalance.py @@ -7,7 +7,7 @@ Pattern Recognition Letters 98 (2017) https://doi.org/10.1016/j.patrec.2017.08.002 -Author: Matt Hall, scienxlab.com +Author: Matt Hall, scienxlab.org Licence: Apache 2.0 Copyright 2022 Redflag contributors diff --git a/src/redflag/importance.py b/src/redflag/importance.py index 05a8feb..920deab 100644 --- a/src/redflag/importance.py +++ b/src/redflag/importance.py @@ -1,7 +1,7 @@ """ Feature importance metrics. -Author: Matt Hall, scienxlab.com +Author: Matt Hall, scienxlab.org Licence: Apache 2.0 Copyright 2022 Redflag contributors @@ -25,10 +25,10 @@ from sklearn.inspection import permutation_importance from sklearn.linear_model import Lasso from sklearn.ensemble import RandomForestRegressor -from sklearn.svm import SVR +from sklearn.neighbors import KNeighborsClassifier +from sklearn.neighbors import KNeighborsRegressor from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier -from sklearn.svm import SVC from .target import is_continuous from .utils import split_and_standardize @@ -42,8 +42,8 @@ def feature_importances(X: ArrayLike, y: ArrayLike=None, Measure feature importances on a task, given X and y. Classification tasks are assessed with logistic regression, a random - forest, and SVM permutation importance. Regression tasks are assessed with - lasso regression, a random forest, and SVM permutation importance. In each + forest, and KNN permutation importance. Regression tasks are assessed with + lasso regression, a random forest, and KNN permutation importance. In each case, the `n` normalized importances with the most variance are averaged. Args: @@ -63,13 +63,14 @@ def feature_importances(X: ArrayLike, y: ArrayLike=None, appear in X. Examples: - >>> X = [[0, 0, 0], [0, 1, 1], [0, 2, 0], [0, 3, 1], [0, 4, 0], [0, 5, 1]] - >>> y = [5, 15, 25, 35, 45, 55] - >>> feature_importances(X, y, task='regression', random_state=0) - array([ 0. , 0.97811006, -0.19385077]) - >>> y = ['a', 'a', 'a', 'b', 'b', 'b'] - >>> feature_importances(X, y, task='classification', random_state=0) - array([ 0. , 0.89013985, -0.55680651]) + >>> X = [[0, 0, 0], [0, 1, 1], [0, 2, 0], [0, 3, 1], [0, 4, 0], [0, 5, 1], [0, 7, 0], [0, 8, 1], [0, 8, 0]] + >>> y = [5, 15, 25, 35, 45, 55, 80, 85, 90] + >>> feature_importances(X, y, task='regression', random_state=42) + array([0. , 0.99416839, 0.00583161]) + >>> y = ['a', 'a', 'a', 'b', 'b', 'b', 'c', 'c', 'c'] + >>> x0, x1, x2 = feature_importances(X, y, task='classification', random_state=42) + >>> x1 > x2 > x0 # See Issue #49 for why this test is like this. + True """ if y is None: raise NotImplementedError('Unsupervised importance is not yet implemented.') @@ -84,17 +85,20 @@ def feature_importances(X: ArrayLike, y: ArrayLike=None, # Train three models and gather the importances. imps: list = [] if task == 'classification': - imps.append(np.abs(LogisticRegression().fit(X, y).coef_.sum(axis=0))) + imps.append(np.abs(LogisticRegression(random_state=random_state).fit(X, y).coef_.sum(axis=0))) imps.append(RandomForestClassifier(random_state=random_state).fit(X, y).feature_importances_) - model = SVC(random_state=random_state).fit(X_train, y_train) - r = permutation_importance(model, X_val, y_val, n_repeats=10, scoring='f1_weighted', random_state=random_state) + model = KNeighborsClassifier().fit(X_train, y_train) + r = permutation_importance(model, X_val, y_val, n_repeats=8, scoring='f1_weighted', random_state=random_state) imps.append(r.importances_mean) elif task == 'regression': - imps.append(np.abs(Lasso().fit(X, y).coef_)) + # Need data to be scaled, but don't necessarily want to scale entire dataset. + imps.append(np.abs(Lasso(random_state=random_state).fit(X, y).coef_)) imps.append(RandomForestRegressor(random_state=random_state).fit(X, y).feature_importances_) - model = SVR().fit(X_train, y_train) - r = permutation_importance(model, X_val, y_val, n_repeats=10, scoring='neg_mean_squared_error', random_state=random_state) - imps.append(r.importances_mean) + model = KNeighborsRegressor().fit(X_train, y_train) + r = permutation_importance(model, X_val, y_val, n_repeats=8, scoring='neg_mean_squared_error', random_state=random_state) + if not all(r.importances_mean < 0): + r.importances_mean[r.importances_mean < 0] = 1e-9 + imps.append(r.importances_mean) imps = np.array(imps) diff --git a/src/redflag/independence.py b/src/redflag/independence.py index 061d2fd..201c827 100644 --- a/src/redflag/independence.py +++ b/src/redflag/independence.py @@ -1,7 +1,7 @@ """ Functions related to understanding row independence. -Author: Matt Hall, scienxlab.com +Author: Matt Hall, scienxlab.org Licence: Apache 2.0 Copyright 2022 Redflag contributors diff --git a/src/redflag/outliers.py b/src/redflag/outliers.py index c759d1c..d40c30b 100644 --- a/src/redflag/outliers.py +++ b/src/redflag/outliers.py @@ -1,7 +1,7 @@ """ Functions related to understanding features. -Author: Matt Hall, scienxlab.com +Author: Matt Hall, scienxlab.org Licence: Apache 2.0 Copyright 2022 Redflag contributors diff --git a/src/redflag/sklearn.py b/src/redflag/sklearn.py index ed82bda..6f5d2b8 100644 --- a/src/redflag/sklearn.py +++ b/src/redflag/sklearn.py @@ -1,7 +1,7 @@ """ Scikit-learn components. -Author: Matt Hall, scienxlab.com +Author: Matt Hall, scienxlab.org Licence: Apache 2.0 Copyright 2022 Redflag contributors @@ -25,13 +25,16 @@ from sklearn import pipeline from sklearn.pipeline import Pipeline from sklearn.pipeline import _name_estimators +from sklearn.pipeline import make_pipeline from sklearn.covariance import EllipticEnvelope from scipy.stats import wasserstein_distance from scipy.stats import cumfreq from sklearn.utils.metaestimators import available_if from .utils import is_clipped, proportion_to_stdev, stdev_to_proportion +from .utils import iter_groups from .target import is_continuous +from .distributions import is_multimodal from .independence import is_correlated from .outliers import has_outliers, expected_outliers from .imbalance import imbalance_degree, imbalance_ratio, minority_classes @@ -55,25 +58,27 @@ def __init__(self, func, warning, **kwargs): self.warning = warning def fit(self, X, y=None): - return self - - def transform(self, X, y=None): - """ - Checks X (and y, if it is continuous data) for suspect values. - """ X = check_array(X) positive = [i for i, feature in enumerate(X.T) if self.func(feature)] if n := len(positive): pos = ', '.join(str(i) for i in positive) - warnings.warn(f"🚩 Feature{'s' if n > 1 else ''} {pos} may have {self.warning}.") + warnings.warn(f"🚩 Feature{'' if n == 1 else 's'} {pos} {'has' if n == 1 else 'have'} samples that {self.warning}.") - if (y is not None) and is_continuous(y): - if np.asarray(y).ndim == 1: - y_ = y.reshape(-1, 1) + if y is not None: + y_ = np.asarray(y) + if y_.ndim == 1: + y_ = y_.reshape(-1, 1) for i, target in enumerate(y_.T): - if self.func(target): - warnings.warn(f"🚩 Target {i} may have {self.warning}.") + if is_continuous(target) and self.func(target): + warnings.warn(f"🚩 Target {i} has samples that {self.warning}.") + + return self + + def transform(self, X, y=None): + """ + Can check X here, but y is not passed into here by `fit`. + """ return X @@ -88,14 +93,14 @@ class ClipDetector(BaseRedflagDetector): >>> X = np.array([[2, 1], [3, 2], [4, 3], [5, 3]]) >>> pipe.fit_transform(X) # doctest: +SKIP redflag/sklearn.py::redflag.sklearn.ClipDetector - 🚩 Feature 1 may have clipped values. + 🚩 Feature 1 has samples that may be clipped. array([[2, 1], [3, 2], [4, 3], [5, 3]]) """ def __init__(self): - super().__init__(is_clipped, "clipped values") + super().__init__(is_clipped, "may be clipped") class CorrelationDetector(BaseRedflagDetector): @@ -109,7 +114,7 @@ class CorrelationDetector(BaseRedflagDetector): >>> X = np.stack([rng.uniform(size=20), np.sin(np.linspace(0, 1, 20))]).T >>> pipe.fit_transform(X) # doctest: +SKIP redflag/sklearn.py::redflag.sklearn.CorrelationDetector - 🚩 Feature 1 may have correlated values. + 🚩 Feature 1 has samples that may be correlated. array([[0.38077051, 0. ], [0.42977406, 0.05260728] ... @@ -117,7 +122,18 @@ class CorrelationDetector(BaseRedflagDetector): [0.7482485 , 0.84147098]]) """ def __init__(self): - super().__init__(is_correlated, "correlated values") + super().__init__(is_correlated, "may be correlated") + + +class RegressionMultimodalDetector(BaseRedflagDetector): + """ + Transformer that detects features with non-unimodal distributions. In a + regression task, it considers the univariate distributions of the features + and the target. Do not use this detector for classification tasks, use + `MultimodalDetector` instead. + """ + def __init__(self): + super().__init__(is_multimodal, "may be multimodally distributed") class UnivariateOutlierDetector(BaseRedflagDetector): @@ -135,7 +151,7 @@ class UnivariateOutlierDetector(BaseRedflagDetector): >>> X = rng.normal(size=(1_000, 2)) >>> pipe.fit_transform(X) # doctest: +SKIP redflag/sklearn.py::redflag.sklearn.UnivariateOutlierDetector - 🚩 Features 0, 1 may have more outliers (in a univariate sense) than expected. + 🚩 Features 0, 1 have samples that are excess univariate outliers. array([[ 0.12573022, -0.13210486], [ 0.64042265, 0.10490012], [-0.53566937, 0.36159505], @@ -154,7 +170,7 @@ class UnivariateOutlierDetector(BaseRedflagDetector): [-0.90942756, 0.36922933]]) """ def __init__(self, **kwargs): - super().__init__(has_outliers, "more outliers (in a univariate sense) than expected", **kwargs) + super().__init__(has_outliers, "are excess univariate outliers", **kwargs) class MultivariateOutlierDetector(BaseEstimator, TransformerMixin): @@ -171,7 +187,7 @@ class MultivariateOutlierDetector(BaseEstimator, TransformerMixin): >>> X = rng.normal(size=(1_000, 2)) >>> pipe.fit_transform(X) # doctest: +SKIP redflag/sklearn.py::redflag.sklearn.MultivariateOutlierDetector - 🚩 Dataset may have more outliers (in a multivariate sense) than expected. + 🚩 Dataset has more multivariate outlier samples than expected. array([[ 0.12573022, -0.13210486], [ 0.64042265, 0.10490012], [-0.53566937, 0.36159505], @@ -210,13 +226,17 @@ def transform(self, X, y=None): outliers = has_outliers(X, p=self.p, threshold=self.threshold, factor=self.factor) if outliers: - warnings.warn(f"🚩 Dataset may have more outliers (in a multivariate sense) than expected.") + warnings.warn(f"🚩 Dataset has more multivariate outlier samples than expected.") if (y is not None) and is_continuous(y): if np.asarray(y).ndim == 1: y_ = y.reshape(-1, 1) + kind = 'univariate' + else: + y_ = y + kind = 'multivariate' if has_outliers(y_, p=self.p, threshold=self.threshold, factor=self.factor): - warnings.warn(f"🚩 Target may have more outliers (in a multivariate sense) than expected.") + warnings.warn(f"🚩 Target has more {kind} outlier samples than expected.") return X @@ -494,8 +514,10 @@ def fit(self, X, y=None): self. """ # If there's no target or y is continuous (probably a regression), we're done. - if y is None or is_continuous(y): - warnings.warn("Target y is None or seems continuous, so no imbalance detection.") + if y is None: + return self + if is_continuous(y): + warnings.warn("Target y seems continuous, skipping imbalance detection.") return self methods = {'id': imbalance_degree, 'ir': imbalance_ratio} @@ -578,8 +600,10 @@ def fit(self, X, y=None): self. """ # If there's no target or y is continuous (probably a regression), we're done. - if y is None or is_continuous(y): - warnings.warn("Target y is None or seems continuous, so no imbalance detection.") + if y is None: + return self + if is_continuous(y): + warnings.warn("Target y seems continuous, skipping imbalance detection.") return self methods = {'id': imbalance_degree, 'ir': imbalance_ratio} @@ -608,8 +632,10 @@ def transform(self, X, y=None): X. """ # If there's no target or y is continuous (probably a regression), we're done. - if y is None or is_continuous(y): - warnings.warn("Target y is None or seems continuous, so no imbalance detection.") + if y is None: + return self + if is_continuous(y): + warnings.warn("Target y seems continuous, skipping imbalance detection.") return self methods = {'id': imbalance_degree, 'ir': imbalance_ratio} @@ -686,21 +712,24 @@ def fit(self, X, y=None): X. """ if y is None: - warnings.warn("Target y is None, so no importance detection.") + warnings.warn("Target y is None, skipping importance detection.") return self importances = feature_importances(X, y, random_state=self.random_state) most_important = most_important_features(importances, threshold=self.threshold) - if (m := len(most_important)) <= 2: - most_str = ', '.join(str(i) for i in most_important) + M = X.shape[1] + + if (m := len(most_important)) <= 2 and (m < M): + most_str = ', '.join(str(i) for i in sorted(most_important)) warnings.warn(f"🚩 Feature{'' if m == 1 else 's'} {most_str} {'has' if m == 1 else 'have'} very high importance; check for leakage.") return self # Don't do this check if there were high-importance features (infer that the others are low.) least_important = least_important_features(importances, threshold=self.threshold) + if (m := len(least_important)) > 0: - least_str = ', '.join(str(i) for i in least_important) + least_str = ', '.join(str(i) for i in sorted(least_important)) warnings.warn(f"🚩 Feature{'' if m == 1 else 's'} {least_str} {'has' if m == 1 else 'have'} low importance; check for relevance.") return self @@ -806,8 +835,42 @@ def make_rf_pipeline(*steps, memory=None, verbose=False): ("rf.imbalance", ImbalanceDetector()), ("rf.clip", ClipDetector()), ("rf.correlation", CorrelationDetector()), + # ("rf.multimodal", MultimodalDetector()), ("rf.outlier", OutlierDetector()), ("rf.distributions", DistributionComparator()), ("rf.importance", ImportanceDetector()), ] ) + + +class Detector(BaseRedflagDetector): + def __init__(self, func, warning=None): + if warning is None: + warning = f"fail custom func {func.__name__}()" + super().__init__(func, warning) + + +def make_detector_pipeline(funcs, warnings=None) -> Pipeline: + """ + Make a detector from one or more 'alarm' functions. + + Args: + funcs: Can be a sequence of functions returning True if a 1D array + meets some condition you want to trigger the alarm for. For example, + `has_negative = lambda x: np.any(x < 0)` to alert you to the + presence of negative values. Can also be a mappable of functions to + warnings. + warnings: The warnings corresponding to the functions. It's probably + safer to pass the functions with their warnings in a dict. + + Returns: + Pipeline + """ + detectors = [] + if isinstance(funcs, dict): + warnings = funcs.values() + elif warnings is None: + warnings = [None for _ in funcs] + for func, warn in zip(funcs, warnings): + detectors.append(Detector(func, warn)) + return make_pipeline(*detectors) diff --git a/src/redflag/target.py b/src/redflag/target.py index 1acbf8f..2866f7e 100644 --- a/src/redflag/target.py +++ b/src/redflag/target.py @@ -1,7 +1,7 @@ """ Functions related to understanding the target and the type of task. -Author: Matt Hall, scienxlab.com +Author: Matt Hall, scienxlab.org Licence: Apache 2.0 Copyright 2022 Redflag contributors @@ -61,7 +61,7 @@ def is_continuous(a: ArrayLike, n: Optional[int]=None) -> bool: n (int): The number of potential categories. That is, if there are fewer than n unique values in the data, it is estimated to be categorical. Default: the square root of the sample size, which - is 10% of the data or 10_000, whichever is smaller. + is all the data or 10_000 random samples, whichever is smaller. Returns: bool: True if arr is probably best suited to regression. @@ -74,39 +74,51 @@ def is_continuous(a: ArrayLike, n: Optional[int]=None) -> bool: >>> import numpy as np >>> is_continuous(np.random.random(size=100)) True + >>> is_continuous(np.random.randint(0, 15, size=200)) + False """ - arr = np.array(a) + arr = np.asarray(a) if not is_numeric(arr): return False + # Now we are dealing with numbers that could represent categories. + + if is_binary(arr): + return False + # Starting with this and having the uplifts be 0.666 means # that at least 2 tests must trigger to get over 0.5. - p = 0.333 - - N = max(min(len(arr)//10, 10_000), 10) - sample = np.random.choice(arr, size=N, replace=False) + p = 1 / 3 + + # Take a sample if array is large. + if arr.size < 10_000: + sample = arr + else: + sample = np.random.choice(arr, size=10_000, replace=False) if n is None: - n = np.sqrt(len(sample)) + n = np.sqrt(sample.size) - # Check if floats (proper floats, ). + # Check if floats. if np.issubdtype(sample.dtype, np.floating): # If not ints in disguise. if not np.all([xi.is_integer() for xi in np.unique(sample)]): - p = update_p(p, 0.666, 0.666) - + p = update_p(p, 2/3, 2/3) + # If low precision. - if np.all((100*sample).astype(int) - 100*sample < 1e-12): - p = update_p(p, 0.666, 0.666) + if np.all((sample.astype(int) - sample) < 1e-3): + p = update_p(p, 2/3, 2/3) + # If many unique values. if np.unique(sample).size > n: - p = update_p(p, 0.666, 0.666) + p = update_p(p, 2/3, 2/3) - many_gap_sizes = np.unique(np.diff(sample)).size > n + # If many sizes of gaps between numbers. + many_gap_sizes = np.unique(np.diff(np.sort(sample))).size > n if many_gap_sizes: - p = update_p(p, 0.666, 0.666) + p = update_p(p, 2/3, 2/3) return p > 0.5 diff --git a/src/redflag/utils.py b/src/redflag/utils.py index 7151a5d..cc403f9 100644 --- a/src/redflag/utils.py +++ b/src/redflag/utils.py @@ -1,7 +1,7 @@ """ Utility functions. -Author: Matt Hall, scienxlab.com +Author: Matt Hall, scienxlab.org Licence: Apache 2.0 Copyright 2022 Redflag contributors @@ -21,17 +21,49 @@ from __future__ import annotations import warnings +import functools +import inspect from typing import Iterable, Any, Optional from numpy.typing import ArrayLike import numpy as np from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler -from scipy.stats import beta +from scipy import stats from scipy.optimize import fsolve from scipy.spatial.distance import pdist +def deprecated(instructions): + """ + Flags a method as deprecated. This decorator can be used to mark functions + as deprecated. It will result in a warning being emitted when the function + is used. + Args: + instructions (str): A human-friendly string of instructions, such + as: 'Please migrate to add_proxy() ASAP.' + Returns: + The decorated function. + """ + def decorator(func): + + @functools.wraps(func) + def wrapper(*args, **kwargs): + message = 'Call to deprecated function {}. {}'.format( + func.__name__, + instructions) + + frame = inspect.currentframe().f_back + + warnings.warn_explicit(message, + category=DeprecationWarning, + filename=inspect.getfile(frame.f_code), + lineno=frame.f_lineno) + return func(*args, **kwargs) + return wrapper + return decorator + + def flatten(L: list[Any]) -> Iterable[Any]: """ Flattens a list. For example: @@ -163,9 +195,10 @@ def split_and_standardize(X: ArrayLike, y: ArrayLike, random_state: Optional[int Returns: tuple of ndarray: X, X_train, X_val, y, y_train, y_val """ + X = np.asarray(X) X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=random_state) - if not is_standardized(X): + if not all(is_standard_normal(x) for x in X.T): scaler = StandardScaler().fit(X) X = scaler.transform(X) scaler = StandardScaler().fit(X_train) @@ -257,7 +290,7 @@ def stdev_to_proportion(threshold: float, d: float=1, n: float=1e9) -> float: >>> stdev_to_proportion(5, d=10) 0.9946544947734935 """ - return float(beta.cdf(x=1/n, a=d/2, b=(n-d-1)/2, scale=1/threshold**2)) + return float(stats.beta.cdf(x=1/n, a=d/2, b=(n-d-1)/2, scale=1/threshold**2)) def proportion_to_stdev(p: float, d: float=1, n: float=1e9) -> float: @@ -298,7 +331,8 @@ def proportion_to_stdev(p: float, d: float=1, n: float=1e9) -> float: return float(r_hat) -def is_standardized(a: ArrayLike, atol: float=1e-5) -> bool: +@deprecated("Use is_standard_normal() instead.") +def is_standardized(a: ArrayLike, atol: float=1e-3) -> bool: """ Returns True if the feature has zero mean and standard deviation of 1. In other words, if the feature appears to be a Z-score. @@ -321,6 +355,31 @@ def is_standardized(a: ArrayLike, atol: float=1e-5) -> bool: return bool((np.abs(μ) < atol) and (np.abs(σ - 1) < atol)) +def is_standard_normal(a: ArrayLike, confidence: float=0.95) -> bool: + """ + Performs the Kolmogorov-Smirnov test for normality. Returns True if the + feature appears to be normally distributed, with a mean close to zero and + standard deviation close to 1. + + Args: + a (array): The data. + confidence (float): The confidence level of the test, default 0.95 + (95% confidence). + + Returns: + bool: True if the feature appears to have a standard normal distribution. + + Example: + >>> a = np.random.normal(size=2000) + >>> is_standard_normal(a, confidence=0.9) + True + >>> is_standard_normal(a + 1) + False + """ + ks = stats.kstest(a, 'norm') + return ks.pvalue > (1 - confidence) + + def zscore(X: np.ndarray) -> np.ndarray: """ Transform array to Z-scores. If 2D, stats are computed @@ -433,7 +492,7 @@ def is_clipped(a: ArrayLike) -> bool: return (min_clips is not None) or (max_clips is not None) -def iter_groups(groups: ArrayLike) -> Iterator[np.ndarray]: +def iter_groups(groups: ArrayLike) -> Iterable[np.ndarray]: """ Allow iterating over groups, getting boolean array for each. diff --git a/tests/test_sklearn.py b/tests/test_sklearn.py index 4fb58ae..2f0c8d9 100644 --- a/tests/test_sklearn.py +++ b/tests/test_sklearn.py @@ -18,7 +18,7 @@ def test_clip_detector(): """ pipe = make_pipeline(rf.ClipDetector()) X = np.array([[2, 1], [3, 2], [4, 3], [5, 3]]) - with pytest.warns(UserWarning, match="Feature 1 may have clipped values."): + with pytest.warns(UserWarning, match="Feature 1 has samples that may be clipped."): pipe.fit_transform(X) # Does not warn: @@ -33,10 +33,47 @@ def test_correlation_detector(): pipe = make_pipeline(rf.CorrelationDetector()) rng = np.random.default_rng(0) X = np.stack([rng.uniform(size=20), np.sin(np.linspace(0, 1, 20))]).T - with pytest.warns(UserWarning, match="Feature 1 may have correlated values."): + with pytest.warns(UserWarning, match="Feature 1 has samples that may be correlated."): pipe.fit_transform(X) +def test_simple_multimodal_detector(): + """ + Checks for features with a multimodal distribution, considered across the + entire dataset (i.e. not per class). + """ + pipe = make_pipeline(rf.RegressionMultimodalDetector()) + rng = np.random.default_rng(0) + X1 = np.stack([rng.normal(size=80), rng.normal(size=80)]).T + X2 = np.stack([rng.normal(size=80), 3 + rng.normal(size=80)]).T + X = np.vstack([X1, X2]) + with pytest.warns(UserWarning, match="Feature 1 has samples that may be multimodally distributed."): + pipe.fit_transform(X) + + +def test_custom_detector(): + """ + Checks for data which fails a user-supplied test. + """ + has_negative = lambda x: np.any(x < 0) + pipe = rf.make_detector_pipeline({has_negative: "are negative"}) + X = np.array([[-2, 1], [3, 2], [4, 3], [5, 4]]) + with pytest.warns(UserWarning, match="Feature 0 has samples that are negative."): + pipe.fit_transform(X) + + pipe = rf.make_detector_pipeline([has_negative]) + with pytest.warns(UserWarning, match="Feature 0 has samples that fail"): + pipe.fit_transform(X) + + detector = rf.Detector(has_negative) + X = np.random.random(size=(100, 2)) + y = np.random.random(size=100) - 0.1 + assert has_negative(y) + assert rf.is_continuous(y) + with pytest.warns(UserWarning, match="Target 0 has samples that fail"): + pipe.fit_transform(X, y) + + def test_distribution_comparator(): """ Checks that the distribution of test data (i.e. transformed only) is the @@ -62,7 +99,7 @@ def test_univariate_outlier_detector(): pipe = make_pipeline(rf.UnivariateOutlierDetector(factor=0.5)) rng = np.random.default_rng(0) X = rng.normal(size=1_000).reshape(-1, 1) - with pytest.warns(UserWarning, match="Feature 0 may have more outliers"): + with pytest.warns(UserWarning, match="Feature 0 has samples that are excess univariate outliers"): pipe.fit_transform(X) # Does not warn with factor of 2.5: @@ -75,7 +112,7 @@ def test_multivariate_outlier_detector(): pipe = make_pipeline(rf.MultivariateOutlierDetector(factor=0.5)) rng = np.random.default_rng(0) X = rng.normal(size=(1_000, 2)) - with pytest.warns(UserWarning, match="Dataset may have more outliers"): + with pytest.warns(UserWarning, match="Dataset has more multivariate outlier samples than expected."): pipe.fit_transform(X) # Does not warn with factor of 2.5: @@ -120,12 +157,11 @@ def test_imbalance_detector(): # Warns about wrong kind of y (continuous): y = rng.normal(size=100) - with pytest.warns(UserWarning, match="Target y is None or seems continuous"): + with pytest.warns(UserWarning, match="Target y seems continuous"): pipe.fit_transform(X, y) - # Warns about wrong kind of y (None): - with pytest.warns(UserWarning, match="Target y is None or seems continuous"): - pipe.fit_transform(X) + # No warning if y is None, just skips. + pipe.fit_transform(X) # Raises error because method doesn't exist: with pytest.raises(ValueError) as e: @@ -164,12 +200,11 @@ def test_imbalance_comparator(): # Warns about wrong kind of y (continuous): y = rng.normal(size=100) - with pytest.warns(UserWarning, match="Target y is None or seems continuous"): + with pytest.warns(UserWarning, match="Target y seems continuous"): pipe.fit_transform(X, y) - # Warns about wrong kind of y (None): - with pytest.warns(UserWarning, match="Target y is None or seems continuous"): - pipe.fit_transform(X) + # No warning if y is None, just skips: + pipe.fit_transform(X) # Raises error because threshold is wrong. with pytest.raises(ValueError) as e: