From f7df6427d0c69e20e3616773722263709c4061d9 Mon Sep 17 00:00:00 2001 From: georgedouzas Date: Sat, 26 Oct 2024 11:05:59 +0300 Subject: [PATCH 1/8] chore: Merged with geometric-smote repo --- .copier-answers.yml | 18 +- .github/ISSUE_TEMPLATE/bug_report.yaml | 4 +- .github/ISSUE_TEMPLATE/config.yml | 6 +- .github/PULL_REQUEST_TEMPLATE.md | 2 +- .github/workflows/ci-docs.yml | 8 +- .github/workflows/ci.yml | 4 +- .gitignore | 1 + CHANGELOG.md | 98 ++- LICENSE | 2 +- README.md | 83 +-- docs/examples/README.md | 2 +- docs/examples/applications/README.md | 2 +- .../applications/plot_mnist_example.py | 2 +- docs/examples/plot_cluster_oversamplers.py | 149 +++++ ...nism.py => plot_gsmote_data_generation.py} | 10 +- ...es.py => plot_gsmote_validation_curves.py} | 6 +- docs/examples/plot_kmeans_smote.py | 113 ++++ docs/generate_api.py | 2 +- docs/overview/user_guide.md | 159 ++++- mkdocs.yml | 12 +- noxfile.py | 6 +- pyproject.toml | 58 +- src/imblearn_extra/__init__.py | 5 + src/imblearn_extra/clover/__init__.py | 39 ++ .../clover/clusterer/__init__.py | 5 + src/imblearn_extra/clover/clusterer/_som.py | 308 ++++++++++ .../clover/distribution/__init__.py | 5 + .../clover/distribution/_density.py | 361 +++++++++++ .../clover/distribution/base.py | 187 ++++++ .../clover/over_sampling/__init__.py | 29 + .../clover/over_sampling/_cluster.py | 571 ++++++++++++++++++ .../clover/over_sampling/_gsomo.py | 263 ++++++++ .../clover/over_sampling/_kmeans_smote.py | 239 ++++++++ .../clover/over_sampling/_somo.py | 215 +++++++ src/{ => imblearn_extra}/gsmote/__init__.py | 0 .../gsmote/geometric_smote.py | 6 +- src/{gsmote => imblearn_extra}/py.typed | 0 tests/__init__.py | 2 +- tests/clover/__init__.py | 1 + tests/clover/clusterer/__init__.py | 1 + tests/clover/clusterer/test_som.py | 74 +++ tests/clover/distribution/__init__.py | 1 + tests/clover/distribution/test_base.py | 25 + tests/clover/distribution/test_density.py | 348 +++++++++++ tests/clover/over_sampling/__init__.py | 1 + tests/clover/over_sampling/test_cluster.py | 326 ++++++++++ tests/clover/over_sampling/test_gsomo.py | 180 ++++++ .../clover/over_sampling/test_kmeans_smote.py | 168 ++++++ tests/clover/over_sampling/test_somo.py | 174 ++++++ tests/gsmote/__init__.py | 7 + tests/{ => gsmote}/test_geometric_smote.py | 4 +- 51 files changed, 4158 insertions(+), 134 deletions(-) create mode 100644 docs/examples/plot_cluster_oversamplers.py rename docs/examples/{plot_data_generation_mechanism.py => plot_gsmote_data_generation.py} (97%) rename docs/examples/{plot_validation_curves.py => plot_gsmote_validation_curves.py} (98%) create mode 100644 docs/examples/plot_kmeans_smote.py create mode 100644 src/imblearn_extra/__init__.py create mode 100644 src/imblearn_extra/clover/__init__.py create mode 100644 src/imblearn_extra/clover/clusterer/__init__.py create mode 100644 src/imblearn_extra/clover/clusterer/_som.py create mode 100644 src/imblearn_extra/clover/distribution/__init__.py create mode 100644 src/imblearn_extra/clover/distribution/_density.py create mode 100644 src/imblearn_extra/clover/distribution/base.py create mode 100644 src/imblearn_extra/clover/over_sampling/__init__.py create mode 100644 src/imblearn_extra/clover/over_sampling/_cluster.py create mode 100644 src/imblearn_extra/clover/over_sampling/_gsomo.py create mode 100644 src/imblearn_extra/clover/over_sampling/_kmeans_smote.py create mode 100644 src/imblearn_extra/clover/over_sampling/_somo.py rename src/{ => imblearn_extra}/gsmote/__init__.py (100%) rename src/{ => imblearn_extra}/gsmote/geometric_smote.py (98%) rename src/{gsmote => imblearn_extra}/py.typed (100%) create mode 100644 tests/clover/__init__.py create mode 100644 tests/clover/clusterer/__init__.py create mode 100644 tests/clover/clusterer/test_som.py create mode 100644 tests/clover/distribution/__init__.py create mode 100644 tests/clover/distribution/test_base.py create mode 100644 tests/clover/distribution/test_density.py create mode 100644 tests/clover/over_sampling/__init__.py create mode 100644 tests/clover/over_sampling/test_cluster.py create mode 100644 tests/clover/over_sampling/test_gsomo.py create mode 100644 tests/clover/over_sampling/test_kmeans_smote.py create mode 100644 tests/clover/over_sampling/test_somo.py create mode 100644 tests/gsmote/__init__.py rename tests/{ => gsmote}/test_geometric_smote.py (99%) diff --git a/.copier-answers.yml b/.copier-answers.yml index 43648d6..c3bc6a8 100644 --- a/.copier-answers.yml +++ b/.copier-answers.yml @@ -1,18 +1,16 @@ -_commit: 0.8.0 -_src_path: gh:georgedouzas/copier-pdm-nox.git +_commit: 0.8.2 +_src_path: gh:georgedouzas/copier-pdm-nox author_email: gdouzas@icloud.com author_fullname: Georgios Douzas author_username: georgedouzas -copyright_date: '2019' +copyright_date: '2021' copyright_holder: Georgios Douzas copyright_holder_email: gdouzas@icloud.com copyright_license: MIT License -project_description: Implementation of the Geometric SMOTE algorithm, a geometrically - enhanced drop-in replacement for SMOTE. It is compatible with scikit-learn and - imbalanced-learn. -python_package_distribution_name: geometric-smote -python_package_import_name: gsmote -python_versions: '>=3.9, <3.12' -repository_name: geometric-smote +project_description: An implementation of novel oversampling algorithms. +python_package_distribution_name: imbalanced-learn-extra +python_package_import_name: imblearn_extra +python_versions: '>=3.10, <3.13' +repository_name: imbalanced-learn-extra repository_namespace: georgedouzas diff --git a/.github/ISSUE_TEMPLATE/bug_report.yaml b/.github/ISSUE_TEMPLATE/bug_report.yaml index b47675e..17a3b64 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.yaml +++ b/.github/ISSUE_TEMPLATE/bug_report.yaml @@ -7,7 +7,7 @@ body: attributes: value: > **Before submitting a bug, please make sure the issue hasn't been already addressed by searching - through [the past issues](https://github.com/georgedouzas/geometric-smote/issues).** + through [the past issues](https://github.com/georgedouzas/imbalanced-learn-extra/issues).** - type: textarea attributes: label: Describe the bug @@ -52,7 +52,7 @@ body: description: | Please provide the following information. placeholder: > - `geometric-smote` version + `imbalanced-learn-extra` version Python version OS validations: diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml index b2a0071..5c247df 100644 --- a/.github/ISSUE_TEMPLATE/config.yml +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -1,11 +1,11 @@ blank_issues_enabled: true contact_links: - name: Discussions - url: https://github.com/georgedouzas/geometric-smote/discussions + url: https://github.com/georgedouzas/imbalanced-learn-extra/discussions about: Ask questions and discuss with other community members - name: Gitter - url: https://gitter.im/geometric-smote/community + url: https://gitter.im/imbalanced-learn-extra/community about: Users and developers can sometimes be found on the gitter channel - name: Blank issue - url: https://github.com/georgedouzas/geometric-smote/issues/new + url: https://github.com/georgedouzas/imbalanced-learn-extra/issues/new about: Please note that Github Discussions should be used in most cases instead diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index f6c15e7..c316b9d 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -4,7 +4,7 @@ Please check the following: -- [ ] You have checked the [Pull Request guidelines](https://github.com/georgedouzas/geometric-smote/blob/master/.github/CONTRIBUTING.md). +- [ ] You have checked the [Pull Request guidelines](https://github.com/georgedouzas/imbalanced-learn-extra/blob/master/.github/CONTRIBUTING.md). - [ ] Tests for bug fixes or new features have been added. - [ ] Docs have been added or updated. diff --git a/.github/workflows/ci-docs.yml b/.github/workflows/ci-docs.yml index 4eee97e..19b1a49 100644 --- a/.github/workflows/ci-docs.yml +++ b/.github/workflows/ci-docs.yml @@ -15,7 +15,6 @@ jobs: os: - ubuntu-latest - macos-latest - - windows-latest runs-on: ${{ matrix.os }} @@ -29,7 +28,7 @@ jobs: python-version: | 3.10 3.11 - + 3.12 - name: Set up PDM uses: pdm-project/setup-pdm@v3 @@ -56,7 +55,6 @@ jobs: os: - ubuntu-latest - macos-latest - - windows-latest runs-on: ${{ matrix.os }} @@ -70,7 +68,7 @@ jobs: python-version: | 3.10 3.11 - + 3.12 - name: Set up PDM uses: pdm-project/setup-pdm@v3 @@ -90,7 +88,7 @@ jobs: strategy: matrix: - python-version: ['3.10', '3.11'] + python-version: ['3.10', '3.11', '3.12'] steps: - name: Checkout diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d68cd38..b372b84 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -14,7 +14,6 @@ jobs: os: - ubuntu-latest - macos-latest - - windows-latest runs-on: ${{ matrix.os }} @@ -28,6 +27,7 @@ jobs: python-version: | 3.10 3.11 + 3.12 - name: Set up PDM uses: pdm-project/setup-pdm@v3 @@ -54,7 +54,6 @@ jobs: os: - ubuntu-latest - macos-latest - - windows-latest runs-on: ${{ matrix.os }} @@ -68,6 +67,7 @@ jobs: python-version: | 3.10 3.11 + 3.12 - name: Set up PDM uses: pdm-project/setup-pdm@v3 diff --git a/.gitignore b/.gitignore index 6285262..0bbd589 100644 --- a/.gitignore +++ b/.gitignore @@ -20,3 +20,4 @@ __pypackages__/ .ruff_cache .vscode .DS_Store +.tool-versions \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 28c65de..6f2b229 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,45 +6,107 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html). -## [0.2.3](https://github.com/georgedouzas/geometric-smote/releases/tag/0.2.3) - 2023-12-02 +## [0.7.0](https://github.com/georgedouzas/imbalanced-learn-extra/releases/tag/0.7.0) - 2024-06-28 -[Compare with 0.2.2](https://github.com/georgedouzas/geometric-smote/compare/0.2.2...0.2.3) +[Compare with 0.6.0](https://github.com/georgedouzas/imbalanced-learn-extra/compare/0.6.0...0.7.0) + +### Features + +- Merge `geometric-smote` and `cluster-over-sampling` projects ([37ea792](https://github.com/georgedouzas/imbalanced-learn-extra/commit/37ea792249d8bc33c50b662f97d41f3ba00711c7) by georgedouzas). ### Docs -- Fix scikit-learn link ([26d9c99](https://github.com/georgedouzas/geometric-smote/commit/26d9c993102677e55d134bb8eabce022188b283a) by georgedouzas). +- Split long docstrings ([d54699c](https://github.com/georgedouzas/imbalanced-learn-extra/commit/d54699cecd6a4c7c96b339a1ab23883dbb7e727f) by georgedouzas). +- Modify expression ([7237221](https://github.com/georgedouzas/imbalanced-learn-extra/commit/7237221689d64a36526ea344e053c68adfff0dd0) by georgedouzas). + +### Chore + +- Update copier template ([9b9ae8c](https://github.com/georgedouzas/imbalanced-learn-extra/commit/9b9ae8cc8b824a84b6134c94cb1365dd532242d1) by georgedouzas). + +## [0.6.0](https://github.com/georgedouzas/imbalanced-learn-extra/releases/tag/0.6.0) - 2023-10-02 -## [0.2.2](https://github.com/georgedouzas/geometric-smote/releases/tag/0.2.2) - 2023-12-02 +[Compare with 0.5.1](https://github.com/georgedouzas/imbalanced-learn-extra/compare/0.5.1...0.6.0) -[Compare with 0.2.1](https://github.com/georgedouzas/geometric-smote/compare/0.2.1...0.2.2) +### Features + +- Include SOM in `clusterer` module ([4c6a8af](https://github.com/georgedouzas/imbalanced-learn-extra/commit/4c6a8af3fd89ee272487fdb807fb9347f0fcb55c) by georgedouzas). ### Docs -- Fix typo ([84f4738](https://github.com/georgedouzas/geometric-smote/commit/84f4738bcf3d28b342c7d7ddd07e0d32856d25e3) by georgedouzas). +- Remove geometric-smote as optional dependency ([24968c2](https://github.com/georgedouzas/imbalanced-learn-extra/commit/24968c272f11cec3ffe5551c028a99c6e0cd14bc) by georgedouzas). +- Silence warning for initialization ([ed4fed2](https://github.com/georgedouzas/imbalanced-learn-extra/commit/ed4fed22c5ff6b8c30e66a57806bb8468362380f) by georgedouzas). + +### Tests + +- Update tests and fix bugs ([e81c837](https://github.com/georgedouzas/imbalanced-learn-extra/commit/e81c837738f34bd326bf35de003e561cc3466146) by georgedouzas). ### Chore -- Release 0.2.2 ([7aa8d9c](https://github.com/georgedouzas/geometric-smote/commit/7aa8d9c94372b83195b79ba357a21d74ec1aa1a6) by georgedouzas). +- Release 0.6.0 ([a5040dc](https://github.com/georgedouzas/imbalanced-learn-extra/commit/a5040dc5db70c2eabf939de435b4130330d0724e) by georgedouzas). +- Ignore PDM local config file ([ed0cca2](https://github.com/georgedouzas/imbalanced-learn-extra/commit/ed0cca2d4f8adc50119bb205071443a67c52d75b) by georgedouzas). +- Install conda to tests jobs ([02fce3d](https://github.com/georgedouzas/imbalanced-learn-extra/commit/02fce3ddba653ef0d8bfbc57233012cf95168de1) by georgedouzas). +- Update copier template ([5680a3e](https://github.com/georgedouzas/imbalanced-learn-extra/commit/5680a3e41a075a2101ec4570b8d2416a6709d332) by georgedouzas). +- Use conda backend ([1253427](https://github.com/georgedouzas/imbalanced-learn-extra/commit/12534274eb82abe9ade5f3a60a946114125c7489) by georgedouzas). + +## [0.5.1](https://github.com/georgedouzas/imbalanced-learn-extra/releases/tag/0.5.1) - 2023-04-13 + +[Compare with 0.4.0](https://github.com/georgedouzas/imbalanced-learn-extra/compare/0.4.0...0.5.1) + +### Features + +- Restructure project using copier template ([57293ae](https://github.com/georgedouzas/imbalanced-learn-extra/commit/57293aee12332ae0ca7f8b8f862e8b143d11bed6) by georgedouzas). + +### Docs + +- Update changelog for version 0.5.0 ([375e79e](https://github.com/georgedouzas/imbalanced-learn-extra/commit/375e79e3bbc95d175f68d8a4ac1e79e628aaaeeb) by georgedouzas). +- Fix example ([438d6a5](https://github.com/georgedouzas/imbalanced-learn-extra/commit/438d6a579d8bc9d175e1fc5656fe38acc4868253) by georgedouzas). + +### Style -## [0.2.1](https://github.com/georgedouzas/geometric-smote/releases/tag/0.2.1) - 2023-12-02 +- Add a blank line after docstring ([ce7524d](https://github.com/georgedouzas/imbalanced-learn-extra/commit/ce7524d5db989c672c7e2994ce33b8a18847c20f) by georgedouzas). -[Compare with 0.2.0](https://github.com/georgedouzas/geometric-smote/compare/0.2.0...0.2.1) +### Code Refactoring + +- Add stack level for warning ([c5925db](https://github.com/georgedouzas/imbalanced-learn-extra/commit/c5925db18cdff1a45852bd0851b2a567daa481a1) by georgedouzas). ### Chore -- Release 0.2.1 ([8734fb6](https://github.com/georgedouzas/geometric-smote/commit/8734fb609f65586c007f2f9d8cea8915d10625fe) by georgedouzas). -- Remove Python 3.9 from CI ([c1aeb9b](https://github.com/georgedouzas/geometric-smote/commit/c1aeb9be9e4004ced4c50627ca0284d4c71dc6f7) by georgedouzas). -- Restructure project with copier template ([73b3280](https://github.com/georgedouzas/geometric-smote/commit/73b32804165ef6875382c85c42a5000ad27e53b4) by georgedouzas). +- Update copier template ([d20c7dc](https://github.com/georgedouzas/imbalanced-learn-extra/commit/d20c7dc9ec73127cd41ecda69505b5b8adb07f23) by georgedouzas). +- Release 0.5.0 ([080a2bb](https://github.com/georgedouzas/imbalanced-learn-extra/commit/080a2bb01f9315bf876f71463625ceee1e89db3d) by georgedouzas). + +## [0.4.0](https://github.com/georgedouzas/imbalanced-learn-extra/releases/tag/0.4.0) - 2023-02-16 + +[Compare with 0.2.5](https://github.com/georgedouzas/imbalanced-learn-extra/compare/0.2.5...0.4.0) + +## [0.2.5](https://github.com/georgedouzas/imbalanced-learn-extra/releases/tag/0.2.5) - 2020-07-24 + +[Compare with 0.2.4](https://github.com/georgedouzas/imbalanced-learn-extra/compare/0.2.4...0.2.5) + +## [0.2.4](https://github.com/georgedouzas/imbalanced-learn-extra/releases/tag/0.2.4) - 2020-07-21 + +[Compare with 0.2.3](https://github.com/georgedouzas/imbalanced-learn-extra/compare/0.2.3...0.2.4) + +## [0.2.3](https://github.com/georgedouzas/imbalanced-learn-extra/releases/tag/0.2.3) - 2020-07-21 + +[Compare with 0.2.2](https://github.com/georgedouzas/imbalanced-learn-extra/compare/0.2.2...0.2.3) + +## [0.2.2](https://github.com/georgedouzas/imbalanced-learn-extra/releases/tag/0.2.2) - 2020-04-08 + +[Compare with 0.2.1](https://github.com/georgedouzas/imbalanced-learn-extra/compare/0.2.1...0.2.2) + +## [0.2.1](https://github.com/georgedouzas/imbalanced-learn-extra/releases/tag/0.2.1) - 2020-04-08 + +[Compare with 0.2.0](https://github.com/georgedouzas/imbalanced-learn-extra/compare/0.2.0...0.2.1) -## [0.2.0](https://github.com/georgedouzas/geometric-smote/releases/tag/0.2.0) - 2022-03-12 +## [0.2.0](https://github.com/georgedouzas/imbalanced-learn-extra/releases/tag/0.2.0) - 2020-04-07 -[Compare with 0.1.3](https://github.com/georgedouzas/geometric-smote/compare/0.1.3...0.2.0) +[Compare with 0.1.2](https://github.com/georgedouzas/imbalanced-learn-extra/compare/0.1.2...0.2.0) -## [0.1.3](https://github.com/georgedouzas/geometric-smote/releases/tag/0.1.3) - 2019-12-13 +## [0.1.2](https://github.com/georgedouzas/imbalanced-learn-extra/releases/tag/0.1.2) - 2020-03-30 -[Compare with 0.1.2](https://github.com/georgedouzas/geometric-smote/compare/0.1.2...0.1.3) +[Compare with 0.1.1](https://github.com/georgedouzas/imbalanced-learn-extra/compare/0.1.1...0.1.2) -## [0.1.2](https://github.com/georgedouzas/geometric-smote/releases/tag/0.1.2) - 2019-07-09 +## [0.1.1](https://github.com/georgedouzas/imbalanced-learn-extra/releases/tag/0.1.1) - 2019-08-17 -[Compare with first commit](https://github.com/georgedouzas/geometric-smote/compare/801d7f49ebce70a48a7d9e30d5820765b5a1d511...0.1.2) +[Compare with first commit](https://github.com/georgedouzas/imbalanced-learn-extra/compare/e209568f6d0b02df1f1d06d5e79ba2300f2f4d23...0.1.1) diff --git a/LICENSE b/LICENSE index b354685..7b84c16 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) 2019 Georgios Douzas +Copyright (c) 2021 Georgios Douzas Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index 37a1ddf..9cbbd10 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,8 @@ +[scikit-learn]: +[imbalanced-learn]: +[SOMO]: +[KMeans-SMOTE]: +[G-SOMO]: [black badge]: [black]: [docformatter badge]: @@ -8,25 +13,19 @@ [mypy]: [mkdocs badge]: [mkdocs]: -[version badge]: -[pythonversion badge]: -[downloads badge]: -[gitter]: +[version badge]: +[pythonversion badge]: +[downloads badge]: +[gitter]: [gitter badge]: -[discussions]: -[discussions badge]: -[ci]: -[ci badge]: -[doc]: -[doc badge]: +[discussions]: +[discussions badge]: +[ci]: +[ci badge]: +[doc]: +[doc badge]: -[![Project Status: Inactive – The project has reached a stable, usable state but is no longer being actively developed; -support/maintenance will be provided as time -allows.](https://www.repostatus.org/badges/latest/inactive.svg)](https://www.repostatus.org/#inactive) - -> **The project has been moved to [imbalanced-learn-extra](https://github.com/georgedouzas/imbalanced-learn-extra).** - -# geometric-smote +# imbalanced-learn-extra [![ci][ci badge]][ci] [![doc][doc badge]][doc] @@ -39,59 +38,75 @@ allows.](https://www.repostatus.org/badges/latest/inactive.svg)](https://www.rep ## Introduction -The package `geometric-smote` implements the Geometric SMOTE algorithm, a geometrically enhanced drop-in replacement for SMOTE. It -is compatible with scikit-learn and imbalanced-learn. The Geometric SMOTE algorithm can handle numerical as well as categorical -features. +`imbalanced-learn-extra` is a Python package that extends [imbalanced-learn]. It implements algorithms that are not included in +[imbalanced-learn] due to their novelty or lower citation number. The current version includes the following: + +- A general interface for clustering-based oversampling algorithms. + +- The Geometric SMOTE algorithm. It is a geometrically enhanced drop-in replacement for SMOTE, that handles numerical as well as +categorical features. ## Installation -For user installation, `geometric-smote` is currently available on the PyPi's repository, and you can +For user installation, `imbalanced-learn-extra` is currently available on the PyPi's repository, and you can install it via `pip`: ```bash -pip install geometric-smote +pip install imbalanced-learn-extra ``` Development installation requires cloning the repository and then using [PDM](https://github.com/pdm-project/pdm) to install the project as well as the main and development dependencies: ```bash -git clone https://github.com/georgedouzas/geometric-smote.git -cd geometric-smote +git clone https://github.com/georgedouzas/imbalanced-learn-extra.git +cd imbalanced-learn-extra pdm install ``` +SOM clusterer requires optional dependencies: + +```bash +pip install imbalanced-learn-extra[som] +``` + ## Usage -All the classes included in `geometric-smote` follow the [imbalanced-learn](https://imbalanced-learn.org/stable/) API using the -functionality of the base oversampler. Using [scikit-learn](https://scikit-learn.org/stable/) convention, the data are represented -as follows: +All the classes included in `imbalanced-learn-extra` follow the [imbalanced-learn] API using the functionality of the base +oversampler. Using [scikit-learn] convention, the data are represented as follows: - Input data `X`: 2D array-like or sparse matrices. - Targets `y`: 1D array-like. -The clustering-based oversamplers implement a `fit` method to learn from `X` and `y`: +The oversamplers implement a `fit` method to learn from `X` and `y`: ```python -gsmote_oversampler.fit(X, y) +oversampler.fit(X, y) ``` They also implement a `fit_resample` method to resample `X` and `y`: ```python -X_resampled, y_resampled = gsmote.fit_resample(X, y) +X_resampled, y_resampled = clustering_based_oversampler.fit_resample(X, y) ``` -## Citing `geometric-smote` +## Citing `imbalanced-learn-extra` + +Publications using clustering-based oversampling: + +- [G. Douzas, F. Bacao, "Self-Organizing Map Oversampling (SOMO) for imbalanced data set learning", Expert Systems with + Applications, vol. 82, pp. 40-52, 2017.][SOMO] +- [G. Douzas, F. Bacao, F. Last, "Improving imbalanced learning through a heuristic oversampling method based on k-means and + SMOTE", Information Sciences, vol. 465, pp. 1-20, 2018.][KMeans-SMOTE] +- [G. Douzas, F. Bacao, F. Last, "G-SOMO: An oversampling approach based on self-organized maps and geometric SMOTE", Expert + Systems with Applications, vol. 183,115230, 2021.][G-SOMO] -If you use `geometric-smote` in a scientific publication, we would appreciate citations to the following paper: +Publications using Geometric-SMOTE: - Douzas, G., Bacao, B. (2019). Geometric SMOTE: a geometrically enhanced drop-in replacement for SMOTE. Information Sciences, 501, 118-135. -Publications using Geometric-SMOTE: - - Fonseca, J., Douzas, G., Bacao, F. (2021). Increasing the Effectiveness of Active Learning: Introducing Artificial Data Generation in Active Learning for Land Use/Land Cover Classification. Remote Sensing, 13(13), 2619. diff --git a/docs/examples/README.md b/docs/examples/README.md index e116f8d..4c9b430 100644 --- a/docs/examples/README.md +++ b/docs/examples/README.md @@ -1,3 +1,3 @@ # General examples -A collection of examples for `geometric-smote` package. +A collection of examples for the `imblearn_extra` package. diff --git a/docs/examples/applications/README.md b/docs/examples/applications/README.md index 72b03d8..a2d3511 100644 --- a/docs/examples/applications/README.md +++ b/docs/examples/applications/README.md @@ -1,3 +1,3 @@ # Applications -Examples of applications for the `geometric-smote` package. \ No newline at end of file +Examples of applications for the `imblearn_extra` package. diff --git a/docs/examples/applications/plot_mnist_example.py b/docs/examples/applications/plot_mnist_example.py index 4aca928..c86c0e6 100644 --- a/docs/examples/applications/plot_mnist_example.py +++ b/docs/examples/applications/plot_mnist_example.py @@ -14,10 +14,10 @@ import matplotlib.pyplot as plt import numpy as np import pandas as pd -from gsmote import GeometricSMOTE from imblearn.datasets import make_imbalance from imblearn.over_sampling import SMOTE from imblearn.pipeline import Pipeline +from imblearn_extra.gsmote import GeometricSMOTE from sklearn.datasets import fetch_openml from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split diff --git a/docs/examples/plot_cluster_oversamplers.py b/docs/examples/plot_cluster_oversamplers.py new file mode 100644 index 0000000..0bb62fa --- /dev/null +++ b/docs/examples/plot_cluster_oversamplers.py @@ -0,0 +1,149 @@ +""" +Clustering-based over-sampling +============================== + +This example illustrates the data generation +process and the performance of various +over-samplers when clustering-based over-sampling +is used. +""" + +# Author: Georgios Douzas +# Licence: MIT + +import matplotlib.pyplot as plt +import pandas as pd +from imblearn.over_sampling import SMOTE, BorderlineSMOTE, RandomOverSampler +from imblearn.pipeline import make_pipeline +from imblearn_extra.clover.over_sampling import ClusterOverSampler +from sklearn.base import clone +from sklearn.cluster import AgglomerativeClustering, KMeans +from sklearn.datasets import make_classification +from sklearn.ensemble import GradientBoostingClassifier +from sklearn.metrics import f1_score +from sklearn.model_selection import train_test_split + +RANDOM_STATE = 0 +OVERSAMPLERS = [ + RandomOverSampler(random_state=RANDOM_STATE), + SMOTE(random_state=RANDOM_STATE + 1), + BorderlineSMOTE(random_state=RANDOM_STATE + 2), +] +KMEANS = KMeans(random_state=RANDOM_STATE, n_clusters=100, n_init='auto') +AGGL = AgglomerativeClustering(n_clusters=100) + + +def generate_imbalanced_data(): + """Generate imbalanced data.""" + X, y = make_classification( + n_classes=3, + class_sep=0.8, + weights=[0.01, 0.05, 0.94], + n_informative=2, + n_redundant=0, + n_repeated=0, + n_features=2, + n_clusters_per_class=1, + n_samples=2000, + random_state=RANDOM_STATE, + ) + return X, y + + +def plot_data(X, y, oversampler, ax): + """Plot original or resampled data.""" + if oversampler is None: + X_res, y_res = X, y + title = 'Original data' + else: + oversampler = clone(oversampler) + X_res, y_res = oversampler.fit_resample(X, y) + if not isinstance(oversampler, ClusterOverSampler): + ovs_name = oversampler.__class__.__name__ + title = f'Resampling using {ovs_name}' + else: + clusterer_name = oversampler.clusterer.__class__.__name__ + ovs_name = oversampler.oversampler_.__class__.__name__ + title = f'Resampling using {clusterer_name}-{ovs_name}' + ax.scatter(X_res[:, 0], X_res[:, 1], c=y_res, alpha=0.8, edgecolor='k') + ax.spines['top'].set_visible(False) + ax.spines['right'].set_visible(False) + ax.get_xaxis().tick_bottom() + ax.get_yaxis().tick_left() + ax.spines['left'].set_position(('outward', 10)) + ax.spines['bottom'].set_position(('outward', 10)) + ax.set_title(title) + + +def compare_f1_scores(X_train, X_test, y_train, y_test, clf, oversampler, clusterer): + """Compare F1 scores of oversamplers with and without clustering.""" + ovs_clf = make_pipeline(clone(oversampler), clf) + clr_ovs_clf = make_pipeline(ClusterOverSampler(clone(oversampler), clusterer), clf) + y_pred = ovs_clf.fit(X_train, y_train).predict(X_test) + y_pred_clr = clr_ovs_clf.fit(X_train, y_train).predict(X_test) + ovs_name = oversampler.__class__.__name__ + ovs_score = f1_score(y_test, y_pred, average='macro') + clr_ovs_score = f1_score(y_test, y_pred_clr, average='macro') + return (ovs_name, ovs_score, clr_ovs_score) + + +# %% +# Generate imbalanced data +# ------------------------ +# +# We are generating a highly imbalanced multi-class data set, using +# `make_classification` from scikit-learn. + +X, y = generate_imbalanced_data() +_, ax = plt.subplots(1, 1, figsize=(15, 7)) +plot_data(X, y, None, ax) + +# %% +# Effect of clustering to over-samplers +# ------------------------------------- +# +# Clustering based over-sampling allows to identify areas of the input space +# which are appropriate to generate artificial data. Therefore, the generation +# of noisy samples is avoided and the within-classes imbalanced issue is also +# addressed. The next plots show the resampled data when clustering is applied, +# comparing them to the resampled data of the initial over-samplers. + +fig, axs = plt.subplots(3, 2, figsize=(15, 15)) +for (ax1, ax2), oversampler in zip(axs, OVERSAMPLERS, strict=True): + plot_data(X, y, clone(oversampler), ax1) + plot_data(X, y, ClusterOverSampler(oversampler, KMEANS), ax2) +fig.tight_layout() + +# %% +# Performance evaluation of clustering based over-sampling +# -------------------------------------------------------- +# +# We are evaluating various over-samplers using F1-score as evaluation metric +# on a test set. The scores with and without clustering are compared. + +clf = GradientBoostingClassifier(random_state=RANDOM_STATE) +X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RANDOM_STATE) +scores = [] +for oversampler in OVERSAMPLERS: + scores.append(compare_f1_scores(X_train, X_test, y_train, y_test, clf, oversampler, KMEANS)) +scores = ( + pd.DataFrame(scores) + .rename(columns={0: 'Oversamplers', 1: ('F-score', 'No clustering'), 2: ('F-score', 'Clustering')}) + .set_index('Oversamplers') +) +scores.columns = pd.MultiIndex.from_tuples(scores.columns) +scores + +# %% +# We repeat the process for AgglomerativeClustering instead of KMeans. + +scores = [] +for oversampler in OVERSAMPLERS: + scores.append(compare_f1_scores(X_train, X_test, y_train, y_test, clf, oversampler, AGGL)) +scores = ( + pd.DataFrame(scores) + .rename(columns={0: 'Oversamplers', 1: ('F-score', 'No clustering'), 2: ('F-score', 'Clustering')}) + .set_index('Oversamplers') +) +scores.columns = pd.MultiIndex.from_tuples(scores.columns) +scores diff --git a/docs/examples/plot_data_generation_mechanism.py b/docs/examples/plot_gsmote_data_generation.py similarity index 97% rename from docs/examples/plot_data_generation_mechanism.py rename to docs/examples/plot_gsmote_data_generation.py index 06dbfda..5f3fb4a 100644 --- a/docs/examples/plot_data_generation_mechanism.py +++ b/docs/examples/plot_gsmote_data_generation.py @@ -1,6 +1,6 @@ """ -Data generation mechanism -========================= +G-SMOTE data generation +======================= This example illustrates the Geometric SMOTE data generation mechanism and the usage of its @@ -12,8 +12,8 @@ import matplotlib.pyplot as plt import numpy as np -from gsmote import GeometricSMOTE from imblearn.over_sampling import SMOTE +from imblearn_extra.gsmote import GeometricSMOTE from sklearn.datasets import make_blobs XLIM, YLIM = [-3.0, 3.0], [0.0, 4.0] @@ -56,7 +56,7 @@ def plot_hyperparameters(oversampler, X, y, param, vals, n_subplots): _, ax_arr = plt.subplots(*n_subplots, figsize=(15, 7 if n_rows > 1 else 3.5)) if n_rows > 1: ax_arr = [ax for axs in ax_arr for ax in axs] - for ax, val in zip(ax_arr, vals): + for ax, val in zip(ax_arr, vals, strict=True): oversampler.set_params(**{param: val}) X_res, y_res = oversampler.fit_resample(X, y) ax.scatter(X_res[y_res == 1, 0], X_res[y_res == 1, 1], label='Positive Class') @@ -71,7 +71,7 @@ def plot_comparison(oversamplers, X, y): samples. """ _, ax_arr = plt.subplots(1, 2, figsize=(15, 5)) - for ax, (name, ovs) in zip(ax_arr, oversamplers): + for ax, (name, ovs) in zip(ax_arr, oversamplers, strict=True): X_res, y_res = ovs.fit_resample(X, y) ax.scatter(X_res[y_res == 1, 0], X_res[y_res == 1, 1], label='Positive Class') ax.scatter(X_res[y_res == 0, 0], X_res[y_res == 0, 1], label='Negative Class') diff --git a/docs/examples/plot_validation_curves.py b/docs/examples/plot_gsmote_validation_curves.py similarity index 98% rename from docs/examples/plot_validation_curves.py rename to docs/examples/plot_gsmote_validation_curves.py index 06564e6..8cc6f27 100644 --- a/docs/examples/plot_validation_curves.py +++ b/docs/examples/plot_gsmote_validation_curves.py @@ -1,6 +1,6 @@ """ -Plotting validation curves -========================== +G-SMOTE validation curves +========================= In this example the impact of the Geometric SMOTE's hyperparameters is examined. The validation scores of a Geometric SMOTE-GBC classifier is presented for @@ -12,9 +12,9 @@ import matplotlib.pyplot as plt import numpy as np -from gsmote import GeometricSMOTE from imblearn.metrics import geometric_mean_score from imblearn.pipeline import make_pipeline +from imblearn_extra.gsmote import GeometricSMOTE from sklearn.datasets import make_classification from sklearn.linear_model import LogisticRegression from sklearn.metrics import make_scorer diff --git a/docs/examples/plot_kmeans_smote.py b/docs/examples/plot_kmeans_smote.py new file mode 100644 index 0000000..6f96061 --- /dev/null +++ b/docs/examples/plot_kmeans_smote.py @@ -0,0 +1,113 @@ +""" +KMeans-SMOTE algorithm +====================== + +This example illustrates the data generation +process and the performance of KMeans-SMOTE. +""" + +# Author: Georgios Douzas +# Licence: MIT + +import matplotlib.pyplot as plt +import pandas as pd +from imblearn.over_sampling import SMOTE +from imblearn.pipeline import make_pipeline +from imblearn_extra.clover.over_sampling import KMeansSMOTE +from sklearn.base import clone +from sklearn.datasets import make_classification +from sklearn.ensemble import GradientBoostingClassifier +from sklearn.metrics import f1_score +from sklearn.model_selection import train_test_split + +RANDOM_STATE = 2 +OVERSAMPLERS = [ + SMOTE(random_state=RANDOM_STATE), + KMeansSMOTE(random_state=RANDOM_STATE + 3), +] + + +def generate_imbalanced_data(): + """Generate imbalanced data.""" + X, y = make_classification( + n_classes=3, + flip_y=0.05, + weights=[0.15, 0.6, 0.25], + n_informative=2, + n_redundant=0, + n_repeated=0, + n_features=2, + n_clusters_per_class=1, + n_samples=1000, + random_state=RANDOM_STATE, + ) + return X, y + + +def plot_data(X, y, oversampler, ax): + """Plot original or resampled data.""" + if oversampler is None: + X_res, y_res = X, y + title = 'Original data' + else: + oversampler = clone(oversampler) + X_res, y_res = oversampler.fit_resample(X, y) + ovs_name = oversampler.__class__.__name__ + title = f'Resampling using {ovs_name}' + ax.scatter(X_res[:, 0], X_res[:, 1], c=y_res, alpha=0.8, edgecolor='k') + ax.spines['top'].set_visible(False) + ax.spines['right'].set_visible(False) + ax.get_xaxis().tick_bottom() + ax.get_yaxis().tick_left() + ax.spines['left'].set_position(('outward', 10)) + ax.spines['bottom'].set_position(('outward', 10)) + ax.set_title(title) + + +def compare_f1_scores(X_train, X_test, y_train, y_test, clf, oversampler): + """Compare F1 scores of oversamplers.""" + ovs_clf = make_pipeline(clone(oversampler), clf) + y_pred = ovs_clf.fit(X_train, y_train).predict(X_test) + ovs_name = oversampler.__class__.__name__ + ovs_score = f1_score(y_test, y_pred, average='macro') + return (ovs_name, ovs_score) + + +# %% +# Generate imbalanced data +# ------------------------ +# +# We are generating an imbalanced multi-class data set, using +# ``make_classification`` from scikit-learn. + +X, y = generate_imbalanced_data() +_, ax = plt.subplots(1, 1, figsize=(15, 7)) +plot_data(X, y, None, ax) + +# %% +# Plot resampled data +# ------------------- +# +# KMeans-SMOTE allows to identify areas of the input space which are appropriate to generate +# artificial data. Therefore, the generation of noisy samples is avoided and the within-classes +# imbalanced issue is also addressed. The next plots show the resampled data of +# KMeans-SMOTE vs SMOTE. + +fig, axs = plt.subplots(1, 2, figsize=(15, 5)) +for ax, oversampler in zip(axs, OVERSAMPLERS, strict=True): + plot_data(X, y, clone(oversampler), ax) +fig.tight_layout() + +# %% +# Performance evaluation +# ---------------------- +# +# We are evaluating the performance of KMeans-SMOTE using F1-score as evaluation metric on a +# test set. SMOTE's performance is also included. + +clf = GradientBoostingClassifier(random_state=RANDOM_STATE) +X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RANDOM_STATE) +scores = [] +for oversampler in OVERSAMPLERS: + scores.append(compare_f1_scores(X_train, X_test, y_train, y_test, clf, oversampler)) +pd.DataFrame(scores).rename(columns={0: 'Oversamplers', 1: 'F-score'}).set_index('Oversamplers') diff --git a/docs/generate_api.py b/docs/generate_api.py index ef36d94..64dae42 100644 --- a/docs/generate_api.py +++ b/docs/generate_api.py @@ -18,7 +18,7 @@ ] for path in paths: module_path = path.relative_to('src').with_suffix('') - doc_path = path.relative_to('src', 'gsmote').with_suffix('.md') + doc_path = path.relative_to('src', 'imblearn_extra').with_suffix('.md') full_doc_path = Path('api', doc_path) parts = tuple(module_path.parts) diff --git a/docs/overview/user_guide.md b/docs/overview/user_guide.md index 576eb4a..6acd150 100644 --- a/docs/overview/user_guide.md +++ b/docs/overview/user_guide.md @@ -7,12 +7,129 @@ # User guide +`imbalanced-learn-extra` is a Python package that extends [imbalanced-learn]. It implements algorithms that are not included in +[imbalanced-learn] due to their novelty or lower citation number. The current version includes the following: + +- A general interface for clustering-based oversampling algorithms that introduces the `ClusterOverSampler` class, while + `KMeansSMOTE`, `SOMO` and `GeometricSOMO` classes are provided for convinience. The distribution of the generated samples to the + clusters is controled by the `distributor` parameter with `DensityDistributor` being an example of distribution that is based on + the density of the clusters. + +- The Geometric SMOTE algorithm as a geometrically enhanced drop-in replacement for SMOTE, that handles numerical as well as +categorical features. + +## Clustering-based oversamping + +Initially, we generate multi-class imbalanced data represented by the imput data `X` and targets `y`: + +```python +>>> from collections import Counter +>>> from sklearn.datasets import make_classification +>>> X, y = make_classification(n_classes=3, weights=[0.10, 0.10, 0.80], random_state=0, n_informative=10) +>>> print(sorted(Counter(y).items())) +[(np.int64(0), 10), (np.int64(1), 10), (np.int64(2), 80)] +``` + +Below we provided some examples of the `imblearn_extra.clover` functionality. + +### KMeans-SMOTE algorithm + +KMeans-SMOTE[^2] algorithm is a combination of [KMeans] clusterer and [SMOTE] oversampler and it is implemented by the +`KMeansSMOTE` class. We initialize it with the default parameters and use it to resample the data: + +```python +>>> from imblearn_extra.clover.over_sampling import KMeansSMOTE +>>> kmeans_smote = KMeansSMOTE(random_state=5) +>>> X_resampled, y_resampled = kmeans_smote.fit_resample(X, y) +>>> print(sorted(Counter(y_resampled).items())) +[(np.int64(0), 80), (np.int64(1), 80), (np.int64(2), 80)] +``` + +The augmented data set can be used instead of the original data set to train a classifier: + +```python +>>> from sklearn.tree import DecisionTreeClassifier +>>> clf = DecisionTreeClassifier() +>>> clf.fit(X_resampled, y_resampled) +DecisionTreeClassifier() +``` + +### Combining clusterers and oversamplers + +The `ClusterOverSampler` class allows to combine [imbalanced-learn] oversamplers with [scikit-learn] clusterers. This achieved +through the use of the parameters `oversampler` and `clusterer`. For example, we can select [BorderlineSMOTE] as the oversampler +and [DBSCAN] as the clustering algorithm: + +```python +>>> from sklearn.cluster import DBSCAN +>>> from imblearn.over_sampling import BorderlineSMOTE +>>> from imblearn_extra.clover.over_sampling import ClusterOverSampler +>>> dbscan_bsmote = ClusterOverSampler(oversampler=BorderlineSMOTE(random_state=5), clusterer=DBSCAN()) +>>> X_resampled, y_resampled = dbscan_bsmote.fit_resample(X, y) +>>> print(sorted(Counter(y_resampled).items())) +[(np.int64(0), 80), (np.int64(1), 80), (np.int64(2), 80)] +``` + +Additionally, if the clusterer supports a neighboring structure for the clusters through a `neighbors_` attribute, then it can +be used to generate inter-cluster artificial data similarly to SOMO[^1] and G-SOMO[^3] algorithms. + +### Adjusting the distribution of generated samples + +The parameter `distributor` of the `ClusterOverSampler` is used to define the distribution of the generated samples to the +clusters. The `DensityDistributor` class implements a density based distribution and it is the default `distributor` for all +objects of the `ClusterOverSampler` class: + +```python +>>> from sklearn.cluster import AgglomerativeClustering +>>> from imblearn.over_sampling import SMOTE +>>> agg_smote = ClusterOverSampler(oversampler=SMOTE(random_state=5), clusterer=AgglomerativeClustering()) +>>> agg_smote.fit(X, y) +>>> agg_smote.distributor_ +DensityDistributor() +``` + +The `DensityDistributor` objects can be parametrized: + +```python +>>> from imblearn_extra.clover.distribution import DensityDistributor +>>> distributor = DensityDistributor(distances_exponent=0) +``` + +In order to distribute the samples a `labels` parameter is required, while `neighbors` is optional: + +```python +>>> from sklearn.cluster import KMeans +>>> clusterer = KMeans(n_clusters=4, random_state=1).fit(X, y) +>>> labels = clusterer.labels_ +``` + +The distribution samples of the samples is provided by the `fit_distribute` method and it is described in the `intra_distribution` +and `inter_distribution` dictionaries: + +```python +>>> intra_distribution, inter_distribution = distributor.fit_distribute(X, y, labels, neighbors=None) +>>> print(distributor.filtered_clusters_) +[(np.int32(3), np.int64(1)), (np.int32(1), np.int64(0)), (np.int32(1), np.int64(1))] +>>> print(distributor.clusters_density_) +{(np.int32(3), np.int64(1)): np.float64(3.0), (np.int32(1), np.int64(0)): np.float64(7.0), (np.int32(1), np.int64(1)): np.float64(7.0)} +>>> print(intra_distribution) +{(np.int32(3), np.int64(1)): np.float64(0.7), (np.int32(1), np.int64(0)): np.float64(1.0), (np.int32(1), np.int64(1)): np.float64(0.3)} +>>> print(inter_distribution) +{} +``` + +The keys of the above dictionaries are tuples of `(cluster_label, class_label)` shape, while their values are proportions of the +total generated samples for the particular class. For example `(0, 1): 0.7` means that 70% of samples of class `1` will be +generated in the cluster `0`. Any other distributor can be defined by extending the `BaseDistributor` class. + +## Geometric SMOTE algorithm + SMOTE algorithm, as well as any other over-sampling method based on the SMOTE mechanism, generates synthetic samples along line -segments that join minority class instances. Geometric SMOTE (G-SMOTE) is an enhancement of the SMOTE data generation mechanism. -G-SMOTE generates synthetic samples in a geometric region of the input space, around each selected minority instance. The -`GeometricSMOTE` class can be used with multiple classes as well as binary classes classification. It uses a one-vs-rest approach -by selecting each targeted class and computing the necessary statistics against the rest of the data set which are grouped in a -single class. +segments that join minority class instances. Geometric SMOTE[^4] (G-SMOTE) is an enhancement of the SMOTE data generation +mechanism. G-SMOTE generates synthetic samples in a geometric region of the input space, around each selected minority instance. +The `GeometricSMOTE` class can be used with multiple classes as well as binary classes classification. It uses a one-vs-rest +approach by selecting each targeted class and computing the necessary statistics against the rest of the data set which are +grouped in a single class. Initially, we generate multi-class imbalanced data represented by the input data `X` and targets `y`: @@ -27,12 +144,12 @@ Initially, we generate multi-class imbalanced data represented by the input data We can use `GeometricSMOTE` to resample the data: ```python ->>> from gsmote import GeometricSMOTE +>>> from imblearn_extra.gsmote import GeometricSMOTE >>> geometric_smote = GeometricSMOTE() >>> X_resampled, y_resampled = geometric_smote.fit_resample(X, y) >>> from collections import Counter >>> print(sorted(Counter(y_resampled).items())) -[(0, 80), (1, 80), (2, 80)] +[(np.int64(0), 80), (np.int64(1), 80), (np.int64(2), 80)] ``` The augmented data set can be used instead of the original data set to train a classifier: @@ -41,6 +158,7 @@ The augmented data set can be used instead of the original data set to train a c >>> from sklearn.tree import DecisionTreeClassifier >>> clf = DecisionTreeClassifier() >>> clf.fit(X_resampled, y_resampled) +DecisionTreeClassifier() ``` `GeometricSMOTE` can be used also in a machine learning pipeline: @@ -49,4 +167,31 @@ The augmented data set can be used instead of the original data set to train a c from imblearn.pipeline import make_pipeline pipeline = make_pipeline(GeometricSMOTE(), DecisionTreeClassifier()) pipeline.fit(X, y) +Pipeline(steps=[('geometricsmote', GeometricSMOTE()), + ('decisiontreeclassifier', DecisionTreeClassifier())]) ``` + +### Compatibility + +The API of `imblearn_extra` is fully compatible to [imbalanced-learn]. Particularly for clustering-based oversampling, any +oversampler from cluster-over-sampling that does not use clustering, i.e. when ``clusterer=None``, is equivalent to the +corresponding [imbalanced-learn] oversampler: + +```python +>>> import numpy as np +>>> X_res_im, y_res_im = SMOTE(random_state=5).fit_resample(X, y) +>>> X_res_cl, y_res_cl = ClusterOverSampler(SMOTE(random_state=5), clusterer=None).fit_resample(X, y) +>>> np.testing.assert_equal(X_res_im, X_res_cl) +>>> np.testing.assert_equal(y_res_im, y_res_cl) +``` + +## References + +[^1]: [G. Douzas, F. Bacao, "Self-Organizing Map Oversampling (SOMO) for imbalanced data set learning", Expert Systems with + Applications, vol. 82, pp. 40-52, 2017.](https://www.sciencedirect.com/science/article/abs/pii/S0957417417302324) +[^2]: [G. Douzas, F. Bacao, F. Last, "Improving imbalanced learning through a heuristic oversampling method based on k-means and SMOTE", Information Sciences, vol. 465, pp. 1-20, + 2018.](https://www.sciencedirect.com/science/article/abs/pii/S0020025518304997) +[^3]: [G. Douzas, F. Bacao, F. Last, "G-SOMO: An oversampling approach based on self-organized maps and geometric SMOTE", Expert + Systems with Applications, vol. 183,115230, 2021.](https://www.sciencedirect.com/science/article/abs/pii/S095741742100662X) +[^4]: [G. Douzas, F. Bacao, F. Last, "Geometric SMOTE a geometrically enhanced drop-in replacement for SMOTE", Information + Sciences, Volume 501, 2019.](https://www.sciencedirect.com/science/article/abs/pii/S0020025519305353?via%3Dihub) diff --git a/mkdocs.yml b/mkdocs.yml index 528ca3f..0483ea7 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -1,10 +1,10 @@ -site_name: "geometric-smote" -site_description: "Implementation of the Geometric SMOTE algorithm, a geometrically enhanced drop-in replacement for SMOTE. It is compatible with scikit-learn and imbalanced-learn." -site_url: "https://georgedouzas.github.io/geometric-smote" -repo_url: "https://github.com/georgedouzas/geometric-smote" -repo_name: "georgedouzas/geometric-smote" +site_name: "imbalanced-learn-extra" +site_description: "An implementation of novel oversampling algorithms." +site_url: "https://georgedouzas.github.io/imbalanced-learn-extra" +repo_url: "https://github.com/georgedouzas/imbalanced-learn-extra" +repo_name: "georgedouzas/imbalanced-learn-extra" site_dir: "site" -watch: [README.md, CONTRIBUTING.md, CHANGELOG.md, src/gsmote] +watch: [README.md, CONTRIBUTING.md, CHANGELOG.md, src/imblearn_extra] theme: name: material diff --git a/noxfile.py b/noxfile.py index 48aaf68..76c5972 100644 --- a/noxfile.py +++ b/noxfile.py @@ -10,7 +10,7 @@ os.environ.update({'PDM_IGNORE_SAVED_PYTHON': '1'}) -PYTHON_VERSIONS: list[str] = ['3.10', '3.11'] +PYTHON_VERSIONS: list[str] = ['3.10', '3.11', '3.12'] FILES: list[str] = ['src', 'tests', 'docs', 'noxfile.py'] CHANGELOG_ARGS: dict[str, Any] = { 'repository': '.', @@ -80,13 +80,13 @@ def checks(session: nox.Session, file: str) -> None: check_cli(session, ['all', 'quality', 'dependencies', 'types']) session.run('pdm', 'install', '-dG', 'checks', '--no-default', external=True) if session.posargs[0] in ['quality', 'all']: - session.run('ruff', file) + session.run('ruff', 'check', file) if session.posargs[0] in ['types', 'all']: session.run('mypy', file) if session.posargs[0] in ['dependencies', 'all']: requirements_path = (Path(session.create_tmp()) / 'requirements.txt').as_posix() args_groups = [['--prod']] + [['-dG', group] for group in ['tests', 'docs', 'maintenance']] - requirements_types = zip(FILES, args_groups) + requirements_types = zip(FILES, args_groups, strict=True) args = [ 'pdm', 'export', diff --git a/pyproject.toml b/pyproject.toml index 883df29..73beb67 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,13 +3,13 @@ requires = ["pdm-pep517"] build-backend = "pdm.pep517.api" [project] -name = "geometric-smote" -description = "Implementation of the Geometric SMOTE algorithm, a geometrically enhanced drop-in replacement for SMOTE. It is compatible with scikit-learn and imbalanced-learn." +name = "imbalanced-learn-extra" +description = "An implementation of novel oversampling algorithms." authors = [{name = "Georgios Douzas", email = "gdouzas@icloud.com"}] license = "MIT" readme = "README.md" -requires-python = ">=3.10, <3.12" -keywords = [] +requires-python = ">=3.10, <3.13" +keywords = ["machine learning", "imbalanced learning", "oversampling"] dynamic = ["version"] classifiers = [ "Development Status :: 4 - Beta", @@ -19,25 +19,29 @@ classifiers = [ "Programming Language :: Python :: 3 :: Only", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", "Topic :: Documentation", "Topic :: Software Development", "Topic :: Utilities", "Typing :: Typed", ] dependencies = [ + "scipy>=1.7.2", + "numpy>=1.22", "scikit-learn>=1.3.2", "imbalanced-learn>=0.11.0", + "MiniSom>=2.3.2", "typing-extensions>=4.8.0", ] [project.urls] -Homepage = "https://georgedouzas.github.io/geometric-smote" -Documentation = "https://georgedouzas.github.io/geometric-smote" -Changelog = "https://georgedouzas.github.io/geometric-smote/changelog" -Repository = "https://github.com/georgedouzas/geometric-smote" -Issues = "https://github.com/georgedouzas/geometric-smote/issues" -Discussions = "https://github.com/georgedouzas/geometric-smote/discussions" -Gitter = "https://gitter.im/geometric-smote/community" +Homepage = "https://georgedouzas.github.io/imbalanced-learn-extra" +Documentation = "https://georgedouzas.github.io/imbalanced-learn-extra" +Changelog = "https://georgedouzas.github.io/imbalanced-learn-extra/changelog" +Repository = "https://github.com/georgedouzas/imbalanced-learn-extra" +Issues = "https://github.com/georgedouzas/imbalanced-learn-extra/issues" +Discussions = "https://github.com/georgedouzas/imbalanced-learn-extra/discussions" +Gitter = "https://gitter.im/imbalanced-learn-extra/community" Funding = "https://github.com/sponsors/georgedouzas" [tool.pdm] @@ -62,8 +66,8 @@ maintenance = [ "nox>=2022.8.7", ] docs = [ - "mkdocs>=1.3", - "mkdocs-coverage>=0.2", + "mkdocs>=1.4.2", + "mkdocs-coverage>=0.2.6", "mkdocs-gen-files>=0.3", "mkdocs-literate-nav>=0.4", "mkdocs-material>=7.3", @@ -72,14 +76,15 @@ docs = [ "mkdocstrings[python]>=0.20", "markdown-callouts>=0.2", "markdown-exec>=0.5", - 'pandas>=1.5.0', - "matplotlib>=3.8.2", + "pandas>=1.5.3", + "matplotlib>=3.7.1", ] formatting = [ "black>=21.10b0", "docformatter>=1.5.1", ] checks = [ + "numpy>=1.22", "ruff>=0.0.237", "safety>=2", "mypy>=0.910", @@ -105,30 +110,33 @@ extend-exclude = "(tests/fixtures|docs/generated)" skip-string-normalization = true [tool.ruff] -select = ["C", "E", "F", "W", "B", "I", "D", "N", "UP", "YTT", "ANN", "S", +fix = true +extend-exclude = ["docs/generated"] +force-exclude = true +line-length = 120 +target-version = "py310" + +[tool.ruff.lint] +fixable = ["C", "E", "F", "W", "B", "I", "D", "N", "UP", "YTT", "ANN", "S", "BLE", "A", "COM", "C4", "DTZ", "T10", "EM", "EXE", "ISC", "ICN", "G", "INP", "PIE", "T20", "PT", "Q", "RET501", "RET502", "RET503", "SIM", "PTH", "PD", "PGH", "PL", "TRY", "RUF", "PLE", "PLR", "PLW", "TRY", "RUF"] -ignore = ["D202", "N806", "N803", "S101", "INP001", "Q000", "TRY002", "PLR0913", "EXE001", "EXE002"] -fix = true -fixable = ["C", "E", "F", "W", "B", "I", "D", "N", "UP", "YTT", "ANN", "S", +ignore = ["D202", "N806", "N803", "S101", "INP001", "Q000", "TRY002", "PLR0913", "EXE001", "EXE002", "E741"] +select = ["C", "E", "F", "W", "B", "I", "D", "N", "UP", "YTT", "ANN", "S", "BLE", "A", "COM", "C4", "DTZ", "T10", "EM", "EXE", "ISC", "ICN", "G", "INP", "PIE", "T20", "PT", "Q", "RET501", "RET502", "RET503", "SIM", "PTH", "PD", "PGH", "PL", "TRY", "RUF", "PLE", "PLR", "PLW", "TRY", "RUF"] -extend-exclude = ["docs/generated"] -force-exclude = true -line-length = 120 -target-version = "py39" -[tool.ruff.per-file-ignores] +[tool.ruff.lint.per-file-ignores] "docs/examples/*" = ["ANN", "D", "B018"] "docs/generated/*" = ["ANN", "D"] "test_*" = ["ANN"] -[tool.ruff.pydocstyle] +[tool.ruff.lint.pydocstyle] convention = "google" [tool.mypy] +plugins = "numpy.typing.mypy_plugin" ignore_missing_imports = true exclude = ["tests/fixtures/", "docs/examples/", "docs/generated/"] warn_unused_ignores = true diff --git a/src/imblearn_extra/__init__.py b/src/imblearn_extra/__init__.py new file mode 100644 index 0000000..81d5b80 --- /dev/null +++ b/src/imblearn_extra/__init__.py @@ -0,0 +1,5 @@ +"""Novel oversampling algorithms implementations.""" + +from __future__ import annotations + +__all__: list[str] = [] diff --git a/src/imblearn_extra/clover/__init__.py b/src/imblearn_extra/clover/__init__.py new file mode 100644 index 0000000..c268e8e --- /dev/null +++ b/src/imblearn_extra/clover/__init__.py @@ -0,0 +1,39 @@ +"""A general interface for clustering based over-sampling algorithms. + +[SOMO oversampling algorithm]: +[KMeans-SMOTE oversampling algorithm]: +[G-SOMO oversampling algorithm]: + +The module provides the implementation of an interface for clustering-based over-sampling. It +has two submodules: + +- [`distribution`][imblearn_extra.clover.distribution]: Provides the classes to distrubute the generated samples into +clusters. + + - [`DensityDistributor`][imblearn_extra.clover.distribution.DensityDistributor]: Density based distributor. + +- [`over_sampling`][imblearn_extra.clover.over_sampling]: Provides the clustering-based oversampling algorithms. + + - [`ClusterOverSampler`][imblearn_extra.clover.over_sampling.ClusterOverSampler]: Combinations of oversampler and + clusterer. + - [`KMeansSMOTE`][imblearn_extra.clover.over_sampling.KMeansSMOTE]: [KMeans-SMOTE oversampling algorithm] + oversampling algorithm. + - [`SOMO`][imblearn_extra.clover.over_sampling.SOMO]: [SOMO oversampling algorithm]. + - [`GeometricSOMO`][imblearn_extra.clover.over_sampling.GeometricSOMO]: [G-SOMO oversampling algorithm]. +""" + +from __future__ import annotations + +import numpy as np +import numpy.typing as npt + +__all__: list[str] = [] + +InputData = npt.NDArray[np.float64] +Targets = npt.NDArray[np.float64] +Labels = npt.NDArray[np.int16] +Neighbors = npt.NDArray[np.int16] +MultiLabel = tuple[int, int] +IntraDistribution = dict[MultiLabel, float] +InterDistribution = dict[tuple[MultiLabel, MultiLabel], float] +Density = dict[MultiLabel, float] diff --git a/src/imblearn_extra/clover/clusterer/__init__.py b/src/imblearn_extra/clover/clusterer/__init__.py new file mode 100644 index 0000000..14a63e6 --- /dev/null +++ b/src/imblearn_extra/clover/clusterer/__init__.py @@ -0,0 +1,5 @@ +"""Implementation of Self-Organizing Map.""" + +from ._som import SOM, extract_topological_neighbors, generate_labels_mapping + +__all__: list[str] = ['SOM', 'extract_topological_neighbors', 'generate_labels_mapping'] diff --git a/src/imblearn_extra/clover/clusterer/_som.py b/src/imblearn_extra/clover/clusterer/_som.py new file mode 100644 index 0000000..f5afc11 --- /dev/null +++ b/src/imblearn_extra/clover/clusterer/_som.py @@ -0,0 +1,308 @@ +"""Implementation of the Self-Organizing Map (SOM) clusterer.""" + +# Author: Georgios Douzas +# License: BSD 3 clause + +from collections.abc import Callable +from itertools import product +from typing import Any, cast + +import numpy as np +import numpy.typing as npt +from minisom import MiniSom, asymptotic_decay +from sklearn.base import BaseEstimator, ClusterMixin +from sklearn.preprocessing import minmax_scale +from sklearn.utils import check_array, check_random_state +from typing_extensions import Self + + +def generate_labels_mapping(labels_coords: list[tuple[int, int]]) -> dict[tuple[int, int], int]: + """Generate a mapping between grid labels and cluster labels.""" + + # Identify unique grid labels + unique_labels = sorted(set(labels_coords)) + + # Generate mapping + labels_mapping = dict(zip(unique_labels, range(len(unique_labels)), strict=True)) + + return labels_mapping + + +def extract_topological_neighbors( + col: int, + row: int, + topology: str, + n_rows: int, + n_columns: int, + labels_coords_unique: list[tuple[int, int]], +) -> list[tuple[int, int]]: + """Return the topological neighbors of a neuron.""" + + # Return common topological neighbors for the two grid types + topological_neighbors = [ + (col - 1, row), + (col + 1, row), + (col, row - 1), + (col, row + 1), + ] + + # Append extra topological neighbors for hexagonal grid type + if topology == 'hexagonal': + offset = (-1) ** row + topological_neighbors += [ + (col - offset, row - offset), + (col - offset, row + offset), + ] + + # Apply constraints + topological_neighbors = [ + (col, row) + for col, row in topological_neighbors + if 0 <= col < n_columns and 0 <= row < n_rows and (col, row) in labels_coords_unique + ] + + return topological_neighbors + + +class SOM(BaseEstimator, ClusterMixin): + """Class to fit and visualize a Self-Organizing Map (SOM). + + The implementation uses MiniSom from minisom. Read more in the + [user_guide]. + + Args: + n_columns: + The number of columns in the map. + + n_rows: + The number of rows in the map. + + sigma: + Spread of the neighborhood function. + + learning_rate: + Initial learning rate. + + decay_function: + Function that reduces learning_rate and sigma at each iteration. + + neighborhood_function: + Function that weights the neighborhood of a position in the map. + Possible values: 'gaussian', 'mexican_hat', 'bubble', 'triangle'. + + topology: + Topology of the map. Possible values: 'rectangular', 'hexagonal'. + + activation_distance: + Distance used to activate the map. Possible values: 'euclidean', + 'cosine', 'manhattan', 'chebyshev'. + + random_state: + Control the randomization of the algorithm. + + - If int, `random_state` is the seed used by the random number + generator. + - If `RandomState` instance, random_state is the random number + generator. + - If `None`, the random number generator is the `RandomState` + instance used by `np.random`. + """ + + def __init__( + self: Self, + n_columns: int | None = None, + n_rows: int | None = None, + sigma: float = 1.0, + learning_rate: float = 0.5, + decay_function: Callable = asymptotic_decay, + neighborhood_function: str = 'gaussian', + topology: str = 'rectangular', + activation_distance: str | Callable = 'euclidean', + random_state: np.random.RandomState | int | None = None, + ) -> None: + self.n_columns = n_columns + self.n_rows = n_rows + self.sigma = sigma + self.learning_rate = learning_rate + self.decay_function = decay_function + self.neighborhood_function = neighborhood_function + self.topology = topology + self.activation_distance = activation_distance + self.random_state = random_state + + def _generate_neighbors( + self: Self, + labels_coords_unique: list[tuple[int, int]], + labels_mapping: dict[tuple[int, int], int], + ) -> npt.NDArray: + """Generate pairs of neighboring labels.""" + + # Generate grid topological neighbors + topological_neighbors = [ + product( + [label_coords], + extract_topological_neighbors( + *label_coords, + self.topology, + self.n_rows_, + self.n_columns_, + labels_coords_unique, + ), + ) + for label_coords in labels_coords_unique + ] + + # Flatten grid topological neighbors + topological_neighbors_flat = cast( + list[tuple[tuple[int, int], tuple[int, int]]], + [pair for pairs in topological_neighbors for pair in pairs], + ) + + # Generate cluster neighbors + all_neighbors = sorted( + {(labels_mapping[pair[0]], labels_mapping[pair[1]]) for pair in topological_neighbors_flat}, + ) + + # Keep unique unordered pairs + neighbors = [] + for pair in all_neighbors: + if pair not in neighbors and pair[::-1] not in neighbors: + neighbors.append(pair) + + return np.array(neighbors) + + def fit(self: Self, X: npt.ArrayLike, y: npt.ArrayLike | None = None, **fit_params: dict[str, Any]) -> Self: + """Train the self-organizing map. + + Args: + X: + Training instances to cluster. + + y: + Ignored. + + fit_params: + Parameters to pass to train method of the MiniSom object. + + The following parameters can be used: + + num_iteration: If `use_epochs` is `False`, the weights will be + updated `num_iteration` times. Otherwise they will be updated + `len(X) * num_iteration` times. + + random_order: + If `True`, samples are picked in random order. + Otherwise the samples are picked sequentially. + + verbose: + If `True` the status of the training will be + printed each time the weights are updated. + + use_epochs: + If `True` the SOM will be trained for num_iteration epochs. + In one epoch the weights are updated `len(data)` times and + the learning rate is constat throughout a single epoch. + + Returns: + The object itself. + """ + # Check random state + self.random_state_ = check_random_state(self.random_state).randint(low=np.iinfo(np.int32).max) + + # Check and normalize input data + X_scaled = minmax_scale(check_array(X, dtype=np.float32)) + + # Initialize size + n_neurons = 5 * np.sqrt(X_scaled.shape[0]) + if self.n_rows is None and self.n_columns is None: + self.n_rows_ = self.n_columns_ = int(np.ceil(np.sqrt(n_neurons))) + elif self.n_rows is None and self.n_columns is not None: + self.n_columns_ = self.n_columns + self.n_rows_ = int(np.ceil(n_neurons / self.n_columns_)) + elif self.n_columns is None and self.n_rows is not None: + self.n_rows_ = self.n_rows + self.n_columns_ = int(np.ceil(n_neurons / self.n_rows_)) + elif self.n_columns is not None and self.n_rows is not None: + self.n_rows_ = self.n_rows + self.n_columns_ = self.n_columns + + # Create MiniSom object + self.algorithm_ = MiniSom( + x=self.n_rows_, + y=self.n_columns_, + input_len=X_scaled.shape[1], + sigma=self.sigma, + learning_rate=self.learning_rate, + decay_function=self.decay_function, + neighborhood_function=self.neighborhood_function, + topology=self.topology, + activation_distance=self.activation_distance, + random_seed=self.random_state_, + ) + + # Fit MiniSom + if 'num_iteration' not in fit_params: + fit_params = {**fit_params, 'num_iteration': cast(Any, 1000)} + self.algorithm_.train(data=X_scaled, **fit_params) + + # Grid labels + labels_coords = [(int(i), int(j)) for i, j in [self.algorithm_.winner(x_scaled) for x_scaled in X_scaled]] + + # Generate labels mapping + self.labels_mapping_ = generate_labels_mapping(labels_coords) + + # Generate cluster labels + self.labels_ = np.array( + [self.labels_mapping_[grid_label] for grid_label in labels_coords], + ) + + # Generate labels neighbors + self.neighbors_ = self._generate_neighbors( + sorted(set(labels_coords)), + self.labels_mapping_, + ) + + return self + + def fit_predict( + self: Self, + X: npt.ArrayLike, + y: npt.ArrayLike | None = None, + **fit_params: dict[str, Any], + ) -> npt.NDArray: + """Train the self-organizing map and assign cluster labels to samples. + + Args: + X: + New data to transform. + + y: + Ignored. + + fit_params: + Parameters to pass to train method of the MiniSom object. + + The following parameters can be used: + + num_iteration: If `use_epochs` is `False`, the weights will be + updated `num_iteration` times. Otherwise they will be updated + `len(X) * num_iteration` times. + + random_order: + If `True`, samples are picked in random order. + Otherwise the samples are picked sequentially. + + verbose: + If `True` the status of the training will be + printed each time the weights are updated. + + use_epochs: + If `True` the SOM will be trained for num_iteration epochs. + In one epoch the weights are updated `len(data)` times and + the learning rate is constat throughout a single epoch. + + Returns: + labels: + Index of the cluster each sample belongs to. + """ + return self.fit(X=X, y=None, **fit_params).labels_ diff --git a/src/imblearn_extra/clover/distribution/__init__.py b/src/imblearn_extra/clover/distribution/__init__.py new file mode 100644 index 0000000..86fd023 --- /dev/null +++ b/src/imblearn_extra/clover/distribution/__init__.py @@ -0,0 +1,5 @@ +"""Distributor classes for clustering-based oversampling.""" + +from ._density import DensityDistributor + +__all__ = ['DensityDistributor'] diff --git a/src/imblearn_extra/clover/distribution/_density.py b/src/imblearn_extra/clover/distribution/_density.py new file mode 100644 index 0000000..11dfdf2 --- /dev/null +++ b/src/imblearn_extra/clover/distribution/_density.py @@ -0,0 +1,361 @@ +"""Implementation of the DensityDistributor class.""" + +# Author: Georgios Douzas +# Joao Fonseca +# License: MIT + +from __future__ import annotations + +from collections import Counter +from itertools import product +from warnings import catch_warnings, filterwarnings + +import numpy as np +from sklearn.metrics.pairwise import euclidean_distances +from sklearn.utils import check_scalar +from typing_extensions import Self + +from .. import InputData, Labels, Neighbors, Targets +from .base import BaseDistributor + + +class DensityDistributor(BaseDistributor): + """Class to perform density based distribution. + + Samples are distributed based on the density of clusters. + + Read more in the [user_guide]. + + Args: + filtering_threshold: + The threshold of a filtered cluster. It can be any non-negative number or + `'auto'` to be calculated automatically. + + - If `'auto'`, the filtering threshold is calculated from the imbalance + ratio of the target for the binary case or the maximum of the target's + imbalance ratios for the multiclass case. + + - If `float` then it is manually set to this number. Any cluster that has an + imbalance ratio smaller than the filtering threshold is identified as a filtered + cluster and can be potentially used to generate minority class instances. Higher + values increase the number of filtered clusters. + + distances_exponent: + The exponent of the mean distance in the density calculation. It can be + any non-negative number or `'auto'` to be calculated automatically. + + - If `'auto'` then it is set equal to the number of + features. Higher values make the calculation of density more sensitive + to the cluster's size i.e. clusters with large mean euclidean distance + between samples are penalized. + + - If `float` then it is manually set to this number. + + sparsity_based: + Whether sparse clusters receive more generated samples. + + - When `True` clusters receive generated samples that are inversely + proportional to their density. + + - When `False` clusters receive generated samples that are proportional to their density. + + distribution_ratio: + The ratio of intra-cluster to inter-cluster generated samples. It is a + number in the `[0.0, 1.0]` range. The default value is `1.0`, a + case corresponding to only intra-cluster generation. As the number + decreases, less intra-cluster samples are generated. Inter-cluster + generation, i.e. when `distribution_ratio` is less than `1.0`, + requires a neighborhood structure for the clusters, i.e. a + `neighbors_` attribute should be created after fitting and it will + raise an error when it is not found. + + Attributes: + clusters_density_ (Density): + Each dict key is a multi-label tuple of shape `(cluster_label, + class_label)`, while the values correspond to the density. + + distances_exponent_ (float): + Actual exponent of the mean distance used in the calculations. + + distribution_ratio_ (float): + A copy of the parameter in the constructor. + + filtered_clusters_ (List[MultiLabel]): + Each element is a tuple of `(cluster_label, class_label)` pairs. + + filtering_threshold_ (float): + Actual filtering threshold used in the calculations. + + inter_distribution_ (InterDistribution): + Each dict key is a multi-label tuple of + shape `((cluster_label1, cluster_label2), class_label)` while the + values are the proportion of samples per class. + + intra_distribution_ (IntraDistribution): + Each dict key is a multi-label tuple of shape `(cluster_label, + class_label)` while the values are the proportion of samples per class. + + labels_ (Labels): + Labels of each sample. + + neighbors_ (Neighbors): + An array that contains all neighboring pairs. Each row is + a unique neighboring pair. + + majority_class_label_ (int): + The majority class label. + + n_samples_ (int): + The number of samples. + + sparsity_based_ (bool): + A copy of the parameter in the constructor. + + unique_class_labels_ (Labels): + An array of unique class labels. + + unique_cluster_labels_ (Labels): + An array of unique cluster labels. + + Examples: + >>> from clover.distribution import DensityDistributor + >>> from sklearn.datasets import load_iris + >>> from sklearn.cluster import KMeans + >>> from imblearn.datasets import make_imbalance + >>> X, y = make_imbalance( + ... *load_iris(return_X_y=True), + ... sampling_strategy={0:50, 1:40, 2:30}, + ... random_state=0 + ... ) + >>> labels = KMeans(random_state=0, n_init='auto').fit_predict(X, y) + >>> density_distributor = DensityDistributor().fit(X, y, labels) + >>> density_distributor.filtered_clusters_ + [(6, 1), (0, 1), (3, 1), (7, 1), (5, 2), (2, 2), (3, 2), (6, 2), (0, 2)] + >>> density_distributor.intra_distribution_ + {(6, 1): 0.50604609281056... (0, 1): 0.143311766542165...} + >>> density_distributor.inter_distribution_ + {} + """ + + def __init__( + self: Self, + filtering_threshold: float | str = 'auto', + distances_exponent: float | str = 'auto', + sparsity_based: bool = True, + distribution_ratio: float = 1.0, + ) -> None: + self.filtering_threshold = filtering_threshold + self.distances_exponent = distances_exponent + self.sparsity_based = sparsity_based + self.distribution_ratio = distribution_ratio + + def _check_parameters( + self: Self, + X: InputData, + y: Targets, + neighbors: Neighbors | None, + ) -> Self: + """Check distributor parameters.""" + + # Filtering threshold + if self.filtering_threshold == 'auto': + counts_vals = Counter(y).values() + self.filtering_threshold_ = max(counts_vals) / min(counts_vals) + else: + self.filtering_threshold_ = check_scalar( + self.filtering_threshold, + 'filtering_threshold', + (int, float), + min_val=0.0, + ) + + # Distances exponent + if self.distances_exponent == 'auto': + self.distances_exponent_ = X.shape[1] + else: + self.distances_exponent_ = check_scalar( + self.distances_exponent, + 'distances_exponent', + (int, float), + min_val=0.0, + ) + + # Sparsity based + check_scalar(self.sparsity_based, 'sparsity_based', bool) + self.sparsity_based_ = self.sparsity_based + + # distribution ratio + check_scalar( + self.distribution_ratio, + 'distribution_ratio', + float, + min_val=0.0, + max_val=1.0, + ) + max_distribution_ratio = 1.0 + if self.distribution_ratio < max_distribution_ratio and neighbors is None: + msg = 'Parameter `distribution_ratio` should be equal to 1.0, when `neighbors` parameter is `None`.' + raise ValueError(msg) + self.distribution_ratio_ = self.distribution_ratio + return self + + def _identify_filtered_clusters(self: Self, y: Targets) -> Self: + """Identify the filtered clusters.""" + # Generate multi-label + multi_labels = list(zip(self.labels_, y, strict=True)) + + # Count multi-label + multi_labels_counts = Counter(multi_labels) + + # Extract unique cluster and class labels + unique_multi_labels = [ + multi_label for multi_label in multi_labels_counts if multi_label[1] not in self.majority_class_labels_ + ] + + # Identify filtered clusters + self.filtered_clusters_ = [] + for multi_label in unique_multi_labels: + n_minority_samples = multi_labels_counts[multi_label] + n_majority_samples = multi_labels_counts[(multi_label[0], self.majority_class_labels_[0])] + if n_majority_samples <= n_minority_samples * self.filtering_threshold_: + self.filtered_clusters_.append(multi_label) + + return self + + def _calculate_clusters_density(self: Self, X: InputData, y: Targets) -> Self: + """Calculate the density of the filtered clusters.""" + self.clusters_density_ = {} + + # Calculate density + finite_densities = [] + for cluster_label, class_label in self.filtered_clusters_: + # Calculate number of majority and minority samples in each cluster + mask = (self.labels_ == cluster_label) & (y == class_label) + n_minority_samples = mask.sum() + + # Calculate density + n_minority_pairs = (n_minority_samples - 1) * n_minority_samples if n_minority_samples > 1 else 1 + mean_distances = euclidean_distances(X[mask]).sum() / n_minority_pairs + with catch_warnings(): + filterwarnings('ignore') + density = n_minority_samples / (mean_distances**self.distances_exponent_) + if np.isfinite(density): + finite_densities.append(density) + self.clusters_density_[(cluster_label, class_label)] = density + + # Convert zero and infinite densities + min_density = 0.0 + if min_density in self.clusters_density_.values(): + self.clusters_density_ = { + multi_label: 1.0 for multi_label, density in self.clusters_density_.items() if density == min_density + } + self.filtered_clusters_ = [ + multi_label for multi_label in self.filtered_clusters_ if multi_label in self.clusters_density_ + ] + else: + max_density = max(finite_densities) if finite_densities else 1.0 + self.clusters_density_ = { + multi_label: (max_density if np.isinf(density) else density) + for multi_label, density in self.clusters_density_.items() + } + return self + + def _intra_distribute( + self: Self, + X: InputData, + y: Targets, + labels: Labels | None, + neighbors: Neighbors | None, + ) -> Self: + """In the clusters distribution. + + Distribute the generated samples in each cluster based on their + density. + """ + + # Calculate weights based on density + weights = { + multi_label: (1 / density if self.sparsity_based_ else density) + for multi_label, density in self.clusters_density_.items() + } + + # Calculate normalization factors + class_labels = {class_label for _, class_label in self.filtered_clusters_} + normalization_factors = {class_label: 0.0 for class_label in class_labels} + for (_, class_label), weight in weights.items(): + normalization_factors[class_label] += weight + + # Intra distribution + self.intra_distribution_ = { + multi_label: (self.distribution_ratio_ * weight / normalization_factors[multi_label[1]]) + for multi_label, weight in weights.items() + } + + return self + + def _inter_distribute( + self: Self, + X: InputData, + y: Targets, + labels: Labels | None, + neighbors: Neighbors | None, + ) -> Self: + """Between the clusters distribution. + + Distribute the generated samples between clusters based on their + density. + """ + + # Identify filtered neighboring clusters + filtered_neighbors = [] + class_labels = {class_label for _, class_label in self.filtered_clusters_} + for pair, class_label in product(self.neighbors_, class_labels): + multi_label0 = (pair[0], class_label) + multi_label1 = (pair[1], class_label) + if multi_label0 in self.filtered_clusters_ and multi_label1 in self.filtered_clusters_: + filtered_neighbors.append((multi_label0, multi_label1)) + + # Calculate inter-cluster density + inter_clusters_density = { + multi_labels: (self.clusters_density_[multi_labels[0]] + self.clusters_density_[multi_labels[1]]) + for multi_labels in filtered_neighbors + } + + # Calculate weights based on density + weights = { + multi_labels: (1 / density if self.sparsity_based_ else density) + for multi_labels, density in inter_clusters_density.items() + } + + # Calculate normalization factors + normalization_factors = {class_label: 0.0 for class_label in class_labels} + for multi_labels, weight in weights.items(): + normalization_factors[multi_labels[0][1]] += weight + + # Intra distribution + self.inter_distribution_ = { + multi_labels: ((1 - self.distribution_ratio_) * weight / normalization_factors[multi_labels[0][1]]) + for multi_labels, weight in weights.items() + } + + return self + + def _fit( + self: Self, + X: InputData, + y: Targets, + labels: Labels | None, + neighbors: Neighbors | None, + ) -> Self: + # Check distributor parameters + self._check_parameters(X, y, neighbors) + + # Identify filtered clusters + self._identify_filtered_clusters(y) + + # Calculate density of filtered clusters + self._calculate_clusters_density(X, y) + + super()._fit(X, y, labels, neighbors) + + return self diff --git a/src/imblearn_extra/clover/distribution/base.py b/src/imblearn_extra/clover/distribution/base.py new file mode 100644 index 0000000..8524648 --- /dev/null +++ b/src/imblearn_extra/clover/distribution/base.py @@ -0,0 +1,187 @@ +"""Base class for distributors.""" + +# Author: Georgios Douzas +# Joao Fonseca +# License: MIT + +from __future__ import annotations + +from collections import Counter + +import numpy as np +from sklearn.base import BaseEstimator +from sklearn.utils import check_array, check_X_y +from typing_extensions import Self + +from .. import InputData, InterDistribution, IntraDistribution, Labels, Neighbors, Targets + + +class BaseDistributor(BaseEstimator): + """The base class for distributors. + + A distributor sets the proportion of samples to be generated inside + each cluster and between clusters. Warning: This class should not be + used directly. Use the derive classes instead. + """ + + def _intra_distribute( + self: Self, + X: InputData, + y: Targets, + labels: Labels | None, + neighbors: Neighbors | None, + ) -> Self: + return self + + def _inter_distribute( + self: Self, + X: InputData, + y: Targets, + labels: Labels | None, + neighbors: Neighbors | None, + ) -> Self: + return self + + def _validate_fitting(self: Self) -> Self: + # Check labels + if len(self.labels_) != self.n_samples_: + msg = ( + f'Number of labels should be equal to the number of samples. ' + f'Got {len(self.labels_)} and {self.n_samples_} instead.' + ) + raise ValueError(msg) + + # Check neighbors + if not set(self.labels_).issuperset(self.neighbors_.flatten()): + masg = 'Attribute `neighbors_` contains unknown labels.' + raise ValueError(masg) + unique_neighbors = {tuple(set(pair)) for pair in self.neighbors_} + if len(unique_neighbors) < len(self.neighbors_): + msg = 'Elements of `neighbors_` attribute are not unique.' + raise ValueError(msg) + + # Check distribution + proportions = { + class_label: 0.0 + for class_label in self.unique_class_labels_ + if class_label not in self.majority_class_labels_ + } + for (_, class_label), proportion in self.intra_distribution_.items(): + proportions[class_label] += proportion + for ( + ((cluster_label1, class_label1), (cluster_label2, class_label2)), + proportion, + ) in self.inter_distribution_.items(): + if class_label1 != class_label2: + multi_label = ( + (cluster_label1, class_label1), + (cluster_label2, class_label2), + ) + msg = ( + 'Multi-labels for neighboring cluster pairs should ' + f'have a common class label. Got {multi_label} instead.' + ) + raise ValueError(msg) + proportions[class_label1] += proportion + if not all(np.isclose(val, 0) or np.isclose(val, 1) for val in proportions.values()): + msg = ( + 'Intra-distribution and inter-distribution sum of proportions for each ' + f'class label should be either equal to 0 or 1. Got {proportions} instead.' + ) + raise ValueError(msg) + + return self + + def _fit( + self: Self, + X: InputData, + y: Targets, + labels: Labels | None, + neighbors: Neighbors | None, + ) -> Self: + if labels is not None: + self._intra_distribute(X, y, labels, neighbors) + if neighbors is not None: + self._inter_distribute(X, y, labels, neighbors) + return self + + def fit( + self: Self, + X: InputData, + y: Targets, + labels: Labels | None = None, + neighbors: Neighbors | None = None, + ) -> Self: + """Generate the intra-label and inter-label distribution. + + Args: + X: + Matrix containing the data which have to be sampled. + + y: + Corresponding label for each sample in X. + labels: + Labels of each sample. + neighbors: + An array that contains all neighboring pairs. Each row is + a unique neighboring pair. + + Returns: + The object itself. + """ + # Check data + X, y = check_X_y(X, y, dtype=None) + + # Set statistics + counts = Counter(y) + self.majority_class_labels_ = [ + class_label + for class_label, class_label_count in counts.items() + if class_label_count == max(counts.values()) + ] + self.unique_cluster_labels_ = np.unique(labels) if labels is not None else np.array(0, dtype=int) + self.unique_class_labels_ = np.unique(y) + self.n_samples_ = len(X) + + # Set default attributes + self.labels_ = np.repeat(0, len(X)) if labels is None else check_array(labels, ensure_2d=False) + self.neighbors_ = np.empty((0, 2), dtype=int) if neighbors is None else check_array(neighbors, ensure_2d=False) + self.intra_distribution_: IntraDistribution = { + (0, class_label): 1.0 for class_label in np.unique(y) if class_label not in self.majority_class_labels_ + } + self.inter_distribution_: InterDistribution = {} + + # Fit distributor + self._fit(X, y, labels, neighbors) + + # Validate fitting procedure + self._validate_fitting() + + return self + + def fit_distribute( + self: Self, + X: InputData, + y: Targets, + labels: Labels | None, + neighbors: Neighbors | None, + ) -> tuple[IntraDistribution, InterDistribution]: + """Return the intra-label and inter-label distribution. + + Args: + X: + Matrix containing the data which have to be sampled. + y: + Corresponding label for each sample in X. + labels: + Labels of each sample. + neighbors: + An array that contains all neighboring pairs. Each row is + a unique neighboring pair. + + Returns: + distributions: + A tuple with the two distributions. + """ + self.fit(X, y, labels, neighbors) + return self.intra_distribution_, self.inter_distribution_ diff --git a/src/imblearn_extra/clover/over_sampling/__init__.py b/src/imblearn_extra/clover/over_sampling/__init__.py new file mode 100644 index 0000000..c68bffc --- /dev/null +++ b/src/imblearn_extra/clover/over_sampling/__init__.py @@ -0,0 +1,29 @@ +"""This module includes classes for clustering-based oversampling. + +A general class for clustering-based oversampling as well as specific +clustering-based oversamplers are provided. +""" + +from ._cluster import ( + ClusterOverSampler, + clone_modify, + extract_inter_data, + extract_intra_data, + generate_in_cluster, + modify_nn, +) +from ._gsomo import GeometricSOMO +from ._kmeans_smote import KMeansSMOTE +from ._somo import SOMO + +__all__: list[str] = [ + 'ClusterOverSampler', + 'KMeansSMOTE', + 'SOMO', + 'GeometricSOMO', + 'modify_nn', + 'clone_modify', + 'extract_inter_data', + 'extract_intra_data', + 'generate_in_cluster', +] diff --git a/src/imblearn_extra/clover/over_sampling/_cluster.py b/src/imblearn_extra/clover/over_sampling/_cluster.py new file mode 100644 index 0000000..eab21b5 --- /dev/null +++ b/src/imblearn_extra/clover/over_sampling/_cluster.py @@ -0,0 +1,571 @@ +"""Implementation of the main class for clustering-based oversampling.""" + +# Author: Georgios Douzas +# License: MIT + +from __future__ import annotations + +import warnings +from collections import Counter, OrderedDict + +import numpy as np +from imblearn.over_sampling import RandomOverSampler +from imblearn.over_sampling.base import BaseOverSampler +from imblearn.pipeline import Pipeline +from imblearn.utils import check_sampling_strategy +from imblearn.utils._validation import ArraysTransformer +from joblib import Parallel, delayed +from sklearn.base import ClusterMixin, TransformerMixin, clone +from sklearn.exceptions import FitFailedWarning +from sklearn.neighbors import NearestNeighbors +from sklearn.preprocessing import label_binarize +from sklearn.utils import check_random_state +from sklearn.utils.multiclass import check_classification_targets +from typing_extensions import Self + +from .. import InputData, InterDistribution, IntraDistribution, Labels, Targets +from ..distribution import DensityDistributor +from ..distribution.base import BaseDistributor + + +def modify_nn(n_neighbors: NearestNeighbors | int, n_samples: int) -> NearestNeighbors | int: + """Modify the nearest neighbors object. + + Args: + n_neighbors: + The `NearestNeighbors` object or number. + n_samples: + The number of samples. + + Returns: + The modified `NearestNeighbors` object or number. + """ + if isinstance(n_neighbors, NearestNeighbors): + n_neighbors = ( + clone(n_neighbors).set_params(n_neighbors=n_samples - 1) + if n_neighbors.n_neighbors >= n_samples + else clone(n_neighbors) + ) + elif isinstance(n_neighbors, int) and n_neighbors >= n_samples: + n_neighbors = n_samples - 1 + return n_neighbors + + +def clone_modify(oversampler: BaseOverSampler, class_label: int, y_in_cluster: Targets) -> BaseOverSampler: + """Clone and modify attributes of oversampler for corner cases. + + Args: + oversampler: + The oversampler to modify its attributes. + class_label: + The class label. + y_in_cluster: + The data of the target in the cluster. + + Returns: + A cloned oversampler with modified number of nearest neighbors. + """ + # Clone oversampler + oversampler = clone(oversampler) + + # Not modify attributes case + if isinstance(oversampler, RandomOverSampler): + return oversampler + + # Select and modify oversampler + n_minority_samples = Counter(y_in_cluster)[class_label] + if n_minority_samples == 1: + oversampler = RandomOverSampler() + else: + if hasattr(oversampler, 'k_neighbors'): + oversampler.k_neighbors = modify_nn(oversampler.k_neighbors, n_minority_samples) + if hasattr(oversampler, 'm_neighbors'): + oversampler.m_neighbors = modify_nn(oversampler.m_neighbors, y_in_cluster.size) + if hasattr(oversampler, 'n_neighbors'): + oversampler.n_neighbors = modify_nn(oversampler.n_neighbors, n_minority_samples) + return oversampler + + +def extract_intra_data( + X: InputData, + y: Targets, + cluster_labels: Labels, + intra_distribution: IntraDistribution, + sampling_strategy: OrderedDict[int, int], +) -> list[tuple[dict[int, int], InputData, Targets]]: + """Extract data for each filtered cluster. + + Args: + X: + The input data. + y: + The targets. + cluster_labels: + The cluster labels. + intra_distribution: + The intra-clusters distributions. + sampling_strategy: + The sampling strategy to follow. + + Returns: + The intra-clusters data. + """ + majority_class_label = Counter(y).most_common()[0][0] + + # Get offsets + selected_multi_labels = [] + classes_labels = {class_label for _, class_label in intra_distribution} + distribution_value_tie = 0.5 + for selected_class_label in classes_labels: + intra_distribution_class_label = { + (cluster_label, class_label): proportion + for (cluster_label, class_label), proportion in intra_distribution.items() + if class_label == selected_class_label + } + selected_multi_label = max( + intra_distribution_class_label, + key=lambda multi_label: intra_distribution_class_label[multi_label], + ) + if intra_distribution_class_label[selected_multi_label] <= distribution_value_tie: + selected_multi_labels.append(selected_multi_label) + + # Get clusters data + clusters_data = [] + for (cluster_label, class_label), proportion in intra_distribution.items(): + mask = (cluster_labels == cluster_label) & (np.isin(y, [majority_class_label, class_label])) + offset = int((cluster_label, class_label) in selected_multi_labels) + n_minority_samples = int(round(sampling_strategy[class_label] * proportion)) + offset + X_in_cluster, y_in_cluster = X[mask], y[mask] + cluster_sampling_strategy = {class_label: n_minority_samples} + if n_minority_samples > 0: + clusters_data.append((cluster_sampling_strategy, X_in_cluster, y_in_cluster)) + return clusters_data + + +def extract_inter_data( + X: InputData, + y: Targets, + cluster_labels: Labels, + inter_distribution: InterDistribution, + sampling_strategy: OrderedDict[int, int], + random_state: np.random.RandomState, +) -> list[tuple[dict[int, int], InputData, Targets]]: + """Extract data between filtered clusters. + + Args: + X: + The input data. + y: + The targets. + cluster_labels: + The cluster labels. + inter_distribution: + The inter-clusters distributions. + sampling_strategy: + The sampling strategy to follow. + random_state: + Control the randomization of the algorithm. + + Returns: + The inter-clusters data. + """ + majority_class_label = Counter(y).most_common()[0][0] + clusters_data = [] + for ( + ((cluster_label1, class_label1), (cluster_label2, class_label2)), + proportion, + ) in inter_distribution.items(): + mask1 = (cluster_labels == cluster_label1) & (np.isin(y, [majority_class_label, class_label1])) + mask2 = (cluster_labels == cluster_label2) & (np.isin(y, [majority_class_label, class_label2])) + X1, X2, y1, y2 = X[mask1], X[mask2], y[mask1], y[mask2] + majority_mask1, majority_mask2 = ( + (y1 == majority_class_label), + (y2 == majority_class_label), + ) + n_minority_samples = int(round(sampling_strategy[class_label1] * proportion)) + for _ in range(n_minority_samples): + ind1, ind2 = ( + random_state.randint(0, (~majority_mask1).sum()), + random_state.randint(0, (~majority_mask2).sum()), + ) + X_in_clusters = np.vstack( + ( + X1[~majority_mask1][ind1].reshape(1, -1), + X2[~majority_mask2][ind2].reshape(1, -1), + X1[majority_mask1], + X2[majority_mask2], + ), + ) + y_in_clusters = np.hstack( + ( + y1[~majority_mask1][ind1], + y2[~majority_mask2][ind2], + y1[majority_mask1], + y2[majority_mask2], + ), + ) + clusters_sampling_strategy = {class_label1: 1} + clusters_data.append((clusters_sampling_strategy, X_in_clusters, y_in_clusters)) + return clusters_data + + +def generate_in_cluster( + oversampler: BaseOverSampler, + transformer: TransformerMixin, + cluster_sampling_strategy: dict[int, int], + X_in_cluster: InputData, + y_in_cluster: Targets, +) -> tuple[InputData, Targets]: + """Generate intra-cluster or inter-cluster new samples. + + Args: + oversampler: + Oversampler to apply to each selected cluster. + transformer: + Transformer to apply before oversampling. + cluster_sampling_strategy: + The sampling strategy in the cluster. + X_in_cluster: + The input data in the cluster. + y_in_cluster: + The targets in the cluster. + + Returns: + X_new: + The generated. + y_new: + The corresponding label of resampled data. + """ + + # Create oversampler for specific cluster and class + class_label = next(iter(cluster_sampling_strategy.keys())) + oversampler = clone_modify(oversampler, class_label, y_in_cluster) + oversampler.sampling_strategy_ = cluster_sampling_strategy + oversampler.n_features_in_ = X_in_cluster.shape[1] + + # Resample cluster and class data + X_res, y_res = oversampler._fit_resample( + transformer.transform(X_in_cluster) if transformer is not None else X_in_cluster, + y_in_cluster, + ) + + # Filter only new data + X_new, y_new = X_res[len(X_in_cluster) :], y_res[len(y_in_cluster) :] + + return X_new, y_new + + +class ClusterOverSampler(BaseOverSampler): + """A class that handles clustering-based oversampling. + + Any combination of oversampler, clusterer and distributor can + be used. + + Read more in the [user_guide]. + + Args: + oversampler: + Oversampler to apply to each selected cluster. + + clusterer: + Clusterer to apply to input space before oversampling. + + - When `None`, it corresponds to a clusterer that assigns + a single cluster to all the samples equivalent to no clustering. + + - When clusterer is given, it applies clustering to the input space. Then + oversampling is applied inside each cluster and between clusters. + + distributor: + Distributor to distribute the generated samples per cluster label. + + - When `None` and a clusterer is provided then it corresponds to the + density distributor. If clusterer is also `None` than the distributor + does not affect the over-sampling procedure. + + - When distributor object is provided, it is used to distribute the + generated samples to the clusters. + + raise_error: + Raise an error when no samples are generated. + + - If `True`, it raises an error when no filtered clusters are + identified and therefore no samples are generated. + + - If `False`, it displays a warning. + + random_state: + Control the randomization of the algorithm. + + - If `int`, it is the seed used by the random number + generator. + - If `np.random.RandomState` instance, it is the random number + generator. + - If `None`, the random number generator is the `RandomState` + instance used by `np.random`. + + n_jobs: + Number of CPU cores used. + + - If `None`, it means `1` unless in a `joblib.parallel_backend` context. + + - If `-1` means using all processors. + + Attributes: + oversampler_ (imblearn.over_sampling.base.BaseOverSampler): + A fitted clone of the `oversampler` parameter. + + clusterer_ (sklearn.base.ClusterMixin): + A fitted clone of the `clusterer` parameter or `None` when a + clusterer is not given. + + distributor_ (clover.distribution.base.BaseDistributor): + A fitted clone of the `distributor` parameter or a fitted instance of + the `DensityDistributor` when a distributor is not given. + + labels_ (Labels): + Cluster labels of each sample. + + neighbors_ (Neighbors): + An array that contains all neighboring pairs with each row being + a unique neighboring pair. It is `None` when the clusterer does not + support this attribute. + + random_state_ (np.random.RandomState): + An instance of `np.random.RandomState` class. + + sampling_strategy_ (dict[int, int]): + Actual sampling strategy. + + Examples: + >>> from collections import Counter + >>> from clover.over_sampling import ClusterOverSampler + >>> from sklearn.datasets import make_classification + >>> from sklearn.cluster import KMeans + >>> from imblearn.over_sampling import SMOTE + >>> X, y = make_classification(random_state=0, n_classes=2, weights=[0.9, 0.1]) + >>> print('Original dataset shape %s' % Counter(y)) + Original dataset shape Counter({{0: 90, 1: 10}}) + >>> cluster_oversampler = ClusterOverSampler( + ... oversampler=SMOTE(random_state=5), + ... clusterer=KMeans(random_state=10, n_init='auto')) + >>> X_res, y_res = cluster_oversampler.fit_resample(X, y) + >>> print('Resampled dataset shape %s' % Counter(y_res)) + Resampled dataset shape Counter({{0: 90, 1: 90}}) + """ + + def __init__( + self: Self, + oversampler: BaseOverSampler, + clusterer: ClusterMixin | None = None, + distributor: BaseDistributor | None = None, + raise_error: bool = True, + random_state: np.random.RandomState | int | None = None, + n_jobs: int | None = None, + ) -> None: + self.oversampler = oversampler + self.clusterer = clusterer + self.distributor = distributor + self.raise_error = raise_error + self.random_state = random_state + self.n_jobs = n_jobs + + def fit(self: Self, X: InputData, y: Targets) -> Self: + """Check inputs and statistics of the sampler. + + You should use `fit_resample` to generate the synthetic data. + + Args: + X: + Data array. + y: + Target array. + + Returns: + self: + Return the instance itself. + """ + X, y, _ = self._check_X_y(X, y) + self._check(X, y) + return self + + def fit_resample( + self: Self, + X: InputData, + y: Targets, + **fit_params: dict[str, str], + ) -> tuple[InputData, Targets]: + """Resample the dataset. + + Args: + X: + Matrix containing the data which have to be sampled. + y: + Corresponding label for each sample in X. + fit_params: + Parameters passed to the fit method of the clusterer. + + Returns: + X_resampled: + The array containing the resampled data. + y_resampled: + The corresponding label of resampled data. + """ + check_classification_targets(y) + arrays_transformer = ArraysTransformer(X, y) + X, y, binarize_y = self._check_X_y(X, y) + + self._check(X, y)._fit(X, y, **fit_params) + + output = self._fit_resample(X, y) + + y_ = label_binarize(y=output[1], classes=np.unique(y)) if binarize_y else output[1] + + X_, y_ = arrays_transformer.transform(output[0], y_) + return (X_, y_) + + def _cluster_sample( + self: Self, + clusters_data: list[tuple[dict[int, int], InputData, Targets]], + X: InputData, + y: Targets, + ) -> tuple[InputData, Targets] | None: + generated_data = Parallel(n_jobs=self.n_jobs)( + delayed(generate_in_cluster)(self.oversampler_, self.transformer_, *data) for data in clusters_data + ) + if generated_data: + X, y = (np.concatenate(data) for data in zip(*generated_data, strict=True)) + return X, y + return None + + def _intra_sample(self: Self, X: InputData, y: Targets) -> tuple[InputData, Targets] | None: + clusters_data = extract_intra_data( + X, + y, + self.labels_, + self.distributor_.intra_distribution_, + self.sampling_strategy_, + ) + return self._cluster_sample(clusters_data, X, y) + + def _inter_sample(self: Self, X: InputData, y: Targets) -> tuple[InputData, Targets] | None: + clusters_data = extract_inter_data( + X, + y, + self.labels_, + self.distributor_.inter_distribution_, + self.sampling_strategy_, + self.random_state_, + ) + return self._cluster_sample(clusters_data, X, y) + + def _check_estimators(self: Self, X: InputData, y: Targets) -> Self: + # Check transformer and oversampler + if isinstance(self.oversampler, Pipeline): + if self.oversampler.steps[:-1]: + self.transformer_ = Pipeline(self.oversampler.steps[:-1]).fit(X) + self.oversampler_ = clone(self.oversampler.steps[-1][-1]) + else: + self.oversampler_ = clone(self.oversampler) + + # Check clusterer and distributor + if self.clusterer is None and self.distributor is not None: + msg = ( + 'Distributor was found but clusterer is set to `None`. ' + 'Either set parameter `distributor` to `None` or use a clusterer.' + ) + raise ValueError(msg) + elif self.clusterer is None and self.distributor is None: + self.clusterer_ = None + self.distributor_ = BaseDistributor() + else: + self.clusterer_ = clone(self.clusterer) + self.distributor_ = DensityDistributor() if self.distributor is None else clone(self.distributor) + return self + + def _check_sampling_strategy(self: Self, y: Targets) -> Self: + self.sampling_strategy_ = check_sampling_strategy( + self.oversampler_.sampling_strategy, + y, + self._sampling_type, + ) + return self + + def _check(self: Self, X: InputData, y: Targets) -> Self: + # Check random state + self.random_state_ = check_random_state(self.random_state) + + # Check transformer + self.transformer_ = None + + # Check estimators and sampling strategy + self._check_estimators(X, y)._check_sampling_strategy(y) + + return self + + def _fit(self: Self, X: InputData, y: Targets, **fit_params: dict[str, str]) -> Self: + # Fit clusterer + if self.clusterer_ is not None: + self.clusterer_.fit(X, y, **fit_params) + + # Extract labels and neighbors + self.labels_ = getattr(self.clusterer_, 'labels_', np.zeros(len(X), dtype=int)) + self.neighbors_ = getattr(self.clusterer_, 'neighbors_', None) + + # fit distributor + self.distributor_.fit(X, y, labels=self.labels_, neighbors=self.neighbors_) + + # Case when no samples are generated + if not self.distributor_.intra_distribution_ and not self.distributor_.inter_distribution_: + msg = 'No samples were generated. Try to modify the parameters of the clusterer or distributor.' + + # Raise error + if self.raise_error: + raise ValueError(msg) + + # Display warning + else: + warnings.warn(msg, FitFailedWarning, stacklevel=1) + + return self + + def _fit_resample( + self: Self, + X: InputData, + y: Targets, + **fit_params: dict[str, str], + ) -> tuple[InputData, Targets]: + # Intracluster oversampling + data_intra = self._intra_sample(X, y) + if data_intra is not None: + X_intra_new, y_intra_new = data_intra + else: + X_intra_new, y_intra_new = None, None + intra_count: Counter = Counter(y_intra_new) + + # Intercluster oversampling + data_inter = self._inter_sample(X, y) + if data_inter is not None: + X_inter_new, y_inter_new = data_inter + else: + X_inter_new, y_inter_new = None, None + inter_count: Counter = Counter(y_inter_new) + + # Set sampling strategy + self.sampling_strategy_ = OrderedDict({}) + for class_label in set(intra_count.keys()).union(inter_count.keys()): + self.sampling_strategy_[class_label] = intra_count.get(class_label, 0) + inter_count.get(class_label, 0) + + # Stack resampled data + X_resampled_unstacked = [ + self.transformer_.transform(X) if self.transformer_ is not None else X, + X_intra_new, + X_inter_new, + ] + y_resampled_unstacked = [y, y_intra_new, y_inter_new] + X_resampled, y_resampled = ( + np.vstack([X for X in X_resampled_unstacked if X is not None]), + np.hstack([y for y in y_resampled_unstacked if y is not None]), + ) + + return X_resampled, y_resampled diff --git a/src/imblearn_extra/clover/over_sampling/_gsomo.py b/src/imblearn_extra/clover/over_sampling/_gsomo.py new file mode 100644 index 0000000..b0553bc --- /dev/null +++ b/src/imblearn_extra/clover/over_sampling/_gsomo.py @@ -0,0 +1,263 @@ +"""Includes the implementation of SOMO.""" + +# Author: Georgios Douzas +# License: MIT + +from __future__ import annotations + +from collections.abc import Callable +from math import sqrt + +import numpy as np +from sklearn.base import clone +from sklearn.neighbors import NearestNeighbors +from sklearn.utils import check_scalar +from typing_extensions import Self + +from ...gsmote import GeometricSMOTE +from .. import InputData, Targets +from ..clusterer import SOM +from ..distribution import DensityDistributor +from ._cluster import ClusterOverSampler + + +class GeometricSOMO(ClusterOverSampler): + """Geometric SOMO algorithm. + + Applies the SOM algorithm to the input space before applying Geometric + SMOTE. Read more in the [user_guide]. + + Args: + sampling_strategy: + Sampling information to resample the data set. + + - When `float`, it corresponds to the desired ratio of the number of + samples in the minority class over the number of samples in the + majority class after resampling. It is only available for binary + classification. + + - When `str`, specify the class targeted by the resampling. The + number of samples in the different classes will be equalized. + Possible choices are: + - `'minority'`: resample only the minority class. + - `'not minority'`: resample all classes but the minority class. + - `'not majority'`: resample all classes but the majority class. + - `'all'`: resample all classes. + - `'auto'`: equivalent to `'not majority'`. + + - When `dict`, the keys correspond to the targeted classes. The + values correspond to the desired number of samples for each targeted + class. + + - When callable, function taking `y` and returns a `dict`. The keys + correspond to the targeted classes. The values correspond to the + desired number of samples for each class. + + random_state: + Control the randomization of the algorithm. + + - If `int`, it is the seed used by the random number + generator. + - If `np.random.RandomState` instance, it is the random number + generator. + - If `None`, the random number generator is the `RandomState` + instance used by `np.random`. + + k_neighbors: + Defines the number of nearest neighbors to be used by SMOTE. + + - If `int`, this number is used to construct synthetic + samples. + + - If `object`, an estimator that inherits from + `sklearn.neighbors.base.KNeighborsMixin` that will be + used to find the number of nearest neighbors. + + truncation_factor: + The type of truncation. The values should be in the `[-1.0, 1.0]` range. + + deformation_factor: + The type of geometry. The values should be in the `[0.0, 1.0]` range. + + selection_strategy: + The type of Geometric SMOTE algorithm with the following options: + `'combined'`, `'majority'`, `'minority'`. + + som_estimator: + Defines the SOM clusterer applied to the input space. + + - If `None`, a `clover.clusterer.SOM` instance with default parameters is used. + + - If `clover.clusterer.SOM` object then is used with the given parameters. + + - If `int`, the number of clusters to be used. + + - If `float`, the proportion of the number of clusters over the number + of samples to be used. + + imbalance_ratio_threshold: + The threshold of a filtered cluster. It can be any non-negative number or + `'auto'` to be calculated automatically. + + - If `'auto'`, the filtering threshold is calculated from the imbalance + ratio of the target for the binary case or the maximum of the target's + imbalance ratios for the multiclass case. + + - If `float` then it is manually set to this number. + + Any cluster that has an imbalance ratio smaller than the filtering threshold is + identified as a filtered cluster and can be potentially used to generate + minority class instances. Higher values increase the number of filtered + clusters. + + distances_exponent: + The exponent of the mean distance in the density calculation. It can be + any non-negative number or `'auto'` to be calculated automatically. + + - If `'auto'` then it is set equal to the number of + features. Higher values make the calculation of density more sensitive + to the cluster's size i.e. clusters with large mean euclidean distance + between samples are penalized. + + - If `float` then it is manually set to this number. + + distribution_ratio: + The ratio of intra-cluster to inter-cluster generated samples. It is a + number in the `[0.0, 1.0]` range. The default value is `0.8`, a + number equal to the proportion of intra-cluster generated samples over + the total number of generated samples. As the number decreases, less + intra-cluster and more inter-cluster samples are generated. + + raise_error: + Raise an error when no samples are generated. + + - If `True`, it raises an error when no filtered clusters are + identified and therefore no samples are generated. + + - If `False`, it displays a warning. + + n_jobs: + Number of CPU cores used. + + - If `None`, it means `1` unless in a `joblib.parallel_backend` context. + + - If `-1` means using all processors. + + Attributes: + oversampler_ (gsmote.GeometricSMOTE): + A fitted `gsmote.GeometricSMOTE` instance. + + clusterer_ (SOM): + A fitted `clovar.clusterer.SOM` instance. + + distributor_ (clover.distribution.DensityDistributor): + A fitted `clover.distribution.DensityDistributor` instance. + + labels_ (Labels): + Labels of each sample. + + neighbors_ (Neighbors): + An array that contains all neighboring pairs with each row being + a unique neighboring pair. + + random_state_ (numpy.random.RandomState): + An instance of `numpy.random.RandomState` class. + + sampling_strategy_ (dict[int, int]): + Actual sampling strategy. + + Examples: + >>> import numpy as np + >>> from clover.over_sampling import GeometricSOMO # doctest: +SKIP + >>> from sklearn.datasets import make_blobs + >>> blobs = [100, 800, 100] + >>> X, y = make_blobs(blobs, centers=[(-10, 0), (0,0), (10, 0)]) + >>> # Add a single 0 sample in the middle blob + >>> X = np.concatenate([X, [[0, 0]]]) + >>> y = np.append(y, 0) + >>> # Make this a binary classification problem + >>> y = y == 1 + >>> gsomo = GeometricSOMO(random_state=42) # doctest: +SKIP + >>> X_res, y_res = gsomo.fit_resample(X, y) # doctest: +SKIP + >>> # Find the number of new samples in the middle blob + >>> right, left = X_res[:, 0] > -5, X_res[:, 0] < 5 # doctest: +SKIP + >>> n_res_in_middle = (right & left).sum() # doctest: +SKIP + >>> print("Samples in the middle blob: %s" % n_res_in_middle) # doctest: +SKIP + Samples in the middle blob: 801 + >>> unchanged = n_res_in_middle == blobs[1] + 1 # doctest: +SKIP + >>> print("Middle blob unchanged: %s" % unchanged) # doctest: +SKIP + Middle blob unchanged: True + >>> more_zero_samples = (y_res == 0).sum() > (y == 0).sum() # doctest: +SKIP + >>> print("More 0 samples: %s" % more_zero_samples) # doctest: +SKIP + More 0 samples: True + """ + + def __init__( + self: Self, + sampling_strategy: dict[int, int] | str | float | Callable[[Targets], dict[int, int]] = 'auto', + random_state: np.random.RandomState | int | None = None, + k_neighbors: NearestNeighbors | int = 5, + truncation_factor: float = 1.0, + deformation_factor: float = 0.0, + selection_strategy: str = 'combined', + som_estimator: SOM | None = None, + imbalance_ratio_threshold: float | str = 'auto', + distances_exponent: float | str = 'auto', + distribution_ratio: float = 0.8, + raise_error: bool = True, + n_jobs: int | None = None, + ) -> None: + self.sampling_strategy = sampling_strategy + self.random_state = random_state + self.k_neighbors = k_neighbors + self.truncation_factor = truncation_factor + self.deformation_factor = deformation_factor + self.selection_strategy = selection_strategy + self.som_estimator = som_estimator + self.distribution_ratio = distribution_ratio + self.imbalance_ratio_threshold = imbalance_ratio_threshold + self.distances_exponent = distances_exponent + self.raise_error = raise_error + self.n_jobs = n_jobs + + def _check_estimators(self: Self, X: InputData, y: Targets) -> Self: + """Check various estimators.""" + # Check oversampler + self.oversampler_ = GeometricSMOTE( + sampling_strategy=self.sampling_strategy, + k_neighbors=self.k_neighbors, + truncation_factor=self.truncation_factor, + deformation_factor=self.deformation_factor, + selection_strategy=self.selection_strategy, + random_state=self.random_state_, + n_jobs=self.n_jobs, + ) + + if self.som_estimator is None: + self.clusterer_ = SOM(random_state=self.random_state_) + elif isinstance(self.som_estimator, int): + check_scalar(self.som_estimator, 'som_estimator', int, min_val=1) + n = round(sqrt(self.som_estimator)) + self.clusterer_ = SOM(n_columns=n, n_rows=n, random_state=self.random_state_) + elif isinstance(self.som_estimator, float): + check_scalar(self.som_estimator, 'som_estimator', float, min_val=0.0, max_val=1.0) + n = round(sqrt((X.shape[0] - 1) * self.som_estimator + 1)) + self.clusterer_ = SOM(n_columns=n, n_rows=n, random_state=self.random_state_) + elif isinstance(self.som_estimator, SOM): + self.clusterer_ = clone(self.som_estimator) + else: + msg = ( + 'Parameter `som_estimator` should be either `None` or the number of clusters or a float ' + 'in the [0.0, 1.0] range equal to the number of clusters over the number of ' + 'samples or an instance of the `SOM` class.' + ) + raise TypeError(msg) + + # Check distributor + self.distributor_ = DensityDistributor( + filtering_threshold=self.imbalance_ratio_threshold, + distances_exponent=self.distances_exponent, + distribution_ratio=self.distribution_ratio, + ) + + return self diff --git a/src/imblearn_extra/clover/over_sampling/_kmeans_smote.py b/src/imblearn_extra/clover/over_sampling/_kmeans_smote.py new file mode 100644 index 0000000..f717912 --- /dev/null +++ b/src/imblearn_extra/clover/over_sampling/_kmeans_smote.py @@ -0,0 +1,239 @@ +"""Includes the implementation of KMeans-SMOTE.""" + +# Author: Georgios Douzas +# License: MIT + +from __future__ import annotations + +import numpy as np +from imblearn.over_sampling import SMOTE +from sklearn.base import clone +from sklearn.cluster import KMeans, MiniBatchKMeans +from sklearn.neighbors import NearestNeighbors +from sklearn.utils import check_scalar +from typing_extensions import Self + +from .. import InputData, Targets +from ..distribution._density import DensityDistributor +from ._cluster import ClusterOverSampler + + +class KMeansSMOTE(ClusterOverSampler): + """KMeans-SMOTE algorithm. + + Applies KMeans clustering to the input space before applying SMOTE. Read + more in the [user_guide]. + + Args: + sampling_strategy: + Sampling information to resample the data set. + + - When `float`, it corresponds to the desired ratio of the number of + samples in the minority class over the number of samples in the + majority class after resampling. It is only available for binary + classification. + + - When `str`, specify the class targeted by the resampling. The + number of samples in the different classes will be equalized. + Possible choices are: + - `'minority'`: resample only the minority class. + - `'not minority'`: resample all classes but the minority class. + - `'not majority'`: resample all classes but the majority class. + - `'all'`: resample all classes. + - `'auto'`: equivalent to `'not majority'`. + + - When `dict`, the keys correspond to the targeted classes. The + values correspond to the desired number of samples for each targeted + class. + + - When callable, function taking `y` and returns a `dict`. The keys + correspond to the targeted classes. The values correspond to the + desired number of samples for each class. + + random_state: + Control the randomization of the algorithm. + + - If `int`, it is the seed used by the random number + generator. + - If `np.random.RandomState` instance, it is the random number + generator. + - If `None`, the random number generator is the `RandomState` + instance used by `np.random`. + + k_neighbors: + Defines the number of nearest neighbors to be used by SMOTE. + + - If `int`, this number is used to construct synthetic + samples. + + - If `object`, an estimator that inherits from + `sklearn.neighbors.base.KNeighborsMixin` that will be + used to find the number of nearest neighbors. + + kmeans_estimator: + Defines the KMeans clusterer applied to the input space. + + - If `None`, `sklearn.cluster.MiniBatchKMeans` is used which + tends to be better with large number of samples. + + - If KMeans object, then an instance from either + `sklearn.cluster.KMeans` or `sklearn.cluster.MiniBatchKMeans`. + + - If `int`, the number of clusters to be used. + + - If `float`, the proportion of the number of clusters over the number + of samples to be used. + + imbalance_ratio_threshold: + The threshold of a filtered cluster. It can be any non-negative number or + `'auto'` to be calculated automatically. + + - If `'auto'`, the filtering threshold is calculated from the imbalance + ratio of the target for the binary case or the maximum of the target's + imbalance ratios for the multiclass case. + + - If `float` then it is manually set to this number. + + Any cluster that has an imbalance ratio smaller than the filtering threshold is + identified as a filtered cluster and can be potentially used to generate + minority class instances. Higher values increase the number of filtered + clusters. + + distances_exponent: + The exponent of the mean distance in the density calculation. It can be + any non-negative number or `'auto'` to be calculated automatically. + + - If `'auto'` then it is set equal to the number of + features. Higher values make the calculation of density more sensitive + to the cluster's size i.e. clusters with large mean euclidean distance + between samples are penalized. + + - If `float` then it is manually set to this number. + + raise_error: + Raise an error when no samples are generated. + + - If `True`, it raises an error when no filtered clusters are + identified and therefore no samples are generated. + + - If `False`, it displays a warning. + + n_jobs: + Number of CPU cores used. + + - If `None`, it means `1` unless in a `joblib.parallel_backend` context. + + - If `-1` means using all processors. + + Attributes: + oversampler_ (imblearn.over_sampling.SMOTE): + A fitted `imblearn.over_sampling.SMOTE` instance. + + clusterer_ (sklearn.cluster.KMeans | sklearn.cluster.MiniBatchKMeans): + A fitted `sklearn.cluster.KMeans` or `sklearn.cluster.MiniBatchKMeans` instance. + + distributor_ (clover.distribution.DensityDistributor): + A fitted `clover.distribution.DensityDistributor` instance. + + labels_ (Labels): + Cluster labels of each sample. + + neighbors_ (None): + It is `None` since KMeans does not support this attribute. + + random_state_ (np.random.RandomState): + An instance of `np.random.RandomState` class. + + sampling_strategy_ (dict[int, int]): + Actual sampling strategy. + + Examples: + >>> import numpy as np + >>> from clover.over_sampling import KMeansSMOTE + >>> from sklearn.datasets import make_blobs + >>> blobs = [100, 800, 100] + >>> X, y = make_blobs(blobs, centers=[(-10, 0), (0,0), (10, 0)]) + >>> # Add a single 0 sample in the middle blob + >>> X = np.concatenate([X, [[0, 0]]]) + >>> y = np.append(y, 0) + >>> # Make this a binary classification problem + >>> y = y == 1 + >>> kmeans_smote = KMeansSMOTE(random_state=42) + >>> X_res, y_res = kmeans_smote.fit_resample(X, y) + >>> # Find the number of new samples in the middle blob + >>> n_res_in_middle = ((X_res[:, 0] > -5) & (X_res[:, 0] < 5)).sum() + >>> print("Samples in the middle blob: %s" % n_res_in_middle) + Samples in the middle blob: 801 + >>> print("Middle blob unchanged: %s" % (n_res_in_middle == blobs[1] + 1)) + Middle blob unchanged: True + >>> print("More 0 samples: %s" % ((y_res == 0).sum() > (y == 0).sum())) + More 0 samples: True + """ + + def __init__( + self: Self, + sampling_strategy: dict[int, int] | str = 'auto', + random_state: np.random.RandomState | int | None = None, + k_neighbors: NearestNeighbors | int = 5, + kmeans_estimator: KMeans | None = None, + imbalance_ratio_threshold: float | str = 'auto', + distances_exponent: float | str = 'auto', + raise_error: bool = True, + n_jobs: int | None = None, + ) -> None: + self.sampling_strategy = sampling_strategy + self.random_state = random_state + self.k_neighbors = k_neighbors + self.kmeans_estimator = kmeans_estimator + self.imbalance_ratio_threshold = imbalance_ratio_threshold + self.distances_exponent = distances_exponent + self.raise_error = raise_error + self.n_jobs = n_jobs + + def _check_estimators(self: Self, X: InputData, y: Targets) -> Self: + """Check various estimators.""" + # Check oversampler + self.oversampler_ = SMOTE( + sampling_strategy=self.sampling_strategy, + k_neighbors=self.k_neighbors, + random_state=self.random_state_, + n_jobs=self.n_jobs, + ) + + # Check clusterer + if self.kmeans_estimator is None: + self.clusterer_ = MiniBatchKMeans(random_state=self.random_state_, n_init='auto') + elif isinstance(self.kmeans_estimator, int): + check_scalar(self.kmeans_estimator, 'kmeans_estimator', int, min_val=1) + self.clusterer_ = MiniBatchKMeans( + n_clusters=self.kmeans_estimator, + random_state=self.random_state_, + n_init='auto', + ) + elif isinstance(self.kmeans_estimator, float): + check_scalar( + self.kmeans_estimator, + 'kmeans_estimator', + float, + min_val=0.0, + max_val=1.0, + ) + n_clusters = round((X.shape[0] - 1) * self.kmeans_estimator + 1) + self.clusterer_ = MiniBatchKMeans(n_clusters=n_clusters, random_state=self.random_state, n_init='auto') + elif isinstance(self.kmeans_estimator, KMeans | MiniBatchKMeans): + self.clusterer_ = clone(self.kmeans_estimator) + else: + msg = ( + 'Parameter `kmeans_estimator` should be either `None` or the number of clusters ' + 'or a float in the [0.0, 1.0] range equal to the number of clusters over the number ' + 'of samples or an instance of either `KMeans` or `MiniBatchKMeans` class.' + ) + raise TypeError(msg) + + # Check distributor + self.distributor_ = DensityDistributor( + filtering_threshold=self.imbalance_ratio_threshold, + distances_exponent=self.distances_exponent, + ) + + return self diff --git a/src/imblearn_extra/clover/over_sampling/_somo.py b/src/imblearn_extra/clover/over_sampling/_somo.py new file mode 100644 index 0000000..81057ca --- /dev/null +++ b/src/imblearn_extra/clover/over_sampling/_somo.py @@ -0,0 +1,215 @@ +"""Includes the implementation of SOMO.""" + +# Author: Georgios Douzas +# License: MIT + +from __future__ import annotations + +from math import sqrt + +import numpy as np +from imblearn.over_sampling import SMOTE +from sklearn.base import clone +from sklearn.neighbors import NearestNeighbors +from sklearn.utils import check_scalar +from typing_extensions import Self + +from .. import InputData, Targets +from ..clusterer import SOM +from ..distribution._density import DensityDistributor +from ._cluster import ClusterOverSampler + + +class SOMO(ClusterOverSampler): + """SOMO algorithm. + + Applies the SOM algorithm to the input space before applying SMOTE. Read + more in the [user_guide]. + + Args: + sampling_strategy: + Sampling information to resample the data set. + + - When `float`, it corresponds to the desired ratio of the number of + samples in the minority class over the number of samples in the + majority class after resampling. It is only available for binary + classification. + + - When `str`, specify the class targeted by the resampling. The + number of samples in the different classes will be equalized. + Possible choices are: + - `'minority'`: resample only the minority class. + - `'not minority'`: resample all classes but the minority class. + - `'not majority'`: resample all classes but the majority class. + - `'all'`: resample all classes. + - `'auto'`: equivalent to `'not majority'`. + + - When `dict`, the keys correspond to the targeted classes. The + values correspond to the desired number of samples for each targeted + class. + + - When callable, function taking `y` and returns a `dict`. The keys + correspond to the targeted classes. The values correspond to the + desired number of samples for each class. + + random_state: + Control the randomization of the algorithm. + + - If `int`, it is the seed used by the random number + generator. + - If `np.random.RandomState` instance, it is the random number + generator. + - If `None`, the random number generator is the `RandomState` + instance used by `np.random`. + + k_neighbors: + Defines the number of nearest neighbors to be used by SMOTE. + + - If `int`, this number is used to construct synthetic + samples. + + - If `object`, an estimator that inherits from + `sklearn.neighbors.base.KNeighborsMixin` that will be + used to find the number of nearest neighbors. + + som_estimator: + Defines the SOM clusterer applied to the input space. + + - If `None`, `SOM` is used which + tends to be better with large number of samples. + + - If SOM object, then it is a `clover.clusterer.SOM` instance. + + - If `int`, the number of clusters to be used. + + - If `float`, the proportion of the number of clusters over the number + of samples to be used. + + distribution_ratio: + The ratio of intra-cluster to inter-cluster generated samples. It is a + number in the `[0.0, 1.0]` range. The default value is `0.8`, a + number equal to the proportion of intra-cluster generated samples over + the total number of generated samples. As the number decreases, less + intra-cluster and more inter-cluster samples are generated. + + raise_error: + Raise an error when no samples are generated. + + - If `True`, it raises an error when no filtered clusters are + identified and therefore no samples are generated. + + - If `False`, it displays a warning. + + n_jobs: + Number of CPU cores used. + + - If `None`, it means `1` unless in a `joblib.parallel_backend` context. + + - If `-1` means using all processors. + + Attributes: + oversampler_ (imblearn.over_sampling.SMOTE): + A fitted `imblearn.over_sampling.SMOTE` instance. + + clusterer_ (clover.clusterer.SOM): + A fitted `clover.clusterer.SOM` instance. + + distributor_ (clover.distribution.DensityDistributor): + A fitted `clover.distribution.DensityDistributor` instance. + + labels_ (Labels): + Cluster labels of each sample. + + neighbors_ (Neighbors): + An array that contains all neighboring pairs with each row being + a unique neighboring pair. + + random_state_ (np.random.RandomState): + An instance of `np.random.RandomState` class. + + sampling_strategy_ (dict[int, int]): + Actual sampling strategy. + + Examples: + >>> import numpy as np + >>> from clover.over_sampling import SOMO # doctest: +SKIP + >>> from sklearn.datasets import make_blobs + >>> blobs = [100, 800, 100] + >>> X, y = make_blobs(blobs, centers=[(-10, 0), (0,0), (10, 0)]) + >>> # Add a single 0 sample in the middle blob + >>> X = np.concatenate([X, [[0, 0]]]) + >>> y = np.append(y, 0) + >>> # Make this a binary classification problem + >>> y = y == 1 + >>> somo = SOMO(random_state=42) # doctest: +SKIP + >>> X_res, y_res = somo.fit_resample(X, y) # doctest: +SKIP + >>> # Find the number of new samples in the middle blob + >>> right, left = X_res[:, 0] > -5, X_res[:, 0] < 5 # doctest: +SKIP + >>> n_res_in_middle = (right & left).sum() # doctest: +SKIP + >>> print("Samples in the middle blob: %s" % n_res_in_middle) # doctest: +SKIP + Samples in the middle blob: 801 + >>> unchanged = n_res_in_middle == blobs[1] + 1 # doctest: +SKIP + >>> print("Middle blob unchanged: %s" % unchanged) # doctest: +SKIP + Middle blob unchanged: True + >>> more_zero_samples = (y_res == 0).sum() > (y == 0).sum() # doctest: +SKIP + >>> print("More 0 samples: %s" % more_zero_samples) # doctest: +SKIP + More 0 samples: True + """ + + def __init__( + self: Self, + sampling_strategy: dict[int, int] | str = 'auto', + random_state: np.random.RandomState | int | None = None, + k_neighbors: NearestNeighbors | int = 5, + som_estimator: SOM | None = None, + distribution_ratio: float = 0.8, + raise_error: bool = True, + n_jobs: int | None = None, + ) -> None: + self.sampling_strategy = sampling_strategy + self.random_state = random_state + self.k_neighbors = k_neighbors + self.som_estimator = som_estimator + self.distribution_ratio = distribution_ratio + self.raise_error = raise_error + self.n_jobs = n_jobs + + def _check_estimators(self: Self, X: InputData, y: Targets) -> Self: + """Check various estimators.""" + # Check oversampler + self.oversampler_ = SMOTE( + sampling_strategy=self.sampling_strategy, + k_neighbors=self.k_neighbors, + random_state=self.random_state_, + n_jobs=self.n_jobs, + ) + + # Check clusterer and number of clusters + if self.som_estimator is None: + self.clusterer_ = SOM(random_state=self.random_state_) + elif isinstance(self.som_estimator, int): + check_scalar(self.som_estimator, 'som_estimator', int, min_val=1) + n = round(sqrt(self.som_estimator)) + self.clusterer_ = SOM(n_columns=n, n_rows=n, random_state=self.random_state_) + elif isinstance(self.som_estimator, float): + check_scalar(self.som_estimator, 'som_estimator', float, min_val=0, max_val=1) + n = round(sqrt((X.shape[0] - 1) * self.som_estimator + 1)) + self.clusterer_ = SOM(n_columns=n, n_rows=n, random_state=self.random_state_) + elif isinstance(self.som_estimator, SOM): + self.clusterer_ = clone(self.som_estimator) + else: + msg = ( + 'Parameter `som_estimator` should be either `None` or the number of ' + 'clusters or a float in the [0.0, 1.0] range equal to the number of ' + 'clusters over the number of samples or an instance of the `SOM` class.' + ) + raise TypeError(msg) + + # Check distributor + self.distributor_ = DensityDistributor( + distribution_ratio=self.distribution_ratio, + filtering_threshold=1, + distances_exponent=2, + ) + + return self diff --git a/src/gsmote/__init__.py b/src/imblearn_extra/gsmote/__init__.py similarity index 100% rename from src/gsmote/__init__.py rename to src/imblearn_extra/gsmote/__init__.py diff --git a/src/gsmote/geometric_smote.py b/src/imblearn_extra/gsmote/geometric_smote.py similarity index 98% rename from src/gsmote/geometric_smote.py rename to src/imblearn_extra/gsmote/geometric_smote.py index ad570ef..e729c56 100644 --- a/src/gsmote/geometric_smote.py +++ b/src/imblearn_extra/gsmote/geometric_smote.py @@ -6,6 +6,7 @@ import math from collections import Counter +from collections.abc import Callable import numpy as np from imblearn.over_sampling.base import BaseOverSampler @@ -93,6 +94,7 @@ def populate_categorical_features( for start_idx, end_idx in zip( np.cumsum(categories_size)[:-1], np.cumsum(categories_size)[1:], + strict=False, ): col_maxs = neighbors[:, start_idx:end_idx].sum(axis=0) is_max = np.isclose(col_maxs, col_maxs.max(axis=0)) @@ -214,7 +216,7 @@ class GeometricSMOTE(BaseOverSampler): def __init__( self: Self, - sampling_strategy: dict[int, int] | str = 'auto', + sampling_strategy: dict[int, int] | str | float | Callable = 'auto', k_neighbors: NearestNeighbors | int = 5, truncation_factor: float = 1.0, deformation_factor: float = 0.0, @@ -386,7 +388,7 @@ def _make_geometric_samples( # noqa: C901 # Generate new samples X_new = np.zeros((n_samples, X.shape[1])) all_neighbors = [] - for ind, (row, col) in enumerate(zip(rows, cols)): + for ind, (row, col) in enumerate(zip(rows, cols, strict=False)): # Define center point center = X_pos[row] diff --git a/src/gsmote/py.typed b/src/imblearn_extra/py.typed similarity index 100% rename from src/gsmote/py.typed rename to src/imblearn_extra/py.typed diff --git a/tests/__init__.py b/tests/__init__.py index 15928cd..6084b50 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1,4 +1,4 @@ -"""Tests suite for `gsmote`.""" +"""Tests suite for `imblearn_extra`.""" from pathlib import Path diff --git a/tests/clover/__init__.py b/tests/clover/__init__.py new file mode 100644 index 0000000..a0384c7 --- /dev/null +++ b/tests/clover/__init__.py @@ -0,0 +1 @@ +"""Tests for clustering-based oversampling.""" diff --git a/tests/clover/clusterer/__init__.py b/tests/clover/clusterer/__init__.py new file mode 100644 index 0000000..3acbc95 --- /dev/null +++ b/tests/clover/clusterer/__init__.py @@ -0,0 +1 @@ +"""Tests for clusterer classes.""" diff --git a/tests/clover/clusterer/test_som.py b/tests/clover/clusterer/test_som.py new file mode 100644 index 0000000..5dfe0f4 --- /dev/null +++ b/tests/clover/clusterer/test_som.py @@ -0,0 +1,74 @@ +"""Test the _som module.""" + +import numpy as np +from imblearn_extra.clover.clusterer import SOM, extract_topological_neighbors, generate_labels_mapping +from sklearn.datasets import make_classification + +RANDOM_STATE = 5 +X, _ = make_classification(random_state=RANDOM_STATE, n_samples=1000) + + +def test_generate_labels_mapping(): + """Test the generation of the labels mapping.""" + grid_labels = [(1, 1), (0, 0), (0, 1), (1, 0), (1, 1), (1, 0), (0, 1)] + labels_mapping = {(0, 0): 0, (0, 1): 1, (1, 0): 2, (1, 1): 3} + assert generate_labels_mapping(grid_labels) == labels_mapping + + +def test_return_topological_neighbors_rectangular(): + """Test the topological neighbors of a neuron for rectangular grid type.""" + som = SOM(random_state=RANDOM_STATE).fit(X) + labels_coords_unique = list({(int(i), int(j)) for i, j in [som.algorithm_.winner(x) for x in X]}) + assert extract_topological_neighbors(0, 0, som.topology, som.n_rows_, som.n_columns_, labels_coords_unique) == [ + (1, 0), + (0, 1), + ] + assert extract_topological_neighbors(1, 1, som.topology, som.n_rows_, som.n_columns_, labels_coords_unique) == [ + (0, 1), + (2, 1), + (1, 0), + (1, 2), + ] + + +def test_return_topological_neighbors_hexagonal(): + """Test the topological neighbors of a neuron for hexagonal grid type.""" + som = SOM(random_state=RANDOM_STATE, topology='hexagonal').fit(X) + labels_coords_unique = list({(int(i), int(j)) for i, j in [som.algorithm_.winner(x) for x in X]}) + assert extract_topological_neighbors(0, 0, som.topology, som.n_rows_, som.n_columns_, labels_coords_unique) == [ + (1, 0), + (0, 1), + ] + assert extract_topological_neighbors(1, 1, som.topology, som.n_rows_, som.n_columns_, labels_coords_unique) == [ + (0, 1), + (2, 1), + (1, 0), + (1, 2), + (2, 2), + (2, 0), + ] + + +def test_no_fit(): + """Test the SOM initialization.""" + som = SOM(random_state=RANDOM_STATE) + assert not hasattr(som, 'labels_') + assert not hasattr(som, 'neighbors_') + assert not hasattr(som, 'algorithm_') + assert not hasattr(som, 'n_columns_') + assert not hasattr(som, 'n_rows_') + assert not hasattr(som, 'labels_mapping_') + + +def test_fit(): + """Test the SOM fitting process.""" + n_rows = 5 + n_columns = 3 + som = SOM(n_rows=n_rows, n_columns=n_columns, random_state=RANDOM_STATE) + som.fit(X) + assert np.array_equal(np.unique(som.labels_), np.arange(0, n_rows * n_columns)) + assert som.n_rows_ == n_rows + assert som.n_columns_ == n_columns + assert hasattr(som, 'neighbors_') + assert hasattr(som, 'algorithm_') + assert hasattr(som, 'labels_mapping_') diff --git a/tests/clover/distribution/__init__.py b/tests/clover/distribution/__init__.py new file mode 100644 index 0000000..96ac577 --- /dev/null +++ b/tests/clover/distribution/__init__.py @@ -0,0 +1 @@ +"""Tests for distribution classes.""" diff --git a/tests/clover/distribution/test_base.py b/tests/clover/distribution/test_base.py new file mode 100644 index 0000000..1e71f3e --- /dev/null +++ b/tests/clover/distribution/test_base.py @@ -0,0 +1,25 @@ +"""Test the _base module.""" + +import numpy as np +import pytest +from imblearn_extra.clover.distribution.base import BaseDistributor +from sklearn.datasets import make_classification + + +@pytest.mark.parametrize(("n_samples", "n_classes", "weights"), [(20, 2, [0.8, 0.2]), (10, 3, [0.6, 0.2, 0.2])]) +def test_fit(n_samples, n_classes, weights): + """Test fit method.""" + X, y = make_classification( + random_state=0, + n_samples=n_samples, + n_classes=n_classes, + weights=weights, + n_informative=5, + ) + distributor = BaseDistributor().fit(X, y) + assert len(distributor.majority_class_labels_) == 1 + assert distributor.majority_class_labels_[0] == 0 + np.testing.assert_array_equal(distributor.labels_, np.repeat(0, n_samples)) + np.testing.assert_array_equal(distributor.neighbors_, np.empty((0, 2))) + assert distributor.intra_distribution_ == {(0, class_label): 1.0 for class_label in range(1, n_classes)} + assert distributor.inter_distribution_ == {} diff --git a/tests/clover/distribution/test_density.py b/tests/clover/distribution/test_density.py new file mode 100644 index 0000000..a6f448c --- /dev/null +++ b/tests/clover/distribution/test_density.py @@ -0,0 +1,348 @@ +"""Test the _density module.""" + +import numpy as np +import pytest +from imblearn_extra.clover.distribution._density import DensityDistributor +from sklearn.base import clone + +X = np.array( + [ + [1.0, 1.0], + [1.0, 2.0], + [1.5, 1.5], + [-1.0, 1.0], + [-1.0, 1.5], + [-1.0, -1.0], + [2.0, -1.0], + [2.5, -1.0], + [2.5, -1.5], + [2.0, -1.5], + [2.0, -2.0], + [2.0, -2.5], + [3.0, -1.0], + [2.0, -1.0], + [4.0, -1.0], + ], +) +y_bin = np.array([1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1]) +y_multi = np.array([0, 1, 2, 1, 2, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2]) +y_partial_tie = np.array([0, 1, 1, 2, 2, 1, 1, 0, 0, 0, 0, 0, 1, 1, 2]) +y_full_tie = np.array([0, 1, 2, 1, 2, 1, 2, 2, 0, 0, 0, 0, 1, 1, 2]) +LABELS = np.array([0, 0, 0, 1, 1, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4]) +NEIGHBORS_BIN = np.array([(0, 1), (0, 2), (0, 3), (4, 2), (2, 3)]) +NEIGHBORS_MULTI = np.array([(0, 1), (1, 4), (2, 3)]) +DISTRIBUTOR = DensityDistributor(filtering_threshold=0.6, distances_exponent=1) + + +def test_filtered_clusters_binary(): + """Test the identification of filtered clusters. + + Binary case. + """ + distributor = clone(DISTRIBUTOR).fit(X, y_bin, LABELS) + assert distributor.filtered_clusters_ == [(0, 1), (2, 1), (4, 1)] + + +def test_filtered_clusters_multiclass(): + """Test the identification of filtered clusters. + + Multiclass case. + """ + distributor = clone(DISTRIBUTOR).set_params(filtering_threshold=1.0).fit(X, y_multi, LABELS) + assert distributor.filtered_clusters_ == [ + (0, 1), + (0, 2), + (1, 1), + (1, 2), + (4, 1), + (4, 2), + ] + + +def test_filtered_clusters_multiclass_partial_tie(): + """Test the identification of filtered clusters. + + Multiclass case with partial tie. + """ + distributor = clone(DISTRIBUTOR).set_params(filtering_threshold=1.0).fit(X, y_partial_tie, LABELS) + assert distributor.filtered_clusters_ == [(1, 2), (4, 2)] + + +def test_filtered_clusters_multiclass_full_tie(): + """Test the identification of filtered clusters. + + Multiclass case with full tie. + """ + distributor = clone(DISTRIBUTOR).set_params(filtering_threshold=1.0).fit(X, y_full_tie, LABELS) + assert distributor.filtered_clusters_ == [] + + +def test_clusters_density_binary(): + """Test the filtered clusters density. + + Binary case. + """ + distributor = clone(DISTRIBUTOR).fit(X, y_bin, LABELS) + assert distributor.clusters_density_ == {(0, 1): 2.0, (2, 1): 2.25, (4, 1): 2.25} + + +def test_clusters_density_multiclass(): + """Test the filtered clusters density. + + Multiclass case. + """ + distributor = clone(DISTRIBUTOR).set_params(filtering_threshold=1.0).fit(X, y_multi, LABELS) + assert distributor.clusters_density_ == { + (0, 1): 2.0, + (0, 2): 2.0, + (1, 1): 2.0, + (1, 2): 2.0, + (4, 1): 2.0, + (4, 2): 2.0, + } + + +def test_clusters_density_multiclass_partial_tie(): + """Test filtered clusters density. + + Multiclass case with partial tie. + """ + distributor = clone(DISTRIBUTOR).set_params(filtering_threshold=1.0).fit(X, y_partial_tie, LABELS) + assert distributor.clusters_density_ == { + (1, 2): 4.0, + (4, 2): 4.0, + } + + +def test_clusters_density_multiclass_full_tie(): + """Test filtered clusters density. + + Multiclass case with full tie. + """ + distributor = clone(DISTRIBUTOR).set_params(filtering_threshold=1.0).fit(X, y_full_tie, LABELS) + assert distributor.clusters_density_ == {} + + +def test_clusters_density_no_filtered(): + """Test filter clusters density. + + No filtered clusters case. + """ + X = np.arange(0.0, 5.0).reshape(-1, 1) + y = np.array([0, 0, 0, 1, 1]) + labels = np.array([-1, -1, -1, -1, -1]) + distributor = clone(DISTRIBUTOR).set_params().fit(X, y, labels) + assert distributor.clusters_density_ == {} + + +def test_raise_error_filtering_threshold(): + """Test raise error for filtering threshold. + + Value and type error cases. + """ + with pytest.raises(ValueError, match='filtering_threshold == -1.0, must be >= 0.0'): + clone(DISTRIBUTOR).set_params(filtering_threshold=-1.0).fit(X, y_bin, LABELS) + with pytest.raises(TypeError, match='filtering_threshold must be an instance of {int, float}, not NoneType'): + clone(DISTRIBUTOR).set_params(filtering_threshold=None).fit(X, y_bin, LABELS) + with pytest.raises(TypeError, match='filtering_threshold must be an instance of {int, float}, not str'): + clone(DISTRIBUTOR).set_params(filtering_threshold='value').fit(X, y_bin, LABELS) + + +def test_raise_error_distances_exponent(): + """Test raise error for distances exponent. + + Value and type error cases. + """ + with pytest.raises(ValueError, match='distances_exponent == -1.0, must be >= 0.0'): + clone(DISTRIBUTOR).set_params(distances_exponent=-1.0).fit(X, y_bin, LABELS) + with pytest.raises(TypeError, match='distances_exponent must be an instance of {int, float}, not None'): + clone(DISTRIBUTOR).set_params(distances_exponent=None).fit(X, y_bin, LABELS) + with pytest.raises(TypeError, match='distances_exponent must be an instance of {int, float}, not str'): + clone(DISTRIBUTOR).set_params(distances_exponent='value').fit(X, y_bin, LABELS) + + +def test_raise_error_sparsity_based(): + """Test raise error for sparsity based. + + Type error case. + """ + with pytest.raises(TypeError, match='sparsity_based must be an instance of bool, not NoneType'): + clone(DISTRIBUTOR).set_params(sparsity_based=None).fit(X, y_bin, LABELS) + + +def test_raise_error_distribution_ratio(): + """Test raise error for distribution ratio. + + Type error case. + """ + with pytest.raises(ValueError, match='distribution_ratio == -1.0, must be >= 0.0'): + clone(DISTRIBUTOR).set_params(distribution_ratio=-1.0).fit(X, y_bin, LABELS) + with pytest.raises(ValueError, match='distribution_ratio == 2.0, must be <= 1.0'): + clone(DISTRIBUTOR).set_params(distribution_ratio=2.0).fit(X, y_bin, LABELS) + with pytest.raises(TypeError, match='distribution_ratio must be an instance of float, not str'): + clone(DISTRIBUTOR).set_params(distribution_ratio='value').fit(X, y_bin, LABELS) + + +def test_raise_error_no_neighbors_distribution_ratio(): + """Test distribution ratio. + + No neighbors value error case. + """ + with pytest.raises( + ValueError, + match=('Parameter `distribution_ratio` should be equal to 1.0, when `neighbors` parameter is `None`.'), + ): + clone(DISTRIBUTOR).set_params(distribution_ratio=0.5).fit(X, y_bin, LABELS, neighbors=None) + + +def test_fit_default(): + """Test fit method. + + Default initialization case. + """ + distributor = clone(DISTRIBUTOR).fit(X, y_bin, None, None) + assert distributor.majority_class_labels_ == [0] + assert hasattr(distributor, 'filtered_clusters_') + assert hasattr(distributor, 'clusters_density_') + np.testing.assert_array_equal(distributor.labels_, np.repeat(0, len(X))) + np.testing.assert_array_equal(distributor.neighbors_, np.empty((0, 2))) + assert distributor.intra_distribution_ == {(0, 1): 1.0} + assert distributor.inter_distribution_ == {} + + +def test_fit_binary_intra(): + """Test fit method. + + Binary and intra-cluster generation case. + """ + distributor = clone(DISTRIBUTOR).fit(X, y_bin, LABELS) + np.testing.assert_almost_equal(distributor.intra_distribution_[(0, 1)], 9.0 / 25.0) + np.testing.assert_almost_equal(distributor.intra_distribution_[(2, 1)], 8.0 / 25.0) + np.testing.assert_almost_equal(distributor.intra_distribution_[(4, 1)], 8.0 / 25.0) + + +def test_fit_multiclass_intra(): + """Test fit method. + + Multiclass and intra-cluster generation case. + """ + distributor = clone(DISTRIBUTOR).set_params(filtering_threshold=1.0).fit(X, y_multi, LABELS) + np.testing.assert_almost_equal(distributor.intra_distribution_[(0, 1)], 1.0 / 3.0) + np.testing.assert_almost_equal(distributor.intra_distribution_[(1, 1)], 1.0 / 3.0) + np.testing.assert_almost_equal(distributor.intra_distribution_[(4, 1)], 1.0 / 3.0) + np.testing.assert_almost_equal(distributor.intra_distribution_[(0, 2)], 1.0 / 3.0) + np.testing.assert_almost_equal(distributor.intra_distribution_[(1, 2)], 1.0 / 3.0) + np.testing.assert_almost_equal(distributor.intra_distribution_[(4, 2)], 1.0 / 3.0) + + +def test_fit_multiclass_intra_partial_tie(): + """Test fit method. + + Multiclass intra-cluster generation and partial tie case. + """ + distributor = clone(DISTRIBUTOR).set_params(filtering_threshold=1.0).fit(X, y_partial_tie, LABELS) + np.testing.assert_almost_equal(distributor.intra_distribution_[(1, 2)], 0.5) + np.testing.assert_almost_equal(distributor.intra_distribution_[(4, 2)], 0.5) + + +def test_fit_binary_inter(): + """Test fit method. + + Binary and inter-cluster generation case. + """ + distributor = clone(DISTRIBUTOR).set_params(distribution_ratio=0.0).fit(X, y_bin, LABELS, NEIGHBORS_BIN) + np.testing.assert_equal(distributor.labels_, LABELS) + np.testing.assert_equal(distributor.neighbors_, NEIGHBORS_BIN) + np.testing.assert_almost_equal(distributor.inter_distribution_[((0, 1), (2, 1))], 18.0 / 35.0) + np.testing.assert_almost_equal(distributor.inter_distribution_[((4, 1), (2, 1))], 17.0 / 35.0) + + +def test_fit_multiclass_inter(): + """Test fit method. + + Multiclass and inter-cluster generation case. + """ + distributor = ( + clone(DISTRIBUTOR) + .set_params(distribution_ratio=0.0, filtering_threshold=1.0) + .fit(X, y_multi, LABELS, NEIGHBORS_MULTI) + ) + np.testing.assert_equal(distributor.labels_, LABELS) + np.testing.assert_equal(distributor.neighbors_, NEIGHBORS_MULTI) + np.testing.assert_almost_equal(distributor.inter_distribution_[((0, 1), (1, 1))], 0.5) + np.testing.assert_almost_equal(distributor.inter_distribution_[((1, 1), (4, 1))], 0.5) + np.testing.assert_almost_equal(distributor.inter_distribution_[((0, 2), (1, 2))], 0.5) + np.testing.assert_almost_equal(distributor.inter_distribution_[((1, 2), (4, 2))], 0.5) + + +def test_fit_multiclass_inter_partial_tie(): + """Test fit method. + + Multiclass, intra-cluster generation and partial tie case. + """ + distributor = ( + clone(DISTRIBUTOR) + .set_params(distribution_ratio=0.0, filtering_threshold=1.0) + .fit(X, y_partial_tie, LABELS, NEIGHBORS_MULTI) + ) + np.testing.assert_equal(distributor.labels_, LABELS) + np.testing.assert_equal(distributor.neighbors_, NEIGHBORS_MULTI) + np.testing.assert_almost_equal(distributor.inter_distribution_[((1, 2), (4, 2))], 1) + + +def test_fit_binary_intra_inter(): + """Test fit method. + + Binary, intra-cluster generation and inter-cluster generation case. + """ + distributor = clone(DISTRIBUTOR).set_params(distribution_ratio=0.5).fit(X, y_bin, LABELS, NEIGHBORS_BIN) + np.testing.assert_equal(distributor.labels_, LABELS) + np.testing.assert_equal(distributor.neighbors_, NEIGHBORS_BIN) + np.testing.assert_almost_equal(distributor.intra_distribution_[(0, 1)], 9.0 / 50.0) + np.testing.assert_almost_equal(distributor.intra_distribution_[(2, 1)], 8.0 / 50.0) + np.testing.assert_almost_equal(distributor.intra_distribution_[(4, 1)], 8.0 / 50.0) + np.testing.assert_almost_equal(distributor.inter_distribution_[((0, 1), (2, 1))], 18.0 / 70.0) + np.testing.assert_almost_equal(distributor.inter_distribution_[((4, 1), (2, 1))], 17.0 / 70.0) + + +def test_fit_multiclass_intra_inter(): + """Test fit method. + + Multiclass, intra-cluster generation and inter-cluster generation + case. + """ + distributor = ( + clone(DISTRIBUTOR) + .set_params(distribution_ratio=0.5, filtering_threshold=1.0) + .fit(X, y_multi, LABELS, NEIGHBORS_MULTI) + ) + np.testing.assert_equal(distributor.labels_, LABELS) + np.testing.assert_equal(distributor.neighbors_, NEIGHBORS_MULTI) + np.testing.assert_almost_equal(distributor.intra_distribution_[(0, 1)], 1.0 / 6.0) + np.testing.assert_almost_equal(distributor.intra_distribution_[(1, 1)], 1.0 / 6.0) + np.testing.assert_almost_equal(distributor.intra_distribution_[(4, 1)], 1.0 / 6.0) + np.testing.assert_almost_equal(distributor.intra_distribution_[(0, 2)], 1.0 / 6.0) + np.testing.assert_almost_equal(distributor.intra_distribution_[(1, 2)], 1.0 / 6.0) + np.testing.assert_almost_equal(distributor.intra_distribution_[(4, 2)], 1.0 / 6.0) + np.testing.assert_almost_equal(distributor.inter_distribution_[((0, 1), (1, 1))], 0.25) + np.testing.assert_almost_equal(distributor.inter_distribution_[((1, 1), (4, 1))], 0.25) + np.testing.assert_almost_equal(distributor.inter_distribution_[((0, 2), (1, 2))], 0.25) + np.testing.assert_almost_equal(distributor.inter_distribution_[((1, 2), (4, 2))], 0.25) + + +def test_fit_multiclass_intra_inter_partial_tie(): + """Test fit method. + + Multiclass, intra-cluster generation, inter-cluster generation case + and partial tie case. + """ + distributor = ( + clone(DISTRIBUTOR) + .set_params(distribution_ratio=0.5, filtering_threshold=1.0) + .fit(X, y_partial_tie, LABELS, NEIGHBORS_MULTI) + ) + np.testing.assert_equal(distributor.labels_, LABELS) + np.testing.assert_equal(distributor.neighbors_, NEIGHBORS_MULTI) + np.testing.assert_almost_equal(distributor.intra_distribution_[(1, 2)], 1.0 / 4.0) + np.testing.assert_almost_equal(distributor.intra_distribution_[(4, 2)], 1.0 / 4.0) + np.testing.assert_almost_equal(distributor.inter_distribution_[((1, 2), (4, 2))], 0.5) diff --git a/tests/clover/over_sampling/__init__.py b/tests/clover/over_sampling/__init__.py new file mode 100644 index 0000000..238038c --- /dev/null +++ b/tests/clover/over_sampling/__init__.py @@ -0,0 +1 @@ +"""Tests for clustering-based over-samplers.""" diff --git a/tests/clover/over_sampling/test_cluster.py b/tests/clover/over_sampling/test_cluster.py new file mode 100644 index 0000000..5442829 --- /dev/null +++ b/tests/clover/over_sampling/test_cluster.py @@ -0,0 +1,326 @@ +"""Test the _cluster module.""" + +from collections import Counter, OrderedDict + +import numpy as np +import pytest +from imblearn.over_sampling import SMOTE, SVMSMOTE, BorderlineSMOTE, RandomOverSampler +from imblearn_extra.clover.distribution import DensityDistributor +from imblearn_extra.clover.over_sampling import ( + ClusterOverSampler, + clone_modify, + extract_inter_data, + extract_intra_data, + generate_in_cluster, + modify_nn, +) +from sklearn.base import clone +from sklearn.cluster import KMeans +from sklearn.datasets import make_classification +from sklearn.exceptions import FitFailedWarning +from sklearn.neighbors import NearestNeighbors +from sklearn.utils import check_random_state + +RANDOM_STATE = 1 +X, y = make_classification( + random_state=RANDOM_STATE, + n_classes=3, + n_samples=5000, + n_features=10, + n_clusters_per_class=2, + weights=[0.2, 0.5, 0.3], + n_informative=5, +) +CLUSTERER = KMeans(n_clusters=5, n_init=1, random_state=RANDOM_STATE) +OVERSAMPLERS = [ + RandomOverSampler(random_state=RANDOM_STATE), + SMOTE(random_state=RANDOM_STATE), + BorderlineSMOTE(random_state=RANDOM_STATE), + SVMSMOTE(random_state=RANDOM_STATE), +] +CLUSTER_OVERSAMPLERS = [ + ClusterOverSampler(RandomOverSampler(random_state=RANDOM_STATE), clusterer=CLUSTERER, random_state=RANDOM_STATE), + ClusterOverSampler(SMOTE(random_state=RANDOM_STATE), clusterer=CLUSTERER, random_state=RANDOM_STATE), + ClusterOverSampler(BorderlineSMOTE(random_state=RANDOM_STATE), clusterer=CLUSTERER, random_state=RANDOM_STATE), + ClusterOverSampler(SVMSMOTE(random_state=RANDOM_STATE), clusterer=CLUSTERER, random_state=RANDOM_STATE), +] + + +def test_modify_nn_object(): + """Test modification of nearest neighbors. + + Object case. + """ + n_neighbors = 2 + assert modify_nn(NearestNeighbors(n_neighbors=5), 3).n_neighbors == n_neighbors + assert modify_nn(NearestNeighbors(n_neighbors=3), 3).n_neighbors == n_neighbors + assert modify_nn(NearestNeighbors(n_neighbors=2), 5).n_neighbors == n_neighbors + + +def test_modify_nn_int(): + """Test modification of nearest neighbors. + + Integer case. + """ + n_neighbors = 2 + assert modify_nn(5, 3) == n_neighbors + assert modify_nn(3, 3) == n_neighbors + assert modify_nn(2, 5) == n_neighbors + + +def test_clone_modify_ros(): + """Test cloning and modification of oversampler. + + Random oversampler case. + """ + cloned_oversampler = clone_modify(OVERSAMPLERS[0], None, None) + assert isinstance(cloned_oversampler, RandomOverSampler) + + +@pytest.mark.parametrize( + 'oversampler', + [ovs for ovs in OVERSAMPLERS if not isinstance(ovs, RandomOverSampler)], +) +def test_clone_modify_single_min_sample(oversampler): + """Test cloning and modification of oversampler. + + One minority class sample case. + """ + class_label = 1 + y_in_cluster = np.array([0, 0, 0, 0, 1, 2, 2, 2]) + cloned_oversampler = clone_modify(oversampler, class_label, y_in_cluster) + assert isinstance(cloned_oversampler, RandomOverSampler) + + +@pytest.mark.parametrize( + 'oversampler', + [ovs for ovs in OVERSAMPLERS if not isinstance(ovs, RandomOverSampler)], +) +def test_clone_modify_neighbors(oversampler): + """Test cloning and modification of oversampler. + + Neighbors based oversamplers case. + """ + class_label = 2 + y_in_cluster = np.array([0, 0, 0, 0, 1, 2, 2, 2]) + n_minority_samples = Counter(y_in_cluster)[class_label] + cloned_oversampler = clone_modify(oversampler, class_label, y_in_cluster) + assert isinstance(cloned_oversampler, oversampler.__class__) + if hasattr(cloned_oversampler, 'k_neighbors'): + assert cloned_oversampler.k_neighbors == n_minority_samples - 1 + if hasattr(cloned_oversampler, 'm_neighbors'): + assert cloned_oversampler.m_neighbors in (y_in_cluster.size - 1, 'deprecated') + if hasattr(cloned_oversampler, 'n_neighbors'): + assert cloned_oversampler.n_neighbors in (n_minority_samples - 1, 'deprecated') + + +def test_extract_intra_data(): + """Test extraction of in the clusters data. + + Multiclass case. + """ + X = np.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]).reshape(-1, 1) + y = np.array([0, 0, 0, 0, 1, 2, 2, 2, 0]) + cluster_labels = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2]) + intra_distribution = {(1, 1): 1.0, (1, 2): 0.8, (2, 2): 0.2} + sampling_strategy = OrderedDict({1: 4, 2: 2}) + clusters_data = extract_intra_data(X, y, cluster_labels, intra_distribution, sampling_strategy) + cluster_sampling_strategies, Xs, ys = zip(*clusters_data, strict=True) + assert cluster_sampling_strategies == ({1: 4}, {2: 2}) + assert [X.tolist() for X in Xs] == [[[4.0], [5.0]], [[4.0], [6.0]]] + assert [y.tolist() for y in ys] == [[0, 1], [0, 2]] + + +def test_extract_inter_data(): + """Test extraction of between clusters data. + + Multiclass case. + """ + X = np.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0]).reshape(-1, 1) + y = np.array([1, 0, 0, 0, 1, 2, 2, 2, 0, 0, 1, 0]) + cluster_labels = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2]) + inter_distribution = { + ((0, 1), (1, 1)): 0.5, + ((1, 1), (2, 1)): 0.5, + ((1, 2), (2, 2)): 1.0, + } + sampling_strategy = OrderedDict({1: 3, 2: 3}) + random_state = check_random_state(RANDOM_STATE) + clusters_data = extract_inter_data(X, y, cluster_labels, inter_distribution, sampling_strategy, random_state) + cluster_sampling_strategies, Xs, ys = zip(*clusters_data, strict=True) + assert cluster_sampling_strategies == ( + {1: 1}, + {1: 1}, + {1: 1}, + {1: 1}, + {2: 1}, + {2: 1}, + {2: 1}, + ) + assert [X.tolist() for X in Xs] == 2 * [[[1.0], [5.0], [2.0], [3.0], [4.0]]] + 2 * [ + [[5.0], [11.0], [4.0], [9.0], [10.0], [12.0]], + ] + 2 * [[[6.0], [8.0], [4.0], [9.0], [10.0], [12.0]]] + [[[6.0], [7.0], [4.0], [9.0], [10.0], [12.0]]] + assert [y.tolist() for y in ys] == 2 * [[1, 1, 0, 0, 0]] + 2 * [[1, 1, 0, 0, 0, 0]] + 3 * [[2, 2, 0, 0, 0, 0]] + + +@pytest.mark.parametrize('oversampler', OVERSAMPLERS) +def test_generate_in_cluster(oversampler): + """Test generation in the clusters samples. + + Multiclass case. + """ + oversampler = clone(oversampler) + + X_in_cluster = np.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]).reshape(-1, 1) + y_in_cluster = np.array([0, 0, 0, 0, 1, 2, 2, 2]) + + # First class + cluster_sampling_strategy = {1: 5} + (class_label,) = cluster_sampling_strategy + X_new, y_new = generate_in_cluster(oversampler, None, cluster_sampling_strategy, X_in_cluster, y_in_cluster) + assert len(X_new) == len(y_new) <= cluster_sampling_strategy[1] + np.testing.assert_array_equal(np.unique(X_new), np.array([5.0])) + assert Counter(y_new)[class_label] == cluster_sampling_strategy[1] + + # Second class + cluster_sampling_strategy = {2: 3} + (class_label,) = cluster_sampling_strategy + X_new, y_new = generate_in_cluster(oversampler, None, cluster_sampling_strategy, X_in_cluster, y_in_cluster) + assert len(X_new) == len(y_new) <= cluster_sampling_strategy[2] + assert Counter(y_new)[class_label] <= cluster_sampling_strategy[2] + + +@pytest.mark.parametrize('oversampler', CLUSTER_OVERSAMPLERS) +def test_fit(oversampler): + """Test fit method. + + Multiclass case. + """ + oversampler = clone(oversampler).fit(X, y) + y_count = Counter(y) + assert hasattr(oversampler, 'sampling_strategy_') + assert hasattr(oversampler, 'oversampler_') + assert hasattr(oversampler, 'clusterer_') + assert hasattr(oversampler, 'distributor_') + assert hasattr(oversampler, 'random_state_') + assert oversampler.sampling_strategy_ == OrderedDict({0: y_count[1] - y_count[0], 2: y_count[1] - y_count[2]}) + + +@pytest.mark.parametrize('oversampler', CLUSTER_OVERSAMPLERS) +def test_fit_resample(oversampler): + """Test fit and resample method. + + Multiclass case. + """ + oversampler = clone(oversampler) + oversampler.fit_resample(X, y) + assert hasattr(oversampler, 'sampling_strategy_') + assert hasattr(oversampler, 'oversampler_') + assert hasattr(oversampler, 'clusterer_') + assert hasattr(oversampler, 'distributor_') + assert hasattr(oversampler, 'random_state_') + assert hasattr(oversampler.distributor_, 'intra_distribution_') + assert hasattr(oversampler.distributor_, 'inter_distribution_') + + +@pytest.mark.parametrize( + ("X", "y", "oversampler"), + [ + ( + np.array([(0.0, 0.0), (1.0, 1.0), (2.0, 2.0), (3.0, 3.0), (4.0, 4.0)]), + np.array([0, 0, 1, 1, 1]), + ClusterOverSampler(oversampler=SMOTE(k_neighbors=5, random_state=RANDOM_STATE)), + ), + ( + np.array([(0.0, 0.0), (1.0, 1.0), (2.0, 2.0), (3.0, 3.0), (4.0, 4.0)]), + np.array([0, 0, 1, 1, 1]), + ClusterOverSampler( + oversampler=SMOTE(k_neighbors=5, random_state=RANDOM_STATE), + clusterer=CLUSTERER.set_params(n_clusters=3), + random_state=RANDOM_STATE, + ), + ), + ], +) +def test_fit_resample_intra_corner_cases(X, y, oversampler): + """Test fit and resample method. + + Corner cases. + """ + X_res, y_res = oversampler.fit_resample(X, y) + y_count = Counter(y_res) + assert y_count[0] == y_count[1] + assert X.item(0, 0) <= X_res.item(-1, 0) <= X.item(1, 0) + assert X.item(0, 1) <= X_res.item(-1, 1) <= X.item(1, 1) + + +@pytest.mark.parametrize('oversampler', CLUSTER_OVERSAMPLERS) +def test_raise_error_fit_resample(oversampler): + """Test raise of error. + + No samples are generated case. + """ + oversampler = clone(oversampler) + oversampler.set_params( + clusterer=CLUSTERER.set_params(n_clusters=2), + distributor=DensityDistributor(filtering_threshold=0.1), + ) + with pytest.raises( + ValueError, + match='No samples were generated. Try to modify the parameters of the clusterer or distributor.', + ): + oversampler.fit_resample(X, y) + + +@pytest.mark.parametrize('oversampler', CLUSTER_OVERSAMPLERS) +def test_display_warning_fit_resample(oversampler): + """Test display warning. + + No samples are generated case. + """ + oversampler = clone(oversampler) + oversampler.set_params( + clusterer=CLUSTERER.set_params(n_clusters=2), + distributor=DensityDistributor(filtering_threshold=0.1), + raise_error=False, + ) + with pytest.warns( + FitFailedWarning, + match='No samples were generated. Try to modify the parameters of the clusterer or distributor.', + ): + oversampler.fit_resample(X, y) + + +@pytest.mark.parametrize('oversampler', CLUSTER_OVERSAMPLERS) +def test_two_majority_classes(oversampler): + """Test fit and resample method. + + Two majority classes case. + """ + oversampler = clone(oversampler) + + label_mapper = { + 0: 13, + 1: 1, + 2: 5, + 3: 7, + 4: 3, + 5: 10, + 6: 6, + 7: 8, + 8: 9, + 9: 11, + 10: 4, + } + + X, y = make_classification( + n_samples=19 * len(label_mapper), + n_classes=len(label_mapper), + n_informative=30, + n_features=145, + random_state=42, + ) + + y = np.array([label_mapper[i] for i in y]) + + oversampler.fit_resample(X, y) diff --git a/tests/clover/over_sampling/test_gsomo.py b/tests/clover/over_sampling/test_gsomo.py new file mode 100644 index 0000000..b21e60c --- /dev/null +++ b/tests/clover/over_sampling/test_gsomo.py @@ -0,0 +1,180 @@ +"""Test the _gsomo module.""" + +from collections import Counter, OrderedDict +from math import sqrt + +import pytest +from imblearn_extra.clover.clusterer import SOM +from imblearn_extra.clover.distribution import DensityDistributor +from imblearn_extra.clover.over_sampling import GeometricSOMO +from imblearn_extra.gsmote import GeometricSMOTE +from sklearn.base import clone +from sklearn.cluster import AgglomerativeClustering +from sklearn.datasets import make_classification + +RANDOM_STATE = 11 +X, y = make_classification( + random_state=RANDOM_STATE, + n_classes=3, + n_samples=5000, + n_features=10, + n_clusters_per_class=2, + weights=[0.3, 0.45, 0.25], + n_informative=5, +) +GSOMO_OVERSAMPLER = GeometricSOMO(random_state=RANDOM_STATE) + + +@pytest.mark.parametrize( + ('k_neighbors', 'imbalance_ratio_threshold', 'distances_exponent', 'distribution_ratio'), + [(3, 2.0, 'auto', 0.3), (5, 1.5, 8, 0.5), (8, 'auto', 10, 0.8)], +) +def test_fit(k_neighbors, imbalance_ratio_threshold, distances_exponent, distribution_ratio): + """Test fit method. + + Multiple cases. + """ + # Fit oversampler + params = { + 'k_neighbors': k_neighbors, + 'distribution_ratio': distribution_ratio, + 'distances_exponent': distances_exponent, + 'imbalance_ratio_threshold': imbalance_ratio_threshold, + } + gsomo = clone(GSOMO_OVERSAMPLER).set_params(**params).fit(X, y) + y_count = Counter(y) + + # Assert random state + assert hasattr(gsomo, 'random_state_') + + # Assert oversampler + assert isinstance(gsomo.oversampler_, GeometricSMOTE) + assert gsomo.oversampler_.k_neighbors == gsomo.k_neighbors == k_neighbors + assert gsomo.oversampler_.truncation_factor == gsomo.truncation_factor + assert gsomo.oversampler_.deformation_factor == gsomo.deformation_factor + assert gsomo.oversampler_.selection_strategy == gsomo.selection_strategy + + # Assert clusterer + assert isinstance(gsomo.clusterer_, SOM) + + # Assert distributor + assert isinstance(gsomo.distributor_, DensityDistributor) + assert gsomo.distributor_.filtering_threshold == gsomo.imbalance_ratio_threshold == imbalance_ratio_threshold + assert gsomo.distributor_.distances_exponent == gsomo.distances_exponent == distances_exponent + assert gsomo.distributor_.distribution_ratio == gsomo.distribution_ratio == distribution_ratio + + # Assert sampling strategy + assert gsomo.oversampler_.sampling_strategy == gsomo.sampling_strategy + assert gsomo.sampling_strategy_ == OrderedDict({0: y_count[1] - y_count[0], 2: y_count[1] - y_count[2]}) + + +def test_fit_default(): + """Test fit method. + + Default case. + """ + # Fit oversampler + gsomo = clone(GSOMO_OVERSAMPLER).fit(X, y) + + # Create SOM instance with default parameters + som = SOM() + + # Assert clusterer + assert isinstance(gsomo.clusterer_, SOM) + assert gsomo.clusterer_.n_rows == som.n_rows + assert gsomo.clusterer_.n_columns == som.n_columns + + +@pytest.mark.parametrize('n_clusters', [5, 6, 12]) +def test_fit_number_of_clusters(n_clusters): + """Test clusterer of fit method. + + Number of clusters case. + """ + # Fit oversampler + gsomo = clone(GSOMO_OVERSAMPLER).set_params(som_estimator=n_clusters).fit(X, y) + + # Assert clusterer + assert isinstance(gsomo.clusterer_, SOM) + assert gsomo.clusterer_.n_rows == round(sqrt(gsomo.som_estimator)) + assert gsomo.clusterer_.n_columns == round(sqrt(gsomo.som_estimator)) + + +@pytest.mark.parametrize('proportion', [0.0, 0.5, 1.0]) +def test_fit_proportion_of_samples(proportion): + """Test clusterer of fit method. + + Proportion of samples case. + """ + # Fit oversampler + gsomo = clone(GSOMO_OVERSAMPLER).set_params(som_estimator=proportion).fit(X, y) + + # Assert clusterer + assert isinstance(gsomo.clusterer_, SOM) + assert gsomo.clusterer_.n_rows == round(sqrt((X.shape[0] - 1) * gsomo.som_estimator + 1)) + assert gsomo.clusterer_.n_columns == round(sqrt((X.shape[0] - 1) * gsomo.som_estimator + 1)) + + +def test_som_estimator(): + """Test clusterer of fit method. + + Clusterer case. + """ + # Fit oversampler + gsomo = clone(GSOMO_OVERSAMPLER).set_params(som_estimator=SOM()).fit(X, y) + + # Define som estimator + som = SOM() + + # Assert clusterer + assert isinstance(gsomo.clusterer_, type(som)) + assert gsomo.clusterer_.n_rows == som.n_rows + assert gsomo.clusterer_.n_columns == som.n_columns + + +@pytest.mark.parametrize('som_estimator', [-3, 0]) +def test_raise_value_error_fit_integer(som_estimator): + """Test fit method. + + Integer values as estimators error case. + """ + with pytest.raises(ValueError, match=f'som_estimator == {som_estimator}, must be >= 1.'): + clone(GSOMO_OVERSAMPLER).set_params(som_estimator=som_estimator).fit(X, y) + + +@pytest.mark.parametrize('som_estimator', [-1.5, 2.0]) +def test_raise_value_error_fit_float(som_estimator): + """Test fit method. + + Float values as estimators error case. + """ + with pytest.raises(ValueError, match=f'som_estimator == {som_estimator}, must be'): + clone(GSOMO_OVERSAMPLER).set_params(som_estimator=som_estimator).fit(X, y) + + +@pytest.mark.parametrize('som_estimator', [AgglomerativeClustering, [3, 5]]) +def test_raise_type_error_fit(som_estimator): + """Test fit method. + + Not SOMO clusterer error case. + """ + with pytest.raises(TypeError, match='Parameter `som_estimator` should be'): + clone(GSOMO_OVERSAMPLER).set_params(som_estimator=som_estimator).fit(X, y) + + +def test_fit_resample(): + """Test fit and resample method. + + Default case. + """ + # Fit oversampler + gsomo = clone(GSOMO_OVERSAMPLER) + _, y_res = gsomo.fit_resample(X, y) + + # Assert clusterer is fitted + assert hasattr(gsomo.clusterer_, 'labels_') + assert hasattr(gsomo.clusterer_, 'neighbors_') + + # Assert distributor is fitted + assert hasattr(gsomo.distributor_, 'intra_distribution_') + assert hasattr(gsomo.distributor_, 'inter_distribution_') diff --git a/tests/clover/over_sampling/test_kmeans_smote.py b/tests/clover/over_sampling/test_kmeans_smote.py new file mode 100644 index 0000000..cc862a3 --- /dev/null +++ b/tests/clover/over_sampling/test_kmeans_smote.py @@ -0,0 +1,168 @@ +"""Test the _kmeans_smote module.""" + +from collections import Counter, OrderedDict + +import pytest +from imblearn.over_sampling import SMOTE +from imblearn_extra.clover.distribution import DensityDistributor +from imblearn_extra.clover.over_sampling import KMeansSMOTE +from sklearn.base import clone +from sklearn.cluster import AgglomerativeClustering, KMeans, MiniBatchKMeans +from sklearn.datasets import make_classification + +RANDOM_STATE = 1 +X, y = make_classification( + random_state=RANDOM_STATE, + n_classes=3, + n_samples=5000, + n_features=10, + n_clusters_per_class=2, + weights=[0.25, 0.45, 0.3], + n_informative=5, +) +KMEANS_SMOTE_OVERSAMPLER = KMeansSMOTE(random_state=RANDOM_STATE) + + +@pytest.mark.parametrize( + ('k_neighbors', 'imbalance_ratio_threshold', 'distances_exponent'), + [(3, 2.0, 'auto'), (5, 1.5, 8), (8, 'auto', 10)], +) +def test_fit(k_neighbors, imbalance_ratio_threshold, distances_exponent): + """Test fit method. + + Multiple cases. + """ + # Fit oversampler + params = { + 'k_neighbors': k_neighbors, + 'imbalance_ratio_threshold': imbalance_ratio_threshold, + 'distances_exponent': distances_exponent, + } + kmeans_smote = clone(KMEANS_SMOTE_OVERSAMPLER).set_params(**params).fit(X, y) + y_count = Counter(y) + + # Assert random state + assert hasattr(kmeans_smote, 'random_state_') + + # Assert oversampler + assert isinstance(kmeans_smote.oversampler_, SMOTE) + assert kmeans_smote.oversampler_.k_neighbors == kmeans_smote.k_neighbors == k_neighbors + + # Assert clusterer + assert isinstance(kmeans_smote.clusterer_, MiniBatchKMeans) + + # Assert distributor + assert isinstance(kmeans_smote.distributor_, DensityDistributor) + assert ( + kmeans_smote.distributor_.filtering_threshold + == kmeans_smote.imbalance_ratio_threshold + == imbalance_ratio_threshold + ) + assert kmeans_smote.distributor_.distances_exponent == kmeans_smote.distances_exponent == distances_exponent + + # Assert sampling strategy + assert kmeans_smote.oversampler_.sampling_strategy == kmeans_smote.sampling_strategy + assert kmeans_smote.sampling_strategy_ == OrderedDict({0: y_count[1] - y_count[0], 2: y_count[1] - y_count[2]}) + + +def test_fit_default(): + """Test fit method. + + Default case. + """ + # Fit oversampler + kmeans_smote = clone(KMEANS_SMOTE_OVERSAMPLER).fit(X, y) + + # Assert clusterer + assert isinstance(kmeans_smote.clusterer_, MiniBatchKMeans) + assert kmeans_smote.clusterer_.n_clusters == MiniBatchKMeans().n_clusters + + +@pytest.mark.parametrize('n_clusters', [5, 6, 12]) +def test_fit_number_of_clusters(n_clusters): + """Test fit method. + + Number of clusters case. + """ + # Fit oversampler + kmeans_smote = clone(KMEANS_SMOTE_OVERSAMPLER).set_params(kmeans_estimator=n_clusters).fit(X, y) + + # Assert clusterer + assert isinstance(kmeans_smote.clusterer_, MiniBatchKMeans) + assert kmeans_smote.clusterer_.n_clusters == n_clusters + + +@pytest.mark.parametrize('proportion', [0.0, 0.5, 1.0]) +def test_fit_proportion_of_samples(proportion): + """Test fit method. + + Proportion of samples case. + """ + # Fit oversampler + kmeans_smote = clone(KMEANS_SMOTE_OVERSAMPLER).set_params(kmeans_estimator=proportion).fit(X, y) + + # Assert clusterer + assert isinstance(kmeans_smote.clusterer_, MiniBatchKMeans) + assert kmeans_smote.clusterer_.n_clusters == round((len(X) - 1) * proportion + 1) + + +@pytest.mark.parametrize('kmeans_estimator', [KMeans(), MiniBatchKMeans()]) +def test_fit_kmeans_estimator(kmeans_estimator): + """Test fit method. + + KMeans estimator case. + """ + # Fit oversampler + kmeans_smote = clone(KMEANS_SMOTE_OVERSAMPLER).set_params(kmeans_estimator=kmeans_estimator).fit(X, y) + + # Assert clusterer + assert isinstance(kmeans_smote.clusterer_, type(kmeans_estimator)) + assert kmeans_smote.clusterer_.n_clusters == kmeans_estimator.n_clusters + + +@pytest.mark.parametrize('kmeans_estimator', [-3, 0]) +def test_raise_value_error_fit_integer(kmeans_estimator): + """Test fit method. + + Integer values as estimators error case. + """ + with pytest.raises(ValueError, match=f'kmeans_estimator == {kmeans_estimator}, must be >= 1.'): + clone(KMEANS_SMOTE_OVERSAMPLER).set_params(kmeans_estimator=kmeans_estimator).fit(X, y) + + +@pytest.mark.parametrize('kmeans_estimator', [-1.5, 2.0]) +def test_raise_value_error_fit_float(kmeans_estimator): + """Test fit method. + + Float values as estimators error case. + """ + with pytest.raises(ValueError, match=f'kmeans_estimator == {kmeans_estimator}, must be'): + clone(KMEANS_SMOTE_OVERSAMPLER).set_params(kmeans_estimator=kmeans_estimator).fit(X, y) + + +@pytest.mark.parametrize('kmeans_estimator', [AgglomerativeClustering(), [3, 5]]) +def test_raise_type_error_fit(kmeans_estimator): + """Test fit method. + + Not KMeans clusterer error case. + """ + with pytest.raises(TypeError, match='Parameter `kmeans_estimator` should be'): + clone(KMEANS_SMOTE_OVERSAMPLER).set_params(kmeans_estimator=kmeans_estimator).fit(X, y) + + +def test_fit_resample(): + """Test fit and resample method. + + Default case. + """ + # Fit oversampler + kmeans_smote = clone(KMEANS_SMOTE_OVERSAMPLER) + _, y_res = kmeans_smote.fit_resample(X, y) + + # Assert clusterer is fitted + assert hasattr(kmeans_smote.clusterer_, 'labels_') + assert not hasattr(kmeans_smote.clusterer_, 'neighbors_') + + # Assert distributor is fitted + assert hasattr(kmeans_smote.distributor_, 'intra_distribution_') + assert hasattr(kmeans_smote.distributor_, 'inter_distribution_') diff --git a/tests/clover/over_sampling/test_somo.py b/tests/clover/over_sampling/test_somo.py new file mode 100644 index 0000000..c35c255 --- /dev/null +++ b/tests/clover/over_sampling/test_somo.py @@ -0,0 +1,174 @@ +"""Test the _somo module.""" + +from collections import Counter, OrderedDict +from math import sqrt + +import pytest +from imblearn.over_sampling import SMOTE +from imblearn_extra.clover.clusterer import SOM +from imblearn_extra.clover.distribution import DensityDistributor +from imblearn_extra.clover.over_sampling import SOMO +from sklearn.base import clone +from sklearn.cluster import AgglomerativeClustering +from sklearn.datasets import make_classification + +RANDOM_STATE = 2 +X, y = make_classification( + random_state=RANDOM_STATE, + n_classes=3, + n_samples=5000, + n_features=10, + n_clusters_per_class=2, + weights=[0.25, 0.45, 0.3], + n_informative=5, +) +SOMO_OVERSAMPLER = SOMO(random_state=RANDOM_STATE) + + +@pytest.mark.parametrize( + ('k_neighbors', 'distribution_ratio'), + [(3, 0.2), (5, 0.5), (8, 0.6)], +) +def test_fit(k_neighbors, distribution_ratio): + """Test fit method. + + Multiple cases. + """ + # Fit oversampler + params = {'k_neighbors': k_neighbors, 'distribution_ratio': distribution_ratio} + somo = clone(SOMO_OVERSAMPLER).set_params(**params).fit(X, y) + y_count = Counter(y) + + # Assert random state + assert hasattr(somo, 'random_state_') + + # Assert oversampler + assert isinstance(somo.oversampler_, SMOTE) + assert somo.oversampler_.k_neighbors == somo.k_neighbors == k_neighbors + + # Assert clusterer + assert isinstance(somo.clusterer_, SOM) + + # Assert distributor + filtering_threshold = 1.0 + distances_exponent = 2 + assert isinstance(somo.distributor_, DensityDistributor) + assert somo.distributor_.filtering_threshold == filtering_threshold + assert somo.distributor_.distances_exponent == distances_exponent + assert somo.distributor_.distribution_ratio == somo.distribution_ratio == distribution_ratio + + # Assert sampling strategy + assert somo.oversampler_.sampling_strategy == somo.sampling_strategy + assert somo.sampling_strategy_ == OrderedDict({0: y_count[1] - y_count[0], 2: y_count[1] - y_count[2]}) + + +def test_fit_default(): + """Test fit method. + + Default case. + """ + # Fit oversampler + somo = clone(SOMO_OVERSAMPLER).fit(X, y) + + # Create SOM instance with default parameters + som = SOM() + + # Assert clusterer + assert isinstance(somo.clusterer_, SOM) + assert somo.clusterer_.n_rows == som.n_rows + assert somo.clusterer_.n_columns == som.n_columns + + +@pytest.mark.parametrize('n_clusters', [5, 6, 12]) +def test_fit_number_of_clusters(n_clusters): + """Test fit method. + + Number of clusters case. + """ + # Fit oversampler + somo = clone(SOMO_OVERSAMPLER).set_params(som_estimator=n_clusters).fit(X, y) + + # Assert clusterer + assert isinstance(somo.clusterer_, SOM) + assert somo.clusterer_.n_rows == round(sqrt(somo.som_estimator)) + assert somo.clusterer_.n_columns == round(sqrt(somo.som_estimator)) + + +@pytest.mark.parametrize('proportion', [0.0, 0.5, 1.0]) +def test_fit_proportion_of_samples(proportion): + """Test fit method. + + Proportion of samples case. + """ + # Fit oversampler + somo = clone(SOMO_OVERSAMPLER).set_params(som_estimator=proportion).fit(X, y) + + # Assert clusterer + assert isinstance(somo.clusterer_, SOM) + assert somo.clusterer_.n_rows == round(sqrt((X.shape[0] - 1) * somo.som_estimator + 1)) + assert somo.clusterer_.n_columns == round(sqrt((X.shape[0] - 1) * somo.som_estimator + 1)) + + +def test_fit_som_estimator(): + """Test fit method. + + SOM estimator case. + """ + # Fit oversampler + somo = clone(SOMO_OVERSAMPLER).set_params(som_estimator=SOM()).fit(X, y) + + # Define som estimator + som = SOM() + + # Assert clusterer + assert isinstance(somo.clusterer_, type(som)) + assert somo.clusterer_.n_rows == som.n_rows + assert somo.clusterer_.n_columns == som.n_columns + + +@pytest.mark.parametrize('som_estimator', [-3, 0]) +def test_raise_value_error_fit_integer(som_estimator): + """Test fit method. + + Integer values as estimators error case. + """ + with pytest.raises(ValueError, match=f'som_estimator == {som_estimator}, must be >= 1.'): + clone(SOMO_OVERSAMPLER).set_params(som_estimator=som_estimator).fit(X, y) + + +@pytest.mark.parametrize('som_estimator', [-1.5, 2.0]) +def test_raise_value_error_fit_float(som_estimator): + """Test fit method. + + Float values as estimators error case. + """ + with pytest.raises(ValueError, match=f'som_estimator == {som_estimator}, must be'): + clone(SOMO_OVERSAMPLER).set_params(som_estimator=som_estimator).fit(X, y) + + +@pytest.mark.parametrize('som_estimator', [AgglomerativeClustering(), [3, 5]]) +def test_raise_type_error_fit(som_estimator): + """Test fit method. + + Not SOM clusterer error case. + """ + with pytest.raises(TypeError, match='Parameter `som_estimator` should be'): + clone(SOMO_OVERSAMPLER).set_params(som_estimator=som_estimator).fit(X, y) + + +def test_fit_resample(): + """Test fit and resample method. + + Default case. + """ + # Fit oversampler + somo = clone(SOMO_OVERSAMPLER) + _, y_res = somo.fit_resample(X, y) + + # Assert clusterer is fitted + assert hasattr(somo.clusterer_, 'labels_') + assert hasattr(somo.clusterer_, 'neighbors_') + + # Assert distributor is fitted + assert hasattr(somo.distributor_, 'intra_distribution_') + assert hasattr(somo.distributor_, 'inter_distribution_') diff --git a/tests/gsmote/__init__.py b/tests/gsmote/__init__.py new file mode 100644 index 0000000..15928cd --- /dev/null +++ b/tests/gsmote/__init__.py @@ -0,0 +1,7 @@ +"""Tests suite for `gsmote`.""" + +from pathlib import Path + +TESTS_DIR = Path(__file__).parent +TMP_DIR = TESTS_DIR / 'tmp' +FIXTURES_DIR = TESTS_DIR / 'fixtures' diff --git a/tests/test_geometric_smote.py b/tests/gsmote/test_geometric_smote.py similarity index 99% rename from tests/test_geometric_smote.py rename to tests/gsmote/test_geometric_smote.py index 16ccdc3..2f74a0f 100644 --- a/tests/test_geometric_smote.py +++ b/tests/gsmote/test_geometric_smote.py @@ -4,7 +4,7 @@ import numpy as np import pytest -from gsmote import SELECTION_STRATEGIES, GeometricSMOTE, make_geometric_sample +from imblearn_extra.gsmote import SELECTION_STRATEGIES, GeometricSMOTE, make_geometric_sample from numpy.linalg import norm from scipy import sparse from sklearn.datasets import make_classification @@ -123,7 +123,7 @@ def test_make_geometric_sample_half_hypersphere(surface_point, deformation_facto ('center', 'surface_point', 'truncation_factor'), [ (center, surface_point, truncation_factor) - for center, surface_point in zip(CENTERS, SURFACE_POINTS) + for center, surface_point in zip(CENTERS, SURFACE_POINTS, strict=False) for truncation_factor in TRUNCATION_FACTORS ], ) From 8c25d2a623a7976dea86e6e07db5cc1f45a179e4 Mon Sep 17 00:00:00 2001 From: georgedouzas Date: Sat, 26 Oct 2024 11:53:33 +0300 Subject: [PATCH 2/8] chore: Sort imports --- docs/examples/applications/plot_mnist_example.py | 3 ++- docs/examples/plot_cluster_oversamplers.py | 3 ++- docs/examples/plot_gsmote_data_generation.py | 3 ++- docs/examples/plot_gsmote_validation_curves.py | 3 ++- docs/examples/plot_kmeans_smote.py | 3 ++- tests/clover/clusterer/test_som.py | 3 ++- tests/clover/distribution/test_base.py | 3 ++- tests/clover/distribution/test_density.py | 3 ++- tests/clover/over_sampling/test_cluster.py | 13 +++++++------ tests/clover/over_sampling/test_gsomo.py | 7 ++++--- tests/clover/over_sampling/test_kmeans_smote.py | 5 +++-- tests/clover/over_sampling/test_somo.py | 7 ++++--- tests/gsmote/test_geometric_smote.py | 3 ++- 13 files changed, 36 insertions(+), 23 deletions(-) diff --git a/docs/examples/applications/plot_mnist_example.py b/docs/examples/applications/plot_mnist_example.py index c86c0e6..036b58a 100644 --- a/docs/examples/applications/plot_mnist_example.py +++ b/docs/examples/applications/plot_mnist_example.py @@ -17,11 +17,12 @@ from imblearn.datasets import make_imbalance from imblearn.over_sampling import SMOTE from imblearn.pipeline import Pipeline -from imblearn_extra.gsmote import GeometricSMOTE from sklearn.datasets import fetch_openml from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split +from imblearn_extra.gsmote import GeometricSMOTE + def plot_mnist_samples(X, y, title=None, n_subplots=None): if n_subplots is None: diff --git a/docs/examples/plot_cluster_oversamplers.py b/docs/examples/plot_cluster_oversamplers.py index 0bb62fa..fec8b28 100644 --- a/docs/examples/plot_cluster_oversamplers.py +++ b/docs/examples/plot_cluster_oversamplers.py @@ -15,7 +15,6 @@ import pandas as pd from imblearn.over_sampling import SMOTE, BorderlineSMOTE, RandomOverSampler from imblearn.pipeline import make_pipeline -from imblearn_extra.clover.over_sampling import ClusterOverSampler from sklearn.base import clone from sklearn.cluster import AgglomerativeClustering, KMeans from sklearn.datasets import make_classification @@ -23,6 +22,8 @@ from sklearn.metrics import f1_score from sklearn.model_selection import train_test_split +from imblearn_extra.clover.over_sampling import ClusterOverSampler + RANDOM_STATE = 0 OVERSAMPLERS = [ RandomOverSampler(random_state=RANDOM_STATE), diff --git a/docs/examples/plot_gsmote_data_generation.py b/docs/examples/plot_gsmote_data_generation.py index 5f3fb4a..2087a64 100644 --- a/docs/examples/plot_gsmote_data_generation.py +++ b/docs/examples/plot_gsmote_data_generation.py @@ -13,9 +13,10 @@ import matplotlib.pyplot as plt import numpy as np from imblearn.over_sampling import SMOTE -from imblearn_extra.gsmote import GeometricSMOTE from sklearn.datasets import make_blobs +from imblearn_extra.gsmote import GeometricSMOTE + XLIM, YLIM = [-3.0, 3.0], [0.0, 4.0] RANDOM_STATE = 5 diff --git a/docs/examples/plot_gsmote_validation_curves.py b/docs/examples/plot_gsmote_validation_curves.py index 8cc6f27..2afd5ca 100644 --- a/docs/examples/plot_gsmote_validation_curves.py +++ b/docs/examples/plot_gsmote_validation_curves.py @@ -14,13 +14,14 @@ import numpy as np from imblearn.metrics import geometric_mean_score from imblearn.pipeline import make_pipeline -from imblearn_extra.gsmote import GeometricSMOTE from sklearn.datasets import make_classification from sklearn.linear_model import LogisticRegression from sklearn.metrics import make_scorer from sklearn.model_selection import validation_curve from sklearn.tree import DecisionTreeClassifier +from imblearn_extra.gsmote import GeometricSMOTE + RANDOM_STATE = 10 SCORER = make_scorer(geometric_mean_score) diff --git a/docs/examples/plot_kmeans_smote.py b/docs/examples/plot_kmeans_smote.py index 6f96061..f06edc9 100644 --- a/docs/examples/plot_kmeans_smote.py +++ b/docs/examples/plot_kmeans_smote.py @@ -13,13 +13,14 @@ import pandas as pd from imblearn.over_sampling import SMOTE from imblearn.pipeline import make_pipeline -from imblearn_extra.clover.over_sampling import KMeansSMOTE from sklearn.base import clone from sklearn.datasets import make_classification from sklearn.ensemble import GradientBoostingClassifier from sklearn.metrics import f1_score from sklearn.model_selection import train_test_split +from imblearn_extra.clover.over_sampling import KMeansSMOTE + RANDOM_STATE = 2 OVERSAMPLERS = [ SMOTE(random_state=RANDOM_STATE), diff --git a/tests/clover/clusterer/test_som.py b/tests/clover/clusterer/test_som.py index 5dfe0f4..e5055ed 100644 --- a/tests/clover/clusterer/test_som.py +++ b/tests/clover/clusterer/test_som.py @@ -1,9 +1,10 @@ """Test the _som module.""" import numpy as np -from imblearn_extra.clover.clusterer import SOM, extract_topological_neighbors, generate_labels_mapping from sklearn.datasets import make_classification +from imblearn_extra.clover.clusterer import SOM, extract_topological_neighbors, generate_labels_mapping + RANDOM_STATE = 5 X, _ = make_classification(random_state=RANDOM_STATE, n_samples=1000) diff --git a/tests/clover/distribution/test_base.py b/tests/clover/distribution/test_base.py index 1e71f3e..c748860 100644 --- a/tests/clover/distribution/test_base.py +++ b/tests/clover/distribution/test_base.py @@ -2,9 +2,10 @@ import numpy as np import pytest -from imblearn_extra.clover.distribution.base import BaseDistributor from sklearn.datasets import make_classification +from imblearn_extra.clover.distribution.base import BaseDistributor + @pytest.mark.parametrize(("n_samples", "n_classes", "weights"), [(20, 2, [0.8, 0.2]), (10, 3, [0.6, 0.2, 0.2])]) def test_fit(n_samples, n_classes, weights): diff --git a/tests/clover/distribution/test_density.py b/tests/clover/distribution/test_density.py index a6f448c..d2c4277 100644 --- a/tests/clover/distribution/test_density.py +++ b/tests/clover/distribution/test_density.py @@ -2,9 +2,10 @@ import numpy as np import pytest -from imblearn_extra.clover.distribution._density import DensityDistributor from sklearn.base import clone +from imblearn_extra.clover.distribution._density import DensityDistributor + X = np.array( [ [1.0, 1.0], diff --git a/tests/clover/over_sampling/test_cluster.py b/tests/clover/over_sampling/test_cluster.py index 5442829..d5d3477 100644 --- a/tests/clover/over_sampling/test_cluster.py +++ b/tests/clover/over_sampling/test_cluster.py @@ -5,6 +5,13 @@ import numpy as np import pytest from imblearn.over_sampling import SMOTE, SVMSMOTE, BorderlineSMOTE, RandomOverSampler +from sklearn.base import clone +from sklearn.cluster import KMeans +from sklearn.datasets import make_classification +from sklearn.exceptions import FitFailedWarning +from sklearn.neighbors import NearestNeighbors +from sklearn.utils import check_random_state + from imblearn_extra.clover.distribution import DensityDistributor from imblearn_extra.clover.over_sampling import ( ClusterOverSampler, @@ -14,12 +21,6 @@ generate_in_cluster, modify_nn, ) -from sklearn.base import clone -from sklearn.cluster import KMeans -from sklearn.datasets import make_classification -from sklearn.exceptions import FitFailedWarning -from sklearn.neighbors import NearestNeighbors -from sklearn.utils import check_random_state RANDOM_STATE = 1 X, y = make_classification( diff --git a/tests/clover/over_sampling/test_gsomo.py b/tests/clover/over_sampling/test_gsomo.py index b21e60c..db50a20 100644 --- a/tests/clover/over_sampling/test_gsomo.py +++ b/tests/clover/over_sampling/test_gsomo.py @@ -4,13 +4,14 @@ from math import sqrt import pytest +from sklearn.base import clone +from sklearn.cluster import AgglomerativeClustering +from sklearn.datasets import make_classification + from imblearn_extra.clover.clusterer import SOM from imblearn_extra.clover.distribution import DensityDistributor from imblearn_extra.clover.over_sampling import GeometricSOMO from imblearn_extra.gsmote import GeometricSMOTE -from sklearn.base import clone -from sklearn.cluster import AgglomerativeClustering -from sklearn.datasets import make_classification RANDOM_STATE = 11 X, y = make_classification( diff --git a/tests/clover/over_sampling/test_kmeans_smote.py b/tests/clover/over_sampling/test_kmeans_smote.py index cc862a3..ad2398b 100644 --- a/tests/clover/over_sampling/test_kmeans_smote.py +++ b/tests/clover/over_sampling/test_kmeans_smote.py @@ -4,12 +4,13 @@ import pytest from imblearn.over_sampling import SMOTE -from imblearn_extra.clover.distribution import DensityDistributor -from imblearn_extra.clover.over_sampling import KMeansSMOTE from sklearn.base import clone from sklearn.cluster import AgglomerativeClustering, KMeans, MiniBatchKMeans from sklearn.datasets import make_classification +from imblearn_extra.clover.distribution import DensityDistributor +from imblearn_extra.clover.over_sampling import KMeansSMOTE + RANDOM_STATE = 1 X, y = make_classification( random_state=RANDOM_STATE, diff --git a/tests/clover/over_sampling/test_somo.py b/tests/clover/over_sampling/test_somo.py index c35c255..adedf43 100644 --- a/tests/clover/over_sampling/test_somo.py +++ b/tests/clover/over_sampling/test_somo.py @@ -5,13 +5,14 @@ import pytest from imblearn.over_sampling import SMOTE -from imblearn_extra.clover.clusterer import SOM -from imblearn_extra.clover.distribution import DensityDistributor -from imblearn_extra.clover.over_sampling import SOMO from sklearn.base import clone from sklearn.cluster import AgglomerativeClustering from sklearn.datasets import make_classification +from imblearn_extra.clover.clusterer import SOM +from imblearn_extra.clover.distribution import DensityDistributor +from imblearn_extra.clover.over_sampling import SOMO + RANDOM_STATE = 2 X, y = make_classification( random_state=RANDOM_STATE, diff --git a/tests/gsmote/test_geometric_smote.py b/tests/gsmote/test_geometric_smote.py index 2f74a0f..5ae28ad 100644 --- a/tests/gsmote/test_geometric_smote.py +++ b/tests/gsmote/test_geometric_smote.py @@ -4,12 +4,13 @@ import numpy as np import pytest -from imblearn_extra.gsmote import SELECTION_STRATEGIES, GeometricSMOTE, make_geometric_sample from numpy.linalg import norm from scipy import sparse from sklearn.datasets import make_classification from sklearn.utils import check_random_state +from imblearn_extra.gsmote import SELECTION_STRATEGIES, GeometricSMOTE, make_geometric_sample + RND_SEED = 0 RANDOM_STATE = check_random_state(RND_SEED) CENTERS = [ From f4c9df4caf959a9b0df2ed99dfdfa0526396150c Mon Sep 17 00:00:00 2001 From: georgedouzas Date: Sat, 26 Oct 2024 12:13:52 +0300 Subject: [PATCH 3/8] fix: Adjust SOM parameters to MiniSom latest version --- src/imblearn_extra/clover/clusterer/_som.py | 11 +++++++---- tests/clover/clusterer/test_som.py | 9 ++++----- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/src/imblearn_extra/clover/clusterer/_som.py b/src/imblearn_extra/clover/clusterer/_som.py index f5afc11..b4a9c71 100644 --- a/src/imblearn_extra/clover/clusterer/_som.py +++ b/src/imblearn_extra/clover/clusterer/_som.py @@ -9,7 +9,7 @@ import numpy as np import numpy.typing as npt -from minisom import MiniSom, asymptotic_decay +from minisom import MiniSom from sklearn.base import BaseEstimator, ClusterMixin from sklearn.preprocessing import minmax_scale from sklearn.utils import check_array, check_random_state @@ -85,6 +85,8 @@ class SOM(BaseEstimator, ClusterMixin): decay_function: Function that reduces learning_rate and sigma at each iteration. + Possible values: 'inverse_decay_to_zero', 'linear_decay_to_zero', + 'asymptotic_decay' or callable. neighborhood_function: Function that weights the neighborhood of a position in the map. @@ -94,8 +96,9 @@ class SOM(BaseEstimator, ClusterMixin): Topology of the map. Possible values: 'rectangular', 'hexagonal'. activation_distance: - Distance used to activate the map. Possible values: 'euclidean', - 'cosine', 'manhattan', 'chebyshev'. + Distance used to activate the map. + Possible values: 'euclidean', 'cosine', 'manhattan', 'chebyshev' + or callable. random_state: Control the randomization of the algorithm. @@ -114,7 +117,7 @@ def __init__( n_rows: int | None = None, sigma: float = 1.0, learning_rate: float = 0.5, - decay_function: Callable = asymptotic_decay, + decay_function: str | Callable = 'inverse_decay_to_zero', neighborhood_function: str = 'gaussian', topology: str = 'rectangular', activation_distance: str | Callable = 'euclidean', diff --git a/tests/clover/clusterer/test_som.py b/tests/clover/clusterer/test_som.py index e5055ed..3a181e9 100644 --- a/tests/clover/clusterer/test_som.py +++ b/tests/clover/clusterer/test_som.py @@ -16,7 +16,7 @@ def test_generate_labels_mapping(): assert generate_labels_mapping(grid_labels) == labels_mapping -def test_return_topological_neighbors_rectangular(): +def test_extract_topological_neighbors_rectangular(): """Test the topological neighbors of a neuron for rectangular grid type.""" som = SOM(random_state=RANDOM_STATE).fit(X) labels_coords_unique = list({(int(i), int(j)) for i, j in [som.algorithm_.winner(x) for x in X]}) @@ -28,11 +28,10 @@ def test_return_topological_neighbors_rectangular(): (0, 1), (2, 1), (1, 0), - (1, 2), ] -def test_return_topological_neighbors_hexagonal(): +def test_extract_topological_neighbors_hexagonal(): """Test the topological neighbors of a neuron for hexagonal grid type.""" som = SOM(random_state=RANDOM_STATE, topology='hexagonal').fit(X) labels_coords_unique = list({(int(i), int(j)) for i, j in [som.algorithm_.winner(x) for x in X]}) @@ -44,7 +43,6 @@ def test_return_topological_neighbors_hexagonal(): (0, 1), (2, 1), (1, 0), - (1, 2), (2, 2), (2, 0), ] @@ -67,7 +65,8 @@ def test_fit(): n_columns = 3 som = SOM(n_rows=n_rows, n_columns=n_columns, random_state=RANDOM_STATE) som.fit(X) - assert np.array_equal(np.unique(som.labels_), np.arange(0, n_rows * n_columns)) + assert all(np.unique(som.labels_) >= 0) + assert all(np.unique(som.labels_) < n_rows * n_columns) assert som.n_rows_ == n_rows assert som.n_columns_ == n_columns assert hasattr(som, 'neighbors_') From 9c2be6b3bd118555cab4b6c54e9021dde2619b4e Mon Sep 17 00:00:00 2001 From: georgedouzas Date: Sat, 26 Oct 2024 12:18:56 +0300 Subject: [PATCH 4/8] chore: Update copier template to v0.11.0 --- .copier-answers.yml | 2 +- .github/workflows/ci-docs.yml | 4 +-- noxfile.py | 2 +- pyproject.toml | 67 +++++++++++++++++------------------ 4 files changed, 37 insertions(+), 38 deletions(-) diff --git a/.copier-answers.yml b/.copier-answers.yml index c3bc6a8..bd702a0 100644 --- a/.copier-answers.yml +++ b/.copier-answers.yml @@ -1,4 +1,4 @@ -_commit: 0.8.2 +_commit: 0.11.0 _src_path: gh:georgedouzas/copier-pdm-nox author_email: gdouzas@icloud.com author_fullname: Georgios Douzas diff --git a/.github/workflows/ci-docs.yml b/.github/workflows/ci-docs.yml index 19b1a49..4460edd 100644 --- a/.github/workflows/ci-docs.yml +++ b/.github/workflows/ci-docs.yml @@ -29,7 +29,7 @@ jobs: 3.10 3.11 3.12 - + - name: Set up PDM uses: pdm-project/setup-pdm@v3 with: @@ -69,7 +69,7 @@ jobs: 3.10 3.11 3.12 - + - name: Set up PDM uses: pdm-project/setup-pdm@v3 with: diff --git a/noxfile.py b/noxfile.py index 76c5972..7f4a9fe 100644 --- a/noxfile.py +++ b/noxfile.py @@ -65,7 +65,7 @@ def formatting(session: nox.Session, file: str) -> None: if session.posargs[0] in ['code', 'all']: session.run('black', file) if session.posargs[0] in ['docstrings', 'all']: - session.run('docformatter', '--in-place', '--recursive', '--close-quotes-on-newline', file) + session.run('docformatter', file) @nox.session(python=PYTHON_VERSIONS) diff --git a/pyproject.toml b/pyproject.toml index 73beb67..e8c788d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -56,52 +56,50 @@ htmlcov pip-wheel-metadata site __pycache__ docs/generated .nox .ruff_cache pdm. && find . -name '.coverage*' -delete"""} docs = "nox --error-on-external-run -R -s docs -- {args:serve}" formatting = "nox --error-on-external-run -R -s formatting -- {args:all}" -checks = "nox --error-on-external-run -R -s checks -- {args:all}" -tests = "nox --error-on-external-run -R -s tests" +checks = "nox --default-venv-backend uv --error-on-external-run -R -s checks -- {args:all}" +tests = "nox --default-venv-backend uv --error-on-external-run -R -s tests" changelog = "nox --error-on-external-run -R -s changelog" release = "nox --error-on-external-run -R -s release" [tool.pdm.dev-dependencies] maintenance = [ - "nox>=2022.8.7", + "nox[uv]>=2024.4.15", ] docs = [ - "mkdocs>=1.4.2", - "mkdocs-coverage>=0.2.6", - "mkdocs-gen-files>=0.3", - "mkdocs-literate-nav>=0.4", - "mkdocs-material>=7.3", - "mkdocs-gallery>=0.7.6", - "mkdocs-section-index>=0.3", - "mkdocstrings[python]>=0.20", - "markdown-callouts>=0.2", - "markdown-exec>=0.5", - "pandas>=1.5.3", - "matplotlib>=3.7.1", + "mkdocs>=1.6.0", + "mkdocs-coverage>=1.1.0", + "mkdocs-gen-files>=0.5.0", + "mkdocs-literate-nav>=0.6.1", + "mkdocs-material>=9.5.27", + "mkdocs-gallery>=0.10.1", + "mkdocs-section-index>=0.3.9", + "mkdocstrings[python]>=0.25.1", + "markdown-callouts>=0.4.0", + "markdown-exec>=1.9.3", + 'pandas>=2.2.2', ] formatting = [ - "black>=21.10b0", - "docformatter>=1.5.1", + "black>=24.4.2", + "docformatter>=1.7.5", ] checks = [ - "numpy>=1.22", - "ruff>=0.0.237", - "safety>=2", - "mypy>=0.910", - "types-markdown>=3.3", - "types-toml>=0.10", + "ruff>=0.5.0", + "safety>=3.2.3", + "mypy>=1.10.1", + "types-markdown>=3.6.0.20240316", + "types-toml>=0.10.8.20240310", ] tests = [ - "pytest>=6.2", - "pytest-cov>=3.0", - "pytest-randomly>=3.10", - "pytest-xdist>=2.4", + "pytest>=8.2.2", + "pytest-cov>=5.0.0", + "pytest-randomly>=3.15.0", + "pytest-xdist>=3.6.1", ] changelog = [ - "git-changelog>=1.0.0", + "git-changelog>=2.5.2", ] release = [ - "twine>=4.0.0", + "twine>=5.1.1", ] [tool.black] @@ -109,18 +107,19 @@ line-length = 120 extend-exclude = "(tests/fixtures|docs/generated)" skip-string-normalization = true +[tool.docformatter] +in-place = true +recursive = true +close-quotes-on-newline = true +wrap-descriptions = 120 + [tool.ruff] -fix = true extend-exclude = ["docs/generated"] force-exclude = true line-length = 120 target-version = "py310" [tool.ruff.lint] -fixable = ["C", "E", "F", "W", "B", "I", "D", "N", "UP", "YTT", "ANN", "S", -"BLE", "A", "COM", "C4", "DTZ", "T10", "EM", "EXE", "ISC", "ICN", "G", -"INP", "PIE", "T20", "PT", "Q", "RET501", "RET502", "RET503", "SIM", -"PTH", "PD", "PGH", "PL", "TRY", "RUF", "PLE", "PLR", "PLW", "TRY", "RUF"] ignore = ["D202", "N806", "N803", "S101", "INP001", "Q000", "TRY002", "PLR0913", "EXE001", "EXE002", "E741"] select = ["C", "E", "F", "W", "B", "I", "D", "N", "UP", "YTT", "ANN", "S", "BLE", "A", "COM", "C4", "DTZ", "T10", "EM", "EXE", "ISC", "ICN", "G", From bf4223d56f406ea23d302c6e7bc2f1409f2c231c Mon Sep 17 00:00:00 2001 From: georgedouzas Date: Sat, 26 Oct 2024 13:03:59 +0300 Subject: [PATCH 5/8] style: Apply black modified configuration --- src/imblearn_extra/clover/distribution/_density.py | 6 ++---- src/imblearn_extra/clover/distribution/base.py | 5 ++--- src/imblearn_extra/clover/over_sampling/__init__.py | 3 +-- src/imblearn_extra/gsmote/__init__.py | 3 +-- tests/clover/distribution/test_density.py | 6 ++---- 5 files changed, 8 insertions(+), 15 deletions(-) diff --git a/src/imblearn_extra/clover/distribution/_density.py b/src/imblearn_extra/clover/distribution/_density.py index 11dfdf2..afbeb32 100644 --- a/src/imblearn_extra/clover/distribution/_density.py +++ b/src/imblearn_extra/clover/distribution/_density.py @@ -269,8 +269,7 @@ def _intra_distribute( ) -> Self: """In the clusters distribution. - Distribute the generated samples in each cluster based on their - density. + Distribute the generated samples in each cluster based on their density. """ # Calculate weights based on density @@ -302,8 +301,7 @@ def _inter_distribute( ) -> Self: """Between the clusters distribution. - Distribute the generated samples between clusters based on their - density. + Distribute the generated samples between clusters based on their density. """ # Identify filtered neighboring clusters diff --git a/src/imblearn_extra/clover/distribution/base.py b/src/imblearn_extra/clover/distribution/base.py index 8524648..3541476 100644 --- a/src/imblearn_extra/clover/distribution/base.py +++ b/src/imblearn_extra/clover/distribution/base.py @@ -19,9 +19,8 @@ class BaseDistributor(BaseEstimator): """The base class for distributors. - A distributor sets the proportion of samples to be generated inside - each cluster and between clusters. Warning: This class should not be - used directly. Use the derive classes instead. + A distributor sets the proportion of samples to be generated inside each cluster and between clusters. Warning: This + class should not be used directly. Use the derive classes instead. """ def _intra_distribute( diff --git a/src/imblearn_extra/clover/over_sampling/__init__.py b/src/imblearn_extra/clover/over_sampling/__init__.py index c68bffc..30640c6 100644 --- a/src/imblearn_extra/clover/over_sampling/__init__.py +++ b/src/imblearn_extra/clover/over_sampling/__init__.py @@ -1,7 +1,6 @@ """This module includes classes for clustering-based oversampling. -A general class for clustering-based oversampling as well as specific -clustering-based oversamplers are provided. +A general class for clustering-based oversampling as well as specific clustering-based oversamplers are provided. """ from ._cluster import ( diff --git a/src/imblearn_extra/gsmote/__init__.py b/src/imblearn_extra/gsmote/__init__.py index f7a6a6c..df4e810 100644 --- a/src/imblearn_extra/gsmote/__init__.py +++ b/src/imblearn_extra/gsmote/__init__.py @@ -1,7 +1,6 @@ """Implementation of the Geometric SMOTE algorithm. -A geometrically enhanced drop-in replacement for SMOTE. It is compatible -with scikit-learn and imbalanced-learn. +A geometrically enhanced drop-in replacement for SMOTE. It is compatible with scikit-learn and imbalanced-learn. """ from __future__ import annotations diff --git a/tests/clover/distribution/test_density.py b/tests/clover/distribution/test_density.py index d2c4277..4cfab85 100644 --- a/tests/clover/distribution/test_density.py +++ b/tests/clover/distribution/test_density.py @@ -309,8 +309,7 @@ def test_fit_binary_intra_inter(): def test_fit_multiclass_intra_inter(): """Test fit method. - Multiclass, intra-cluster generation and inter-cluster generation - case. + Multiclass, intra-cluster generation and inter-cluster generation case. """ distributor = ( clone(DISTRIBUTOR) @@ -334,8 +333,7 @@ def test_fit_multiclass_intra_inter(): def test_fit_multiclass_intra_inter_partial_tie(): """Test fit method. - Multiclass, intra-cluster generation, inter-cluster generation case - and partial tie case. + Multiclass, intra-cluster generation, inter-cluster generation case and partial tie case. """ distributor = ( clone(DISTRIBUTOR) From 3e883b06e69d60ee956bf98177afb26117b5e899 Mon Sep 17 00:00:00 2001 From: georgedouzas Date: Sat, 26 Oct 2024 13:05:37 +0300 Subject: [PATCH 6/8] chore: Use new safety API --- .github/workflows/ci-docs.yml | 3 +++ .github/workflows/ci.yml | 3 +++ noxfile.py | 2 +- 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci-docs.yml b/.github/workflows/ci-docs.yml index 4460edd..31cbe06 100644 --- a/.github/workflows/ci-docs.yml +++ b/.github/workflows/ci-docs.yml @@ -46,6 +46,9 @@ jobs: run: pdm checks types - name: Check vulnerabilities in dependencies + uses: pyupio/safety-action@v1 + with: + api-key: ${{ secrets.SAFETY_API_KEY }} run: pdm checks dependencies tests: diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b372b84..ddf046b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -45,6 +45,9 @@ jobs: run: pdm checks types - name: Check vulnerabilities in dependencies + uses: pyupio/safety-action@v1 + with: + api-key: ${{ secrets.SAFETY_API_KEY }} run: pdm checks dependencies tests: diff --git a/noxfile.py b/noxfile.py index 7f4a9fe..a08f250 100644 --- a/noxfile.py +++ b/noxfile.py @@ -99,7 +99,7 @@ def checks(session: nox.Session, file: str) -> None: requirements_path, ] session.run(*(args + dict(requirements_types)[file]), external=True) - session.run('safety', 'check', '-r', requirements_path) + session.run('safety', 'scan', '-r', requirements_path) @nox.session(python=PYTHON_VERSIONS) From 8aa7f97a6355f090837a9e2ceba8611df7112632 Mon Sep 17 00:00:00 2001 From: georgedouzas Date: Sat, 26 Oct 2024 13:12:16 +0300 Subject: [PATCH 7/8] chore: Modify GitHub actions workflow and add dependencies --- .github/workflows/ci-docs.yml | 8 ++------ .github/workflows/ci.yml | 5 ++--- noxfile.py | 6 +++++- pyproject.toml | 2 ++ 4 files changed, 11 insertions(+), 10 deletions(-) diff --git a/.github/workflows/ci-docs.yml b/.github/workflows/ci-docs.yml index 31cbe06..1707d65 100644 --- a/.github/workflows/ci-docs.yml +++ b/.github/workflows/ci-docs.yml @@ -42,13 +42,9 @@ jobs: - name: Check code quality run: pdm checks quality - - name: Check type annotations - run: pdm checks types - - name: Check vulnerabilities in dependencies - uses: pyupio/safety-action@v1 - with: - api-key: ${{ secrets.SAFETY_API_KEY }} + env: + SAFETY_API_KEY: ${{ secrets.SAFETY_API_KEY }} run: pdm checks dependencies tests: diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index ddf046b..e845b59 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -45,9 +45,8 @@ jobs: run: pdm checks types - name: Check vulnerabilities in dependencies - uses: pyupio/safety-action@v1 - with: - api-key: ${{ secrets.SAFETY_API_KEY }} + env: + SAFETY_API_KEY: ${{ secrets.SAFETY_API_KEY }} run: pdm checks dependencies tests: diff --git a/noxfile.py b/noxfile.py index a08f250..ba959aa 100644 --- a/noxfile.py +++ b/noxfile.py @@ -99,7 +99,11 @@ def checks(session: nox.Session, file: str) -> None: requirements_path, ] session.run(*(args + dict(requirements_types)[file]), external=True) - session.run('safety', 'scan', '-r', requirements_path) + if os.environ.get('CI') is not None: + api_key = os.environ.get('SAFETY_API_KEY') + session.run('safety', '--key', api_key, '--stage', 'cicd', 'scan', '-r', requirements_path) + else: + session.run('safety', 'scan', '-r', requirements_path) @nox.session(python=PYTHON_VERSIONS) diff --git a/pyproject.toml b/pyproject.toml index e8c788d..8f696f7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -77,6 +77,7 @@ docs = [ "markdown-callouts>=0.4.0", "markdown-exec>=1.9.3", 'pandas>=2.2.2', + "matplotlib>=3.9.2", ] formatting = [ "black>=24.4.2", @@ -88,6 +89,7 @@ checks = [ "mypy>=1.10.1", "types-markdown>=3.6.0.20240316", "types-toml>=0.10.8.20240310", + "numpy>=2.1.2", ] tests = [ "pytest>=8.2.2", From 50593c9fd9349c2bcf59c6cc10c7827c6697e618 Mon Sep 17 00:00:00 2001 From: georgedouzas Date: Sat, 26 Oct 2024 19:57:02 +0300 Subject: [PATCH 8/8] chore: Release 0.2.4 --- CHANGELOG.md | 100 +++++++++++++++++---------------------------------- 1 file changed, 33 insertions(+), 67 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6f2b229..a5ef5d8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,107 +6,73 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html). -## [0.7.0](https://github.com/georgedouzas/imbalanced-learn-extra/releases/tag/0.7.0) - 2024-06-28 +## [0.2.4](https://github.com/georgedouzas/imbalanced-learn-extra/releases/tag/0.2.4) - 2024-10-26 -[Compare with 0.6.0](https://github.com/georgedouzas/imbalanced-learn-extra/compare/0.6.0...0.7.0) +[Compare with 0.2.3](https://github.com/georgedouzas/imbalanced-learn-extra/compare/0.2.3...0.2.4) -### Features +### Bug Fixes -- Merge `geometric-smote` and `cluster-over-sampling` projects ([37ea792](https://github.com/georgedouzas/imbalanced-learn-extra/commit/37ea792249d8bc33c50b662f97d41f3ba00711c7) by georgedouzas). +- Adjust SOM parameters to MiniSom latest version ([f4c9df4](https://github.com/georgedouzas/imbalanced-learn-extra/commit/f4c9df4caf959a9b0df2ed99dfdfa0526396150c) by georgedouzas). ### Docs -- Split long docstrings ([d54699c](https://github.com/georgedouzas/imbalanced-learn-extra/commit/d54699cecd6a4c7c96b339a1ab23883dbb7e727f) by georgedouzas). -- Modify expression ([7237221](https://github.com/georgedouzas/imbalanced-learn-extra/commit/7237221689d64a36526ea344e053c68adfff0dd0) by georgedouzas). +- Add information for inactive status ([1516061](https://github.com/georgedouzas/imbalanced-learn-extra/commit/151606150227f581cb0b92e124e6f5e823a09a1c) by georgedouzas). -### Chore +### Style -- Update copier template ([9b9ae8c](https://github.com/georgedouzas/imbalanced-learn-extra/commit/9b9ae8cc8b824a84b6134c94cb1365dd532242d1) by georgedouzas). +- Apply black modified configuration ([bf4223d](https://github.com/georgedouzas/imbalanced-learn-extra/commit/bf4223d56f406ea23d302c6e7bc2f1409f2c231c) by georgedouzas). -## [0.6.0](https://github.com/georgedouzas/imbalanced-learn-extra/releases/tag/0.6.0) - 2023-10-02 +### Chore -[Compare with 0.5.1](https://github.com/georgedouzas/imbalanced-learn-extra/compare/0.5.1...0.6.0) +- Modify GitHub actions workflow and add dependencies ([8aa7f97](https://github.com/georgedouzas/imbalanced-learn-extra/commit/8aa7f97a6355f090837a9e2ceba8611df7112632) by georgedouzas). +- Use new safety API ([3e883b0](https://github.com/georgedouzas/imbalanced-learn-extra/commit/3e883b06e69d60ee956bf98177afb26117b5e899) by georgedouzas). +- Update copier template to v0.11.0 ([9c2be6b](https://github.com/georgedouzas/imbalanced-learn-extra/commit/9c2be6b3bd118555cab4b6c54e9021dde2619b4e) by georgedouzas). +- Sort imports ([8c25d2a](https://github.com/georgedouzas/imbalanced-learn-extra/commit/8c25d2a623a7976dea86e6e07db5cc1f45a179e4) by georgedouzas). +- Merged with geometric-smote repo ([f7df642](https://github.com/georgedouzas/imbalanced-learn-extra/commit/f7df6427d0c69e20e3616773722263709c4061d9) by georgedouzas). -### Features +## [0.2.3](https://github.com/georgedouzas/imbalanced-learn-extra/releases/tag/0.2.3) - 2023-12-02 -- Include SOM in `clusterer` module ([4c6a8af](https://github.com/georgedouzas/imbalanced-learn-extra/commit/4c6a8af3fd89ee272487fdb807fb9347f0fcb55c) by georgedouzas). +[Compare with 0.2.2](https://github.com/georgedouzas/imbalanced-learn-extra/compare/0.2.2...0.2.3) ### Docs -- Remove geometric-smote as optional dependency ([24968c2](https://github.com/georgedouzas/imbalanced-learn-extra/commit/24968c272f11cec3ffe5551c028a99c6e0cd14bc) by georgedouzas). -- Silence warning for initialization ([ed4fed2](https://github.com/georgedouzas/imbalanced-learn-extra/commit/ed4fed22c5ff6b8c30e66a57806bb8468362380f) by georgedouzas). - -### Tests - -- Update tests and fix bugs ([e81c837](https://github.com/georgedouzas/imbalanced-learn-extra/commit/e81c837738f34bd326bf35de003e561cc3466146) by georgedouzas). +- Fix scikit-learn link ([3d74b1b](https://github.com/georgedouzas/imbalanced-learn-extra/commit/3d74b1b3443045c7bfb58b2e9520427a2cfc78af) by georgedouzas). ### Chore -- Release 0.6.0 ([a5040dc](https://github.com/georgedouzas/imbalanced-learn-extra/commit/a5040dc5db70c2eabf939de435b4130330d0724e) by georgedouzas). -- Ignore PDM local config file ([ed0cca2](https://github.com/georgedouzas/imbalanced-learn-extra/commit/ed0cca2d4f8adc50119bb205071443a67c52d75b) by georgedouzas). -- Install conda to tests jobs ([02fce3d](https://github.com/georgedouzas/imbalanced-learn-extra/commit/02fce3ddba653ef0d8bfbc57233012cf95168de1) by georgedouzas). -- Update copier template ([5680a3e](https://github.com/georgedouzas/imbalanced-learn-extra/commit/5680a3e41a075a2101ec4570b8d2416a6709d332) by georgedouzas). -- Use conda backend ([1253427](https://github.com/georgedouzas/imbalanced-learn-extra/commit/12534274eb82abe9ade5f3a60a946114125c7489) by georgedouzas). - -## [0.5.1](https://github.com/georgedouzas/imbalanced-learn-extra/releases/tag/0.5.1) - 2023-04-13 +- Release 0.2.3 ([7b96a07](https://github.com/georgedouzas/imbalanced-learn-extra/commit/7b96a0712a14274c4f37dc9c44acd4ac57417b4a) by georgedouzas). -[Compare with 0.4.0](https://github.com/georgedouzas/imbalanced-learn-extra/compare/0.4.0...0.5.1) +## [0.2.2](https://github.com/georgedouzas/imbalanced-learn-extra/releases/tag/0.2.2) - 2023-12-02 -### Features - -- Restructure project using copier template ([57293ae](https://github.com/georgedouzas/imbalanced-learn-extra/commit/57293aee12332ae0ca7f8b8f862e8b143d11bed6) by georgedouzas). +[Compare with 0.2.1](https://github.com/georgedouzas/imbalanced-learn-extra/compare/0.2.1...0.2.2) ### Docs -- Update changelog for version 0.5.0 ([375e79e](https://github.com/georgedouzas/imbalanced-learn-extra/commit/375e79e3bbc95d175f68d8a4ac1e79e628aaaeeb) by georgedouzas). -- Fix example ([438d6a5](https://github.com/georgedouzas/imbalanced-learn-extra/commit/438d6a579d8bc9d175e1fc5656fe38acc4868253) by georgedouzas). - -### Style - -- Add a blank line after docstring ([ce7524d](https://github.com/georgedouzas/imbalanced-learn-extra/commit/ce7524d5db989c672c7e2994ce33b8a18847c20f) by georgedouzas). - -### Code Refactoring - -- Add stack level for warning ([c5925db](https://github.com/georgedouzas/imbalanced-learn-extra/commit/c5925db18cdff1a45852bd0851b2a567daa481a1) by georgedouzas). +- Fix typo ([84f4738](https://github.com/georgedouzas/imbalanced-learn-extra/commit/84f4738bcf3d28b342c7d7ddd07e0d32856d25e3) by georgedouzas). ### Chore -- Update copier template ([d20c7dc](https://github.com/georgedouzas/imbalanced-learn-extra/commit/d20c7dc9ec73127cd41ecda69505b5b8adb07f23) by georgedouzas). -- Release 0.5.0 ([080a2bb](https://github.com/georgedouzas/imbalanced-learn-extra/commit/080a2bb01f9315bf876f71463625ceee1e89db3d) by georgedouzas). +- Release 0.2.2 ([7aa8d9c](https://github.com/georgedouzas/imbalanced-learn-extra/commit/7aa8d9c94372b83195b79ba357a21d74ec1aa1a6) by georgedouzas). -## [0.4.0](https://github.com/georgedouzas/imbalanced-learn-extra/releases/tag/0.4.0) - 2023-02-16 +## [0.2.1](https://github.com/georgedouzas/imbalanced-learn-extra/releases/tag/0.2.1) - 2023-12-02 -[Compare with 0.2.5](https://github.com/georgedouzas/imbalanced-learn-extra/compare/0.2.5...0.4.0) - -## [0.2.5](https://github.com/georgedouzas/imbalanced-learn-extra/releases/tag/0.2.5) - 2020-07-24 - -[Compare with 0.2.4](https://github.com/georgedouzas/imbalanced-learn-extra/compare/0.2.4...0.2.5) - -## [0.2.4](https://github.com/georgedouzas/imbalanced-learn-extra/releases/tag/0.2.4) - 2020-07-21 - -[Compare with 0.2.3](https://github.com/georgedouzas/imbalanced-learn-extra/compare/0.2.3...0.2.4) - -## [0.2.3](https://github.com/georgedouzas/imbalanced-learn-extra/releases/tag/0.2.3) - 2020-07-21 - -[Compare with 0.2.2](https://github.com/georgedouzas/imbalanced-learn-extra/compare/0.2.2...0.2.3) - -## [0.2.2](https://github.com/georgedouzas/imbalanced-learn-extra/releases/tag/0.2.2) - 2020-04-08 - -[Compare with 0.2.1](https://github.com/georgedouzas/imbalanced-learn-extra/compare/0.2.1...0.2.2) +[Compare with 0.2.0](https://github.com/georgedouzas/imbalanced-learn-extra/compare/0.2.0...0.2.1) -## [0.2.1](https://github.com/georgedouzas/imbalanced-learn-extra/releases/tag/0.2.1) - 2020-04-08 +### Chore -[Compare with 0.2.0](https://github.com/georgedouzas/imbalanced-learn-extra/compare/0.2.0...0.2.1) +- Release 0.2.1 ([8734fb6](https://github.com/georgedouzas/imbalanced-learn-extra/commit/8734fb609f65586c007f2f9d8cea8915d10625fe) by georgedouzas). +- Remove Python 3.9 from CI ([c1aeb9b](https://github.com/georgedouzas/imbalanced-learn-extra/commit/c1aeb9be9e4004ced4c50627ca0284d4c71dc6f7) by georgedouzas). +- Restructure project with copier template ([73b3280](https://github.com/georgedouzas/imbalanced-learn-extra/commit/73b32804165ef6875382c85c42a5000ad27e53b4) by georgedouzas). -## [0.2.0](https://github.com/georgedouzas/imbalanced-learn-extra/releases/tag/0.2.0) - 2020-04-07 +## [0.2.0](https://github.com/georgedouzas/imbalanced-learn-extra/releases/tag/0.2.0) - 2022-03-12 -[Compare with 0.1.2](https://github.com/georgedouzas/imbalanced-learn-extra/compare/0.1.2...0.2.0) +[Compare with 0.1.3](https://github.com/georgedouzas/imbalanced-learn-extra/compare/0.1.3...0.2.0) -## [0.1.2](https://github.com/georgedouzas/imbalanced-learn-extra/releases/tag/0.1.2) - 2020-03-30 +## [0.1.3](https://github.com/georgedouzas/imbalanced-learn-extra/releases/tag/0.1.3) - 2019-12-13 -[Compare with 0.1.1](https://github.com/georgedouzas/imbalanced-learn-extra/compare/0.1.1...0.1.2) +[Compare with 0.1.2](https://github.com/georgedouzas/imbalanced-learn-extra/compare/0.1.2...0.1.3) -## [0.1.1](https://github.com/georgedouzas/imbalanced-learn-extra/releases/tag/0.1.1) - 2019-08-17 +## [0.1.2](https://github.com/georgedouzas/imbalanced-learn-extra/releases/tag/0.1.2) - 2019-07-09 -[Compare with first commit](https://github.com/georgedouzas/imbalanced-learn-extra/compare/e209568f6d0b02df1f1d06d5e79ba2300f2f4d23...0.1.1) +[Compare with first commit](https://github.com/georgedouzas/imbalanced-learn-extra/compare/801d7f49ebce70a48a7d9e30d5820765b5a1d511...0.1.2)