diff --git a/.copier-answers.yml b/.copier-answers.yml
index 43648d6..bd702a0 100644
--- a/.copier-answers.yml
+++ b/.copier-answers.yml
@@ -1,18 +1,16 @@
-_commit: 0.8.0
-_src_path: gh:georgedouzas/copier-pdm-nox.git
+_commit: 0.11.0
+_src_path: gh:georgedouzas/copier-pdm-nox
author_email: gdouzas@icloud.com
author_fullname: Georgios Douzas
author_username: georgedouzas
-copyright_date: '2019'
+copyright_date: '2021'
copyright_holder: Georgios Douzas
copyright_holder_email: gdouzas@icloud.com
copyright_license: MIT License
-project_description: Implementation of the Geometric SMOTE algorithm, a geometrically
- enhanced drop-in replacement for SMOTE. It is compatible with scikit-learn and
- imbalanced-learn.
-python_package_distribution_name: geometric-smote
-python_package_import_name: gsmote
-python_versions: '>=3.9, <3.12'
-repository_name: geometric-smote
+project_description: An implementation of novel oversampling algorithms.
+python_package_distribution_name: imbalanced-learn-extra
+python_package_import_name: imblearn_extra
+python_versions: '>=3.10, <3.13'
+repository_name: imbalanced-learn-extra
repository_namespace: georgedouzas
diff --git a/.github/ISSUE_TEMPLATE/bug_report.yaml b/.github/ISSUE_TEMPLATE/bug_report.yaml
index b47675e..17a3b64 100644
--- a/.github/ISSUE_TEMPLATE/bug_report.yaml
+++ b/.github/ISSUE_TEMPLATE/bug_report.yaml
@@ -7,7 +7,7 @@ body:
attributes:
value: >
**Before submitting a bug, please make sure the issue hasn't been already addressed by searching
- through [the past issues](https://github.com/georgedouzas/geometric-smote/issues).**
+ through [the past issues](https://github.com/georgedouzas/imbalanced-learn-extra/issues).**
- type: textarea
attributes:
label: Describe the bug
@@ -52,7 +52,7 @@ body:
description: |
Please provide the following information.
placeholder: >
- `geometric-smote` version
+ `imbalanced-learn-extra` version
Python version
OS
validations:
diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
index b2a0071..5c247df 100644
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -1,11 +1,11 @@
blank_issues_enabled: true
contact_links:
- name: Discussions
- url: https://github.com/georgedouzas/geometric-smote/discussions
+ url: https://github.com/georgedouzas/imbalanced-learn-extra/discussions
about: Ask questions and discuss with other community members
- name: Gitter
- url: https://gitter.im/geometric-smote/community
+ url: https://gitter.im/imbalanced-learn-extra/community
about: Users and developers can sometimes be found on the gitter channel
- name: Blank issue
- url: https://github.com/georgedouzas/geometric-smote/issues/new
+ url: https://github.com/georgedouzas/imbalanced-learn-extra/issues/new
about: Please note that Github Discussions should be used in most cases instead
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index f6c15e7..c316b9d 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -4,7 +4,7 @@
Please check the following:
-- [ ] You have checked the [Pull Request guidelines](https://github.com/georgedouzas/geometric-smote/blob/master/.github/CONTRIBUTING.md).
+- [ ] You have checked the [Pull Request guidelines](https://github.com/georgedouzas/imbalanced-learn-extra/blob/master/.github/CONTRIBUTING.md).
- [ ] Tests for bug fixes or new features have been added.
- [ ] Docs have been added or updated.
diff --git a/.github/workflows/ci-docs.yml b/.github/workflows/ci-docs.yml
index 4eee97e..1707d65 100644
--- a/.github/workflows/ci-docs.yml
+++ b/.github/workflows/ci-docs.yml
@@ -15,7 +15,6 @@ jobs:
os:
- ubuntu-latest
- macos-latest
- - windows-latest
runs-on: ${{ matrix.os }}
@@ -29,8 +28,8 @@ jobs:
python-version: |
3.10
3.11
+ 3.12
-
- name: Set up PDM
uses: pdm-project/setup-pdm@v3
with:
@@ -43,10 +42,9 @@ jobs:
- name: Check code quality
run: pdm checks quality
- - name: Check type annotations
- run: pdm checks types
-
- name: Check vulnerabilities in dependencies
+ env:
+ SAFETY_API_KEY: ${{ secrets.SAFETY_API_KEY }}
run: pdm checks dependencies
tests:
@@ -56,7 +54,6 @@ jobs:
os:
- ubuntu-latest
- macos-latest
- - windows-latest
runs-on: ${{ matrix.os }}
@@ -70,8 +67,8 @@ jobs:
python-version: |
3.10
3.11
+ 3.12
-
- name: Set up PDM
uses: pdm-project/setup-pdm@v3
with:
@@ -90,7 +87,7 @@ jobs:
strategy:
matrix:
- python-version: ['3.10', '3.11']
+ python-version: ['3.10', '3.11', '3.12']
steps:
- name: Checkout
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index d68cd38..e845b59 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -14,7 +14,6 @@ jobs:
os:
- ubuntu-latest
- macos-latest
- - windows-latest
runs-on: ${{ matrix.os }}
@@ -28,6 +27,7 @@ jobs:
python-version: |
3.10
3.11
+ 3.12
- name: Set up PDM
uses: pdm-project/setup-pdm@v3
@@ -45,6 +45,8 @@ jobs:
run: pdm checks types
- name: Check vulnerabilities in dependencies
+ env:
+ SAFETY_API_KEY: ${{ secrets.SAFETY_API_KEY }}
run: pdm checks dependencies
tests:
@@ -54,7 +56,6 @@ jobs:
os:
- ubuntu-latest
- macos-latest
- - windows-latest
runs-on: ${{ matrix.os }}
@@ -68,6 +69,7 @@ jobs:
python-version: |
3.10
3.11
+ 3.12
- name: Set up PDM
uses: pdm-project/setup-pdm@v3
diff --git a/.gitignore b/.gitignore
index 6285262..0bbd589 100644
--- a/.gitignore
+++ b/.gitignore
@@ -20,3 +20,4 @@ __pypackages__/
.ruff_cache
.vscode
.DS_Store
+.tool-versions
\ No newline at end of file
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 28c65de..a5ef5d8 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,45 +6,73 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
-## [0.2.3](https://github.com/georgedouzas/geometric-smote/releases/tag/0.2.3) - 2023-12-02
+## [0.2.4](https://github.com/georgedouzas/imbalanced-learn-extra/releases/tag/0.2.4) - 2024-10-26
-[Compare with 0.2.2](https://github.com/georgedouzas/geometric-smote/compare/0.2.2...0.2.3)
+[Compare with 0.2.3](https://github.com/georgedouzas/imbalanced-learn-extra/compare/0.2.3...0.2.4)
+
+### Bug Fixes
+
+- Adjust SOM parameters to MiniSom latest version ([f4c9df4](https://github.com/georgedouzas/imbalanced-learn-extra/commit/f4c9df4caf959a9b0df2ed99dfdfa0526396150c) by georgedouzas).
### Docs
-- Fix scikit-learn link ([26d9c99](https://github.com/georgedouzas/geometric-smote/commit/26d9c993102677e55d134bb8eabce022188b283a) by georgedouzas).
+- Add information for inactive status ([1516061](https://github.com/georgedouzas/imbalanced-learn-extra/commit/151606150227f581cb0b92e124e6f5e823a09a1c) by georgedouzas).
+
+### Style
+
+- Apply black modified configuration ([bf4223d](https://github.com/georgedouzas/imbalanced-learn-extra/commit/bf4223d56f406ea23d302c6e7bc2f1409f2c231c) by georgedouzas).
+
+### Chore
+
+- Modify GitHub actions workflow and add dependencies ([8aa7f97](https://github.com/georgedouzas/imbalanced-learn-extra/commit/8aa7f97a6355f090837a9e2ceba8611df7112632) by georgedouzas).
+- Use new safety API ([3e883b0](https://github.com/georgedouzas/imbalanced-learn-extra/commit/3e883b06e69d60ee956bf98177afb26117b5e899) by georgedouzas).
+- Update copier template to v0.11.0 ([9c2be6b](https://github.com/georgedouzas/imbalanced-learn-extra/commit/9c2be6b3bd118555cab4b6c54e9021dde2619b4e) by georgedouzas).
+- Sort imports ([8c25d2a](https://github.com/georgedouzas/imbalanced-learn-extra/commit/8c25d2a623a7976dea86e6e07db5cc1f45a179e4) by georgedouzas).
+- Merged with geometric-smote repo ([f7df642](https://github.com/georgedouzas/imbalanced-learn-extra/commit/f7df6427d0c69e20e3616773722263709c4061d9) by georgedouzas).
+
+## [0.2.3](https://github.com/georgedouzas/imbalanced-learn-extra/releases/tag/0.2.3) - 2023-12-02
+
+[Compare with 0.2.2](https://github.com/georgedouzas/imbalanced-learn-extra/compare/0.2.2...0.2.3)
+
+### Docs
+
+- Fix scikit-learn link ([3d74b1b](https://github.com/georgedouzas/imbalanced-learn-extra/commit/3d74b1b3443045c7bfb58b2e9520427a2cfc78af) by georgedouzas).
+
+### Chore
+
+- Release 0.2.3 ([7b96a07](https://github.com/georgedouzas/imbalanced-learn-extra/commit/7b96a0712a14274c4f37dc9c44acd4ac57417b4a) by georgedouzas).
-## [0.2.2](https://github.com/georgedouzas/geometric-smote/releases/tag/0.2.2) - 2023-12-02
+## [0.2.2](https://github.com/georgedouzas/imbalanced-learn-extra/releases/tag/0.2.2) - 2023-12-02
-[Compare with 0.2.1](https://github.com/georgedouzas/geometric-smote/compare/0.2.1...0.2.2)
+[Compare with 0.2.1](https://github.com/georgedouzas/imbalanced-learn-extra/compare/0.2.1...0.2.2)
### Docs
-- Fix typo ([84f4738](https://github.com/georgedouzas/geometric-smote/commit/84f4738bcf3d28b342c7d7ddd07e0d32856d25e3) by georgedouzas).
+- Fix typo ([84f4738](https://github.com/georgedouzas/imbalanced-learn-extra/commit/84f4738bcf3d28b342c7d7ddd07e0d32856d25e3) by georgedouzas).
### Chore
-- Release 0.2.2 ([7aa8d9c](https://github.com/georgedouzas/geometric-smote/commit/7aa8d9c94372b83195b79ba357a21d74ec1aa1a6) by georgedouzas).
+- Release 0.2.2 ([7aa8d9c](https://github.com/georgedouzas/imbalanced-learn-extra/commit/7aa8d9c94372b83195b79ba357a21d74ec1aa1a6) by georgedouzas).
-## [0.2.1](https://github.com/georgedouzas/geometric-smote/releases/tag/0.2.1) - 2023-12-02
+## [0.2.1](https://github.com/georgedouzas/imbalanced-learn-extra/releases/tag/0.2.1) - 2023-12-02
-[Compare with 0.2.0](https://github.com/georgedouzas/geometric-smote/compare/0.2.0...0.2.1)
+[Compare with 0.2.0](https://github.com/georgedouzas/imbalanced-learn-extra/compare/0.2.0...0.2.1)
### Chore
-- Release 0.2.1 ([8734fb6](https://github.com/georgedouzas/geometric-smote/commit/8734fb609f65586c007f2f9d8cea8915d10625fe) by georgedouzas).
-- Remove Python 3.9 from CI ([c1aeb9b](https://github.com/georgedouzas/geometric-smote/commit/c1aeb9be9e4004ced4c50627ca0284d4c71dc6f7) by georgedouzas).
-- Restructure project with copier template ([73b3280](https://github.com/georgedouzas/geometric-smote/commit/73b32804165ef6875382c85c42a5000ad27e53b4) by georgedouzas).
+- Release 0.2.1 ([8734fb6](https://github.com/georgedouzas/imbalanced-learn-extra/commit/8734fb609f65586c007f2f9d8cea8915d10625fe) by georgedouzas).
+- Remove Python 3.9 from CI ([c1aeb9b](https://github.com/georgedouzas/imbalanced-learn-extra/commit/c1aeb9be9e4004ced4c50627ca0284d4c71dc6f7) by georgedouzas).
+- Restructure project with copier template ([73b3280](https://github.com/georgedouzas/imbalanced-learn-extra/commit/73b32804165ef6875382c85c42a5000ad27e53b4) by georgedouzas).
-## [0.2.0](https://github.com/georgedouzas/geometric-smote/releases/tag/0.2.0) - 2022-03-12
+## [0.2.0](https://github.com/georgedouzas/imbalanced-learn-extra/releases/tag/0.2.0) - 2022-03-12
-[Compare with 0.1.3](https://github.com/georgedouzas/geometric-smote/compare/0.1.3...0.2.0)
+[Compare with 0.1.3](https://github.com/georgedouzas/imbalanced-learn-extra/compare/0.1.3...0.2.0)
-## [0.1.3](https://github.com/georgedouzas/geometric-smote/releases/tag/0.1.3) - 2019-12-13
+## [0.1.3](https://github.com/georgedouzas/imbalanced-learn-extra/releases/tag/0.1.3) - 2019-12-13
-[Compare with 0.1.2](https://github.com/georgedouzas/geometric-smote/compare/0.1.2...0.1.3)
+[Compare with 0.1.2](https://github.com/georgedouzas/imbalanced-learn-extra/compare/0.1.2...0.1.3)
-## [0.1.2](https://github.com/georgedouzas/geometric-smote/releases/tag/0.1.2) - 2019-07-09
+## [0.1.2](https://github.com/georgedouzas/imbalanced-learn-extra/releases/tag/0.1.2) - 2019-07-09
-[Compare with first commit](https://github.com/georgedouzas/geometric-smote/compare/801d7f49ebce70a48a7d9e30d5820765b5a1d511...0.1.2)
+[Compare with first commit](https://github.com/georgedouzas/imbalanced-learn-extra/compare/801d7f49ebce70a48a7d9e30d5820765b5a1d511...0.1.2)
diff --git a/LICENSE b/LICENSE
index b354685..7b84c16 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,6 +1,6 @@
MIT License
-Copyright (c) 2019 Georgios Douzas
+Copyright (c) 2021 Georgios Douzas
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
diff --git a/README.md b/README.md
index 37a1ddf..9cbbd10 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,8 @@
+[scikit-learn]:
+[imbalanced-learn]:
+[SOMO]:
+[KMeans-SMOTE]:
+[G-SOMO]:
[black badge]:
[black]:
[docformatter badge]:
@@ -8,25 +13,19 @@
[mypy]:
[mkdocs badge]:
[mkdocs]:
-[version badge]:
-[pythonversion badge]:
-[downloads badge]:
-[gitter]:
+[version badge]:
+[pythonversion badge]:
+[downloads badge]:
+[gitter]:
[gitter badge]:
-[discussions]:
-[discussions badge]:
-[ci]:
-[ci badge]:
-[doc]:
-[doc badge]:
+[discussions]:
+[discussions badge]:
+[ci]:
+[ci badge]:
+[doc]:
+[doc badge]:
-[](https://www.repostatus.org/#inactive)
-
-> **The project has been moved to [imbalanced-learn-extra](https://github.com/georgedouzas/imbalanced-learn-extra).**
-
-# geometric-smote
+# imbalanced-learn-extra
[![ci][ci badge]][ci] [![doc][doc badge]][doc]
@@ -39,59 +38,75 @@ allows.](https://www.repostatus.org/badges/latest/inactive.svg)](https://www.rep
## Introduction
-The package `geometric-smote` implements the Geometric SMOTE algorithm, a geometrically enhanced drop-in replacement for SMOTE. It
-is compatible with scikit-learn and imbalanced-learn. The Geometric SMOTE algorithm can handle numerical as well as categorical
-features.
+`imbalanced-learn-extra` is a Python package that extends [imbalanced-learn]. It implements algorithms that are not included in
+[imbalanced-learn] due to their novelty or lower citation number. The current version includes the following:
+
+- A general interface for clustering-based oversampling algorithms.
+
+- The Geometric SMOTE algorithm. It is a geometrically enhanced drop-in replacement for SMOTE, that handles numerical as well as
+categorical features.
## Installation
-For user installation, `geometric-smote` is currently available on the PyPi's repository, and you can
+For user installation, `imbalanced-learn-extra` is currently available on the PyPi's repository, and you can
install it via `pip`:
```bash
-pip install geometric-smote
+pip install imbalanced-learn-extra
```
Development installation requires cloning the repository and then using [PDM](https://github.com/pdm-project/pdm) to install the
project as well as the main and development dependencies:
```bash
-git clone https://github.com/georgedouzas/geometric-smote.git
-cd geometric-smote
+git clone https://github.com/georgedouzas/imbalanced-learn-extra.git
+cd imbalanced-learn-extra
pdm install
```
+SOM clusterer requires optional dependencies:
+
+```bash
+pip install imbalanced-learn-extra[som]
+```
+
## Usage
-All the classes included in `geometric-smote` follow the [imbalanced-learn](https://imbalanced-learn.org/stable/) API using the
-functionality of the base oversampler. Using [scikit-learn](https://scikit-learn.org/stable/) convention, the data are represented
-as follows:
+All the classes included in `imbalanced-learn-extra` follow the [imbalanced-learn] API using the functionality of the base
+oversampler. Using [scikit-learn] convention, the data are represented as follows:
- Input data `X`: 2D array-like or sparse matrices.
- Targets `y`: 1D array-like.
-The clustering-based oversamplers implement a `fit` method to learn from `X` and `y`:
+The oversamplers implement a `fit` method to learn from `X` and `y`:
```python
-gsmote_oversampler.fit(X, y)
+oversampler.fit(X, y)
```
They also implement a `fit_resample` method to resample `X` and `y`:
```python
-X_resampled, y_resampled = gsmote.fit_resample(X, y)
+X_resampled, y_resampled = clustering_based_oversampler.fit_resample(X, y)
```
-## Citing `geometric-smote`
+## Citing `imbalanced-learn-extra`
+
+Publications using clustering-based oversampling:
+
+- [G. Douzas, F. Bacao, "Self-Organizing Map Oversampling (SOMO) for imbalanced data set learning", Expert Systems with
+ Applications, vol. 82, pp. 40-52, 2017.][SOMO]
+- [G. Douzas, F. Bacao, F. Last, "Improving imbalanced learning through a heuristic oversampling method based on k-means and
+ SMOTE", Information Sciences, vol. 465, pp. 1-20, 2018.][KMeans-SMOTE]
+- [G. Douzas, F. Bacao, F. Last, "G-SOMO: An oversampling approach based on self-organized maps and geometric SMOTE", Expert
+ Systems with Applications, vol. 183,115230, 2021.][G-SOMO]
-If you use `geometric-smote` in a scientific publication, we would appreciate citations to the following paper:
+Publications using Geometric-SMOTE:
- Douzas, G., Bacao, B. (2019). Geometric SMOTE: a geometrically enhanced
drop-in replacement for SMOTE. Information Sciences, 501, 118-135.
-Publications using Geometric-SMOTE:
-
- Fonseca, J., Douzas, G., Bacao, F. (2021). Increasing the Effectiveness of
Active Learning: Introducing Artificial Data Generation in Active Learning
for Land Use/Land Cover Classification. Remote Sensing, 13(13), 2619.
diff --git a/docs/examples/README.md b/docs/examples/README.md
index e116f8d..4c9b430 100644
--- a/docs/examples/README.md
+++ b/docs/examples/README.md
@@ -1,3 +1,3 @@
# General examples
-A collection of examples for `geometric-smote` package.
+A collection of examples for the `imblearn_extra` package.
diff --git a/docs/examples/applications/README.md b/docs/examples/applications/README.md
index 72b03d8..a2d3511 100644
--- a/docs/examples/applications/README.md
+++ b/docs/examples/applications/README.md
@@ -1,3 +1,3 @@
# Applications
-Examples of applications for the `geometric-smote` package.
\ No newline at end of file
+Examples of applications for the `imblearn_extra` package.
diff --git a/docs/examples/applications/plot_mnist_example.py b/docs/examples/applications/plot_mnist_example.py
index 4aca928..036b58a 100644
--- a/docs/examples/applications/plot_mnist_example.py
+++ b/docs/examples/applications/plot_mnist_example.py
@@ -14,7 +14,6 @@
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
-from gsmote import GeometricSMOTE
from imblearn.datasets import make_imbalance
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
@@ -22,6 +21,8 @@
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
+from imblearn_extra.gsmote import GeometricSMOTE
+
def plot_mnist_samples(X, y, title=None, n_subplots=None):
if n_subplots is None:
diff --git a/docs/examples/plot_cluster_oversamplers.py b/docs/examples/plot_cluster_oversamplers.py
new file mode 100644
index 0000000..fec8b28
--- /dev/null
+++ b/docs/examples/plot_cluster_oversamplers.py
@@ -0,0 +1,150 @@
+"""
+Clustering-based over-sampling
+==============================
+
+This example illustrates the data generation
+process and the performance of various
+over-samplers when clustering-based over-sampling
+is used.
+"""
+
+# Author: Georgios Douzas
+# Licence: MIT
+
+import matplotlib.pyplot as plt
+import pandas as pd
+from imblearn.over_sampling import SMOTE, BorderlineSMOTE, RandomOverSampler
+from imblearn.pipeline import make_pipeline
+from sklearn.base import clone
+from sklearn.cluster import AgglomerativeClustering, KMeans
+from sklearn.datasets import make_classification
+from sklearn.ensemble import GradientBoostingClassifier
+from sklearn.metrics import f1_score
+from sklearn.model_selection import train_test_split
+
+from imblearn_extra.clover.over_sampling import ClusterOverSampler
+
+RANDOM_STATE = 0
+OVERSAMPLERS = [
+ RandomOverSampler(random_state=RANDOM_STATE),
+ SMOTE(random_state=RANDOM_STATE + 1),
+ BorderlineSMOTE(random_state=RANDOM_STATE + 2),
+]
+KMEANS = KMeans(random_state=RANDOM_STATE, n_clusters=100, n_init='auto')
+AGGL = AgglomerativeClustering(n_clusters=100)
+
+
+def generate_imbalanced_data():
+ """Generate imbalanced data."""
+ X, y = make_classification(
+ n_classes=3,
+ class_sep=0.8,
+ weights=[0.01, 0.05, 0.94],
+ n_informative=2,
+ n_redundant=0,
+ n_repeated=0,
+ n_features=2,
+ n_clusters_per_class=1,
+ n_samples=2000,
+ random_state=RANDOM_STATE,
+ )
+ return X, y
+
+
+def plot_data(X, y, oversampler, ax):
+ """Plot original or resampled data."""
+ if oversampler is None:
+ X_res, y_res = X, y
+ title = 'Original data'
+ else:
+ oversampler = clone(oversampler)
+ X_res, y_res = oversampler.fit_resample(X, y)
+ if not isinstance(oversampler, ClusterOverSampler):
+ ovs_name = oversampler.__class__.__name__
+ title = f'Resampling using {ovs_name}'
+ else:
+ clusterer_name = oversampler.clusterer.__class__.__name__
+ ovs_name = oversampler.oversampler_.__class__.__name__
+ title = f'Resampling using {clusterer_name}-{ovs_name}'
+ ax.scatter(X_res[:, 0], X_res[:, 1], c=y_res, alpha=0.8, edgecolor='k')
+ ax.spines['top'].set_visible(False)
+ ax.spines['right'].set_visible(False)
+ ax.get_xaxis().tick_bottom()
+ ax.get_yaxis().tick_left()
+ ax.spines['left'].set_position(('outward', 10))
+ ax.spines['bottom'].set_position(('outward', 10))
+ ax.set_title(title)
+
+
+def compare_f1_scores(X_train, X_test, y_train, y_test, clf, oversampler, clusterer):
+ """Compare F1 scores of oversamplers with and without clustering."""
+ ovs_clf = make_pipeline(clone(oversampler), clf)
+ clr_ovs_clf = make_pipeline(ClusterOverSampler(clone(oversampler), clusterer), clf)
+ y_pred = ovs_clf.fit(X_train, y_train).predict(X_test)
+ y_pred_clr = clr_ovs_clf.fit(X_train, y_train).predict(X_test)
+ ovs_name = oversampler.__class__.__name__
+ ovs_score = f1_score(y_test, y_pred, average='macro')
+ clr_ovs_score = f1_score(y_test, y_pred_clr, average='macro')
+ return (ovs_name, ovs_score, clr_ovs_score)
+
+
+# %%
+# Generate imbalanced data
+# ------------------------
+#
+# We are generating a highly imbalanced multi-class data set, using
+# `make_classification` from scikit-learn.
+
+X, y = generate_imbalanced_data()
+_, ax = plt.subplots(1, 1, figsize=(15, 7))
+plot_data(X, y, None, ax)
+
+# %%
+# Effect of clustering to over-samplers
+# -------------------------------------
+#
+# Clustering based over-sampling allows to identify areas of the input space
+# which are appropriate to generate artificial data. Therefore, the generation
+# of noisy samples is avoided and the within-classes imbalanced issue is also
+# addressed. The next plots show the resampled data when clustering is applied,
+# comparing them to the resampled data of the initial over-samplers.
+
+fig, axs = plt.subplots(3, 2, figsize=(15, 15))
+for (ax1, ax2), oversampler in zip(axs, OVERSAMPLERS, strict=True):
+ plot_data(X, y, clone(oversampler), ax1)
+ plot_data(X, y, ClusterOverSampler(oversampler, KMEANS), ax2)
+fig.tight_layout()
+
+# %%
+# Performance evaluation of clustering based over-sampling
+# --------------------------------------------------------
+#
+# We are evaluating various over-samplers using F1-score as evaluation metric
+# on a test set. The scores with and without clustering are compared.
+
+clf = GradientBoostingClassifier(random_state=RANDOM_STATE)
+X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RANDOM_STATE)
+scores = []
+for oversampler in OVERSAMPLERS:
+ scores.append(compare_f1_scores(X_train, X_test, y_train, y_test, clf, oversampler, KMEANS))
+scores = (
+ pd.DataFrame(scores)
+ .rename(columns={0: 'Oversamplers', 1: ('F-score', 'No clustering'), 2: ('F-score', 'Clustering')})
+ .set_index('Oversamplers')
+)
+scores.columns = pd.MultiIndex.from_tuples(scores.columns)
+scores
+
+# %%
+# We repeat the process for AgglomerativeClustering instead of KMeans.
+
+scores = []
+for oversampler in OVERSAMPLERS:
+ scores.append(compare_f1_scores(X_train, X_test, y_train, y_test, clf, oversampler, AGGL))
+scores = (
+ pd.DataFrame(scores)
+ .rename(columns={0: 'Oversamplers', 1: ('F-score', 'No clustering'), 2: ('F-score', 'Clustering')})
+ .set_index('Oversamplers')
+)
+scores.columns = pd.MultiIndex.from_tuples(scores.columns)
+scores
diff --git a/docs/examples/plot_data_generation_mechanism.py b/docs/examples/plot_gsmote_data_generation.py
similarity index 97%
rename from docs/examples/plot_data_generation_mechanism.py
rename to docs/examples/plot_gsmote_data_generation.py
index 06dbfda..2087a64 100644
--- a/docs/examples/plot_data_generation_mechanism.py
+++ b/docs/examples/plot_gsmote_data_generation.py
@@ -1,6 +1,6 @@
"""
-Data generation mechanism
-=========================
+G-SMOTE data generation
+=======================
This example illustrates the Geometric SMOTE data
generation mechanism and the usage of its
@@ -12,10 +12,11 @@
import matplotlib.pyplot as plt
import numpy as np
-from gsmote import GeometricSMOTE
from imblearn.over_sampling import SMOTE
from sklearn.datasets import make_blobs
+from imblearn_extra.gsmote import GeometricSMOTE
+
XLIM, YLIM = [-3.0, 3.0], [0.0, 4.0]
RANDOM_STATE = 5
@@ -56,7 +57,7 @@ def plot_hyperparameters(oversampler, X, y, param, vals, n_subplots):
_, ax_arr = plt.subplots(*n_subplots, figsize=(15, 7 if n_rows > 1 else 3.5))
if n_rows > 1:
ax_arr = [ax for axs in ax_arr for ax in axs]
- for ax, val in zip(ax_arr, vals):
+ for ax, val in zip(ax_arr, vals, strict=True):
oversampler.set_params(**{param: val})
X_res, y_res = oversampler.fit_resample(X, y)
ax.scatter(X_res[y_res == 1, 0], X_res[y_res == 1, 1], label='Positive Class')
@@ -71,7 +72,7 @@ def plot_comparison(oversamplers, X, y):
samples.
"""
_, ax_arr = plt.subplots(1, 2, figsize=(15, 5))
- for ax, (name, ovs) in zip(ax_arr, oversamplers):
+ for ax, (name, ovs) in zip(ax_arr, oversamplers, strict=True):
X_res, y_res = ovs.fit_resample(X, y)
ax.scatter(X_res[y_res == 1, 0], X_res[y_res == 1, 1], label='Positive Class')
ax.scatter(X_res[y_res == 0, 0], X_res[y_res == 0, 1], label='Negative Class')
diff --git a/docs/examples/plot_validation_curves.py b/docs/examples/plot_gsmote_validation_curves.py
similarity index 98%
rename from docs/examples/plot_validation_curves.py
rename to docs/examples/plot_gsmote_validation_curves.py
index 06564e6..2afd5ca 100644
--- a/docs/examples/plot_validation_curves.py
+++ b/docs/examples/plot_gsmote_validation_curves.py
@@ -1,6 +1,6 @@
"""
-Plotting validation curves
-==========================
+G-SMOTE validation curves
+=========================
In this example the impact of the Geometric SMOTE's hyperparameters is examined.
The validation scores of a Geometric SMOTE-GBC classifier is presented for
@@ -12,7 +12,6 @@
import matplotlib.pyplot as plt
import numpy as np
-from gsmote import GeometricSMOTE
from imblearn.metrics import geometric_mean_score
from imblearn.pipeline import make_pipeline
from sklearn.datasets import make_classification
@@ -21,6 +20,8 @@
from sklearn.model_selection import validation_curve
from sklearn.tree import DecisionTreeClassifier
+from imblearn_extra.gsmote import GeometricSMOTE
+
RANDOM_STATE = 10
SCORER = make_scorer(geometric_mean_score)
diff --git a/docs/examples/plot_kmeans_smote.py b/docs/examples/plot_kmeans_smote.py
new file mode 100644
index 0000000..f06edc9
--- /dev/null
+++ b/docs/examples/plot_kmeans_smote.py
@@ -0,0 +1,114 @@
+"""
+KMeans-SMOTE algorithm
+======================
+
+This example illustrates the data generation
+process and the performance of KMeans-SMOTE.
+"""
+
+# Author: Georgios Douzas
+# Licence: MIT
+
+import matplotlib.pyplot as plt
+import pandas as pd
+from imblearn.over_sampling import SMOTE
+from imblearn.pipeline import make_pipeline
+from sklearn.base import clone
+from sklearn.datasets import make_classification
+from sklearn.ensemble import GradientBoostingClassifier
+from sklearn.metrics import f1_score
+from sklearn.model_selection import train_test_split
+
+from imblearn_extra.clover.over_sampling import KMeansSMOTE
+
+RANDOM_STATE = 2
+OVERSAMPLERS = [
+ SMOTE(random_state=RANDOM_STATE),
+ KMeansSMOTE(random_state=RANDOM_STATE + 3),
+]
+
+
+def generate_imbalanced_data():
+ """Generate imbalanced data."""
+ X, y = make_classification(
+ n_classes=3,
+ flip_y=0.05,
+ weights=[0.15, 0.6, 0.25],
+ n_informative=2,
+ n_redundant=0,
+ n_repeated=0,
+ n_features=2,
+ n_clusters_per_class=1,
+ n_samples=1000,
+ random_state=RANDOM_STATE,
+ )
+ return X, y
+
+
+def plot_data(X, y, oversampler, ax):
+ """Plot original or resampled data."""
+ if oversampler is None:
+ X_res, y_res = X, y
+ title = 'Original data'
+ else:
+ oversampler = clone(oversampler)
+ X_res, y_res = oversampler.fit_resample(X, y)
+ ovs_name = oversampler.__class__.__name__
+ title = f'Resampling using {ovs_name}'
+ ax.scatter(X_res[:, 0], X_res[:, 1], c=y_res, alpha=0.8, edgecolor='k')
+ ax.spines['top'].set_visible(False)
+ ax.spines['right'].set_visible(False)
+ ax.get_xaxis().tick_bottom()
+ ax.get_yaxis().tick_left()
+ ax.spines['left'].set_position(('outward', 10))
+ ax.spines['bottom'].set_position(('outward', 10))
+ ax.set_title(title)
+
+
+def compare_f1_scores(X_train, X_test, y_train, y_test, clf, oversampler):
+ """Compare F1 scores of oversamplers."""
+ ovs_clf = make_pipeline(clone(oversampler), clf)
+ y_pred = ovs_clf.fit(X_train, y_train).predict(X_test)
+ ovs_name = oversampler.__class__.__name__
+ ovs_score = f1_score(y_test, y_pred, average='macro')
+ return (ovs_name, ovs_score)
+
+
+# %%
+# Generate imbalanced data
+# ------------------------
+#
+# We are generating an imbalanced multi-class data set, using
+# ``make_classification`` from scikit-learn.
+
+X, y = generate_imbalanced_data()
+_, ax = plt.subplots(1, 1, figsize=(15, 7))
+plot_data(X, y, None, ax)
+
+# %%
+# Plot resampled data
+# -------------------
+#
+# KMeans-SMOTE allows to identify areas of the input space which are appropriate to generate
+# artificial data. Therefore, the generation of noisy samples is avoided and the within-classes
+# imbalanced issue is also addressed. The next plots show the resampled data of
+# KMeans-SMOTE vs SMOTE.
+
+fig, axs = plt.subplots(1, 2, figsize=(15, 5))
+for ax, oversampler in zip(axs, OVERSAMPLERS, strict=True):
+ plot_data(X, y, clone(oversampler), ax)
+fig.tight_layout()
+
+# %%
+# Performance evaluation
+# ----------------------
+#
+# We are evaluating the performance of KMeans-SMOTE using F1-score as evaluation metric on a
+# test set. SMOTE's performance is also included.
+
+clf = GradientBoostingClassifier(random_state=RANDOM_STATE)
+X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RANDOM_STATE)
+scores = []
+for oversampler in OVERSAMPLERS:
+ scores.append(compare_f1_scores(X_train, X_test, y_train, y_test, clf, oversampler))
+pd.DataFrame(scores).rename(columns={0: 'Oversamplers', 1: 'F-score'}).set_index('Oversamplers')
diff --git a/docs/generate_api.py b/docs/generate_api.py
index ef36d94..64dae42 100644
--- a/docs/generate_api.py
+++ b/docs/generate_api.py
@@ -18,7 +18,7 @@
]
for path in paths:
module_path = path.relative_to('src').with_suffix('')
- doc_path = path.relative_to('src', 'gsmote').with_suffix('.md')
+ doc_path = path.relative_to('src', 'imblearn_extra').with_suffix('.md')
full_doc_path = Path('api', doc_path)
parts = tuple(module_path.parts)
diff --git a/docs/overview/user_guide.md b/docs/overview/user_guide.md
index 576eb4a..6acd150 100644
--- a/docs/overview/user_guide.md
+++ b/docs/overview/user_guide.md
@@ -7,12 +7,129 @@
# User guide
+`imbalanced-learn-extra` is a Python package that extends [imbalanced-learn]. It implements algorithms that are not included in
+[imbalanced-learn] due to their novelty or lower citation number. The current version includes the following:
+
+- A general interface for clustering-based oversampling algorithms that introduces the `ClusterOverSampler` class, while
+ `KMeansSMOTE`, `SOMO` and `GeometricSOMO` classes are provided for convinience. The distribution of the generated samples to the
+ clusters is controled by the `distributor` parameter with `DensityDistributor` being an example of distribution that is based on
+ the density of the clusters.
+
+- The Geometric SMOTE algorithm as a geometrically enhanced drop-in replacement for SMOTE, that handles numerical as well as
+categorical features.
+
+## Clustering-based oversamping
+
+Initially, we generate multi-class imbalanced data represented by the imput data `X` and targets `y`:
+
+```python
+>>> from collections import Counter
+>>> from sklearn.datasets import make_classification
+>>> X, y = make_classification(n_classes=3, weights=[0.10, 0.10, 0.80], random_state=0, n_informative=10)
+>>> print(sorted(Counter(y).items()))
+[(np.int64(0), 10), (np.int64(1), 10), (np.int64(2), 80)]
+```
+
+Below we provided some examples of the `imblearn_extra.clover` functionality.
+
+### KMeans-SMOTE algorithm
+
+KMeans-SMOTE[^2] algorithm is a combination of [KMeans] clusterer and [SMOTE] oversampler and it is implemented by the
+`KMeansSMOTE` class. We initialize it with the default parameters and use it to resample the data:
+
+```python
+>>> from imblearn_extra.clover.over_sampling import KMeansSMOTE
+>>> kmeans_smote = KMeansSMOTE(random_state=5)
+>>> X_resampled, y_resampled = kmeans_smote.fit_resample(X, y)
+>>> print(sorted(Counter(y_resampled).items()))
+[(np.int64(0), 80), (np.int64(1), 80), (np.int64(2), 80)]
+```
+
+The augmented data set can be used instead of the original data set to train a classifier:
+
+```python
+>>> from sklearn.tree import DecisionTreeClassifier
+>>> clf = DecisionTreeClassifier()
+>>> clf.fit(X_resampled, y_resampled)
+DecisionTreeClassifier()
+```
+
+### Combining clusterers and oversamplers
+
+The `ClusterOverSampler` class allows to combine [imbalanced-learn] oversamplers with [scikit-learn] clusterers. This achieved
+through the use of the parameters `oversampler` and `clusterer`. For example, we can select [BorderlineSMOTE] as the oversampler
+and [DBSCAN] as the clustering algorithm:
+
+```python
+>>> from sklearn.cluster import DBSCAN
+>>> from imblearn.over_sampling import BorderlineSMOTE
+>>> from imblearn_extra.clover.over_sampling import ClusterOverSampler
+>>> dbscan_bsmote = ClusterOverSampler(oversampler=BorderlineSMOTE(random_state=5), clusterer=DBSCAN())
+>>> X_resampled, y_resampled = dbscan_bsmote.fit_resample(X, y)
+>>> print(sorted(Counter(y_resampled).items()))
+[(np.int64(0), 80), (np.int64(1), 80), (np.int64(2), 80)]
+```
+
+Additionally, if the clusterer supports a neighboring structure for the clusters through a `neighbors_` attribute, then it can
+be used to generate inter-cluster artificial data similarly to SOMO[^1] and G-SOMO[^3] algorithms.
+
+### Adjusting the distribution of generated samples
+
+The parameter `distributor` of the `ClusterOverSampler` is used to define the distribution of the generated samples to the
+clusters. The `DensityDistributor` class implements a density based distribution and it is the default `distributor` for all
+objects of the `ClusterOverSampler` class:
+
+```python
+>>> from sklearn.cluster import AgglomerativeClustering
+>>> from imblearn.over_sampling import SMOTE
+>>> agg_smote = ClusterOverSampler(oversampler=SMOTE(random_state=5), clusterer=AgglomerativeClustering())
+>>> agg_smote.fit(X, y)
+>>> agg_smote.distributor_
+DensityDistributor()
+```
+
+The `DensityDistributor` objects can be parametrized:
+
+```python
+>>> from imblearn_extra.clover.distribution import DensityDistributor
+>>> distributor = DensityDistributor(distances_exponent=0)
+```
+
+In order to distribute the samples a `labels` parameter is required, while `neighbors` is optional:
+
+```python
+>>> from sklearn.cluster import KMeans
+>>> clusterer = KMeans(n_clusters=4, random_state=1).fit(X, y)
+>>> labels = clusterer.labels_
+```
+
+The distribution samples of the samples is provided by the `fit_distribute` method and it is described in the `intra_distribution`
+and `inter_distribution` dictionaries:
+
+```python
+>>> intra_distribution, inter_distribution = distributor.fit_distribute(X, y, labels, neighbors=None)
+>>> print(distributor.filtered_clusters_)
+[(np.int32(3), np.int64(1)), (np.int32(1), np.int64(0)), (np.int32(1), np.int64(1))]
+>>> print(distributor.clusters_density_)
+{(np.int32(3), np.int64(1)): np.float64(3.0), (np.int32(1), np.int64(0)): np.float64(7.0), (np.int32(1), np.int64(1)): np.float64(7.0)}
+>>> print(intra_distribution)
+{(np.int32(3), np.int64(1)): np.float64(0.7), (np.int32(1), np.int64(0)): np.float64(1.0), (np.int32(1), np.int64(1)): np.float64(0.3)}
+>>> print(inter_distribution)
+{}
+```
+
+The keys of the above dictionaries are tuples of `(cluster_label, class_label)` shape, while their values are proportions of the
+total generated samples for the particular class. For example `(0, 1): 0.7` means that 70% of samples of class `1` will be
+generated in the cluster `0`. Any other distributor can be defined by extending the `BaseDistributor` class.
+
+## Geometric SMOTE algorithm
+
SMOTE algorithm, as well as any other over-sampling method based on the SMOTE mechanism, generates synthetic samples along line
-segments that join minority class instances. Geometric SMOTE (G-SMOTE) is an enhancement of the SMOTE data generation mechanism.
-G-SMOTE generates synthetic samples in a geometric region of the input space, around each selected minority instance. The
-`GeometricSMOTE` class can be used with multiple classes as well as binary classes classification. It uses a one-vs-rest approach
-by selecting each targeted class and computing the necessary statistics against the rest of the data set which are grouped in a
-single class.
+segments that join minority class instances. Geometric SMOTE[^4] (G-SMOTE) is an enhancement of the SMOTE data generation
+mechanism. G-SMOTE generates synthetic samples in a geometric region of the input space, around each selected minority instance.
+The `GeometricSMOTE` class can be used with multiple classes as well as binary classes classification. It uses a one-vs-rest
+approach by selecting each targeted class and computing the necessary statistics against the rest of the data set which are
+grouped in a single class.
Initially, we generate multi-class imbalanced data represented by the input data `X` and targets `y`:
@@ -27,12 +144,12 @@ Initially, we generate multi-class imbalanced data represented by the input data
We can use `GeometricSMOTE` to resample the data:
```python
->>> from gsmote import GeometricSMOTE
+>>> from imblearn_extra.gsmote import GeometricSMOTE
>>> geometric_smote = GeometricSMOTE()
>>> X_resampled, y_resampled = geometric_smote.fit_resample(X, y)
>>> from collections import Counter
>>> print(sorted(Counter(y_resampled).items()))
-[(0, 80), (1, 80), (2, 80)]
+[(np.int64(0), 80), (np.int64(1), 80), (np.int64(2), 80)]
```
The augmented data set can be used instead of the original data set to train a classifier:
@@ -41,6 +158,7 @@ The augmented data set can be used instead of the original data set to train a c
>>> from sklearn.tree import DecisionTreeClassifier
>>> clf = DecisionTreeClassifier()
>>> clf.fit(X_resampled, y_resampled)
+DecisionTreeClassifier()
```
`GeometricSMOTE` can be used also in a machine learning pipeline:
@@ -49,4 +167,31 @@ The augmented data set can be used instead of the original data set to train a c
from imblearn.pipeline import make_pipeline
pipeline = make_pipeline(GeometricSMOTE(), DecisionTreeClassifier())
pipeline.fit(X, y)
+Pipeline(steps=[('geometricsmote', GeometricSMOTE()),
+ ('decisiontreeclassifier', DecisionTreeClassifier())])
```
+
+### Compatibility
+
+The API of `imblearn_extra` is fully compatible to [imbalanced-learn]. Particularly for clustering-based oversampling, any
+oversampler from cluster-over-sampling that does not use clustering, i.e. when ``clusterer=None``, is equivalent to the
+corresponding [imbalanced-learn] oversampler:
+
+```python
+>>> import numpy as np
+>>> X_res_im, y_res_im = SMOTE(random_state=5).fit_resample(X, y)
+>>> X_res_cl, y_res_cl = ClusterOverSampler(SMOTE(random_state=5), clusterer=None).fit_resample(X, y)
+>>> np.testing.assert_equal(X_res_im, X_res_cl)
+>>> np.testing.assert_equal(y_res_im, y_res_cl)
+```
+
+## References
+
+[^1]: [G. Douzas, F. Bacao, "Self-Organizing Map Oversampling (SOMO) for imbalanced data set learning", Expert Systems with
+ Applications, vol. 82, pp. 40-52, 2017.](https://www.sciencedirect.com/science/article/abs/pii/S0957417417302324)
+[^2]: [G. Douzas, F. Bacao, F. Last, "Improving imbalanced learning through a heuristic oversampling method based on k-means and SMOTE", Information Sciences, vol. 465, pp. 1-20,
+ 2018.](https://www.sciencedirect.com/science/article/abs/pii/S0020025518304997)
+[^3]: [G. Douzas, F. Bacao, F. Last, "G-SOMO: An oversampling approach based on self-organized maps and geometric SMOTE", Expert
+ Systems with Applications, vol. 183,115230, 2021.](https://www.sciencedirect.com/science/article/abs/pii/S095741742100662X)
+[^4]: [G. Douzas, F. Bacao, F. Last, "Geometric SMOTE a geometrically enhanced drop-in replacement for SMOTE", Information
+ Sciences, Volume 501, 2019.](https://www.sciencedirect.com/science/article/abs/pii/S0020025519305353?via%3Dihub)
diff --git a/mkdocs.yml b/mkdocs.yml
index 528ca3f..0483ea7 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -1,10 +1,10 @@
-site_name: "geometric-smote"
-site_description: "Implementation of the Geometric SMOTE algorithm, a geometrically enhanced drop-in replacement for SMOTE. It is compatible with scikit-learn and imbalanced-learn."
-site_url: "https://georgedouzas.github.io/geometric-smote"
-repo_url: "https://github.com/georgedouzas/geometric-smote"
-repo_name: "georgedouzas/geometric-smote"
+site_name: "imbalanced-learn-extra"
+site_description: "An implementation of novel oversampling algorithms."
+site_url: "https://georgedouzas.github.io/imbalanced-learn-extra"
+repo_url: "https://github.com/georgedouzas/imbalanced-learn-extra"
+repo_name: "georgedouzas/imbalanced-learn-extra"
site_dir: "site"
-watch: [README.md, CONTRIBUTING.md, CHANGELOG.md, src/gsmote]
+watch: [README.md, CONTRIBUTING.md, CHANGELOG.md, src/imblearn_extra]
theme:
name: material
diff --git a/noxfile.py b/noxfile.py
index 48aaf68..ba959aa 100644
--- a/noxfile.py
+++ b/noxfile.py
@@ -10,7 +10,7 @@
os.environ.update({'PDM_IGNORE_SAVED_PYTHON': '1'})
-PYTHON_VERSIONS: list[str] = ['3.10', '3.11']
+PYTHON_VERSIONS: list[str] = ['3.10', '3.11', '3.12']
FILES: list[str] = ['src', 'tests', 'docs', 'noxfile.py']
CHANGELOG_ARGS: dict[str, Any] = {
'repository': '.',
@@ -65,7 +65,7 @@ def formatting(session: nox.Session, file: str) -> None:
if session.posargs[0] in ['code', 'all']:
session.run('black', file)
if session.posargs[0] in ['docstrings', 'all']:
- session.run('docformatter', '--in-place', '--recursive', '--close-quotes-on-newline', file)
+ session.run('docformatter', file)
@nox.session(python=PYTHON_VERSIONS)
@@ -80,13 +80,13 @@ def checks(session: nox.Session, file: str) -> None:
check_cli(session, ['all', 'quality', 'dependencies', 'types'])
session.run('pdm', 'install', '-dG', 'checks', '--no-default', external=True)
if session.posargs[0] in ['quality', 'all']:
- session.run('ruff', file)
+ session.run('ruff', 'check', file)
if session.posargs[0] in ['types', 'all']:
session.run('mypy', file)
if session.posargs[0] in ['dependencies', 'all']:
requirements_path = (Path(session.create_tmp()) / 'requirements.txt').as_posix()
args_groups = [['--prod']] + [['-dG', group] for group in ['tests', 'docs', 'maintenance']]
- requirements_types = zip(FILES, args_groups)
+ requirements_types = zip(FILES, args_groups, strict=True)
args = [
'pdm',
'export',
@@ -99,7 +99,11 @@ def checks(session: nox.Session, file: str) -> None:
requirements_path,
]
session.run(*(args + dict(requirements_types)[file]), external=True)
- session.run('safety', 'check', '-r', requirements_path)
+ if os.environ.get('CI') is not None:
+ api_key = os.environ.get('SAFETY_API_KEY')
+ session.run('safety', '--key', api_key, '--stage', 'cicd', 'scan', '-r', requirements_path)
+ else:
+ session.run('safety', 'scan', '-r', requirements_path)
@nox.session(python=PYTHON_VERSIONS)
diff --git a/pyproject.toml b/pyproject.toml
index 883df29..8f696f7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -3,13 +3,13 @@ requires = ["pdm-pep517"]
build-backend = "pdm.pep517.api"
[project]
-name = "geometric-smote"
-description = "Implementation of the Geometric SMOTE algorithm, a geometrically enhanced drop-in replacement for SMOTE. It is compatible with scikit-learn and imbalanced-learn."
+name = "imbalanced-learn-extra"
+description = "An implementation of novel oversampling algorithms."
authors = [{name = "Georgios Douzas", email = "gdouzas@icloud.com"}]
license = "MIT"
readme = "README.md"
-requires-python = ">=3.10, <3.12"
-keywords = []
+requires-python = ">=3.10, <3.13"
+keywords = ["machine learning", "imbalanced learning", "oversampling"]
dynamic = ["version"]
classifiers = [
"Development Status :: 4 - Beta",
@@ -19,25 +19,29 @@ classifiers = [
"Programming Language :: Python :: 3 :: Only",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
+ "Programming Language :: Python :: 3.12",
"Topic :: Documentation",
"Topic :: Software Development",
"Topic :: Utilities",
"Typing :: Typed",
]
dependencies = [
+ "scipy>=1.7.2",
+ "numpy>=1.22",
"scikit-learn>=1.3.2",
"imbalanced-learn>=0.11.0",
+ "MiniSom>=2.3.2",
"typing-extensions>=4.8.0",
]
[project.urls]
-Homepage = "https://georgedouzas.github.io/geometric-smote"
-Documentation = "https://georgedouzas.github.io/geometric-smote"
-Changelog = "https://georgedouzas.github.io/geometric-smote/changelog"
-Repository = "https://github.com/georgedouzas/geometric-smote"
-Issues = "https://github.com/georgedouzas/geometric-smote/issues"
-Discussions = "https://github.com/georgedouzas/geometric-smote/discussions"
-Gitter = "https://gitter.im/geometric-smote/community"
+Homepage = "https://georgedouzas.github.io/imbalanced-learn-extra"
+Documentation = "https://georgedouzas.github.io/imbalanced-learn-extra"
+Changelog = "https://georgedouzas.github.io/imbalanced-learn-extra/changelog"
+Repository = "https://github.com/georgedouzas/imbalanced-learn-extra"
+Issues = "https://github.com/georgedouzas/imbalanced-learn-extra/issues"
+Discussions = "https://github.com/georgedouzas/imbalanced-learn-extra/discussions"
+Gitter = "https://gitter.im/imbalanced-learn-extra/community"
Funding = "https://github.com/sponsors/georgedouzas"
[tool.pdm]
@@ -52,51 +56,52 @@ htmlcov pip-wheel-metadata site __pycache__ docs/generated .nox .ruff_cache pdm.
&& find . -name '.coverage*' -delete"""}
docs = "nox --error-on-external-run -R -s docs -- {args:serve}"
formatting = "nox --error-on-external-run -R -s formatting -- {args:all}"
-checks = "nox --error-on-external-run -R -s checks -- {args:all}"
-tests = "nox --error-on-external-run -R -s tests"
+checks = "nox --default-venv-backend uv --error-on-external-run -R -s checks -- {args:all}"
+tests = "nox --default-venv-backend uv --error-on-external-run -R -s tests"
changelog = "nox --error-on-external-run -R -s changelog"
release = "nox --error-on-external-run -R -s release"
[tool.pdm.dev-dependencies]
maintenance = [
- "nox>=2022.8.7",
+ "nox[uv]>=2024.4.15",
]
docs = [
- "mkdocs>=1.3",
- "mkdocs-coverage>=0.2",
- "mkdocs-gen-files>=0.3",
- "mkdocs-literate-nav>=0.4",
- "mkdocs-material>=7.3",
- "mkdocs-gallery>=0.7.6",
- "mkdocs-section-index>=0.3",
- "mkdocstrings[python]>=0.20",
- "markdown-callouts>=0.2",
- "markdown-exec>=0.5",
- 'pandas>=1.5.0',
- "matplotlib>=3.8.2",
+ "mkdocs>=1.6.0",
+ "mkdocs-coverage>=1.1.0",
+ "mkdocs-gen-files>=0.5.0",
+ "mkdocs-literate-nav>=0.6.1",
+ "mkdocs-material>=9.5.27",
+ "mkdocs-gallery>=0.10.1",
+ "mkdocs-section-index>=0.3.9",
+ "mkdocstrings[python]>=0.25.1",
+ "markdown-callouts>=0.4.0",
+ "markdown-exec>=1.9.3",
+ 'pandas>=2.2.2',
+ "matplotlib>=3.9.2",
]
formatting = [
- "black>=21.10b0",
- "docformatter>=1.5.1",
+ "black>=24.4.2",
+ "docformatter>=1.7.5",
]
checks = [
- "ruff>=0.0.237",
- "safety>=2",
- "mypy>=0.910",
- "types-markdown>=3.3",
- "types-toml>=0.10",
+ "ruff>=0.5.0",
+ "safety>=3.2.3",
+ "mypy>=1.10.1",
+ "types-markdown>=3.6.0.20240316",
+ "types-toml>=0.10.8.20240310",
+ "numpy>=2.1.2",
]
tests = [
- "pytest>=6.2",
- "pytest-cov>=3.0",
- "pytest-randomly>=3.10",
- "pytest-xdist>=2.4",
+ "pytest>=8.2.2",
+ "pytest-cov>=5.0.0",
+ "pytest-randomly>=3.15.0",
+ "pytest-xdist>=3.6.1",
]
changelog = [
- "git-changelog>=1.0.0",
+ "git-changelog>=2.5.2",
]
release = [
- "twine>=4.0.0",
+ "twine>=5.1.1",
]
[tool.black]
@@ -104,31 +109,35 @@ line-length = 120
extend-exclude = "(tests/fixtures|docs/generated)"
skip-string-normalization = true
+[tool.docformatter]
+in-place = true
+recursive = true
+close-quotes-on-newline = true
+wrap-descriptions = 120
+
[tool.ruff]
+extend-exclude = ["docs/generated"]
+force-exclude = true
+line-length = 120
+target-version = "py310"
+
+[tool.ruff.lint]
+ignore = ["D202", "N806", "N803", "S101", "INP001", "Q000", "TRY002", "PLR0913", "EXE001", "EXE002", "E741"]
select = ["C", "E", "F", "W", "B", "I", "D", "N", "UP", "YTT", "ANN", "S",
"BLE", "A", "COM", "C4", "DTZ", "T10", "EM", "EXE", "ISC", "ICN", "G",
"INP", "PIE", "T20", "PT", "Q", "RET501", "RET502", "RET503", "SIM",
"PTH", "PD", "PGH", "PL", "TRY", "RUF", "PLE", "PLR", "PLW", "TRY", "RUF"]
-ignore = ["D202", "N806", "N803", "S101", "INP001", "Q000", "TRY002", "PLR0913", "EXE001", "EXE002"]
-fix = true
-fixable = ["C", "E", "F", "W", "B", "I", "D", "N", "UP", "YTT", "ANN", "S",
-"BLE", "A", "COM", "C4", "DTZ", "T10", "EM", "EXE", "ISC", "ICN", "G",
-"INP", "PIE", "T20", "PT", "Q", "RET501", "RET502", "RET503", "SIM",
-"PTH", "PD", "PGH", "PL", "TRY", "RUF", "PLE", "PLR", "PLW", "TRY", "RUF"]
-extend-exclude = ["docs/generated"]
-force-exclude = true
-line-length = 120
-target-version = "py39"
-[tool.ruff.per-file-ignores]
+[tool.ruff.lint.per-file-ignores]
"docs/examples/*" = ["ANN", "D", "B018"]
"docs/generated/*" = ["ANN", "D"]
"test_*" = ["ANN"]
-[tool.ruff.pydocstyle]
+[tool.ruff.lint.pydocstyle]
convention = "google"
[tool.mypy]
+plugins = "numpy.typing.mypy_plugin"
ignore_missing_imports = true
exclude = ["tests/fixtures/", "docs/examples/", "docs/generated/"]
warn_unused_ignores = true
diff --git a/src/imblearn_extra/__init__.py b/src/imblearn_extra/__init__.py
new file mode 100644
index 0000000..81d5b80
--- /dev/null
+++ b/src/imblearn_extra/__init__.py
@@ -0,0 +1,5 @@
+"""Novel oversampling algorithms implementations."""
+
+from __future__ import annotations
+
+__all__: list[str] = []
diff --git a/src/imblearn_extra/clover/__init__.py b/src/imblearn_extra/clover/__init__.py
new file mode 100644
index 0000000..c268e8e
--- /dev/null
+++ b/src/imblearn_extra/clover/__init__.py
@@ -0,0 +1,39 @@
+"""A general interface for clustering based over-sampling algorithms.
+
+[SOMO oversampling algorithm]:
+[KMeans-SMOTE oversampling algorithm]:
+[G-SOMO oversampling algorithm]:
+
+The module provides the implementation of an interface for clustering-based over-sampling. It
+has two submodules:
+
+- [`distribution`][imblearn_extra.clover.distribution]: Provides the classes to distrubute the generated samples into
+clusters.
+
+ - [`DensityDistributor`][imblearn_extra.clover.distribution.DensityDistributor]: Density based distributor.
+
+- [`over_sampling`][imblearn_extra.clover.over_sampling]: Provides the clustering-based oversampling algorithms.
+
+ - [`ClusterOverSampler`][imblearn_extra.clover.over_sampling.ClusterOverSampler]: Combinations of oversampler and
+ clusterer.
+ - [`KMeansSMOTE`][imblearn_extra.clover.over_sampling.KMeansSMOTE]: [KMeans-SMOTE oversampling algorithm]
+ oversampling algorithm.
+ - [`SOMO`][imblearn_extra.clover.over_sampling.SOMO]: [SOMO oversampling algorithm].
+ - [`GeometricSOMO`][imblearn_extra.clover.over_sampling.GeometricSOMO]: [G-SOMO oversampling algorithm].
+"""
+
+from __future__ import annotations
+
+import numpy as np
+import numpy.typing as npt
+
+__all__: list[str] = []
+
+InputData = npt.NDArray[np.float64]
+Targets = npt.NDArray[np.float64]
+Labels = npt.NDArray[np.int16]
+Neighbors = npt.NDArray[np.int16]
+MultiLabel = tuple[int, int]
+IntraDistribution = dict[MultiLabel, float]
+InterDistribution = dict[tuple[MultiLabel, MultiLabel], float]
+Density = dict[MultiLabel, float]
diff --git a/src/imblearn_extra/clover/clusterer/__init__.py b/src/imblearn_extra/clover/clusterer/__init__.py
new file mode 100644
index 0000000..14a63e6
--- /dev/null
+++ b/src/imblearn_extra/clover/clusterer/__init__.py
@@ -0,0 +1,5 @@
+"""Implementation of Self-Organizing Map."""
+
+from ._som import SOM, extract_topological_neighbors, generate_labels_mapping
+
+__all__: list[str] = ['SOM', 'extract_topological_neighbors', 'generate_labels_mapping']
diff --git a/src/imblearn_extra/clover/clusterer/_som.py b/src/imblearn_extra/clover/clusterer/_som.py
new file mode 100644
index 0000000..b4a9c71
--- /dev/null
+++ b/src/imblearn_extra/clover/clusterer/_som.py
@@ -0,0 +1,311 @@
+"""Implementation of the Self-Organizing Map (SOM) clusterer."""
+
+# Author: Georgios Douzas
+# License: BSD 3 clause
+
+from collections.abc import Callable
+from itertools import product
+from typing import Any, cast
+
+import numpy as np
+import numpy.typing as npt
+from minisom import MiniSom
+from sklearn.base import BaseEstimator, ClusterMixin
+from sklearn.preprocessing import minmax_scale
+from sklearn.utils import check_array, check_random_state
+from typing_extensions import Self
+
+
+def generate_labels_mapping(labels_coords: list[tuple[int, int]]) -> dict[tuple[int, int], int]:
+ """Generate a mapping between grid labels and cluster labels."""
+
+ # Identify unique grid labels
+ unique_labels = sorted(set(labels_coords))
+
+ # Generate mapping
+ labels_mapping = dict(zip(unique_labels, range(len(unique_labels)), strict=True))
+
+ return labels_mapping
+
+
+def extract_topological_neighbors(
+ col: int,
+ row: int,
+ topology: str,
+ n_rows: int,
+ n_columns: int,
+ labels_coords_unique: list[tuple[int, int]],
+) -> list[tuple[int, int]]:
+ """Return the topological neighbors of a neuron."""
+
+ # Return common topological neighbors for the two grid types
+ topological_neighbors = [
+ (col - 1, row),
+ (col + 1, row),
+ (col, row - 1),
+ (col, row + 1),
+ ]
+
+ # Append extra topological neighbors for hexagonal grid type
+ if topology == 'hexagonal':
+ offset = (-1) ** row
+ topological_neighbors += [
+ (col - offset, row - offset),
+ (col - offset, row + offset),
+ ]
+
+ # Apply constraints
+ topological_neighbors = [
+ (col, row)
+ for col, row in topological_neighbors
+ if 0 <= col < n_columns and 0 <= row < n_rows and (col, row) in labels_coords_unique
+ ]
+
+ return topological_neighbors
+
+
+class SOM(BaseEstimator, ClusterMixin):
+ """Class to fit and visualize a Self-Organizing Map (SOM).
+
+ The implementation uses MiniSom from minisom. Read more in the
+ [user_guide].
+
+ Args:
+ n_columns:
+ The number of columns in the map.
+
+ n_rows:
+ The number of rows in the map.
+
+ sigma:
+ Spread of the neighborhood function.
+
+ learning_rate:
+ Initial learning rate.
+
+ decay_function:
+ Function that reduces learning_rate and sigma at each iteration.
+ Possible values: 'inverse_decay_to_zero', 'linear_decay_to_zero',
+ 'asymptotic_decay' or callable.
+
+ neighborhood_function:
+ Function that weights the neighborhood of a position in the map.
+ Possible values: 'gaussian', 'mexican_hat', 'bubble', 'triangle'.
+
+ topology:
+ Topology of the map. Possible values: 'rectangular', 'hexagonal'.
+
+ activation_distance:
+ Distance used to activate the map.
+ Possible values: 'euclidean', 'cosine', 'manhattan', 'chebyshev'
+ or callable.
+
+ random_state:
+ Control the randomization of the algorithm.
+
+ - If int, `random_state` is the seed used by the random number
+ generator.
+ - If `RandomState` instance, random_state is the random number
+ generator.
+ - If `None`, the random number generator is the `RandomState`
+ instance used by `np.random`.
+ """
+
+ def __init__(
+ self: Self,
+ n_columns: int | None = None,
+ n_rows: int | None = None,
+ sigma: float = 1.0,
+ learning_rate: float = 0.5,
+ decay_function: str | Callable = 'inverse_decay_to_zero',
+ neighborhood_function: str = 'gaussian',
+ topology: str = 'rectangular',
+ activation_distance: str | Callable = 'euclidean',
+ random_state: np.random.RandomState | int | None = None,
+ ) -> None:
+ self.n_columns = n_columns
+ self.n_rows = n_rows
+ self.sigma = sigma
+ self.learning_rate = learning_rate
+ self.decay_function = decay_function
+ self.neighborhood_function = neighborhood_function
+ self.topology = topology
+ self.activation_distance = activation_distance
+ self.random_state = random_state
+
+ def _generate_neighbors(
+ self: Self,
+ labels_coords_unique: list[tuple[int, int]],
+ labels_mapping: dict[tuple[int, int], int],
+ ) -> npt.NDArray:
+ """Generate pairs of neighboring labels."""
+
+ # Generate grid topological neighbors
+ topological_neighbors = [
+ product(
+ [label_coords],
+ extract_topological_neighbors(
+ *label_coords,
+ self.topology,
+ self.n_rows_,
+ self.n_columns_,
+ labels_coords_unique,
+ ),
+ )
+ for label_coords in labels_coords_unique
+ ]
+
+ # Flatten grid topological neighbors
+ topological_neighbors_flat = cast(
+ list[tuple[tuple[int, int], tuple[int, int]]],
+ [pair for pairs in topological_neighbors for pair in pairs],
+ )
+
+ # Generate cluster neighbors
+ all_neighbors = sorted(
+ {(labels_mapping[pair[0]], labels_mapping[pair[1]]) for pair in topological_neighbors_flat},
+ )
+
+ # Keep unique unordered pairs
+ neighbors = []
+ for pair in all_neighbors:
+ if pair not in neighbors and pair[::-1] not in neighbors:
+ neighbors.append(pair)
+
+ return np.array(neighbors)
+
+ def fit(self: Self, X: npt.ArrayLike, y: npt.ArrayLike | None = None, **fit_params: dict[str, Any]) -> Self:
+ """Train the self-organizing map.
+
+ Args:
+ X:
+ Training instances to cluster.
+
+ y:
+ Ignored.
+
+ fit_params:
+ Parameters to pass to train method of the MiniSom object.
+
+ The following parameters can be used:
+
+ num_iteration: If `use_epochs` is `False`, the weights will be
+ updated `num_iteration` times. Otherwise they will be updated
+ `len(X) * num_iteration` times.
+
+ random_order:
+ If `True`, samples are picked in random order.
+ Otherwise the samples are picked sequentially.
+
+ verbose:
+ If `True` the status of the training will be
+ printed each time the weights are updated.
+
+ use_epochs:
+ If `True` the SOM will be trained for num_iteration epochs.
+ In one epoch the weights are updated `len(data)` times and
+ the learning rate is constat throughout a single epoch.
+
+ Returns:
+ The object itself.
+ """
+ # Check random state
+ self.random_state_ = check_random_state(self.random_state).randint(low=np.iinfo(np.int32).max)
+
+ # Check and normalize input data
+ X_scaled = minmax_scale(check_array(X, dtype=np.float32))
+
+ # Initialize size
+ n_neurons = 5 * np.sqrt(X_scaled.shape[0])
+ if self.n_rows is None and self.n_columns is None:
+ self.n_rows_ = self.n_columns_ = int(np.ceil(np.sqrt(n_neurons)))
+ elif self.n_rows is None and self.n_columns is not None:
+ self.n_columns_ = self.n_columns
+ self.n_rows_ = int(np.ceil(n_neurons / self.n_columns_))
+ elif self.n_columns is None and self.n_rows is not None:
+ self.n_rows_ = self.n_rows
+ self.n_columns_ = int(np.ceil(n_neurons / self.n_rows_))
+ elif self.n_columns is not None and self.n_rows is not None:
+ self.n_rows_ = self.n_rows
+ self.n_columns_ = self.n_columns
+
+ # Create MiniSom object
+ self.algorithm_ = MiniSom(
+ x=self.n_rows_,
+ y=self.n_columns_,
+ input_len=X_scaled.shape[1],
+ sigma=self.sigma,
+ learning_rate=self.learning_rate,
+ decay_function=self.decay_function,
+ neighborhood_function=self.neighborhood_function,
+ topology=self.topology,
+ activation_distance=self.activation_distance,
+ random_seed=self.random_state_,
+ )
+
+ # Fit MiniSom
+ if 'num_iteration' not in fit_params:
+ fit_params = {**fit_params, 'num_iteration': cast(Any, 1000)}
+ self.algorithm_.train(data=X_scaled, **fit_params)
+
+ # Grid labels
+ labels_coords = [(int(i), int(j)) for i, j in [self.algorithm_.winner(x_scaled) for x_scaled in X_scaled]]
+
+ # Generate labels mapping
+ self.labels_mapping_ = generate_labels_mapping(labels_coords)
+
+ # Generate cluster labels
+ self.labels_ = np.array(
+ [self.labels_mapping_[grid_label] for grid_label in labels_coords],
+ )
+
+ # Generate labels neighbors
+ self.neighbors_ = self._generate_neighbors(
+ sorted(set(labels_coords)),
+ self.labels_mapping_,
+ )
+
+ return self
+
+ def fit_predict(
+ self: Self,
+ X: npt.ArrayLike,
+ y: npt.ArrayLike | None = None,
+ **fit_params: dict[str, Any],
+ ) -> npt.NDArray:
+ """Train the self-organizing map and assign cluster labels to samples.
+
+ Args:
+ X:
+ New data to transform.
+
+ y:
+ Ignored.
+
+ fit_params:
+ Parameters to pass to train method of the MiniSom object.
+
+ The following parameters can be used:
+
+ num_iteration: If `use_epochs` is `False`, the weights will be
+ updated `num_iteration` times. Otherwise they will be updated
+ `len(X) * num_iteration` times.
+
+ random_order:
+ If `True`, samples are picked in random order.
+ Otherwise the samples are picked sequentially.
+
+ verbose:
+ If `True` the status of the training will be
+ printed each time the weights are updated.
+
+ use_epochs:
+ If `True` the SOM will be trained for num_iteration epochs.
+ In one epoch the weights are updated `len(data)` times and
+ the learning rate is constat throughout a single epoch.
+
+ Returns:
+ labels:
+ Index of the cluster each sample belongs to.
+ """
+ return self.fit(X=X, y=None, **fit_params).labels_
diff --git a/src/imblearn_extra/clover/distribution/__init__.py b/src/imblearn_extra/clover/distribution/__init__.py
new file mode 100644
index 0000000..86fd023
--- /dev/null
+++ b/src/imblearn_extra/clover/distribution/__init__.py
@@ -0,0 +1,5 @@
+"""Distributor classes for clustering-based oversampling."""
+
+from ._density import DensityDistributor
+
+__all__ = ['DensityDistributor']
diff --git a/src/imblearn_extra/clover/distribution/_density.py b/src/imblearn_extra/clover/distribution/_density.py
new file mode 100644
index 0000000..afbeb32
--- /dev/null
+++ b/src/imblearn_extra/clover/distribution/_density.py
@@ -0,0 +1,359 @@
+"""Implementation of the DensityDistributor class."""
+
+# Author: Georgios Douzas
+# Joao Fonseca
+# License: MIT
+
+from __future__ import annotations
+
+from collections import Counter
+from itertools import product
+from warnings import catch_warnings, filterwarnings
+
+import numpy as np
+from sklearn.metrics.pairwise import euclidean_distances
+from sklearn.utils import check_scalar
+from typing_extensions import Self
+
+from .. import InputData, Labels, Neighbors, Targets
+from .base import BaseDistributor
+
+
+class DensityDistributor(BaseDistributor):
+ """Class to perform density based distribution.
+
+ Samples are distributed based on the density of clusters.
+
+ Read more in the [user_guide].
+
+ Args:
+ filtering_threshold:
+ The threshold of a filtered cluster. It can be any non-negative number or
+ `'auto'` to be calculated automatically.
+
+ - If `'auto'`, the filtering threshold is calculated from the imbalance
+ ratio of the target for the binary case or the maximum of the target's
+ imbalance ratios for the multiclass case.
+
+ - If `float` then it is manually set to this number. Any cluster that has an
+ imbalance ratio smaller than the filtering threshold is identified as a filtered
+ cluster and can be potentially used to generate minority class instances. Higher
+ values increase the number of filtered clusters.
+
+ distances_exponent:
+ The exponent of the mean distance in the density calculation. It can be
+ any non-negative number or `'auto'` to be calculated automatically.
+
+ - If `'auto'` then it is set equal to the number of
+ features. Higher values make the calculation of density more sensitive
+ to the cluster's size i.e. clusters with large mean euclidean distance
+ between samples are penalized.
+
+ - If `float` then it is manually set to this number.
+
+ sparsity_based:
+ Whether sparse clusters receive more generated samples.
+
+ - When `True` clusters receive generated samples that are inversely
+ proportional to their density.
+
+ - When `False` clusters receive generated samples that are proportional to their density.
+
+ distribution_ratio:
+ The ratio of intra-cluster to inter-cluster generated samples. It is a
+ number in the `[0.0, 1.0]` range. The default value is `1.0`, a
+ case corresponding to only intra-cluster generation. As the number
+ decreases, less intra-cluster samples are generated. Inter-cluster
+ generation, i.e. when `distribution_ratio` is less than `1.0`,
+ requires a neighborhood structure for the clusters, i.e. a
+ `neighbors_` attribute should be created after fitting and it will
+ raise an error when it is not found.
+
+ Attributes:
+ clusters_density_ (Density):
+ Each dict key is a multi-label tuple of shape `(cluster_label,
+ class_label)`, while the values correspond to the density.
+
+ distances_exponent_ (float):
+ Actual exponent of the mean distance used in the calculations.
+
+ distribution_ratio_ (float):
+ A copy of the parameter in the constructor.
+
+ filtered_clusters_ (List[MultiLabel]):
+ Each element is a tuple of `(cluster_label, class_label)` pairs.
+
+ filtering_threshold_ (float):
+ Actual filtering threshold used in the calculations.
+
+ inter_distribution_ (InterDistribution):
+ Each dict key is a multi-label tuple of
+ shape `((cluster_label1, cluster_label2), class_label)` while the
+ values are the proportion of samples per class.
+
+ intra_distribution_ (IntraDistribution):
+ Each dict key is a multi-label tuple of shape `(cluster_label,
+ class_label)` while the values are the proportion of samples per class.
+
+ labels_ (Labels):
+ Labels of each sample.
+
+ neighbors_ (Neighbors):
+ An array that contains all neighboring pairs. Each row is
+ a unique neighboring pair.
+
+ majority_class_label_ (int):
+ The majority class label.
+
+ n_samples_ (int):
+ The number of samples.
+
+ sparsity_based_ (bool):
+ A copy of the parameter in the constructor.
+
+ unique_class_labels_ (Labels):
+ An array of unique class labels.
+
+ unique_cluster_labels_ (Labels):
+ An array of unique cluster labels.
+
+ Examples:
+ >>> from clover.distribution import DensityDistributor
+ >>> from sklearn.datasets import load_iris
+ >>> from sklearn.cluster import KMeans
+ >>> from imblearn.datasets import make_imbalance
+ >>> X, y = make_imbalance(
+ ... *load_iris(return_X_y=True),
+ ... sampling_strategy={0:50, 1:40, 2:30},
+ ... random_state=0
+ ... )
+ >>> labels = KMeans(random_state=0, n_init='auto').fit_predict(X, y)
+ >>> density_distributor = DensityDistributor().fit(X, y, labels)
+ >>> density_distributor.filtered_clusters_
+ [(6, 1), (0, 1), (3, 1), (7, 1), (5, 2), (2, 2), (3, 2), (6, 2), (0, 2)]
+ >>> density_distributor.intra_distribution_
+ {(6, 1): 0.50604609281056... (0, 1): 0.143311766542165...}
+ >>> density_distributor.inter_distribution_
+ {}
+ """
+
+ def __init__(
+ self: Self,
+ filtering_threshold: float | str = 'auto',
+ distances_exponent: float | str = 'auto',
+ sparsity_based: bool = True,
+ distribution_ratio: float = 1.0,
+ ) -> None:
+ self.filtering_threshold = filtering_threshold
+ self.distances_exponent = distances_exponent
+ self.sparsity_based = sparsity_based
+ self.distribution_ratio = distribution_ratio
+
+ def _check_parameters(
+ self: Self,
+ X: InputData,
+ y: Targets,
+ neighbors: Neighbors | None,
+ ) -> Self:
+ """Check distributor parameters."""
+
+ # Filtering threshold
+ if self.filtering_threshold == 'auto':
+ counts_vals = Counter(y).values()
+ self.filtering_threshold_ = max(counts_vals) / min(counts_vals)
+ else:
+ self.filtering_threshold_ = check_scalar(
+ self.filtering_threshold,
+ 'filtering_threshold',
+ (int, float),
+ min_val=0.0,
+ )
+
+ # Distances exponent
+ if self.distances_exponent == 'auto':
+ self.distances_exponent_ = X.shape[1]
+ else:
+ self.distances_exponent_ = check_scalar(
+ self.distances_exponent,
+ 'distances_exponent',
+ (int, float),
+ min_val=0.0,
+ )
+
+ # Sparsity based
+ check_scalar(self.sparsity_based, 'sparsity_based', bool)
+ self.sparsity_based_ = self.sparsity_based
+
+ # distribution ratio
+ check_scalar(
+ self.distribution_ratio,
+ 'distribution_ratio',
+ float,
+ min_val=0.0,
+ max_val=1.0,
+ )
+ max_distribution_ratio = 1.0
+ if self.distribution_ratio < max_distribution_ratio and neighbors is None:
+ msg = 'Parameter `distribution_ratio` should be equal to 1.0, when `neighbors` parameter is `None`.'
+ raise ValueError(msg)
+ self.distribution_ratio_ = self.distribution_ratio
+ return self
+
+ def _identify_filtered_clusters(self: Self, y: Targets) -> Self:
+ """Identify the filtered clusters."""
+ # Generate multi-label
+ multi_labels = list(zip(self.labels_, y, strict=True))
+
+ # Count multi-label
+ multi_labels_counts = Counter(multi_labels)
+
+ # Extract unique cluster and class labels
+ unique_multi_labels = [
+ multi_label for multi_label in multi_labels_counts if multi_label[1] not in self.majority_class_labels_
+ ]
+
+ # Identify filtered clusters
+ self.filtered_clusters_ = []
+ for multi_label in unique_multi_labels:
+ n_minority_samples = multi_labels_counts[multi_label]
+ n_majority_samples = multi_labels_counts[(multi_label[0], self.majority_class_labels_[0])]
+ if n_majority_samples <= n_minority_samples * self.filtering_threshold_:
+ self.filtered_clusters_.append(multi_label)
+
+ return self
+
+ def _calculate_clusters_density(self: Self, X: InputData, y: Targets) -> Self:
+ """Calculate the density of the filtered clusters."""
+ self.clusters_density_ = {}
+
+ # Calculate density
+ finite_densities = []
+ for cluster_label, class_label in self.filtered_clusters_:
+ # Calculate number of majority and minority samples in each cluster
+ mask = (self.labels_ == cluster_label) & (y == class_label)
+ n_minority_samples = mask.sum()
+
+ # Calculate density
+ n_minority_pairs = (n_minority_samples - 1) * n_minority_samples if n_minority_samples > 1 else 1
+ mean_distances = euclidean_distances(X[mask]).sum() / n_minority_pairs
+ with catch_warnings():
+ filterwarnings('ignore')
+ density = n_minority_samples / (mean_distances**self.distances_exponent_)
+ if np.isfinite(density):
+ finite_densities.append(density)
+ self.clusters_density_[(cluster_label, class_label)] = density
+
+ # Convert zero and infinite densities
+ min_density = 0.0
+ if min_density in self.clusters_density_.values():
+ self.clusters_density_ = {
+ multi_label: 1.0 for multi_label, density in self.clusters_density_.items() if density == min_density
+ }
+ self.filtered_clusters_ = [
+ multi_label for multi_label in self.filtered_clusters_ if multi_label in self.clusters_density_
+ ]
+ else:
+ max_density = max(finite_densities) if finite_densities else 1.0
+ self.clusters_density_ = {
+ multi_label: (max_density if np.isinf(density) else density)
+ for multi_label, density in self.clusters_density_.items()
+ }
+ return self
+
+ def _intra_distribute(
+ self: Self,
+ X: InputData,
+ y: Targets,
+ labels: Labels | None,
+ neighbors: Neighbors | None,
+ ) -> Self:
+ """In the clusters distribution.
+
+ Distribute the generated samples in each cluster based on their density.
+ """
+
+ # Calculate weights based on density
+ weights = {
+ multi_label: (1 / density if self.sparsity_based_ else density)
+ for multi_label, density in self.clusters_density_.items()
+ }
+
+ # Calculate normalization factors
+ class_labels = {class_label for _, class_label in self.filtered_clusters_}
+ normalization_factors = {class_label: 0.0 for class_label in class_labels}
+ for (_, class_label), weight in weights.items():
+ normalization_factors[class_label] += weight
+
+ # Intra distribution
+ self.intra_distribution_ = {
+ multi_label: (self.distribution_ratio_ * weight / normalization_factors[multi_label[1]])
+ for multi_label, weight in weights.items()
+ }
+
+ return self
+
+ def _inter_distribute(
+ self: Self,
+ X: InputData,
+ y: Targets,
+ labels: Labels | None,
+ neighbors: Neighbors | None,
+ ) -> Self:
+ """Between the clusters distribution.
+
+ Distribute the generated samples between clusters based on their density.
+ """
+
+ # Identify filtered neighboring clusters
+ filtered_neighbors = []
+ class_labels = {class_label for _, class_label in self.filtered_clusters_}
+ for pair, class_label in product(self.neighbors_, class_labels):
+ multi_label0 = (pair[0], class_label)
+ multi_label1 = (pair[1], class_label)
+ if multi_label0 in self.filtered_clusters_ and multi_label1 in self.filtered_clusters_:
+ filtered_neighbors.append((multi_label0, multi_label1))
+
+ # Calculate inter-cluster density
+ inter_clusters_density = {
+ multi_labels: (self.clusters_density_[multi_labels[0]] + self.clusters_density_[multi_labels[1]])
+ for multi_labels in filtered_neighbors
+ }
+
+ # Calculate weights based on density
+ weights = {
+ multi_labels: (1 / density if self.sparsity_based_ else density)
+ for multi_labels, density in inter_clusters_density.items()
+ }
+
+ # Calculate normalization factors
+ normalization_factors = {class_label: 0.0 for class_label in class_labels}
+ for multi_labels, weight in weights.items():
+ normalization_factors[multi_labels[0][1]] += weight
+
+ # Intra distribution
+ self.inter_distribution_ = {
+ multi_labels: ((1 - self.distribution_ratio_) * weight / normalization_factors[multi_labels[0][1]])
+ for multi_labels, weight in weights.items()
+ }
+
+ return self
+
+ def _fit(
+ self: Self,
+ X: InputData,
+ y: Targets,
+ labels: Labels | None,
+ neighbors: Neighbors | None,
+ ) -> Self:
+ # Check distributor parameters
+ self._check_parameters(X, y, neighbors)
+
+ # Identify filtered clusters
+ self._identify_filtered_clusters(y)
+
+ # Calculate density of filtered clusters
+ self._calculate_clusters_density(X, y)
+
+ super()._fit(X, y, labels, neighbors)
+
+ return self
diff --git a/src/imblearn_extra/clover/distribution/base.py b/src/imblearn_extra/clover/distribution/base.py
new file mode 100644
index 0000000..3541476
--- /dev/null
+++ b/src/imblearn_extra/clover/distribution/base.py
@@ -0,0 +1,186 @@
+"""Base class for distributors."""
+
+# Author: Georgios Douzas
+# Joao Fonseca
+# License: MIT
+
+from __future__ import annotations
+
+from collections import Counter
+
+import numpy as np
+from sklearn.base import BaseEstimator
+from sklearn.utils import check_array, check_X_y
+from typing_extensions import Self
+
+from .. import InputData, InterDistribution, IntraDistribution, Labels, Neighbors, Targets
+
+
+class BaseDistributor(BaseEstimator):
+ """The base class for distributors.
+
+ A distributor sets the proportion of samples to be generated inside each cluster and between clusters. Warning: This
+ class should not be used directly. Use the derive classes instead.
+ """
+
+ def _intra_distribute(
+ self: Self,
+ X: InputData,
+ y: Targets,
+ labels: Labels | None,
+ neighbors: Neighbors | None,
+ ) -> Self:
+ return self
+
+ def _inter_distribute(
+ self: Self,
+ X: InputData,
+ y: Targets,
+ labels: Labels | None,
+ neighbors: Neighbors | None,
+ ) -> Self:
+ return self
+
+ def _validate_fitting(self: Self) -> Self:
+ # Check labels
+ if len(self.labels_) != self.n_samples_:
+ msg = (
+ f'Number of labels should be equal to the number of samples. '
+ f'Got {len(self.labels_)} and {self.n_samples_} instead.'
+ )
+ raise ValueError(msg)
+
+ # Check neighbors
+ if not set(self.labels_).issuperset(self.neighbors_.flatten()):
+ masg = 'Attribute `neighbors_` contains unknown labels.'
+ raise ValueError(masg)
+ unique_neighbors = {tuple(set(pair)) for pair in self.neighbors_}
+ if len(unique_neighbors) < len(self.neighbors_):
+ msg = 'Elements of `neighbors_` attribute are not unique.'
+ raise ValueError(msg)
+
+ # Check distribution
+ proportions = {
+ class_label: 0.0
+ for class_label in self.unique_class_labels_
+ if class_label not in self.majority_class_labels_
+ }
+ for (_, class_label), proportion in self.intra_distribution_.items():
+ proportions[class_label] += proportion
+ for (
+ ((cluster_label1, class_label1), (cluster_label2, class_label2)),
+ proportion,
+ ) in self.inter_distribution_.items():
+ if class_label1 != class_label2:
+ multi_label = (
+ (cluster_label1, class_label1),
+ (cluster_label2, class_label2),
+ )
+ msg = (
+ 'Multi-labels for neighboring cluster pairs should '
+ f'have a common class label. Got {multi_label} instead.'
+ )
+ raise ValueError(msg)
+ proportions[class_label1] += proportion
+ if not all(np.isclose(val, 0) or np.isclose(val, 1) for val in proportions.values()):
+ msg = (
+ 'Intra-distribution and inter-distribution sum of proportions for each '
+ f'class label should be either equal to 0 or 1. Got {proportions} instead.'
+ )
+ raise ValueError(msg)
+
+ return self
+
+ def _fit(
+ self: Self,
+ X: InputData,
+ y: Targets,
+ labels: Labels | None,
+ neighbors: Neighbors | None,
+ ) -> Self:
+ if labels is not None:
+ self._intra_distribute(X, y, labels, neighbors)
+ if neighbors is not None:
+ self._inter_distribute(X, y, labels, neighbors)
+ return self
+
+ def fit(
+ self: Self,
+ X: InputData,
+ y: Targets,
+ labels: Labels | None = None,
+ neighbors: Neighbors | None = None,
+ ) -> Self:
+ """Generate the intra-label and inter-label distribution.
+
+ Args:
+ X:
+ Matrix containing the data which have to be sampled.
+
+ y:
+ Corresponding label for each sample in X.
+ labels:
+ Labels of each sample.
+ neighbors:
+ An array that contains all neighboring pairs. Each row is
+ a unique neighboring pair.
+
+ Returns:
+ The object itself.
+ """
+ # Check data
+ X, y = check_X_y(X, y, dtype=None)
+
+ # Set statistics
+ counts = Counter(y)
+ self.majority_class_labels_ = [
+ class_label
+ for class_label, class_label_count in counts.items()
+ if class_label_count == max(counts.values())
+ ]
+ self.unique_cluster_labels_ = np.unique(labels) if labels is not None else np.array(0, dtype=int)
+ self.unique_class_labels_ = np.unique(y)
+ self.n_samples_ = len(X)
+
+ # Set default attributes
+ self.labels_ = np.repeat(0, len(X)) if labels is None else check_array(labels, ensure_2d=False)
+ self.neighbors_ = np.empty((0, 2), dtype=int) if neighbors is None else check_array(neighbors, ensure_2d=False)
+ self.intra_distribution_: IntraDistribution = {
+ (0, class_label): 1.0 for class_label in np.unique(y) if class_label not in self.majority_class_labels_
+ }
+ self.inter_distribution_: InterDistribution = {}
+
+ # Fit distributor
+ self._fit(X, y, labels, neighbors)
+
+ # Validate fitting procedure
+ self._validate_fitting()
+
+ return self
+
+ def fit_distribute(
+ self: Self,
+ X: InputData,
+ y: Targets,
+ labels: Labels | None,
+ neighbors: Neighbors | None,
+ ) -> tuple[IntraDistribution, InterDistribution]:
+ """Return the intra-label and inter-label distribution.
+
+ Args:
+ X:
+ Matrix containing the data which have to be sampled.
+ y:
+ Corresponding label for each sample in X.
+ labels:
+ Labels of each sample.
+ neighbors:
+ An array that contains all neighboring pairs. Each row is
+ a unique neighboring pair.
+
+ Returns:
+ distributions:
+ A tuple with the two distributions.
+ """
+ self.fit(X, y, labels, neighbors)
+ return self.intra_distribution_, self.inter_distribution_
diff --git a/src/imblearn_extra/clover/over_sampling/__init__.py b/src/imblearn_extra/clover/over_sampling/__init__.py
new file mode 100644
index 0000000..30640c6
--- /dev/null
+++ b/src/imblearn_extra/clover/over_sampling/__init__.py
@@ -0,0 +1,28 @@
+"""This module includes classes for clustering-based oversampling.
+
+A general class for clustering-based oversampling as well as specific clustering-based oversamplers are provided.
+"""
+
+from ._cluster import (
+ ClusterOverSampler,
+ clone_modify,
+ extract_inter_data,
+ extract_intra_data,
+ generate_in_cluster,
+ modify_nn,
+)
+from ._gsomo import GeometricSOMO
+from ._kmeans_smote import KMeansSMOTE
+from ._somo import SOMO
+
+__all__: list[str] = [
+ 'ClusterOverSampler',
+ 'KMeansSMOTE',
+ 'SOMO',
+ 'GeometricSOMO',
+ 'modify_nn',
+ 'clone_modify',
+ 'extract_inter_data',
+ 'extract_intra_data',
+ 'generate_in_cluster',
+]
diff --git a/src/imblearn_extra/clover/over_sampling/_cluster.py b/src/imblearn_extra/clover/over_sampling/_cluster.py
new file mode 100644
index 0000000..eab21b5
--- /dev/null
+++ b/src/imblearn_extra/clover/over_sampling/_cluster.py
@@ -0,0 +1,571 @@
+"""Implementation of the main class for clustering-based oversampling."""
+
+# Author: Georgios Douzas
+# License: MIT
+
+from __future__ import annotations
+
+import warnings
+from collections import Counter, OrderedDict
+
+import numpy as np
+from imblearn.over_sampling import RandomOverSampler
+from imblearn.over_sampling.base import BaseOverSampler
+from imblearn.pipeline import Pipeline
+from imblearn.utils import check_sampling_strategy
+from imblearn.utils._validation import ArraysTransformer
+from joblib import Parallel, delayed
+from sklearn.base import ClusterMixin, TransformerMixin, clone
+from sklearn.exceptions import FitFailedWarning
+from sklearn.neighbors import NearestNeighbors
+from sklearn.preprocessing import label_binarize
+from sklearn.utils import check_random_state
+from sklearn.utils.multiclass import check_classification_targets
+from typing_extensions import Self
+
+from .. import InputData, InterDistribution, IntraDistribution, Labels, Targets
+from ..distribution import DensityDistributor
+from ..distribution.base import BaseDistributor
+
+
+def modify_nn(n_neighbors: NearestNeighbors | int, n_samples: int) -> NearestNeighbors | int:
+ """Modify the nearest neighbors object.
+
+ Args:
+ n_neighbors:
+ The `NearestNeighbors` object or number.
+ n_samples:
+ The number of samples.
+
+ Returns:
+ The modified `NearestNeighbors` object or number.
+ """
+ if isinstance(n_neighbors, NearestNeighbors):
+ n_neighbors = (
+ clone(n_neighbors).set_params(n_neighbors=n_samples - 1)
+ if n_neighbors.n_neighbors >= n_samples
+ else clone(n_neighbors)
+ )
+ elif isinstance(n_neighbors, int) and n_neighbors >= n_samples:
+ n_neighbors = n_samples - 1
+ return n_neighbors
+
+
+def clone_modify(oversampler: BaseOverSampler, class_label: int, y_in_cluster: Targets) -> BaseOverSampler:
+ """Clone and modify attributes of oversampler for corner cases.
+
+ Args:
+ oversampler:
+ The oversampler to modify its attributes.
+ class_label:
+ The class label.
+ y_in_cluster:
+ The data of the target in the cluster.
+
+ Returns:
+ A cloned oversampler with modified number of nearest neighbors.
+ """
+ # Clone oversampler
+ oversampler = clone(oversampler)
+
+ # Not modify attributes case
+ if isinstance(oversampler, RandomOverSampler):
+ return oversampler
+
+ # Select and modify oversampler
+ n_minority_samples = Counter(y_in_cluster)[class_label]
+ if n_minority_samples == 1:
+ oversampler = RandomOverSampler()
+ else:
+ if hasattr(oversampler, 'k_neighbors'):
+ oversampler.k_neighbors = modify_nn(oversampler.k_neighbors, n_minority_samples)
+ if hasattr(oversampler, 'm_neighbors'):
+ oversampler.m_neighbors = modify_nn(oversampler.m_neighbors, y_in_cluster.size)
+ if hasattr(oversampler, 'n_neighbors'):
+ oversampler.n_neighbors = modify_nn(oversampler.n_neighbors, n_minority_samples)
+ return oversampler
+
+
+def extract_intra_data(
+ X: InputData,
+ y: Targets,
+ cluster_labels: Labels,
+ intra_distribution: IntraDistribution,
+ sampling_strategy: OrderedDict[int, int],
+) -> list[tuple[dict[int, int], InputData, Targets]]:
+ """Extract data for each filtered cluster.
+
+ Args:
+ X:
+ The input data.
+ y:
+ The targets.
+ cluster_labels:
+ The cluster labels.
+ intra_distribution:
+ The intra-clusters distributions.
+ sampling_strategy:
+ The sampling strategy to follow.
+
+ Returns:
+ The intra-clusters data.
+ """
+ majority_class_label = Counter(y).most_common()[0][0]
+
+ # Get offsets
+ selected_multi_labels = []
+ classes_labels = {class_label for _, class_label in intra_distribution}
+ distribution_value_tie = 0.5
+ for selected_class_label in classes_labels:
+ intra_distribution_class_label = {
+ (cluster_label, class_label): proportion
+ for (cluster_label, class_label), proportion in intra_distribution.items()
+ if class_label == selected_class_label
+ }
+ selected_multi_label = max(
+ intra_distribution_class_label,
+ key=lambda multi_label: intra_distribution_class_label[multi_label],
+ )
+ if intra_distribution_class_label[selected_multi_label] <= distribution_value_tie:
+ selected_multi_labels.append(selected_multi_label)
+
+ # Get clusters data
+ clusters_data = []
+ for (cluster_label, class_label), proportion in intra_distribution.items():
+ mask = (cluster_labels == cluster_label) & (np.isin(y, [majority_class_label, class_label]))
+ offset = int((cluster_label, class_label) in selected_multi_labels)
+ n_minority_samples = int(round(sampling_strategy[class_label] * proportion)) + offset
+ X_in_cluster, y_in_cluster = X[mask], y[mask]
+ cluster_sampling_strategy = {class_label: n_minority_samples}
+ if n_minority_samples > 0:
+ clusters_data.append((cluster_sampling_strategy, X_in_cluster, y_in_cluster))
+ return clusters_data
+
+
+def extract_inter_data(
+ X: InputData,
+ y: Targets,
+ cluster_labels: Labels,
+ inter_distribution: InterDistribution,
+ sampling_strategy: OrderedDict[int, int],
+ random_state: np.random.RandomState,
+) -> list[tuple[dict[int, int], InputData, Targets]]:
+ """Extract data between filtered clusters.
+
+ Args:
+ X:
+ The input data.
+ y:
+ The targets.
+ cluster_labels:
+ The cluster labels.
+ inter_distribution:
+ The inter-clusters distributions.
+ sampling_strategy:
+ The sampling strategy to follow.
+ random_state:
+ Control the randomization of the algorithm.
+
+ Returns:
+ The inter-clusters data.
+ """
+ majority_class_label = Counter(y).most_common()[0][0]
+ clusters_data = []
+ for (
+ ((cluster_label1, class_label1), (cluster_label2, class_label2)),
+ proportion,
+ ) in inter_distribution.items():
+ mask1 = (cluster_labels == cluster_label1) & (np.isin(y, [majority_class_label, class_label1]))
+ mask2 = (cluster_labels == cluster_label2) & (np.isin(y, [majority_class_label, class_label2]))
+ X1, X2, y1, y2 = X[mask1], X[mask2], y[mask1], y[mask2]
+ majority_mask1, majority_mask2 = (
+ (y1 == majority_class_label),
+ (y2 == majority_class_label),
+ )
+ n_minority_samples = int(round(sampling_strategy[class_label1] * proportion))
+ for _ in range(n_minority_samples):
+ ind1, ind2 = (
+ random_state.randint(0, (~majority_mask1).sum()),
+ random_state.randint(0, (~majority_mask2).sum()),
+ )
+ X_in_clusters = np.vstack(
+ (
+ X1[~majority_mask1][ind1].reshape(1, -1),
+ X2[~majority_mask2][ind2].reshape(1, -1),
+ X1[majority_mask1],
+ X2[majority_mask2],
+ ),
+ )
+ y_in_clusters = np.hstack(
+ (
+ y1[~majority_mask1][ind1],
+ y2[~majority_mask2][ind2],
+ y1[majority_mask1],
+ y2[majority_mask2],
+ ),
+ )
+ clusters_sampling_strategy = {class_label1: 1}
+ clusters_data.append((clusters_sampling_strategy, X_in_clusters, y_in_clusters))
+ return clusters_data
+
+
+def generate_in_cluster(
+ oversampler: BaseOverSampler,
+ transformer: TransformerMixin,
+ cluster_sampling_strategy: dict[int, int],
+ X_in_cluster: InputData,
+ y_in_cluster: Targets,
+) -> tuple[InputData, Targets]:
+ """Generate intra-cluster or inter-cluster new samples.
+
+ Args:
+ oversampler:
+ Oversampler to apply to each selected cluster.
+ transformer:
+ Transformer to apply before oversampling.
+ cluster_sampling_strategy:
+ The sampling strategy in the cluster.
+ X_in_cluster:
+ The input data in the cluster.
+ y_in_cluster:
+ The targets in the cluster.
+
+ Returns:
+ X_new:
+ The generated.
+ y_new:
+ The corresponding label of resampled data.
+ """
+
+ # Create oversampler for specific cluster and class
+ class_label = next(iter(cluster_sampling_strategy.keys()))
+ oversampler = clone_modify(oversampler, class_label, y_in_cluster)
+ oversampler.sampling_strategy_ = cluster_sampling_strategy
+ oversampler.n_features_in_ = X_in_cluster.shape[1]
+
+ # Resample cluster and class data
+ X_res, y_res = oversampler._fit_resample(
+ transformer.transform(X_in_cluster) if transformer is not None else X_in_cluster,
+ y_in_cluster,
+ )
+
+ # Filter only new data
+ X_new, y_new = X_res[len(X_in_cluster) :], y_res[len(y_in_cluster) :]
+
+ return X_new, y_new
+
+
+class ClusterOverSampler(BaseOverSampler):
+ """A class that handles clustering-based oversampling.
+
+ Any combination of oversampler, clusterer and distributor can
+ be used.
+
+ Read more in the [user_guide].
+
+ Args:
+ oversampler:
+ Oversampler to apply to each selected cluster.
+
+ clusterer:
+ Clusterer to apply to input space before oversampling.
+
+ - When `None`, it corresponds to a clusterer that assigns
+ a single cluster to all the samples equivalent to no clustering.
+
+ - When clusterer is given, it applies clustering to the input space. Then
+ oversampling is applied inside each cluster and between clusters.
+
+ distributor:
+ Distributor to distribute the generated samples per cluster label.
+
+ - When `None` and a clusterer is provided then it corresponds to the
+ density distributor. If clusterer is also `None` than the distributor
+ does not affect the over-sampling procedure.
+
+ - When distributor object is provided, it is used to distribute the
+ generated samples to the clusters.
+
+ raise_error:
+ Raise an error when no samples are generated.
+
+ - If `True`, it raises an error when no filtered clusters are
+ identified and therefore no samples are generated.
+
+ - If `False`, it displays a warning.
+
+ random_state:
+ Control the randomization of the algorithm.
+
+ - If `int`, it is the seed used by the random number
+ generator.
+ - If `np.random.RandomState` instance, it is the random number
+ generator.
+ - If `None`, the random number generator is the `RandomState`
+ instance used by `np.random`.
+
+ n_jobs:
+ Number of CPU cores used.
+
+ - If `None`, it means `1` unless in a `joblib.parallel_backend` context.
+
+ - If `-1` means using all processors.
+
+ Attributes:
+ oversampler_ (imblearn.over_sampling.base.BaseOverSampler):
+ A fitted clone of the `oversampler` parameter.
+
+ clusterer_ (sklearn.base.ClusterMixin):
+ A fitted clone of the `clusterer` parameter or `None` when a
+ clusterer is not given.
+
+ distributor_ (clover.distribution.base.BaseDistributor):
+ A fitted clone of the `distributor` parameter or a fitted instance of
+ the `DensityDistributor` when a distributor is not given.
+
+ labels_ (Labels):
+ Cluster labels of each sample.
+
+ neighbors_ (Neighbors):
+ An array that contains all neighboring pairs with each row being
+ a unique neighboring pair. It is `None` when the clusterer does not
+ support this attribute.
+
+ random_state_ (np.random.RandomState):
+ An instance of `np.random.RandomState` class.
+
+ sampling_strategy_ (dict[int, int]):
+ Actual sampling strategy.
+
+ Examples:
+ >>> from collections import Counter
+ >>> from clover.over_sampling import ClusterOverSampler
+ >>> from sklearn.datasets import make_classification
+ >>> from sklearn.cluster import KMeans
+ >>> from imblearn.over_sampling import SMOTE
+ >>> X, y = make_classification(random_state=0, n_classes=2, weights=[0.9, 0.1])
+ >>> print('Original dataset shape %s' % Counter(y))
+ Original dataset shape Counter({{0: 90, 1: 10}})
+ >>> cluster_oversampler = ClusterOverSampler(
+ ... oversampler=SMOTE(random_state=5),
+ ... clusterer=KMeans(random_state=10, n_init='auto'))
+ >>> X_res, y_res = cluster_oversampler.fit_resample(X, y)
+ >>> print('Resampled dataset shape %s' % Counter(y_res))
+ Resampled dataset shape Counter({{0: 90, 1: 90}})
+ """
+
+ def __init__(
+ self: Self,
+ oversampler: BaseOverSampler,
+ clusterer: ClusterMixin | None = None,
+ distributor: BaseDistributor | None = None,
+ raise_error: bool = True,
+ random_state: np.random.RandomState | int | None = None,
+ n_jobs: int | None = None,
+ ) -> None:
+ self.oversampler = oversampler
+ self.clusterer = clusterer
+ self.distributor = distributor
+ self.raise_error = raise_error
+ self.random_state = random_state
+ self.n_jobs = n_jobs
+
+ def fit(self: Self, X: InputData, y: Targets) -> Self:
+ """Check inputs and statistics of the sampler.
+
+ You should use `fit_resample` to generate the synthetic data.
+
+ Args:
+ X:
+ Data array.
+ y:
+ Target array.
+
+ Returns:
+ self:
+ Return the instance itself.
+ """
+ X, y, _ = self._check_X_y(X, y)
+ self._check(X, y)
+ return self
+
+ def fit_resample(
+ self: Self,
+ X: InputData,
+ y: Targets,
+ **fit_params: dict[str, str],
+ ) -> tuple[InputData, Targets]:
+ """Resample the dataset.
+
+ Args:
+ X:
+ Matrix containing the data which have to be sampled.
+ y:
+ Corresponding label for each sample in X.
+ fit_params:
+ Parameters passed to the fit method of the clusterer.
+
+ Returns:
+ X_resampled:
+ The array containing the resampled data.
+ y_resampled:
+ The corresponding label of resampled data.
+ """
+ check_classification_targets(y)
+ arrays_transformer = ArraysTransformer(X, y)
+ X, y, binarize_y = self._check_X_y(X, y)
+
+ self._check(X, y)._fit(X, y, **fit_params)
+
+ output = self._fit_resample(X, y)
+
+ y_ = label_binarize(y=output[1], classes=np.unique(y)) if binarize_y else output[1]
+
+ X_, y_ = arrays_transformer.transform(output[0], y_)
+ return (X_, y_)
+
+ def _cluster_sample(
+ self: Self,
+ clusters_data: list[tuple[dict[int, int], InputData, Targets]],
+ X: InputData,
+ y: Targets,
+ ) -> tuple[InputData, Targets] | None:
+ generated_data = Parallel(n_jobs=self.n_jobs)(
+ delayed(generate_in_cluster)(self.oversampler_, self.transformer_, *data) for data in clusters_data
+ )
+ if generated_data:
+ X, y = (np.concatenate(data) for data in zip(*generated_data, strict=True))
+ return X, y
+ return None
+
+ def _intra_sample(self: Self, X: InputData, y: Targets) -> tuple[InputData, Targets] | None:
+ clusters_data = extract_intra_data(
+ X,
+ y,
+ self.labels_,
+ self.distributor_.intra_distribution_,
+ self.sampling_strategy_,
+ )
+ return self._cluster_sample(clusters_data, X, y)
+
+ def _inter_sample(self: Self, X: InputData, y: Targets) -> tuple[InputData, Targets] | None:
+ clusters_data = extract_inter_data(
+ X,
+ y,
+ self.labels_,
+ self.distributor_.inter_distribution_,
+ self.sampling_strategy_,
+ self.random_state_,
+ )
+ return self._cluster_sample(clusters_data, X, y)
+
+ def _check_estimators(self: Self, X: InputData, y: Targets) -> Self:
+ # Check transformer and oversampler
+ if isinstance(self.oversampler, Pipeline):
+ if self.oversampler.steps[:-1]:
+ self.transformer_ = Pipeline(self.oversampler.steps[:-1]).fit(X)
+ self.oversampler_ = clone(self.oversampler.steps[-1][-1])
+ else:
+ self.oversampler_ = clone(self.oversampler)
+
+ # Check clusterer and distributor
+ if self.clusterer is None and self.distributor is not None:
+ msg = (
+ 'Distributor was found but clusterer is set to `None`. '
+ 'Either set parameter `distributor` to `None` or use a clusterer.'
+ )
+ raise ValueError(msg)
+ elif self.clusterer is None and self.distributor is None:
+ self.clusterer_ = None
+ self.distributor_ = BaseDistributor()
+ else:
+ self.clusterer_ = clone(self.clusterer)
+ self.distributor_ = DensityDistributor() if self.distributor is None else clone(self.distributor)
+ return self
+
+ def _check_sampling_strategy(self: Self, y: Targets) -> Self:
+ self.sampling_strategy_ = check_sampling_strategy(
+ self.oversampler_.sampling_strategy,
+ y,
+ self._sampling_type,
+ )
+ return self
+
+ def _check(self: Self, X: InputData, y: Targets) -> Self:
+ # Check random state
+ self.random_state_ = check_random_state(self.random_state)
+
+ # Check transformer
+ self.transformer_ = None
+
+ # Check estimators and sampling strategy
+ self._check_estimators(X, y)._check_sampling_strategy(y)
+
+ return self
+
+ def _fit(self: Self, X: InputData, y: Targets, **fit_params: dict[str, str]) -> Self:
+ # Fit clusterer
+ if self.clusterer_ is not None:
+ self.clusterer_.fit(X, y, **fit_params)
+
+ # Extract labels and neighbors
+ self.labels_ = getattr(self.clusterer_, 'labels_', np.zeros(len(X), dtype=int))
+ self.neighbors_ = getattr(self.clusterer_, 'neighbors_', None)
+
+ # fit distributor
+ self.distributor_.fit(X, y, labels=self.labels_, neighbors=self.neighbors_)
+
+ # Case when no samples are generated
+ if not self.distributor_.intra_distribution_ and not self.distributor_.inter_distribution_:
+ msg = 'No samples were generated. Try to modify the parameters of the clusterer or distributor.'
+
+ # Raise error
+ if self.raise_error:
+ raise ValueError(msg)
+
+ # Display warning
+ else:
+ warnings.warn(msg, FitFailedWarning, stacklevel=1)
+
+ return self
+
+ def _fit_resample(
+ self: Self,
+ X: InputData,
+ y: Targets,
+ **fit_params: dict[str, str],
+ ) -> tuple[InputData, Targets]:
+ # Intracluster oversampling
+ data_intra = self._intra_sample(X, y)
+ if data_intra is not None:
+ X_intra_new, y_intra_new = data_intra
+ else:
+ X_intra_new, y_intra_new = None, None
+ intra_count: Counter = Counter(y_intra_new)
+
+ # Intercluster oversampling
+ data_inter = self._inter_sample(X, y)
+ if data_inter is not None:
+ X_inter_new, y_inter_new = data_inter
+ else:
+ X_inter_new, y_inter_new = None, None
+ inter_count: Counter = Counter(y_inter_new)
+
+ # Set sampling strategy
+ self.sampling_strategy_ = OrderedDict({})
+ for class_label in set(intra_count.keys()).union(inter_count.keys()):
+ self.sampling_strategy_[class_label] = intra_count.get(class_label, 0) + inter_count.get(class_label, 0)
+
+ # Stack resampled data
+ X_resampled_unstacked = [
+ self.transformer_.transform(X) if self.transformer_ is not None else X,
+ X_intra_new,
+ X_inter_new,
+ ]
+ y_resampled_unstacked = [y, y_intra_new, y_inter_new]
+ X_resampled, y_resampled = (
+ np.vstack([X for X in X_resampled_unstacked if X is not None]),
+ np.hstack([y for y in y_resampled_unstacked if y is not None]),
+ )
+
+ return X_resampled, y_resampled
diff --git a/src/imblearn_extra/clover/over_sampling/_gsomo.py b/src/imblearn_extra/clover/over_sampling/_gsomo.py
new file mode 100644
index 0000000..b0553bc
--- /dev/null
+++ b/src/imblearn_extra/clover/over_sampling/_gsomo.py
@@ -0,0 +1,263 @@
+"""Includes the implementation of SOMO."""
+
+# Author: Georgios Douzas
+# License: MIT
+
+from __future__ import annotations
+
+from collections.abc import Callable
+from math import sqrt
+
+import numpy as np
+from sklearn.base import clone
+from sklearn.neighbors import NearestNeighbors
+from sklearn.utils import check_scalar
+from typing_extensions import Self
+
+from ...gsmote import GeometricSMOTE
+from .. import InputData, Targets
+from ..clusterer import SOM
+from ..distribution import DensityDistributor
+from ._cluster import ClusterOverSampler
+
+
+class GeometricSOMO(ClusterOverSampler):
+ """Geometric SOMO algorithm.
+
+ Applies the SOM algorithm to the input space before applying Geometric
+ SMOTE. Read more in the [user_guide].
+
+ Args:
+ sampling_strategy:
+ Sampling information to resample the data set.
+
+ - When `float`, it corresponds to the desired ratio of the number of
+ samples in the minority class over the number of samples in the
+ majority class after resampling. It is only available for binary
+ classification.
+
+ - When `str`, specify the class targeted by the resampling. The
+ number of samples in the different classes will be equalized.
+ Possible choices are:
+ - `'minority'`: resample only the minority class.
+ - `'not minority'`: resample all classes but the minority class.
+ - `'not majority'`: resample all classes but the majority class.
+ - `'all'`: resample all classes.
+ - `'auto'`: equivalent to `'not majority'`.
+
+ - When `dict`, the keys correspond to the targeted classes. The
+ values correspond to the desired number of samples for each targeted
+ class.
+
+ - When callable, function taking `y` and returns a `dict`. The keys
+ correspond to the targeted classes. The values correspond to the
+ desired number of samples for each class.
+
+ random_state:
+ Control the randomization of the algorithm.
+
+ - If `int`, it is the seed used by the random number
+ generator.
+ - If `np.random.RandomState` instance, it is the random number
+ generator.
+ - If `None`, the random number generator is the `RandomState`
+ instance used by `np.random`.
+
+ k_neighbors:
+ Defines the number of nearest neighbors to be used by SMOTE.
+
+ - If `int`, this number is used to construct synthetic
+ samples.
+
+ - If `object`, an estimator that inherits from
+ `sklearn.neighbors.base.KNeighborsMixin` that will be
+ used to find the number of nearest neighbors.
+
+ truncation_factor:
+ The type of truncation. The values should be in the `[-1.0, 1.0]` range.
+
+ deformation_factor:
+ The type of geometry. The values should be in the `[0.0, 1.0]` range.
+
+ selection_strategy:
+ The type of Geometric SMOTE algorithm with the following options:
+ `'combined'`, `'majority'`, `'minority'`.
+
+ som_estimator:
+ Defines the SOM clusterer applied to the input space.
+
+ - If `None`, a `clover.clusterer.SOM` instance with default parameters is used.
+
+ - If `clover.clusterer.SOM` object then is used with the given parameters.
+
+ - If `int`, the number of clusters to be used.
+
+ - If `float`, the proportion of the number of clusters over the number
+ of samples to be used.
+
+ imbalance_ratio_threshold:
+ The threshold of a filtered cluster. It can be any non-negative number or
+ `'auto'` to be calculated automatically.
+
+ - If `'auto'`, the filtering threshold is calculated from the imbalance
+ ratio of the target for the binary case or the maximum of the target's
+ imbalance ratios for the multiclass case.
+
+ - If `float` then it is manually set to this number.
+
+ Any cluster that has an imbalance ratio smaller than the filtering threshold is
+ identified as a filtered cluster and can be potentially used to generate
+ minority class instances. Higher values increase the number of filtered
+ clusters.
+
+ distances_exponent:
+ The exponent of the mean distance in the density calculation. It can be
+ any non-negative number or `'auto'` to be calculated automatically.
+
+ - If `'auto'` then it is set equal to the number of
+ features. Higher values make the calculation of density more sensitive
+ to the cluster's size i.e. clusters with large mean euclidean distance
+ between samples are penalized.
+
+ - If `float` then it is manually set to this number.
+
+ distribution_ratio:
+ The ratio of intra-cluster to inter-cluster generated samples. It is a
+ number in the `[0.0, 1.0]` range. The default value is `0.8`, a
+ number equal to the proportion of intra-cluster generated samples over
+ the total number of generated samples. As the number decreases, less
+ intra-cluster and more inter-cluster samples are generated.
+
+ raise_error:
+ Raise an error when no samples are generated.
+
+ - If `True`, it raises an error when no filtered clusters are
+ identified and therefore no samples are generated.
+
+ - If `False`, it displays a warning.
+
+ n_jobs:
+ Number of CPU cores used.
+
+ - If `None`, it means `1` unless in a `joblib.parallel_backend` context.
+
+ - If `-1` means using all processors.
+
+ Attributes:
+ oversampler_ (gsmote.GeometricSMOTE):
+ A fitted `gsmote.GeometricSMOTE` instance.
+
+ clusterer_ (SOM):
+ A fitted `clovar.clusterer.SOM` instance.
+
+ distributor_ (clover.distribution.DensityDistributor):
+ A fitted `clover.distribution.DensityDistributor` instance.
+
+ labels_ (Labels):
+ Labels of each sample.
+
+ neighbors_ (Neighbors):
+ An array that contains all neighboring pairs with each row being
+ a unique neighboring pair.
+
+ random_state_ (numpy.random.RandomState):
+ An instance of `numpy.random.RandomState` class.
+
+ sampling_strategy_ (dict[int, int]):
+ Actual sampling strategy.
+
+ Examples:
+ >>> import numpy as np
+ >>> from clover.over_sampling import GeometricSOMO # doctest: +SKIP
+ >>> from sklearn.datasets import make_blobs
+ >>> blobs = [100, 800, 100]
+ >>> X, y = make_blobs(blobs, centers=[(-10, 0), (0,0), (10, 0)])
+ >>> # Add a single 0 sample in the middle blob
+ >>> X = np.concatenate([X, [[0, 0]]])
+ >>> y = np.append(y, 0)
+ >>> # Make this a binary classification problem
+ >>> y = y == 1
+ >>> gsomo = GeometricSOMO(random_state=42) # doctest: +SKIP
+ >>> X_res, y_res = gsomo.fit_resample(X, y) # doctest: +SKIP
+ >>> # Find the number of new samples in the middle blob
+ >>> right, left = X_res[:, 0] > -5, X_res[:, 0] < 5 # doctest: +SKIP
+ >>> n_res_in_middle = (right & left).sum() # doctest: +SKIP
+ >>> print("Samples in the middle blob: %s" % n_res_in_middle) # doctest: +SKIP
+ Samples in the middle blob: 801
+ >>> unchanged = n_res_in_middle == blobs[1] + 1 # doctest: +SKIP
+ >>> print("Middle blob unchanged: %s" % unchanged) # doctest: +SKIP
+ Middle blob unchanged: True
+ >>> more_zero_samples = (y_res == 0).sum() > (y == 0).sum() # doctest: +SKIP
+ >>> print("More 0 samples: %s" % more_zero_samples) # doctest: +SKIP
+ More 0 samples: True
+ """
+
+ def __init__(
+ self: Self,
+ sampling_strategy: dict[int, int] | str | float | Callable[[Targets], dict[int, int]] = 'auto',
+ random_state: np.random.RandomState | int | None = None,
+ k_neighbors: NearestNeighbors | int = 5,
+ truncation_factor: float = 1.0,
+ deformation_factor: float = 0.0,
+ selection_strategy: str = 'combined',
+ som_estimator: SOM | None = None,
+ imbalance_ratio_threshold: float | str = 'auto',
+ distances_exponent: float | str = 'auto',
+ distribution_ratio: float = 0.8,
+ raise_error: bool = True,
+ n_jobs: int | None = None,
+ ) -> None:
+ self.sampling_strategy = sampling_strategy
+ self.random_state = random_state
+ self.k_neighbors = k_neighbors
+ self.truncation_factor = truncation_factor
+ self.deformation_factor = deformation_factor
+ self.selection_strategy = selection_strategy
+ self.som_estimator = som_estimator
+ self.distribution_ratio = distribution_ratio
+ self.imbalance_ratio_threshold = imbalance_ratio_threshold
+ self.distances_exponent = distances_exponent
+ self.raise_error = raise_error
+ self.n_jobs = n_jobs
+
+ def _check_estimators(self: Self, X: InputData, y: Targets) -> Self:
+ """Check various estimators."""
+ # Check oversampler
+ self.oversampler_ = GeometricSMOTE(
+ sampling_strategy=self.sampling_strategy,
+ k_neighbors=self.k_neighbors,
+ truncation_factor=self.truncation_factor,
+ deformation_factor=self.deformation_factor,
+ selection_strategy=self.selection_strategy,
+ random_state=self.random_state_,
+ n_jobs=self.n_jobs,
+ )
+
+ if self.som_estimator is None:
+ self.clusterer_ = SOM(random_state=self.random_state_)
+ elif isinstance(self.som_estimator, int):
+ check_scalar(self.som_estimator, 'som_estimator', int, min_val=1)
+ n = round(sqrt(self.som_estimator))
+ self.clusterer_ = SOM(n_columns=n, n_rows=n, random_state=self.random_state_)
+ elif isinstance(self.som_estimator, float):
+ check_scalar(self.som_estimator, 'som_estimator', float, min_val=0.0, max_val=1.0)
+ n = round(sqrt((X.shape[0] - 1) * self.som_estimator + 1))
+ self.clusterer_ = SOM(n_columns=n, n_rows=n, random_state=self.random_state_)
+ elif isinstance(self.som_estimator, SOM):
+ self.clusterer_ = clone(self.som_estimator)
+ else:
+ msg = (
+ 'Parameter `som_estimator` should be either `None` or the number of clusters or a float '
+ 'in the [0.0, 1.0] range equal to the number of clusters over the number of '
+ 'samples or an instance of the `SOM` class.'
+ )
+ raise TypeError(msg)
+
+ # Check distributor
+ self.distributor_ = DensityDistributor(
+ filtering_threshold=self.imbalance_ratio_threshold,
+ distances_exponent=self.distances_exponent,
+ distribution_ratio=self.distribution_ratio,
+ )
+
+ return self
diff --git a/src/imblearn_extra/clover/over_sampling/_kmeans_smote.py b/src/imblearn_extra/clover/over_sampling/_kmeans_smote.py
new file mode 100644
index 0000000..f717912
--- /dev/null
+++ b/src/imblearn_extra/clover/over_sampling/_kmeans_smote.py
@@ -0,0 +1,239 @@
+"""Includes the implementation of KMeans-SMOTE."""
+
+# Author: Georgios Douzas
+# License: MIT
+
+from __future__ import annotations
+
+import numpy as np
+from imblearn.over_sampling import SMOTE
+from sklearn.base import clone
+from sklearn.cluster import KMeans, MiniBatchKMeans
+from sklearn.neighbors import NearestNeighbors
+from sklearn.utils import check_scalar
+from typing_extensions import Self
+
+from .. import InputData, Targets
+from ..distribution._density import DensityDistributor
+from ._cluster import ClusterOverSampler
+
+
+class KMeansSMOTE(ClusterOverSampler):
+ """KMeans-SMOTE algorithm.
+
+ Applies KMeans clustering to the input space before applying SMOTE. Read
+ more in the [user_guide].
+
+ Args:
+ sampling_strategy:
+ Sampling information to resample the data set.
+
+ - When `float`, it corresponds to the desired ratio of the number of
+ samples in the minority class over the number of samples in the
+ majority class after resampling. It is only available for binary
+ classification.
+
+ - When `str`, specify the class targeted by the resampling. The
+ number of samples in the different classes will be equalized.
+ Possible choices are:
+ - `'minority'`: resample only the minority class.
+ - `'not minority'`: resample all classes but the minority class.
+ - `'not majority'`: resample all classes but the majority class.
+ - `'all'`: resample all classes.
+ - `'auto'`: equivalent to `'not majority'`.
+
+ - When `dict`, the keys correspond to the targeted classes. The
+ values correspond to the desired number of samples for each targeted
+ class.
+
+ - When callable, function taking `y` and returns a `dict`. The keys
+ correspond to the targeted classes. The values correspond to the
+ desired number of samples for each class.
+
+ random_state:
+ Control the randomization of the algorithm.
+
+ - If `int`, it is the seed used by the random number
+ generator.
+ - If `np.random.RandomState` instance, it is the random number
+ generator.
+ - If `None`, the random number generator is the `RandomState`
+ instance used by `np.random`.
+
+ k_neighbors:
+ Defines the number of nearest neighbors to be used by SMOTE.
+
+ - If `int`, this number is used to construct synthetic
+ samples.
+
+ - If `object`, an estimator that inherits from
+ `sklearn.neighbors.base.KNeighborsMixin` that will be
+ used to find the number of nearest neighbors.
+
+ kmeans_estimator:
+ Defines the KMeans clusterer applied to the input space.
+
+ - If `None`, `sklearn.cluster.MiniBatchKMeans` is used which
+ tends to be better with large number of samples.
+
+ - If KMeans object, then an instance from either
+ `sklearn.cluster.KMeans` or `sklearn.cluster.MiniBatchKMeans`.
+
+ - If `int`, the number of clusters to be used.
+
+ - If `float`, the proportion of the number of clusters over the number
+ of samples to be used.
+
+ imbalance_ratio_threshold:
+ The threshold of a filtered cluster. It can be any non-negative number or
+ `'auto'` to be calculated automatically.
+
+ - If `'auto'`, the filtering threshold is calculated from the imbalance
+ ratio of the target for the binary case or the maximum of the target's
+ imbalance ratios for the multiclass case.
+
+ - If `float` then it is manually set to this number.
+
+ Any cluster that has an imbalance ratio smaller than the filtering threshold is
+ identified as a filtered cluster and can be potentially used to generate
+ minority class instances. Higher values increase the number of filtered
+ clusters.
+
+ distances_exponent:
+ The exponent of the mean distance in the density calculation. It can be
+ any non-negative number or `'auto'` to be calculated automatically.
+
+ - If `'auto'` then it is set equal to the number of
+ features. Higher values make the calculation of density more sensitive
+ to the cluster's size i.e. clusters with large mean euclidean distance
+ between samples are penalized.
+
+ - If `float` then it is manually set to this number.
+
+ raise_error:
+ Raise an error when no samples are generated.
+
+ - If `True`, it raises an error when no filtered clusters are
+ identified and therefore no samples are generated.
+
+ - If `False`, it displays a warning.
+
+ n_jobs:
+ Number of CPU cores used.
+
+ - If `None`, it means `1` unless in a `joblib.parallel_backend` context.
+
+ - If `-1` means using all processors.
+
+ Attributes:
+ oversampler_ (imblearn.over_sampling.SMOTE):
+ A fitted `imblearn.over_sampling.SMOTE` instance.
+
+ clusterer_ (sklearn.cluster.KMeans | sklearn.cluster.MiniBatchKMeans):
+ A fitted `sklearn.cluster.KMeans` or `sklearn.cluster.MiniBatchKMeans` instance.
+
+ distributor_ (clover.distribution.DensityDistributor):
+ A fitted `clover.distribution.DensityDistributor` instance.
+
+ labels_ (Labels):
+ Cluster labels of each sample.
+
+ neighbors_ (None):
+ It is `None` since KMeans does not support this attribute.
+
+ random_state_ (np.random.RandomState):
+ An instance of `np.random.RandomState` class.
+
+ sampling_strategy_ (dict[int, int]):
+ Actual sampling strategy.
+
+ Examples:
+ >>> import numpy as np
+ >>> from clover.over_sampling import KMeansSMOTE
+ >>> from sklearn.datasets import make_blobs
+ >>> blobs = [100, 800, 100]
+ >>> X, y = make_blobs(blobs, centers=[(-10, 0), (0,0), (10, 0)])
+ >>> # Add a single 0 sample in the middle blob
+ >>> X = np.concatenate([X, [[0, 0]]])
+ >>> y = np.append(y, 0)
+ >>> # Make this a binary classification problem
+ >>> y = y == 1
+ >>> kmeans_smote = KMeansSMOTE(random_state=42)
+ >>> X_res, y_res = kmeans_smote.fit_resample(X, y)
+ >>> # Find the number of new samples in the middle blob
+ >>> n_res_in_middle = ((X_res[:, 0] > -5) & (X_res[:, 0] < 5)).sum()
+ >>> print("Samples in the middle blob: %s" % n_res_in_middle)
+ Samples in the middle blob: 801
+ >>> print("Middle blob unchanged: %s" % (n_res_in_middle == blobs[1] + 1))
+ Middle blob unchanged: True
+ >>> print("More 0 samples: %s" % ((y_res == 0).sum() > (y == 0).sum()))
+ More 0 samples: True
+ """
+
+ def __init__(
+ self: Self,
+ sampling_strategy: dict[int, int] | str = 'auto',
+ random_state: np.random.RandomState | int | None = None,
+ k_neighbors: NearestNeighbors | int = 5,
+ kmeans_estimator: KMeans | None = None,
+ imbalance_ratio_threshold: float | str = 'auto',
+ distances_exponent: float | str = 'auto',
+ raise_error: bool = True,
+ n_jobs: int | None = None,
+ ) -> None:
+ self.sampling_strategy = sampling_strategy
+ self.random_state = random_state
+ self.k_neighbors = k_neighbors
+ self.kmeans_estimator = kmeans_estimator
+ self.imbalance_ratio_threshold = imbalance_ratio_threshold
+ self.distances_exponent = distances_exponent
+ self.raise_error = raise_error
+ self.n_jobs = n_jobs
+
+ def _check_estimators(self: Self, X: InputData, y: Targets) -> Self:
+ """Check various estimators."""
+ # Check oversampler
+ self.oversampler_ = SMOTE(
+ sampling_strategy=self.sampling_strategy,
+ k_neighbors=self.k_neighbors,
+ random_state=self.random_state_,
+ n_jobs=self.n_jobs,
+ )
+
+ # Check clusterer
+ if self.kmeans_estimator is None:
+ self.clusterer_ = MiniBatchKMeans(random_state=self.random_state_, n_init='auto')
+ elif isinstance(self.kmeans_estimator, int):
+ check_scalar(self.kmeans_estimator, 'kmeans_estimator', int, min_val=1)
+ self.clusterer_ = MiniBatchKMeans(
+ n_clusters=self.kmeans_estimator,
+ random_state=self.random_state_,
+ n_init='auto',
+ )
+ elif isinstance(self.kmeans_estimator, float):
+ check_scalar(
+ self.kmeans_estimator,
+ 'kmeans_estimator',
+ float,
+ min_val=0.0,
+ max_val=1.0,
+ )
+ n_clusters = round((X.shape[0] - 1) * self.kmeans_estimator + 1)
+ self.clusterer_ = MiniBatchKMeans(n_clusters=n_clusters, random_state=self.random_state, n_init='auto')
+ elif isinstance(self.kmeans_estimator, KMeans | MiniBatchKMeans):
+ self.clusterer_ = clone(self.kmeans_estimator)
+ else:
+ msg = (
+ 'Parameter `kmeans_estimator` should be either `None` or the number of clusters '
+ 'or a float in the [0.0, 1.0] range equal to the number of clusters over the number '
+ 'of samples or an instance of either `KMeans` or `MiniBatchKMeans` class.'
+ )
+ raise TypeError(msg)
+
+ # Check distributor
+ self.distributor_ = DensityDistributor(
+ filtering_threshold=self.imbalance_ratio_threshold,
+ distances_exponent=self.distances_exponent,
+ )
+
+ return self
diff --git a/src/imblearn_extra/clover/over_sampling/_somo.py b/src/imblearn_extra/clover/over_sampling/_somo.py
new file mode 100644
index 0000000..81057ca
--- /dev/null
+++ b/src/imblearn_extra/clover/over_sampling/_somo.py
@@ -0,0 +1,215 @@
+"""Includes the implementation of SOMO."""
+
+# Author: Georgios Douzas
+# License: MIT
+
+from __future__ import annotations
+
+from math import sqrt
+
+import numpy as np
+from imblearn.over_sampling import SMOTE
+from sklearn.base import clone
+from sklearn.neighbors import NearestNeighbors
+from sklearn.utils import check_scalar
+from typing_extensions import Self
+
+from .. import InputData, Targets
+from ..clusterer import SOM
+from ..distribution._density import DensityDistributor
+from ._cluster import ClusterOverSampler
+
+
+class SOMO(ClusterOverSampler):
+ """SOMO algorithm.
+
+ Applies the SOM algorithm to the input space before applying SMOTE. Read
+ more in the [user_guide].
+
+ Args:
+ sampling_strategy:
+ Sampling information to resample the data set.
+
+ - When `float`, it corresponds to the desired ratio of the number of
+ samples in the minority class over the number of samples in the
+ majority class after resampling. It is only available for binary
+ classification.
+
+ - When `str`, specify the class targeted by the resampling. The
+ number of samples in the different classes will be equalized.
+ Possible choices are:
+ - `'minority'`: resample only the minority class.
+ - `'not minority'`: resample all classes but the minority class.
+ - `'not majority'`: resample all classes but the majority class.
+ - `'all'`: resample all classes.
+ - `'auto'`: equivalent to `'not majority'`.
+
+ - When `dict`, the keys correspond to the targeted classes. The
+ values correspond to the desired number of samples for each targeted
+ class.
+
+ - When callable, function taking `y` and returns a `dict`. The keys
+ correspond to the targeted classes. The values correspond to the
+ desired number of samples for each class.
+
+ random_state:
+ Control the randomization of the algorithm.
+
+ - If `int`, it is the seed used by the random number
+ generator.
+ - If `np.random.RandomState` instance, it is the random number
+ generator.
+ - If `None`, the random number generator is the `RandomState`
+ instance used by `np.random`.
+
+ k_neighbors:
+ Defines the number of nearest neighbors to be used by SMOTE.
+
+ - If `int`, this number is used to construct synthetic
+ samples.
+
+ - If `object`, an estimator that inherits from
+ `sklearn.neighbors.base.KNeighborsMixin` that will be
+ used to find the number of nearest neighbors.
+
+ som_estimator:
+ Defines the SOM clusterer applied to the input space.
+
+ - If `None`, `SOM` is used which
+ tends to be better with large number of samples.
+
+ - If SOM object, then it is a `clover.clusterer.SOM` instance.
+
+ - If `int`, the number of clusters to be used.
+
+ - If `float`, the proportion of the number of clusters over the number
+ of samples to be used.
+
+ distribution_ratio:
+ The ratio of intra-cluster to inter-cluster generated samples. It is a
+ number in the `[0.0, 1.0]` range. The default value is `0.8`, a
+ number equal to the proportion of intra-cluster generated samples over
+ the total number of generated samples. As the number decreases, less
+ intra-cluster and more inter-cluster samples are generated.
+
+ raise_error:
+ Raise an error when no samples are generated.
+
+ - If `True`, it raises an error when no filtered clusters are
+ identified and therefore no samples are generated.
+
+ - If `False`, it displays a warning.
+
+ n_jobs:
+ Number of CPU cores used.
+
+ - If `None`, it means `1` unless in a `joblib.parallel_backend` context.
+
+ - If `-1` means using all processors.
+
+ Attributes:
+ oversampler_ (imblearn.over_sampling.SMOTE):
+ A fitted `imblearn.over_sampling.SMOTE` instance.
+
+ clusterer_ (clover.clusterer.SOM):
+ A fitted `clover.clusterer.SOM` instance.
+
+ distributor_ (clover.distribution.DensityDistributor):
+ A fitted `clover.distribution.DensityDistributor` instance.
+
+ labels_ (Labels):
+ Cluster labels of each sample.
+
+ neighbors_ (Neighbors):
+ An array that contains all neighboring pairs with each row being
+ a unique neighboring pair.
+
+ random_state_ (np.random.RandomState):
+ An instance of `np.random.RandomState` class.
+
+ sampling_strategy_ (dict[int, int]):
+ Actual sampling strategy.
+
+ Examples:
+ >>> import numpy as np
+ >>> from clover.over_sampling import SOMO # doctest: +SKIP
+ >>> from sklearn.datasets import make_blobs
+ >>> blobs = [100, 800, 100]
+ >>> X, y = make_blobs(blobs, centers=[(-10, 0), (0,0), (10, 0)])
+ >>> # Add a single 0 sample in the middle blob
+ >>> X = np.concatenate([X, [[0, 0]]])
+ >>> y = np.append(y, 0)
+ >>> # Make this a binary classification problem
+ >>> y = y == 1
+ >>> somo = SOMO(random_state=42) # doctest: +SKIP
+ >>> X_res, y_res = somo.fit_resample(X, y) # doctest: +SKIP
+ >>> # Find the number of new samples in the middle blob
+ >>> right, left = X_res[:, 0] > -5, X_res[:, 0] < 5 # doctest: +SKIP
+ >>> n_res_in_middle = (right & left).sum() # doctest: +SKIP
+ >>> print("Samples in the middle blob: %s" % n_res_in_middle) # doctest: +SKIP
+ Samples in the middle blob: 801
+ >>> unchanged = n_res_in_middle == blobs[1] + 1 # doctest: +SKIP
+ >>> print("Middle blob unchanged: %s" % unchanged) # doctest: +SKIP
+ Middle blob unchanged: True
+ >>> more_zero_samples = (y_res == 0).sum() > (y == 0).sum() # doctest: +SKIP
+ >>> print("More 0 samples: %s" % more_zero_samples) # doctest: +SKIP
+ More 0 samples: True
+ """
+
+ def __init__(
+ self: Self,
+ sampling_strategy: dict[int, int] | str = 'auto',
+ random_state: np.random.RandomState | int | None = None,
+ k_neighbors: NearestNeighbors | int = 5,
+ som_estimator: SOM | None = None,
+ distribution_ratio: float = 0.8,
+ raise_error: bool = True,
+ n_jobs: int | None = None,
+ ) -> None:
+ self.sampling_strategy = sampling_strategy
+ self.random_state = random_state
+ self.k_neighbors = k_neighbors
+ self.som_estimator = som_estimator
+ self.distribution_ratio = distribution_ratio
+ self.raise_error = raise_error
+ self.n_jobs = n_jobs
+
+ def _check_estimators(self: Self, X: InputData, y: Targets) -> Self:
+ """Check various estimators."""
+ # Check oversampler
+ self.oversampler_ = SMOTE(
+ sampling_strategy=self.sampling_strategy,
+ k_neighbors=self.k_neighbors,
+ random_state=self.random_state_,
+ n_jobs=self.n_jobs,
+ )
+
+ # Check clusterer and number of clusters
+ if self.som_estimator is None:
+ self.clusterer_ = SOM(random_state=self.random_state_)
+ elif isinstance(self.som_estimator, int):
+ check_scalar(self.som_estimator, 'som_estimator', int, min_val=1)
+ n = round(sqrt(self.som_estimator))
+ self.clusterer_ = SOM(n_columns=n, n_rows=n, random_state=self.random_state_)
+ elif isinstance(self.som_estimator, float):
+ check_scalar(self.som_estimator, 'som_estimator', float, min_val=0, max_val=1)
+ n = round(sqrt((X.shape[0] - 1) * self.som_estimator + 1))
+ self.clusterer_ = SOM(n_columns=n, n_rows=n, random_state=self.random_state_)
+ elif isinstance(self.som_estimator, SOM):
+ self.clusterer_ = clone(self.som_estimator)
+ else:
+ msg = (
+ 'Parameter `som_estimator` should be either `None` or the number of '
+ 'clusters or a float in the [0.0, 1.0] range equal to the number of '
+ 'clusters over the number of samples or an instance of the `SOM` class.'
+ )
+ raise TypeError(msg)
+
+ # Check distributor
+ self.distributor_ = DensityDistributor(
+ distribution_ratio=self.distribution_ratio,
+ filtering_threshold=1,
+ distances_exponent=2,
+ )
+
+ return self
diff --git a/src/gsmote/__init__.py b/src/imblearn_extra/gsmote/__init__.py
similarity index 87%
rename from src/gsmote/__init__.py
rename to src/imblearn_extra/gsmote/__init__.py
index f7a6a6c..df4e810 100644
--- a/src/gsmote/__init__.py
+++ b/src/imblearn_extra/gsmote/__init__.py
@@ -1,7 +1,6 @@
"""Implementation of the Geometric SMOTE algorithm.
-A geometrically enhanced drop-in replacement for SMOTE. It is compatible
-with scikit-learn and imbalanced-learn.
+A geometrically enhanced drop-in replacement for SMOTE. It is compatible with scikit-learn and imbalanced-learn.
"""
from __future__ import annotations
diff --git a/src/gsmote/geometric_smote.py b/src/imblearn_extra/gsmote/geometric_smote.py
similarity index 98%
rename from src/gsmote/geometric_smote.py
rename to src/imblearn_extra/gsmote/geometric_smote.py
index ad570ef..e729c56 100644
--- a/src/gsmote/geometric_smote.py
+++ b/src/imblearn_extra/gsmote/geometric_smote.py
@@ -6,6 +6,7 @@
import math
from collections import Counter
+from collections.abc import Callable
import numpy as np
from imblearn.over_sampling.base import BaseOverSampler
@@ -93,6 +94,7 @@ def populate_categorical_features(
for start_idx, end_idx in zip(
np.cumsum(categories_size)[:-1],
np.cumsum(categories_size)[1:],
+ strict=False,
):
col_maxs = neighbors[:, start_idx:end_idx].sum(axis=0)
is_max = np.isclose(col_maxs, col_maxs.max(axis=0))
@@ -214,7 +216,7 @@ class GeometricSMOTE(BaseOverSampler):
def __init__(
self: Self,
- sampling_strategy: dict[int, int] | str = 'auto',
+ sampling_strategy: dict[int, int] | str | float | Callable = 'auto',
k_neighbors: NearestNeighbors | int = 5,
truncation_factor: float = 1.0,
deformation_factor: float = 0.0,
@@ -386,7 +388,7 @@ def _make_geometric_samples( # noqa: C901
# Generate new samples
X_new = np.zeros((n_samples, X.shape[1]))
all_neighbors = []
- for ind, (row, col) in enumerate(zip(rows, cols)):
+ for ind, (row, col) in enumerate(zip(rows, cols, strict=False)):
# Define center point
center = X_pos[row]
diff --git a/src/gsmote/py.typed b/src/imblearn_extra/py.typed
similarity index 100%
rename from src/gsmote/py.typed
rename to src/imblearn_extra/py.typed
diff --git a/tests/__init__.py b/tests/__init__.py
index 15928cd..6084b50 100644
--- a/tests/__init__.py
+++ b/tests/__init__.py
@@ -1,4 +1,4 @@
-"""Tests suite for `gsmote`."""
+"""Tests suite for `imblearn_extra`."""
from pathlib import Path
diff --git a/tests/clover/__init__.py b/tests/clover/__init__.py
new file mode 100644
index 0000000..a0384c7
--- /dev/null
+++ b/tests/clover/__init__.py
@@ -0,0 +1 @@
+"""Tests for clustering-based oversampling."""
diff --git a/tests/clover/clusterer/__init__.py b/tests/clover/clusterer/__init__.py
new file mode 100644
index 0000000..3acbc95
--- /dev/null
+++ b/tests/clover/clusterer/__init__.py
@@ -0,0 +1 @@
+"""Tests for clusterer classes."""
diff --git a/tests/clover/clusterer/test_som.py b/tests/clover/clusterer/test_som.py
new file mode 100644
index 0000000..3a181e9
--- /dev/null
+++ b/tests/clover/clusterer/test_som.py
@@ -0,0 +1,74 @@
+"""Test the _som module."""
+
+import numpy as np
+from sklearn.datasets import make_classification
+
+from imblearn_extra.clover.clusterer import SOM, extract_topological_neighbors, generate_labels_mapping
+
+RANDOM_STATE = 5
+X, _ = make_classification(random_state=RANDOM_STATE, n_samples=1000)
+
+
+def test_generate_labels_mapping():
+ """Test the generation of the labels mapping."""
+ grid_labels = [(1, 1), (0, 0), (0, 1), (1, 0), (1, 1), (1, 0), (0, 1)]
+ labels_mapping = {(0, 0): 0, (0, 1): 1, (1, 0): 2, (1, 1): 3}
+ assert generate_labels_mapping(grid_labels) == labels_mapping
+
+
+def test_extract_topological_neighbors_rectangular():
+ """Test the topological neighbors of a neuron for rectangular grid type."""
+ som = SOM(random_state=RANDOM_STATE).fit(X)
+ labels_coords_unique = list({(int(i), int(j)) for i, j in [som.algorithm_.winner(x) for x in X]})
+ assert extract_topological_neighbors(0, 0, som.topology, som.n_rows_, som.n_columns_, labels_coords_unique) == [
+ (1, 0),
+ (0, 1),
+ ]
+ assert extract_topological_neighbors(1, 1, som.topology, som.n_rows_, som.n_columns_, labels_coords_unique) == [
+ (0, 1),
+ (2, 1),
+ (1, 0),
+ ]
+
+
+def test_extract_topological_neighbors_hexagonal():
+ """Test the topological neighbors of a neuron for hexagonal grid type."""
+ som = SOM(random_state=RANDOM_STATE, topology='hexagonal').fit(X)
+ labels_coords_unique = list({(int(i), int(j)) for i, j in [som.algorithm_.winner(x) for x in X]})
+ assert extract_topological_neighbors(0, 0, som.topology, som.n_rows_, som.n_columns_, labels_coords_unique) == [
+ (1, 0),
+ (0, 1),
+ ]
+ assert extract_topological_neighbors(1, 1, som.topology, som.n_rows_, som.n_columns_, labels_coords_unique) == [
+ (0, 1),
+ (2, 1),
+ (1, 0),
+ (2, 2),
+ (2, 0),
+ ]
+
+
+def test_no_fit():
+ """Test the SOM initialization."""
+ som = SOM(random_state=RANDOM_STATE)
+ assert not hasattr(som, 'labels_')
+ assert not hasattr(som, 'neighbors_')
+ assert not hasattr(som, 'algorithm_')
+ assert not hasattr(som, 'n_columns_')
+ assert not hasattr(som, 'n_rows_')
+ assert not hasattr(som, 'labels_mapping_')
+
+
+def test_fit():
+ """Test the SOM fitting process."""
+ n_rows = 5
+ n_columns = 3
+ som = SOM(n_rows=n_rows, n_columns=n_columns, random_state=RANDOM_STATE)
+ som.fit(X)
+ assert all(np.unique(som.labels_) >= 0)
+ assert all(np.unique(som.labels_) < n_rows * n_columns)
+ assert som.n_rows_ == n_rows
+ assert som.n_columns_ == n_columns
+ assert hasattr(som, 'neighbors_')
+ assert hasattr(som, 'algorithm_')
+ assert hasattr(som, 'labels_mapping_')
diff --git a/tests/clover/distribution/__init__.py b/tests/clover/distribution/__init__.py
new file mode 100644
index 0000000..96ac577
--- /dev/null
+++ b/tests/clover/distribution/__init__.py
@@ -0,0 +1 @@
+"""Tests for distribution classes."""
diff --git a/tests/clover/distribution/test_base.py b/tests/clover/distribution/test_base.py
new file mode 100644
index 0000000..c748860
--- /dev/null
+++ b/tests/clover/distribution/test_base.py
@@ -0,0 +1,26 @@
+"""Test the _base module."""
+
+import numpy as np
+import pytest
+from sklearn.datasets import make_classification
+
+from imblearn_extra.clover.distribution.base import BaseDistributor
+
+
+@pytest.mark.parametrize(("n_samples", "n_classes", "weights"), [(20, 2, [0.8, 0.2]), (10, 3, [0.6, 0.2, 0.2])])
+def test_fit(n_samples, n_classes, weights):
+ """Test fit method."""
+ X, y = make_classification(
+ random_state=0,
+ n_samples=n_samples,
+ n_classes=n_classes,
+ weights=weights,
+ n_informative=5,
+ )
+ distributor = BaseDistributor().fit(X, y)
+ assert len(distributor.majority_class_labels_) == 1
+ assert distributor.majority_class_labels_[0] == 0
+ np.testing.assert_array_equal(distributor.labels_, np.repeat(0, n_samples))
+ np.testing.assert_array_equal(distributor.neighbors_, np.empty((0, 2)))
+ assert distributor.intra_distribution_ == {(0, class_label): 1.0 for class_label in range(1, n_classes)}
+ assert distributor.inter_distribution_ == {}
diff --git a/tests/clover/distribution/test_density.py b/tests/clover/distribution/test_density.py
new file mode 100644
index 0000000..4cfab85
--- /dev/null
+++ b/tests/clover/distribution/test_density.py
@@ -0,0 +1,347 @@
+"""Test the _density module."""
+
+import numpy as np
+import pytest
+from sklearn.base import clone
+
+from imblearn_extra.clover.distribution._density import DensityDistributor
+
+X = np.array(
+ [
+ [1.0, 1.0],
+ [1.0, 2.0],
+ [1.5, 1.5],
+ [-1.0, 1.0],
+ [-1.0, 1.5],
+ [-1.0, -1.0],
+ [2.0, -1.0],
+ [2.5, -1.0],
+ [2.5, -1.5],
+ [2.0, -1.5],
+ [2.0, -2.0],
+ [2.0, -2.5],
+ [3.0, -1.0],
+ [2.0, -1.0],
+ [4.0, -1.0],
+ ],
+)
+y_bin = np.array([1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1])
+y_multi = np.array([0, 1, 2, 1, 2, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2])
+y_partial_tie = np.array([0, 1, 1, 2, 2, 1, 1, 0, 0, 0, 0, 0, 1, 1, 2])
+y_full_tie = np.array([0, 1, 2, 1, 2, 1, 2, 2, 0, 0, 0, 0, 1, 1, 2])
+LABELS = np.array([0, 0, 0, 1, 1, 2, 3, 3, 3, 3, 3, 3, 4, 4, 4])
+NEIGHBORS_BIN = np.array([(0, 1), (0, 2), (0, 3), (4, 2), (2, 3)])
+NEIGHBORS_MULTI = np.array([(0, 1), (1, 4), (2, 3)])
+DISTRIBUTOR = DensityDistributor(filtering_threshold=0.6, distances_exponent=1)
+
+
+def test_filtered_clusters_binary():
+ """Test the identification of filtered clusters.
+
+ Binary case.
+ """
+ distributor = clone(DISTRIBUTOR).fit(X, y_bin, LABELS)
+ assert distributor.filtered_clusters_ == [(0, 1), (2, 1), (4, 1)]
+
+
+def test_filtered_clusters_multiclass():
+ """Test the identification of filtered clusters.
+
+ Multiclass case.
+ """
+ distributor = clone(DISTRIBUTOR).set_params(filtering_threshold=1.0).fit(X, y_multi, LABELS)
+ assert distributor.filtered_clusters_ == [
+ (0, 1),
+ (0, 2),
+ (1, 1),
+ (1, 2),
+ (4, 1),
+ (4, 2),
+ ]
+
+
+def test_filtered_clusters_multiclass_partial_tie():
+ """Test the identification of filtered clusters.
+
+ Multiclass case with partial tie.
+ """
+ distributor = clone(DISTRIBUTOR).set_params(filtering_threshold=1.0).fit(X, y_partial_tie, LABELS)
+ assert distributor.filtered_clusters_ == [(1, 2), (4, 2)]
+
+
+def test_filtered_clusters_multiclass_full_tie():
+ """Test the identification of filtered clusters.
+
+ Multiclass case with full tie.
+ """
+ distributor = clone(DISTRIBUTOR).set_params(filtering_threshold=1.0).fit(X, y_full_tie, LABELS)
+ assert distributor.filtered_clusters_ == []
+
+
+def test_clusters_density_binary():
+ """Test the filtered clusters density.
+
+ Binary case.
+ """
+ distributor = clone(DISTRIBUTOR).fit(X, y_bin, LABELS)
+ assert distributor.clusters_density_ == {(0, 1): 2.0, (2, 1): 2.25, (4, 1): 2.25}
+
+
+def test_clusters_density_multiclass():
+ """Test the filtered clusters density.
+
+ Multiclass case.
+ """
+ distributor = clone(DISTRIBUTOR).set_params(filtering_threshold=1.0).fit(X, y_multi, LABELS)
+ assert distributor.clusters_density_ == {
+ (0, 1): 2.0,
+ (0, 2): 2.0,
+ (1, 1): 2.0,
+ (1, 2): 2.0,
+ (4, 1): 2.0,
+ (4, 2): 2.0,
+ }
+
+
+def test_clusters_density_multiclass_partial_tie():
+ """Test filtered clusters density.
+
+ Multiclass case with partial tie.
+ """
+ distributor = clone(DISTRIBUTOR).set_params(filtering_threshold=1.0).fit(X, y_partial_tie, LABELS)
+ assert distributor.clusters_density_ == {
+ (1, 2): 4.0,
+ (4, 2): 4.0,
+ }
+
+
+def test_clusters_density_multiclass_full_tie():
+ """Test filtered clusters density.
+
+ Multiclass case with full tie.
+ """
+ distributor = clone(DISTRIBUTOR).set_params(filtering_threshold=1.0).fit(X, y_full_tie, LABELS)
+ assert distributor.clusters_density_ == {}
+
+
+def test_clusters_density_no_filtered():
+ """Test filter clusters density.
+
+ No filtered clusters case.
+ """
+ X = np.arange(0.0, 5.0).reshape(-1, 1)
+ y = np.array([0, 0, 0, 1, 1])
+ labels = np.array([-1, -1, -1, -1, -1])
+ distributor = clone(DISTRIBUTOR).set_params().fit(X, y, labels)
+ assert distributor.clusters_density_ == {}
+
+
+def test_raise_error_filtering_threshold():
+ """Test raise error for filtering threshold.
+
+ Value and type error cases.
+ """
+ with pytest.raises(ValueError, match='filtering_threshold == -1.0, must be >= 0.0'):
+ clone(DISTRIBUTOR).set_params(filtering_threshold=-1.0).fit(X, y_bin, LABELS)
+ with pytest.raises(TypeError, match='filtering_threshold must be an instance of {int, float}, not NoneType'):
+ clone(DISTRIBUTOR).set_params(filtering_threshold=None).fit(X, y_bin, LABELS)
+ with pytest.raises(TypeError, match='filtering_threshold must be an instance of {int, float}, not str'):
+ clone(DISTRIBUTOR).set_params(filtering_threshold='value').fit(X, y_bin, LABELS)
+
+
+def test_raise_error_distances_exponent():
+ """Test raise error for distances exponent.
+
+ Value and type error cases.
+ """
+ with pytest.raises(ValueError, match='distances_exponent == -1.0, must be >= 0.0'):
+ clone(DISTRIBUTOR).set_params(distances_exponent=-1.0).fit(X, y_bin, LABELS)
+ with pytest.raises(TypeError, match='distances_exponent must be an instance of {int, float}, not None'):
+ clone(DISTRIBUTOR).set_params(distances_exponent=None).fit(X, y_bin, LABELS)
+ with pytest.raises(TypeError, match='distances_exponent must be an instance of {int, float}, not str'):
+ clone(DISTRIBUTOR).set_params(distances_exponent='value').fit(X, y_bin, LABELS)
+
+
+def test_raise_error_sparsity_based():
+ """Test raise error for sparsity based.
+
+ Type error case.
+ """
+ with pytest.raises(TypeError, match='sparsity_based must be an instance of bool, not NoneType'):
+ clone(DISTRIBUTOR).set_params(sparsity_based=None).fit(X, y_bin, LABELS)
+
+
+def test_raise_error_distribution_ratio():
+ """Test raise error for distribution ratio.
+
+ Type error case.
+ """
+ with pytest.raises(ValueError, match='distribution_ratio == -1.0, must be >= 0.0'):
+ clone(DISTRIBUTOR).set_params(distribution_ratio=-1.0).fit(X, y_bin, LABELS)
+ with pytest.raises(ValueError, match='distribution_ratio == 2.0, must be <= 1.0'):
+ clone(DISTRIBUTOR).set_params(distribution_ratio=2.0).fit(X, y_bin, LABELS)
+ with pytest.raises(TypeError, match='distribution_ratio must be an instance of float, not str'):
+ clone(DISTRIBUTOR).set_params(distribution_ratio='value').fit(X, y_bin, LABELS)
+
+
+def test_raise_error_no_neighbors_distribution_ratio():
+ """Test distribution ratio.
+
+ No neighbors value error case.
+ """
+ with pytest.raises(
+ ValueError,
+ match=('Parameter `distribution_ratio` should be equal to 1.0, when `neighbors` parameter is `None`.'),
+ ):
+ clone(DISTRIBUTOR).set_params(distribution_ratio=0.5).fit(X, y_bin, LABELS, neighbors=None)
+
+
+def test_fit_default():
+ """Test fit method.
+
+ Default initialization case.
+ """
+ distributor = clone(DISTRIBUTOR).fit(X, y_bin, None, None)
+ assert distributor.majority_class_labels_ == [0]
+ assert hasattr(distributor, 'filtered_clusters_')
+ assert hasattr(distributor, 'clusters_density_')
+ np.testing.assert_array_equal(distributor.labels_, np.repeat(0, len(X)))
+ np.testing.assert_array_equal(distributor.neighbors_, np.empty((0, 2)))
+ assert distributor.intra_distribution_ == {(0, 1): 1.0}
+ assert distributor.inter_distribution_ == {}
+
+
+def test_fit_binary_intra():
+ """Test fit method.
+
+ Binary and intra-cluster generation case.
+ """
+ distributor = clone(DISTRIBUTOR).fit(X, y_bin, LABELS)
+ np.testing.assert_almost_equal(distributor.intra_distribution_[(0, 1)], 9.0 / 25.0)
+ np.testing.assert_almost_equal(distributor.intra_distribution_[(2, 1)], 8.0 / 25.0)
+ np.testing.assert_almost_equal(distributor.intra_distribution_[(4, 1)], 8.0 / 25.0)
+
+
+def test_fit_multiclass_intra():
+ """Test fit method.
+
+ Multiclass and intra-cluster generation case.
+ """
+ distributor = clone(DISTRIBUTOR).set_params(filtering_threshold=1.0).fit(X, y_multi, LABELS)
+ np.testing.assert_almost_equal(distributor.intra_distribution_[(0, 1)], 1.0 / 3.0)
+ np.testing.assert_almost_equal(distributor.intra_distribution_[(1, 1)], 1.0 / 3.0)
+ np.testing.assert_almost_equal(distributor.intra_distribution_[(4, 1)], 1.0 / 3.0)
+ np.testing.assert_almost_equal(distributor.intra_distribution_[(0, 2)], 1.0 / 3.0)
+ np.testing.assert_almost_equal(distributor.intra_distribution_[(1, 2)], 1.0 / 3.0)
+ np.testing.assert_almost_equal(distributor.intra_distribution_[(4, 2)], 1.0 / 3.0)
+
+
+def test_fit_multiclass_intra_partial_tie():
+ """Test fit method.
+
+ Multiclass intra-cluster generation and partial tie case.
+ """
+ distributor = clone(DISTRIBUTOR).set_params(filtering_threshold=1.0).fit(X, y_partial_tie, LABELS)
+ np.testing.assert_almost_equal(distributor.intra_distribution_[(1, 2)], 0.5)
+ np.testing.assert_almost_equal(distributor.intra_distribution_[(4, 2)], 0.5)
+
+
+def test_fit_binary_inter():
+ """Test fit method.
+
+ Binary and inter-cluster generation case.
+ """
+ distributor = clone(DISTRIBUTOR).set_params(distribution_ratio=0.0).fit(X, y_bin, LABELS, NEIGHBORS_BIN)
+ np.testing.assert_equal(distributor.labels_, LABELS)
+ np.testing.assert_equal(distributor.neighbors_, NEIGHBORS_BIN)
+ np.testing.assert_almost_equal(distributor.inter_distribution_[((0, 1), (2, 1))], 18.0 / 35.0)
+ np.testing.assert_almost_equal(distributor.inter_distribution_[((4, 1), (2, 1))], 17.0 / 35.0)
+
+
+def test_fit_multiclass_inter():
+ """Test fit method.
+
+ Multiclass and inter-cluster generation case.
+ """
+ distributor = (
+ clone(DISTRIBUTOR)
+ .set_params(distribution_ratio=0.0, filtering_threshold=1.0)
+ .fit(X, y_multi, LABELS, NEIGHBORS_MULTI)
+ )
+ np.testing.assert_equal(distributor.labels_, LABELS)
+ np.testing.assert_equal(distributor.neighbors_, NEIGHBORS_MULTI)
+ np.testing.assert_almost_equal(distributor.inter_distribution_[((0, 1), (1, 1))], 0.5)
+ np.testing.assert_almost_equal(distributor.inter_distribution_[((1, 1), (4, 1))], 0.5)
+ np.testing.assert_almost_equal(distributor.inter_distribution_[((0, 2), (1, 2))], 0.5)
+ np.testing.assert_almost_equal(distributor.inter_distribution_[((1, 2), (4, 2))], 0.5)
+
+
+def test_fit_multiclass_inter_partial_tie():
+ """Test fit method.
+
+ Multiclass, intra-cluster generation and partial tie case.
+ """
+ distributor = (
+ clone(DISTRIBUTOR)
+ .set_params(distribution_ratio=0.0, filtering_threshold=1.0)
+ .fit(X, y_partial_tie, LABELS, NEIGHBORS_MULTI)
+ )
+ np.testing.assert_equal(distributor.labels_, LABELS)
+ np.testing.assert_equal(distributor.neighbors_, NEIGHBORS_MULTI)
+ np.testing.assert_almost_equal(distributor.inter_distribution_[((1, 2), (4, 2))], 1)
+
+
+def test_fit_binary_intra_inter():
+ """Test fit method.
+
+ Binary, intra-cluster generation and inter-cluster generation case.
+ """
+ distributor = clone(DISTRIBUTOR).set_params(distribution_ratio=0.5).fit(X, y_bin, LABELS, NEIGHBORS_BIN)
+ np.testing.assert_equal(distributor.labels_, LABELS)
+ np.testing.assert_equal(distributor.neighbors_, NEIGHBORS_BIN)
+ np.testing.assert_almost_equal(distributor.intra_distribution_[(0, 1)], 9.0 / 50.0)
+ np.testing.assert_almost_equal(distributor.intra_distribution_[(2, 1)], 8.0 / 50.0)
+ np.testing.assert_almost_equal(distributor.intra_distribution_[(4, 1)], 8.0 / 50.0)
+ np.testing.assert_almost_equal(distributor.inter_distribution_[((0, 1), (2, 1))], 18.0 / 70.0)
+ np.testing.assert_almost_equal(distributor.inter_distribution_[((4, 1), (2, 1))], 17.0 / 70.0)
+
+
+def test_fit_multiclass_intra_inter():
+ """Test fit method.
+
+ Multiclass, intra-cluster generation and inter-cluster generation case.
+ """
+ distributor = (
+ clone(DISTRIBUTOR)
+ .set_params(distribution_ratio=0.5, filtering_threshold=1.0)
+ .fit(X, y_multi, LABELS, NEIGHBORS_MULTI)
+ )
+ np.testing.assert_equal(distributor.labels_, LABELS)
+ np.testing.assert_equal(distributor.neighbors_, NEIGHBORS_MULTI)
+ np.testing.assert_almost_equal(distributor.intra_distribution_[(0, 1)], 1.0 / 6.0)
+ np.testing.assert_almost_equal(distributor.intra_distribution_[(1, 1)], 1.0 / 6.0)
+ np.testing.assert_almost_equal(distributor.intra_distribution_[(4, 1)], 1.0 / 6.0)
+ np.testing.assert_almost_equal(distributor.intra_distribution_[(0, 2)], 1.0 / 6.0)
+ np.testing.assert_almost_equal(distributor.intra_distribution_[(1, 2)], 1.0 / 6.0)
+ np.testing.assert_almost_equal(distributor.intra_distribution_[(4, 2)], 1.0 / 6.0)
+ np.testing.assert_almost_equal(distributor.inter_distribution_[((0, 1), (1, 1))], 0.25)
+ np.testing.assert_almost_equal(distributor.inter_distribution_[((1, 1), (4, 1))], 0.25)
+ np.testing.assert_almost_equal(distributor.inter_distribution_[((0, 2), (1, 2))], 0.25)
+ np.testing.assert_almost_equal(distributor.inter_distribution_[((1, 2), (4, 2))], 0.25)
+
+
+def test_fit_multiclass_intra_inter_partial_tie():
+ """Test fit method.
+
+ Multiclass, intra-cluster generation, inter-cluster generation case and partial tie case.
+ """
+ distributor = (
+ clone(DISTRIBUTOR)
+ .set_params(distribution_ratio=0.5, filtering_threshold=1.0)
+ .fit(X, y_partial_tie, LABELS, NEIGHBORS_MULTI)
+ )
+ np.testing.assert_equal(distributor.labels_, LABELS)
+ np.testing.assert_equal(distributor.neighbors_, NEIGHBORS_MULTI)
+ np.testing.assert_almost_equal(distributor.intra_distribution_[(1, 2)], 1.0 / 4.0)
+ np.testing.assert_almost_equal(distributor.intra_distribution_[(4, 2)], 1.0 / 4.0)
+ np.testing.assert_almost_equal(distributor.inter_distribution_[((1, 2), (4, 2))], 0.5)
diff --git a/tests/clover/over_sampling/__init__.py b/tests/clover/over_sampling/__init__.py
new file mode 100644
index 0000000..238038c
--- /dev/null
+++ b/tests/clover/over_sampling/__init__.py
@@ -0,0 +1 @@
+"""Tests for clustering-based over-samplers."""
diff --git a/tests/clover/over_sampling/test_cluster.py b/tests/clover/over_sampling/test_cluster.py
new file mode 100644
index 0000000..d5d3477
--- /dev/null
+++ b/tests/clover/over_sampling/test_cluster.py
@@ -0,0 +1,327 @@
+"""Test the _cluster module."""
+
+from collections import Counter, OrderedDict
+
+import numpy as np
+import pytest
+from imblearn.over_sampling import SMOTE, SVMSMOTE, BorderlineSMOTE, RandomOverSampler
+from sklearn.base import clone
+from sklearn.cluster import KMeans
+from sklearn.datasets import make_classification
+from sklearn.exceptions import FitFailedWarning
+from sklearn.neighbors import NearestNeighbors
+from sklearn.utils import check_random_state
+
+from imblearn_extra.clover.distribution import DensityDistributor
+from imblearn_extra.clover.over_sampling import (
+ ClusterOverSampler,
+ clone_modify,
+ extract_inter_data,
+ extract_intra_data,
+ generate_in_cluster,
+ modify_nn,
+)
+
+RANDOM_STATE = 1
+X, y = make_classification(
+ random_state=RANDOM_STATE,
+ n_classes=3,
+ n_samples=5000,
+ n_features=10,
+ n_clusters_per_class=2,
+ weights=[0.2, 0.5, 0.3],
+ n_informative=5,
+)
+CLUSTERER = KMeans(n_clusters=5, n_init=1, random_state=RANDOM_STATE)
+OVERSAMPLERS = [
+ RandomOverSampler(random_state=RANDOM_STATE),
+ SMOTE(random_state=RANDOM_STATE),
+ BorderlineSMOTE(random_state=RANDOM_STATE),
+ SVMSMOTE(random_state=RANDOM_STATE),
+]
+CLUSTER_OVERSAMPLERS = [
+ ClusterOverSampler(RandomOverSampler(random_state=RANDOM_STATE), clusterer=CLUSTERER, random_state=RANDOM_STATE),
+ ClusterOverSampler(SMOTE(random_state=RANDOM_STATE), clusterer=CLUSTERER, random_state=RANDOM_STATE),
+ ClusterOverSampler(BorderlineSMOTE(random_state=RANDOM_STATE), clusterer=CLUSTERER, random_state=RANDOM_STATE),
+ ClusterOverSampler(SVMSMOTE(random_state=RANDOM_STATE), clusterer=CLUSTERER, random_state=RANDOM_STATE),
+]
+
+
+def test_modify_nn_object():
+ """Test modification of nearest neighbors.
+
+ Object case.
+ """
+ n_neighbors = 2
+ assert modify_nn(NearestNeighbors(n_neighbors=5), 3).n_neighbors == n_neighbors
+ assert modify_nn(NearestNeighbors(n_neighbors=3), 3).n_neighbors == n_neighbors
+ assert modify_nn(NearestNeighbors(n_neighbors=2), 5).n_neighbors == n_neighbors
+
+
+def test_modify_nn_int():
+ """Test modification of nearest neighbors.
+
+ Integer case.
+ """
+ n_neighbors = 2
+ assert modify_nn(5, 3) == n_neighbors
+ assert modify_nn(3, 3) == n_neighbors
+ assert modify_nn(2, 5) == n_neighbors
+
+
+def test_clone_modify_ros():
+ """Test cloning and modification of oversampler.
+
+ Random oversampler case.
+ """
+ cloned_oversampler = clone_modify(OVERSAMPLERS[0], None, None)
+ assert isinstance(cloned_oversampler, RandomOverSampler)
+
+
+@pytest.mark.parametrize(
+ 'oversampler',
+ [ovs for ovs in OVERSAMPLERS if not isinstance(ovs, RandomOverSampler)],
+)
+def test_clone_modify_single_min_sample(oversampler):
+ """Test cloning and modification of oversampler.
+
+ One minority class sample case.
+ """
+ class_label = 1
+ y_in_cluster = np.array([0, 0, 0, 0, 1, 2, 2, 2])
+ cloned_oversampler = clone_modify(oversampler, class_label, y_in_cluster)
+ assert isinstance(cloned_oversampler, RandomOverSampler)
+
+
+@pytest.mark.parametrize(
+ 'oversampler',
+ [ovs for ovs in OVERSAMPLERS if not isinstance(ovs, RandomOverSampler)],
+)
+def test_clone_modify_neighbors(oversampler):
+ """Test cloning and modification of oversampler.
+
+ Neighbors based oversamplers case.
+ """
+ class_label = 2
+ y_in_cluster = np.array([0, 0, 0, 0, 1, 2, 2, 2])
+ n_minority_samples = Counter(y_in_cluster)[class_label]
+ cloned_oversampler = clone_modify(oversampler, class_label, y_in_cluster)
+ assert isinstance(cloned_oversampler, oversampler.__class__)
+ if hasattr(cloned_oversampler, 'k_neighbors'):
+ assert cloned_oversampler.k_neighbors == n_minority_samples - 1
+ if hasattr(cloned_oversampler, 'm_neighbors'):
+ assert cloned_oversampler.m_neighbors in (y_in_cluster.size - 1, 'deprecated')
+ if hasattr(cloned_oversampler, 'n_neighbors'):
+ assert cloned_oversampler.n_neighbors in (n_minority_samples - 1, 'deprecated')
+
+
+def test_extract_intra_data():
+ """Test extraction of in the clusters data.
+
+ Multiclass case.
+ """
+ X = np.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]).reshape(-1, 1)
+ y = np.array([0, 0, 0, 0, 1, 2, 2, 2, 0])
+ cluster_labels = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2])
+ intra_distribution = {(1, 1): 1.0, (1, 2): 0.8, (2, 2): 0.2}
+ sampling_strategy = OrderedDict({1: 4, 2: 2})
+ clusters_data = extract_intra_data(X, y, cluster_labels, intra_distribution, sampling_strategy)
+ cluster_sampling_strategies, Xs, ys = zip(*clusters_data, strict=True)
+ assert cluster_sampling_strategies == ({1: 4}, {2: 2})
+ assert [X.tolist() for X in Xs] == [[[4.0], [5.0]], [[4.0], [6.0]]]
+ assert [y.tolist() for y in ys] == [[0, 1], [0, 2]]
+
+
+def test_extract_inter_data():
+ """Test extraction of between clusters data.
+
+ Multiclass case.
+ """
+ X = np.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0]).reshape(-1, 1)
+ y = np.array([1, 0, 0, 0, 1, 2, 2, 2, 0, 0, 1, 0])
+ cluster_labels = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 2])
+ inter_distribution = {
+ ((0, 1), (1, 1)): 0.5,
+ ((1, 1), (2, 1)): 0.5,
+ ((1, 2), (2, 2)): 1.0,
+ }
+ sampling_strategy = OrderedDict({1: 3, 2: 3})
+ random_state = check_random_state(RANDOM_STATE)
+ clusters_data = extract_inter_data(X, y, cluster_labels, inter_distribution, sampling_strategy, random_state)
+ cluster_sampling_strategies, Xs, ys = zip(*clusters_data, strict=True)
+ assert cluster_sampling_strategies == (
+ {1: 1},
+ {1: 1},
+ {1: 1},
+ {1: 1},
+ {2: 1},
+ {2: 1},
+ {2: 1},
+ )
+ assert [X.tolist() for X in Xs] == 2 * [[[1.0], [5.0], [2.0], [3.0], [4.0]]] + 2 * [
+ [[5.0], [11.0], [4.0], [9.0], [10.0], [12.0]],
+ ] + 2 * [[[6.0], [8.0], [4.0], [9.0], [10.0], [12.0]]] + [[[6.0], [7.0], [4.0], [9.0], [10.0], [12.0]]]
+ assert [y.tolist() for y in ys] == 2 * [[1, 1, 0, 0, 0]] + 2 * [[1, 1, 0, 0, 0, 0]] + 3 * [[2, 2, 0, 0, 0, 0]]
+
+
+@pytest.mark.parametrize('oversampler', OVERSAMPLERS)
+def test_generate_in_cluster(oversampler):
+ """Test generation in the clusters samples.
+
+ Multiclass case.
+ """
+ oversampler = clone(oversampler)
+
+ X_in_cluster = np.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]).reshape(-1, 1)
+ y_in_cluster = np.array([0, 0, 0, 0, 1, 2, 2, 2])
+
+ # First class
+ cluster_sampling_strategy = {1: 5}
+ (class_label,) = cluster_sampling_strategy
+ X_new, y_new = generate_in_cluster(oversampler, None, cluster_sampling_strategy, X_in_cluster, y_in_cluster)
+ assert len(X_new) == len(y_new) <= cluster_sampling_strategy[1]
+ np.testing.assert_array_equal(np.unique(X_new), np.array([5.0]))
+ assert Counter(y_new)[class_label] == cluster_sampling_strategy[1]
+
+ # Second class
+ cluster_sampling_strategy = {2: 3}
+ (class_label,) = cluster_sampling_strategy
+ X_new, y_new = generate_in_cluster(oversampler, None, cluster_sampling_strategy, X_in_cluster, y_in_cluster)
+ assert len(X_new) == len(y_new) <= cluster_sampling_strategy[2]
+ assert Counter(y_new)[class_label] <= cluster_sampling_strategy[2]
+
+
+@pytest.mark.parametrize('oversampler', CLUSTER_OVERSAMPLERS)
+def test_fit(oversampler):
+ """Test fit method.
+
+ Multiclass case.
+ """
+ oversampler = clone(oversampler).fit(X, y)
+ y_count = Counter(y)
+ assert hasattr(oversampler, 'sampling_strategy_')
+ assert hasattr(oversampler, 'oversampler_')
+ assert hasattr(oversampler, 'clusterer_')
+ assert hasattr(oversampler, 'distributor_')
+ assert hasattr(oversampler, 'random_state_')
+ assert oversampler.sampling_strategy_ == OrderedDict({0: y_count[1] - y_count[0], 2: y_count[1] - y_count[2]})
+
+
+@pytest.mark.parametrize('oversampler', CLUSTER_OVERSAMPLERS)
+def test_fit_resample(oversampler):
+ """Test fit and resample method.
+
+ Multiclass case.
+ """
+ oversampler = clone(oversampler)
+ oversampler.fit_resample(X, y)
+ assert hasattr(oversampler, 'sampling_strategy_')
+ assert hasattr(oversampler, 'oversampler_')
+ assert hasattr(oversampler, 'clusterer_')
+ assert hasattr(oversampler, 'distributor_')
+ assert hasattr(oversampler, 'random_state_')
+ assert hasattr(oversampler.distributor_, 'intra_distribution_')
+ assert hasattr(oversampler.distributor_, 'inter_distribution_')
+
+
+@pytest.mark.parametrize(
+ ("X", "y", "oversampler"),
+ [
+ (
+ np.array([(0.0, 0.0), (1.0, 1.0), (2.0, 2.0), (3.0, 3.0), (4.0, 4.0)]),
+ np.array([0, 0, 1, 1, 1]),
+ ClusterOverSampler(oversampler=SMOTE(k_neighbors=5, random_state=RANDOM_STATE)),
+ ),
+ (
+ np.array([(0.0, 0.0), (1.0, 1.0), (2.0, 2.0), (3.0, 3.0), (4.0, 4.0)]),
+ np.array([0, 0, 1, 1, 1]),
+ ClusterOverSampler(
+ oversampler=SMOTE(k_neighbors=5, random_state=RANDOM_STATE),
+ clusterer=CLUSTERER.set_params(n_clusters=3),
+ random_state=RANDOM_STATE,
+ ),
+ ),
+ ],
+)
+def test_fit_resample_intra_corner_cases(X, y, oversampler):
+ """Test fit and resample method.
+
+ Corner cases.
+ """
+ X_res, y_res = oversampler.fit_resample(X, y)
+ y_count = Counter(y_res)
+ assert y_count[0] == y_count[1]
+ assert X.item(0, 0) <= X_res.item(-1, 0) <= X.item(1, 0)
+ assert X.item(0, 1) <= X_res.item(-1, 1) <= X.item(1, 1)
+
+
+@pytest.mark.parametrize('oversampler', CLUSTER_OVERSAMPLERS)
+def test_raise_error_fit_resample(oversampler):
+ """Test raise of error.
+
+ No samples are generated case.
+ """
+ oversampler = clone(oversampler)
+ oversampler.set_params(
+ clusterer=CLUSTERER.set_params(n_clusters=2),
+ distributor=DensityDistributor(filtering_threshold=0.1),
+ )
+ with pytest.raises(
+ ValueError,
+ match='No samples were generated. Try to modify the parameters of the clusterer or distributor.',
+ ):
+ oversampler.fit_resample(X, y)
+
+
+@pytest.mark.parametrize('oversampler', CLUSTER_OVERSAMPLERS)
+def test_display_warning_fit_resample(oversampler):
+ """Test display warning.
+
+ No samples are generated case.
+ """
+ oversampler = clone(oversampler)
+ oversampler.set_params(
+ clusterer=CLUSTERER.set_params(n_clusters=2),
+ distributor=DensityDistributor(filtering_threshold=0.1),
+ raise_error=False,
+ )
+ with pytest.warns(
+ FitFailedWarning,
+ match='No samples were generated. Try to modify the parameters of the clusterer or distributor.',
+ ):
+ oversampler.fit_resample(X, y)
+
+
+@pytest.mark.parametrize('oversampler', CLUSTER_OVERSAMPLERS)
+def test_two_majority_classes(oversampler):
+ """Test fit and resample method.
+
+ Two majority classes case.
+ """
+ oversampler = clone(oversampler)
+
+ label_mapper = {
+ 0: 13,
+ 1: 1,
+ 2: 5,
+ 3: 7,
+ 4: 3,
+ 5: 10,
+ 6: 6,
+ 7: 8,
+ 8: 9,
+ 9: 11,
+ 10: 4,
+ }
+
+ X, y = make_classification(
+ n_samples=19 * len(label_mapper),
+ n_classes=len(label_mapper),
+ n_informative=30,
+ n_features=145,
+ random_state=42,
+ )
+
+ y = np.array([label_mapper[i] for i in y])
+
+ oversampler.fit_resample(X, y)
diff --git a/tests/clover/over_sampling/test_gsomo.py b/tests/clover/over_sampling/test_gsomo.py
new file mode 100644
index 0000000..db50a20
--- /dev/null
+++ b/tests/clover/over_sampling/test_gsomo.py
@@ -0,0 +1,181 @@
+"""Test the _gsomo module."""
+
+from collections import Counter, OrderedDict
+from math import sqrt
+
+import pytest
+from sklearn.base import clone
+from sklearn.cluster import AgglomerativeClustering
+from sklearn.datasets import make_classification
+
+from imblearn_extra.clover.clusterer import SOM
+from imblearn_extra.clover.distribution import DensityDistributor
+from imblearn_extra.clover.over_sampling import GeometricSOMO
+from imblearn_extra.gsmote import GeometricSMOTE
+
+RANDOM_STATE = 11
+X, y = make_classification(
+ random_state=RANDOM_STATE,
+ n_classes=3,
+ n_samples=5000,
+ n_features=10,
+ n_clusters_per_class=2,
+ weights=[0.3, 0.45, 0.25],
+ n_informative=5,
+)
+GSOMO_OVERSAMPLER = GeometricSOMO(random_state=RANDOM_STATE)
+
+
+@pytest.mark.parametrize(
+ ('k_neighbors', 'imbalance_ratio_threshold', 'distances_exponent', 'distribution_ratio'),
+ [(3, 2.0, 'auto', 0.3), (5, 1.5, 8, 0.5), (8, 'auto', 10, 0.8)],
+)
+def test_fit(k_neighbors, imbalance_ratio_threshold, distances_exponent, distribution_ratio):
+ """Test fit method.
+
+ Multiple cases.
+ """
+ # Fit oversampler
+ params = {
+ 'k_neighbors': k_neighbors,
+ 'distribution_ratio': distribution_ratio,
+ 'distances_exponent': distances_exponent,
+ 'imbalance_ratio_threshold': imbalance_ratio_threshold,
+ }
+ gsomo = clone(GSOMO_OVERSAMPLER).set_params(**params).fit(X, y)
+ y_count = Counter(y)
+
+ # Assert random state
+ assert hasattr(gsomo, 'random_state_')
+
+ # Assert oversampler
+ assert isinstance(gsomo.oversampler_, GeometricSMOTE)
+ assert gsomo.oversampler_.k_neighbors == gsomo.k_neighbors == k_neighbors
+ assert gsomo.oversampler_.truncation_factor == gsomo.truncation_factor
+ assert gsomo.oversampler_.deformation_factor == gsomo.deformation_factor
+ assert gsomo.oversampler_.selection_strategy == gsomo.selection_strategy
+
+ # Assert clusterer
+ assert isinstance(gsomo.clusterer_, SOM)
+
+ # Assert distributor
+ assert isinstance(gsomo.distributor_, DensityDistributor)
+ assert gsomo.distributor_.filtering_threshold == gsomo.imbalance_ratio_threshold == imbalance_ratio_threshold
+ assert gsomo.distributor_.distances_exponent == gsomo.distances_exponent == distances_exponent
+ assert gsomo.distributor_.distribution_ratio == gsomo.distribution_ratio == distribution_ratio
+
+ # Assert sampling strategy
+ assert gsomo.oversampler_.sampling_strategy == gsomo.sampling_strategy
+ assert gsomo.sampling_strategy_ == OrderedDict({0: y_count[1] - y_count[0], 2: y_count[1] - y_count[2]})
+
+
+def test_fit_default():
+ """Test fit method.
+
+ Default case.
+ """
+ # Fit oversampler
+ gsomo = clone(GSOMO_OVERSAMPLER).fit(X, y)
+
+ # Create SOM instance with default parameters
+ som = SOM()
+
+ # Assert clusterer
+ assert isinstance(gsomo.clusterer_, SOM)
+ assert gsomo.clusterer_.n_rows == som.n_rows
+ assert gsomo.clusterer_.n_columns == som.n_columns
+
+
+@pytest.mark.parametrize('n_clusters', [5, 6, 12])
+def test_fit_number_of_clusters(n_clusters):
+ """Test clusterer of fit method.
+
+ Number of clusters case.
+ """
+ # Fit oversampler
+ gsomo = clone(GSOMO_OVERSAMPLER).set_params(som_estimator=n_clusters).fit(X, y)
+
+ # Assert clusterer
+ assert isinstance(gsomo.clusterer_, SOM)
+ assert gsomo.clusterer_.n_rows == round(sqrt(gsomo.som_estimator))
+ assert gsomo.clusterer_.n_columns == round(sqrt(gsomo.som_estimator))
+
+
+@pytest.mark.parametrize('proportion', [0.0, 0.5, 1.0])
+def test_fit_proportion_of_samples(proportion):
+ """Test clusterer of fit method.
+
+ Proportion of samples case.
+ """
+ # Fit oversampler
+ gsomo = clone(GSOMO_OVERSAMPLER).set_params(som_estimator=proportion).fit(X, y)
+
+ # Assert clusterer
+ assert isinstance(gsomo.clusterer_, SOM)
+ assert gsomo.clusterer_.n_rows == round(sqrt((X.shape[0] - 1) * gsomo.som_estimator + 1))
+ assert gsomo.clusterer_.n_columns == round(sqrt((X.shape[0] - 1) * gsomo.som_estimator + 1))
+
+
+def test_som_estimator():
+ """Test clusterer of fit method.
+
+ Clusterer case.
+ """
+ # Fit oversampler
+ gsomo = clone(GSOMO_OVERSAMPLER).set_params(som_estimator=SOM()).fit(X, y)
+
+ # Define som estimator
+ som = SOM()
+
+ # Assert clusterer
+ assert isinstance(gsomo.clusterer_, type(som))
+ assert gsomo.clusterer_.n_rows == som.n_rows
+ assert gsomo.clusterer_.n_columns == som.n_columns
+
+
+@pytest.mark.parametrize('som_estimator', [-3, 0])
+def test_raise_value_error_fit_integer(som_estimator):
+ """Test fit method.
+
+ Integer values as estimators error case.
+ """
+ with pytest.raises(ValueError, match=f'som_estimator == {som_estimator}, must be >= 1.'):
+ clone(GSOMO_OVERSAMPLER).set_params(som_estimator=som_estimator).fit(X, y)
+
+
+@pytest.mark.parametrize('som_estimator', [-1.5, 2.0])
+def test_raise_value_error_fit_float(som_estimator):
+ """Test fit method.
+
+ Float values as estimators error case.
+ """
+ with pytest.raises(ValueError, match=f'som_estimator == {som_estimator}, must be'):
+ clone(GSOMO_OVERSAMPLER).set_params(som_estimator=som_estimator).fit(X, y)
+
+
+@pytest.mark.parametrize('som_estimator', [AgglomerativeClustering, [3, 5]])
+def test_raise_type_error_fit(som_estimator):
+ """Test fit method.
+
+ Not SOMO clusterer error case.
+ """
+ with pytest.raises(TypeError, match='Parameter `som_estimator` should be'):
+ clone(GSOMO_OVERSAMPLER).set_params(som_estimator=som_estimator).fit(X, y)
+
+
+def test_fit_resample():
+ """Test fit and resample method.
+
+ Default case.
+ """
+ # Fit oversampler
+ gsomo = clone(GSOMO_OVERSAMPLER)
+ _, y_res = gsomo.fit_resample(X, y)
+
+ # Assert clusterer is fitted
+ assert hasattr(gsomo.clusterer_, 'labels_')
+ assert hasattr(gsomo.clusterer_, 'neighbors_')
+
+ # Assert distributor is fitted
+ assert hasattr(gsomo.distributor_, 'intra_distribution_')
+ assert hasattr(gsomo.distributor_, 'inter_distribution_')
diff --git a/tests/clover/over_sampling/test_kmeans_smote.py b/tests/clover/over_sampling/test_kmeans_smote.py
new file mode 100644
index 0000000..ad2398b
--- /dev/null
+++ b/tests/clover/over_sampling/test_kmeans_smote.py
@@ -0,0 +1,169 @@
+"""Test the _kmeans_smote module."""
+
+from collections import Counter, OrderedDict
+
+import pytest
+from imblearn.over_sampling import SMOTE
+from sklearn.base import clone
+from sklearn.cluster import AgglomerativeClustering, KMeans, MiniBatchKMeans
+from sklearn.datasets import make_classification
+
+from imblearn_extra.clover.distribution import DensityDistributor
+from imblearn_extra.clover.over_sampling import KMeansSMOTE
+
+RANDOM_STATE = 1
+X, y = make_classification(
+ random_state=RANDOM_STATE,
+ n_classes=3,
+ n_samples=5000,
+ n_features=10,
+ n_clusters_per_class=2,
+ weights=[0.25, 0.45, 0.3],
+ n_informative=5,
+)
+KMEANS_SMOTE_OVERSAMPLER = KMeansSMOTE(random_state=RANDOM_STATE)
+
+
+@pytest.mark.parametrize(
+ ('k_neighbors', 'imbalance_ratio_threshold', 'distances_exponent'),
+ [(3, 2.0, 'auto'), (5, 1.5, 8), (8, 'auto', 10)],
+)
+def test_fit(k_neighbors, imbalance_ratio_threshold, distances_exponent):
+ """Test fit method.
+
+ Multiple cases.
+ """
+ # Fit oversampler
+ params = {
+ 'k_neighbors': k_neighbors,
+ 'imbalance_ratio_threshold': imbalance_ratio_threshold,
+ 'distances_exponent': distances_exponent,
+ }
+ kmeans_smote = clone(KMEANS_SMOTE_OVERSAMPLER).set_params(**params).fit(X, y)
+ y_count = Counter(y)
+
+ # Assert random state
+ assert hasattr(kmeans_smote, 'random_state_')
+
+ # Assert oversampler
+ assert isinstance(kmeans_smote.oversampler_, SMOTE)
+ assert kmeans_smote.oversampler_.k_neighbors == kmeans_smote.k_neighbors == k_neighbors
+
+ # Assert clusterer
+ assert isinstance(kmeans_smote.clusterer_, MiniBatchKMeans)
+
+ # Assert distributor
+ assert isinstance(kmeans_smote.distributor_, DensityDistributor)
+ assert (
+ kmeans_smote.distributor_.filtering_threshold
+ == kmeans_smote.imbalance_ratio_threshold
+ == imbalance_ratio_threshold
+ )
+ assert kmeans_smote.distributor_.distances_exponent == kmeans_smote.distances_exponent == distances_exponent
+
+ # Assert sampling strategy
+ assert kmeans_smote.oversampler_.sampling_strategy == kmeans_smote.sampling_strategy
+ assert kmeans_smote.sampling_strategy_ == OrderedDict({0: y_count[1] - y_count[0], 2: y_count[1] - y_count[2]})
+
+
+def test_fit_default():
+ """Test fit method.
+
+ Default case.
+ """
+ # Fit oversampler
+ kmeans_smote = clone(KMEANS_SMOTE_OVERSAMPLER).fit(X, y)
+
+ # Assert clusterer
+ assert isinstance(kmeans_smote.clusterer_, MiniBatchKMeans)
+ assert kmeans_smote.clusterer_.n_clusters == MiniBatchKMeans().n_clusters
+
+
+@pytest.mark.parametrize('n_clusters', [5, 6, 12])
+def test_fit_number_of_clusters(n_clusters):
+ """Test fit method.
+
+ Number of clusters case.
+ """
+ # Fit oversampler
+ kmeans_smote = clone(KMEANS_SMOTE_OVERSAMPLER).set_params(kmeans_estimator=n_clusters).fit(X, y)
+
+ # Assert clusterer
+ assert isinstance(kmeans_smote.clusterer_, MiniBatchKMeans)
+ assert kmeans_smote.clusterer_.n_clusters == n_clusters
+
+
+@pytest.mark.parametrize('proportion', [0.0, 0.5, 1.0])
+def test_fit_proportion_of_samples(proportion):
+ """Test fit method.
+
+ Proportion of samples case.
+ """
+ # Fit oversampler
+ kmeans_smote = clone(KMEANS_SMOTE_OVERSAMPLER).set_params(kmeans_estimator=proportion).fit(X, y)
+
+ # Assert clusterer
+ assert isinstance(kmeans_smote.clusterer_, MiniBatchKMeans)
+ assert kmeans_smote.clusterer_.n_clusters == round((len(X) - 1) * proportion + 1)
+
+
+@pytest.mark.parametrize('kmeans_estimator', [KMeans(), MiniBatchKMeans()])
+def test_fit_kmeans_estimator(kmeans_estimator):
+ """Test fit method.
+
+ KMeans estimator case.
+ """
+ # Fit oversampler
+ kmeans_smote = clone(KMEANS_SMOTE_OVERSAMPLER).set_params(kmeans_estimator=kmeans_estimator).fit(X, y)
+
+ # Assert clusterer
+ assert isinstance(kmeans_smote.clusterer_, type(kmeans_estimator))
+ assert kmeans_smote.clusterer_.n_clusters == kmeans_estimator.n_clusters
+
+
+@pytest.mark.parametrize('kmeans_estimator', [-3, 0])
+def test_raise_value_error_fit_integer(kmeans_estimator):
+ """Test fit method.
+
+ Integer values as estimators error case.
+ """
+ with pytest.raises(ValueError, match=f'kmeans_estimator == {kmeans_estimator}, must be >= 1.'):
+ clone(KMEANS_SMOTE_OVERSAMPLER).set_params(kmeans_estimator=kmeans_estimator).fit(X, y)
+
+
+@pytest.mark.parametrize('kmeans_estimator', [-1.5, 2.0])
+def test_raise_value_error_fit_float(kmeans_estimator):
+ """Test fit method.
+
+ Float values as estimators error case.
+ """
+ with pytest.raises(ValueError, match=f'kmeans_estimator == {kmeans_estimator}, must be'):
+ clone(KMEANS_SMOTE_OVERSAMPLER).set_params(kmeans_estimator=kmeans_estimator).fit(X, y)
+
+
+@pytest.mark.parametrize('kmeans_estimator', [AgglomerativeClustering(), [3, 5]])
+def test_raise_type_error_fit(kmeans_estimator):
+ """Test fit method.
+
+ Not KMeans clusterer error case.
+ """
+ with pytest.raises(TypeError, match='Parameter `kmeans_estimator` should be'):
+ clone(KMEANS_SMOTE_OVERSAMPLER).set_params(kmeans_estimator=kmeans_estimator).fit(X, y)
+
+
+def test_fit_resample():
+ """Test fit and resample method.
+
+ Default case.
+ """
+ # Fit oversampler
+ kmeans_smote = clone(KMEANS_SMOTE_OVERSAMPLER)
+ _, y_res = kmeans_smote.fit_resample(X, y)
+
+ # Assert clusterer is fitted
+ assert hasattr(kmeans_smote.clusterer_, 'labels_')
+ assert not hasattr(kmeans_smote.clusterer_, 'neighbors_')
+
+ # Assert distributor is fitted
+ assert hasattr(kmeans_smote.distributor_, 'intra_distribution_')
+ assert hasattr(kmeans_smote.distributor_, 'inter_distribution_')
diff --git a/tests/clover/over_sampling/test_somo.py b/tests/clover/over_sampling/test_somo.py
new file mode 100644
index 0000000..adedf43
--- /dev/null
+++ b/tests/clover/over_sampling/test_somo.py
@@ -0,0 +1,175 @@
+"""Test the _somo module."""
+
+from collections import Counter, OrderedDict
+from math import sqrt
+
+import pytest
+from imblearn.over_sampling import SMOTE
+from sklearn.base import clone
+from sklearn.cluster import AgglomerativeClustering
+from sklearn.datasets import make_classification
+
+from imblearn_extra.clover.clusterer import SOM
+from imblearn_extra.clover.distribution import DensityDistributor
+from imblearn_extra.clover.over_sampling import SOMO
+
+RANDOM_STATE = 2
+X, y = make_classification(
+ random_state=RANDOM_STATE,
+ n_classes=3,
+ n_samples=5000,
+ n_features=10,
+ n_clusters_per_class=2,
+ weights=[0.25, 0.45, 0.3],
+ n_informative=5,
+)
+SOMO_OVERSAMPLER = SOMO(random_state=RANDOM_STATE)
+
+
+@pytest.mark.parametrize(
+ ('k_neighbors', 'distribution_ratio'),
+ [(3, 0.2), (5, 0.5), (8, 0.6)],
+)
+def test_fit(k_neighbors, distribution_ratio):
+ """Test fit method.
+
+ Multiple cases.
+ """
+ # Fit oversampler
+ params = {'k_neighbors': k_neighbors, 'distribution_ratio': distribution_ratio}
+ somo = clone(SOMO_OVERSAMPLER).set_params(**params).fit(X, y)
+ y_count = Counter(y)
+
+ # Assert random state
+ assert hasattr(somo, 'random_state_')
+
+ # Assert oversampler
+ assert isinstance(somo.oversampler_, SMOTE)
+ assert somo.oversampler_.k_neighbors == somo.k_neighbors == k_neighbors
+
+ # Assert clusterer
+ assert isinstance(somo.clusterer_, SOM)
+
+ # Assert distributor
+ filtering_threshold = 1.0
+ distances_exponent = 2
+ assert isinstance(somo.distributor_, DensityDistributor)
+ assert somo.distributor_.filtering_threshold == filtering_threshold
+ assert somo.distributor_.distances_exponent == distances_exponent
+ assert somo.distributor_.distribution_ratio == somo.distribution_ratio == distribution_ratio
+
+ # Assert sampling strategy
+ assert somo.oversampler_.sampling_strategy == somo.sampling_strategy
+ assert somo.sampling_strategy_ == OrderedDict({0: y_count[1] - y_count[0], 2: y_count[1] - y_count[2]})
+
+
+def test_fit_default():
+ """Test fit method.
+
+ Default case.
+ """
+ # Fit oversampler
+ somo = clone(SOMO_OVERSAMPLER).fit(X, y)
+
+ # Create SOM instance with default parameters
+ som = SOM()
+
+ # Assert clusterer
+ assert isinstance(somo.clusterer_, SOM)
+ assert somo.clusterer_.n_rows == som.n_rows
+ assert somo.clusterer_.n_columns == som.n_columns
+
+
+@pytest.mark.parametrize('n_clusters', [5, 6, 12])
+def test_fit_number_of_clusters(n_clusters):
+ """Test fit method.
+
+ Number of clusters case.
+ """
+ # Fit oversampler
+ somo = clone(SOMO_OVERSAMPLER).set_params(som_estimator=n_clusters).fit(X, y)
+
+ # Assert clusterer
+ assert isinstance(somo.clusterer_, SOM)
+ assert somo.clusterer_.n_rows == round(sqrt(somo.som_estimator))
+ assert somo.clusterer_.n_columns == round(sqrt(somo.som_estimator))
+
+
+@pytest.mark.parametrize('proportion', [0.0, 0.5, 1.0])
+def test_fit_proportion_of_samples(proportion):
+ """Test fit method.
+
+ Proportion of samples case.
+ """
+ # Fit oversampler
+ somo = clone(SOMO_OVERSAMPLER).set_params(som_estimator=proportion).fit(X, y)
+
+ # Assert clusterer
+ assert isinstance(somo.clusterer_, SOM)
+ assert somo.clusterer_.n_rows == round(sqrt((X.shape[0] - 1) * somo.som_estimator + 1))
+ assert somo.clusterer_.n_columns == round(sqrt((X.shape[0] - 1) * somo.som_estimator + 1))
+
+
+def test_fit_som_estimator():
+ """Test fit method.
+
+ SOM estimator case.
+ """
+ # Fit oversampler
+ somo = clone(SOMO_OVERSAMPLER).set_params(som_estimator=SOM()).fit(X, y)
+
+ # Define som estimator
+ som = SOM()
+
+ # Assert clusterer
+ assert isinstance(somo.clusterer_, type(som))
+ assert somo.clusterer_.n_rows == som.n_rows
+ assert somo.clusterer_.n_columns == som.n_columns
+
+
+@pytest.mark.parametrize('som_estimator', [-3, 0])
+def test_raise_value_error_fit_integer(som_estimator):
+ """Test fit method.
+
+ Integer values as estimators error case.
+ """
+ with pytest.raises(ValueError, match=f'som_estimator == {som_estimator}, must be >= 1.'):
+ clone(SOMO_OVERSAMPLER).set_params(som_estimator=som_estimator).fit(X, y)
+
+
+@pytest.mark.parametrize('som_estimator', [-1.5, 2.0])
+def test_raise_value_error_fit_float(som_estimator):
+ """Test fit method.
+
+ Float values as estimators error case.
+ """
+ with pytest.raises(ValueError, match=f'som_estimator == {som_estimator}, must be'):
+ clone(SOMO_OVERSAMPLER).set_params(som_estimator=som_estimator).fit(X, y)
+
+
+@pytest.mark.parametrize('som_estimator', [AgglomerativeClustering(), [3, 5]])
+def test_raise_type_error_fit(som_estimator):
+ """Test fit method.
+
+ Not SOM clusterer error case.
+ """
+ with pytest.raises(TypeError, match='Parameter `som_estimator` should be'):
+ clone(SOMO_OVERSAMPLER).set_params(som_estimator=som_estimator).fit(X, y)
+
+
+def test_fit_resample():
+ """Test fit and resample method.
+
+ Default case.
+ """
+ # Fit oversampler
+ somo = clone(SOMO_OVERSAMPLER)
+ _, y_res = somo.fit_resample(X, y)
+
+ # Assert clusterer is fitted
+ assert hasattr(somo.clusterer_, 'labels_')
+ assert hasattr(somo.clusterer_, 'neighbors_')
+
+ # Assert distributor is fitted
+ assert hasattr(somo.distributor_, 'intra_distribution_')
+ assert hasattr(somo.distributor_, 'inter_distribution_')
diff --git a/tests/gsmote/__init__.py b/tests/gsmote/__init__.py
new file mode 100644
index 0000000..15928cd
--- /dev/null
+++ b/tests/gsmote/__init__.py
@@ -0,0 +1,7 @@
+"""Tests suite for `gsmote`."""
+
+from pathlib import Path
+
+TESTS_DIR = Path(__file__).parent
+TMP_DIR = TESTS_DIR / 'tmp'
+FIXTURES_DIR = TESTS_DIR / 'fixtures'
diff --git a/tests/test_geometric_smote.py b/tests/gsmote/test_geometric_smote.py
similarity index 99%
rename from tests/test_geometric_smote.py
rename to tests/gsmote/test_geometric_smote.py
index 16ccdc3..5ae28ad 100644
--- a/tests/test_geometric_smote.py
+++ b/tests/gsmote/test_geometric_smote.py
@@ -4,12 +4,13 @@
import numpy as np
import pytest
-from gsmote import SELECTION_STRATEGIES, GeometricSMOTE, make_geometric_sample
from numpy.linalg import norm
from scipy import sparse
from sklearn.datasets import make_classification
from sklearn.utils import check_random_state
+from imblearn_extra.gsmote import SELECTION_STRATEGIES, GeometricSMOTE, make_geometric_sample
+
RND_SEED = 0
RANDOM_STATE = check_random_state(RND_SEED)
CENTERS = [
@@ -123,7 +124,7 @@ def test_make_geometric_sample_half_hypersphere(surface_point, deformation_facto
('center', 'surface_point', 'truncation_factor'),
[
(center, surface_point, truncation_factor)
- for center, surface_point in zip(CENTERS, SURFACE_POINTS)
+ for center, surface_point in zip(CENTERS, SURFACE_POINTS, strict=False)
for truncation_factor in TRUNCATION_FACTORS
],
)