-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* FEAT Extract core points * Fix versioning
- Loading branch information
Showing
4 changed files
with
82 additions
and
69 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,8 +1 @@ | ||
from .copac import COPAC, copac | ||
|
||
__all__ = [ | ||
'COPAC', | ||
'copac', | ||
'__version__', | ||
] | ||
__version__ = "0.3.0" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,7 +4,7 @@ | |
COPAC: Correlation Partition Clustering | ||
""" | ||
|
||
# Author: Roman Feldbauer <[email protected]> | ||
# Author: Roman Feldbauer <[email protected]> | ||
# Elisabeth Hartel | ||
# Jiri Mauritz <jirmauritz at gmail dot com> | ||
# Thomas Turic <[email protected]> | ||
|
@@ -16,10 +16,7 @@ | |
from scipy.spatial.distance import squareform | ||
|
||
from sklearn.base import BaseEstimator, ClusterMixin | ||
try: # for sklearn < 0.23 | ||
from sklearn.cluster.dbscan_ import dbscan | ||
except: # for sklearn >= 0.23 | ||
from sklearn.cluster._dbscan import dbscan | ||
from sklearn.cluster import DBSCAN | ||
from sklearn.neighbors import NearestNeighbors | ||
from sklearn.utils import check_array | ||
|
||
|
@@ -41,9 +38,12 @@ def _cdist(P, Q, Mhat_P): | |
return (PQ_diff @ Mhat_P * PQ_diff).sum(axis=1) | ||
|
||
|
||
def copac(X, k=10, mu=5, eps=0.5, alpha=0.85, metric='euclidean', | ||
metric_params=None, algorithm='auto', leaf_size=30, p=None, | ||
n_jobs=1, sample_weight=None): | ||
def copac(X: np.ndarray, *, | ||
k: int = 10, mu: int = 5, eps: float = 0.5, alpha: float = 0.85, | ||
metric: str = 'euclidean', metric_params=None, | ||
algorithm: str = 'auto', leaf_size: int = 30, p: float = None, | ||
n_jobs: int = 1, sample_weight: np.ndarray=None, | ||
return_core_pts: bool = False): | ||
"""Perform COPAC clustering from vector array. | ||
Parameters | ||
|
@@ -84,12 +84,16 @@ def copac(X, k=10, mu=5, eps=0.5, alpha=0.85, metric='euclidean', | |
n_jobs : int, optional, default=1 | ||
Number of parallel processes. Use all cores with n_jobs=-1. | ||
sample_weight : None | ||
Currently ignored | ||
Sample weights | ||
return_core_pts : bool | ||
Return clusters labels and core point indices for each correlation dimension. | ||
Returns | ||
------- | ||
labels : array [n_samples] | ||
Cluster labels for each point. Noisy samples are given the label -1. | ||
core_pts_ind : dict[int, array] | ||
Indices of core points for each correlation dimension (only if ``return_core_pts=True``). | ||
References | ||
---------- | ||
|
@@ -99,9 +103,9 @@ def copac(X, k=10, mu=5, eps=0.5, alpha=0.85, metric='euclidean', | |
Conference on Data Mining, April 26-28, 2007, Minneapolis, | ||
Minnesota, USA (2007), pp. 413–418. | ||
""" | ||
X = check_array(X) | ||
n, d = X.shape | ||
y = -np.ones(n, dtype=np.int) | ||
data_dtype = X.dtype | ||
y = -np.ones(n, dtype=int) | ||
if n_jobs == -1: | ||
n_jobs = cpu_count() | ||
|
||
|
@@ -141,10 +145,11 @@ def copac(X, k=10, mu=5, eps=0.5, alpha=0.85, metric='euclidean', | |
# Loop over partitions according to local corr. dim. | ||
max_label = 0 | ||
used_y = np.zeros_like(y, dtype=int) | ||
for D in Ds: | ||
core_pts = {} | ||
for dim, D in enumerate(Ds, start=1): | ||
n_D = D.shape[0] | ||
cdist_P = -np.ones(n_D * (n_D - 1) // 2, dtype=np.float) | ||
cdist_Q = -np.ones((n_D, n_D), dtype=np.float) | ||
cdist_P = -np.ones(n_D * (n_D - 1) // 2, dtype=data_dtype) | ||
cdist_Q = -np.ones((n_D, n_D), dtype=data_dtype) | ||
start = 0 | ||
# Calculate triu part of distance matrix | ||
for i in range(0, n_D - 1): | ||
|
@@ -168,9 +173,10 @@ def copac(X, k=10, mu=5, eps=0.5, alpha=0.85, metric='euclidean', | |
|
||
# Perform DBSCAN with full distance matrix | ||
cdist = squareform(cdist) | ||
clust = dbscan(X=cdist, eps=eps, min_samples=mu, | ||
metric='precomputed', n_jobs=n_jobs) | ||
_, labels = clust | ||
dbscan = DBSCAN(eps=eps, min_samples=mu, metric="precomputed", n_jobs=n_jobs) | ||
labels = dbscan.fit_predict(X=cdist, sample_weight=sample_weight) | ||
core_pts[dim] = dbscan.core_sample_indices_ | ||
|
||
# Each DBSCAN run is unaware of previous ones, | ||
# so we need to keep track of previous copac IDs | ||
y_D = labels + max_label | ||
|
@@ -180,7 +186,11 @@ def copac(X, k=10, mu=5, eps=0.5, alpha=0.85, metric='euclidean', | |
y[D] = y_D | ||
used_y[D] += 1 | ||
assert np.all(used_y == 1), "Not all samples were handled exactly once!" | ||
return y | ||
|
||
if return_core_pts: | ||
return y, core_pts | ||
else: | ||
return y | ||
|
||
|
||
class COPAC(BaseEstimator, ClusterMixin): | ||
|
@@ -254,7 +264,7 @@ def __init__(self, k=10, mu=5, eps=0.5, alpha=0.85, | |
self.p = p | ||
self.n_jobs = n_jobs | ||
|
||
def fit(self, X, y=None, sample_weight=None): | ||
def fit(self, X, y=None, sample_weight=None, return_core_pts=False): | ||
"""Perform COPAC clustering from features. | ||
Parameters | ||
|
@@ -268,14 +278,25 @@ def fit(self, X, y=None, sample_weight=None): | |
Note that weights are absolute, and default to 1. | ||
CURRENTLY IGNORED. | ||
y : Ignored | ||
return_core_pts : bool | ||
Return cluster labels and core points per correlation dimension | ||
""" | ||
X = check_array(X) | ||
clust = copac(X, sample_weight=sample_weight, | ||
**self.get_params()) | ||
X: np.ndarray = check_array(X) | ||
result = copac( | ||
X=X, | ||
sample_weight=sample_weight, | ||
return_core_pts=return_core_pts, | ||
**self.get_params(), | ||
) | ||
if return_core_pts: | ||
clust, core_pts = result | ||
self.core_point_indices_ = core_pts | ||
else: | ||
clust = result | ||
self.labels_ = clust | ||
return self | ||
|
||
def fit_predict(self, X, y=None, sample_weight=None): | ||
def fit_predict(self, X, y=None, sample_weight=None, return_core_pts=False): | ||
"""Performs clustering on X and returns copac labels. | ||
Parameters | ||
|
@@ -289,11 +310,16 @@ def fit_predict(self, X, y=None, sample_weight=None): | |
Note that weights are absolute, and default to 1. | ||
CURRENTLY IGNORED. | ||
y : Ignored | ||
return_core_pts : bool | ||
Return cluster labels and core points per correlation dimension | ||
Returns | ||
------- | ||
y : ndarray, shape (n_samples,) | ||
copac labels | ||
""" | ||
self.fit(X, sample_weight=sample_weight) | ||
return self.labels_ | ||
self.fit(X, sample_weight=sample_weight, return_core_pts=return_core_pts) | ||
if return_core_pts: | ||
return self.labels_, self.core_point_indices_ | ||
else: | ||
return self.labels_ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,46 +1,40 @@ | ||
""" | ||
Testing for Clustering methods | ||
""" | ||
import unittest | ||
import pytest | ||
|
||
import numpy as np | ||
|
||
from sklearn.metrics.cluster import v_measure_score | ||
from sklearn.utils.testing import assert_equal, assert_array_equal | ||
from sklearn.datasets.samples_generator import make_blobs | ||
from sklearn.datasets import make_blobs | ||
|
||
from ..copac import COPAC | ||
|
||
|
||
class TestCopac(unittest.TestCase): | ||
|
||
def setUp(self): | ||
""" Set up very simple data set """ | ||
self.n_clusters = 2 | ||
self.centers = np.array([[3, 3], [-3, -3]]) + 10 | ||
self.X, self.y = make_blobs(n_samples=60, n_features=2, | ||
centers=self.centers, cluster_std=0.4, | ||
shuffle=True, random_state=0) | ||
self.v = v_measure_score(self.y, self.y) | ||
|
||
def tearDown(self): | ||
del self.n_clusters, self.centers, self.X | ||
|
||
def test_copac(self): | ||
""" Minimal test that COPAC runs at all. """ | ||
k = 40 | ||
mu = 10 | ||
eps = 2 | ||
alpha = 0.85 | ||
copac = COPAC(k=k, mu=mu, eps=eps, alpha=alpha) | ||
y_pred = copac.fit_predict(self.X) | ||
v = v_measure_score(self.y, y_pred) | ||
# Must score perfectly on very simple data | ||
assert_equal(self.v, v) | ||
# Check correct labels_ attribute | ||
copac = COPAC(k=k, mu=mu, eps=eps, alpha=alpha) | ||
copac.fit(self.X) | ||
assert_array_equal(copac.labels_, y_pred) | ||
|
||
if __name__ == "__main__": | ||
unittest.main() | ||
@pytest.mark.parametrize("return_core_pts", [True, False]) | ||
def test_copac(return_core_pts): | ||
""" Minimal test that COPAC runs at all. """ | ||
# Set up very simple data set | ||
n_clusters = 2 | ||
centers = np.array([[3, 3], [-3, -3]]) + 10 | ||
X, y = make_blobs(n_samples=60, n_features=2, | ||
centers=centers, cluster_std=0.4, | ||
shuffle=True, random_state=0) | ||
v_true = v_measure_score(y, y) | ||
|
||
k = 40 | ||
mu = 10 | ||
eps = 2 | ||
alpha = 0.85 | ||
|
||
copac = COPAC(k=k, mu=mu, eps=eps, alpha=alpha) | ||
y_pred = copac.fit_predict(X, return_core_pts=return_core_pts) | ||
if return_core_pts: | ||
y_pred, core_pts_ind = y_pred | ||
assert isinstance(core_pts_ind, dict) | ||
v_pred = v_measure_score(y, y_pred) | ||
# Must score perfectly on very simple data | ||
np.testing.assert_equal(v_true, v_pred) | ||
# Check correct labels_ attribute | ||
copac = COPAC(k=k, mu=mu, eps=eps, alpha=alpha) | ||
copac.fit(X) | ||
np.testing.assert_array_equal(copac.labels_, y_pred) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,7 +3,7 @@ universal = true | |
|
||
[metadata] | ||
name = COPAC | ||
version = 0.3.0 | ||
version = attr: copac.__init__.__version__ | ||
author = Roman Feldbauer | ||
author_email = [email protected] | ||
url = https://github.com/VarIr/copac | ||
|