Skip to content

Commit

Permalink
FEAT Extract core points (#17)
Browse files Browse the repository at this point in the history
* FEAT Extract core points

* Fix versioning
  • Loading branch information
VarIr authored Sep 15, 2023
1 parent 7c7adc1 commit 8326af8
Show file tree
Hide file tree
Showing 4 changed files with 82 additions and 69 deletions.
7 changes: 0 additions & 7 deletions copac/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1 @@
from .copac import COPAC, copac

__all__ = [
'COPAC',
'copac',
'__version__',
]
__version__ = "0.3.0"
76 changes: 51 additions & 25 deletions copac/copac.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
COPAC: Correlation Partition Clustering
"""

# Author: Roman Feldbauer <[email protected]>
# Author: Roman Feldbauer <[email protected]>
# Elisabeth Hartel
# Jiri Mauritz <jirmauritz at gmail dot com>
# Thomas Turic <[email protected]>
Expand All @@ -16,10 +16,7 @@
from scipy.spatial.distance import squareform

from sklearn.base import BaseEstimator, ClusterMixin
try: # for sklearn < 0.23
from sklearn.cluster.dbscan_ import dbscan
except: # for sklearn >= 0.23
from sklearn.cluster._dbscan import dbscan
from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors
from sklearn.utils import check_array

Expand All @@ -41,9 +38,12 @@ def _cdist(P, Q, Mhat_P):
return (PQ_diff @ Mhat_P * PQ_diff).sum(axis=1)


def copac(X, k=10, mu=5, eps=0.5, alpha=0.85, metric='euclidean',
metric_params=None, algorithm='auto', leaf_size=30, p=None,
n_jobs=1, sample_weight=None):
def copac(X: np.ndarray, *,
k: int = 10, mu: int = 5, eps: float = 0.5, alpha: float = 0.85,
metric: str = 'euclidean', metric_params=None,
algorithm: str = 'auto', leaf_size: int = 30, p: float = None,
n_jobs: int = 1, sample_weight: np.ndarray=None,
return_core_pts: bool = False):
"""Perform COPAC clustering from vector array.
Parameters
Expand Down Expand Up @@ -84,12 +84,16 @@ def copac(X, k=10, mu=5, eps=0.5, alpha=0.85, metric='euclidean',
n_jobs : int, optional, default=1
Number of parallel processes. Use all cores with n_jobs=-1.
sample_weight : None
Currently ignored
Sample weights
return_core_pts : bool
Return clusters labels and core point indices for each correlation dimension.
Returns
-------
labels : array [n_samples]
Cluster labels for each point. Noisy samples are given the label -1.
core_pts_ind : dict[int, array]
Indices of core points for each correlation dimension (only if ``return_core_pts=True``).
References
----------
Expand All @@ -99,9 +103,9 @@ def copac(X, k=10, mu=5, eps=0.5, alpha=0.85, metric='euclidean',
Conference on Data Mining, April 26-28, 2007, Minneapolis,
Minnesota, USA (2007), pp. 413–418.
"""
X = check_array(X)
n, d = X.shape
y = -np.ones(n, dtype=np.int)
data_dtype = X.dtype
y = -np.ones(n, dtype=int)
if n_jobs == -1:
n_jobs = cpu_count()

Expand Down Expand Up @@ -141,10 +145,11 @@ def copac(X, k=10, mu=5, eps=0.5, alpha=0.85, metric='euclidean',
# Loop over partitions according to local corr. dim.
max_label = 0
used_y = np.zeros_like(y, dtype=int)
for D in Ds:
core_pts = {}
for dim, D in enumerate(Ds, start=1):
n_D = D.shape[0]
cdist_P = -np.ones(n_D * (n_D - 1) // 2, dtype=np.float)
cdist_Q = -np.ones((n_D, n_D), dtype=np.float)
cdist_P = -np.ones(n_D * (n_D - 1) // 2, dtype=data_dtype)
cdist_Q = -np.ones((n_D, n_D), dtype=data_dtype)
start = 0
# Calculate triu part of distance matrix
for i in range(0, n_D - 1):
Expand All @@ -168,9 +173,10 @@ def copac(X, k=10, mu=5, eps=0.5, alpha=0.85, metric='euclidean',

# Perform DBSCAN with full distance matrix
cdist = squareform(cdist)
clust = dbscan(X=cdist, eps=eps, min_samples=mu,
metric='precomputed', n_jobs=n_jobs)
_, labels = clust
dbscan = DBSCAN(eps=eps, min_samples=mu, metric="precomputed", n_jobs=n_jobs)
labels = dbscan.fit_predict(X=cdist, sample_weight=sample_weight)
core_pts[dim] = dbscan.core_sample_indices_

# Each DBSCAN run is unaware of previous ones,
# so we need to keep track of previous copac IDs
y_D = labels + max_label
Expand All @@ -180,7 +186,11 @@ def copac(X, k=10, mu=5, eps=0.5, alpha=0.85, metric='euclidean',
y[D] = y_D
used_y[D] += 1
assert np.all(used_y == 1), "Not all samples were handled exactly once!"
return y

if return_core_pts:
return y, core_pts
else:
return y


class COPAC(BaseEstimator, ClusterMixin):
Expand Down Expand Up @@ -254,7 +264,7 @@ def __init__(self, k=10, mu=5, eps=0.5, alpha=0.85,
self.p = p
self.n_jobs = n_jobs

def fit(self, X, y=None, sample_weight=None):
def fit(self, X, y=None, sample_weight=None, return_core_pts=False):
"""Perform COPAC clustering from features.
Parameters
Expand All @@ -268,14 +278,25 @@ def fit(self, X, y=None, sample_weight=None):
Note that weights are absolute, and default to 1.
CURRENTLY IGNORED.
y : Ignored
return_core_pts : bool
Return cluster labels and core points per correlation dimension
"""
X = check_array(X)
clust = copac(X, sample_weight=sample_weight,
**self.get_params())
X: np.ndarray = check_array(X)
result = copac(
X=X,
sample_weight=sample_weight,
return_core_pts=return_core_pts,
**self.get_params(),
)
if return_core_pts:
clust, core_pts = result
self.core_point_indices_ = core_pts
else:
clust = result
self.labels_ = clust
return self

def fit_predict(self, X, y=None, sample_weight=None):
def fit_predict(self, X, y=None, sample_weight=None, return_core_pts=False):
"""Performs clustering on X and returns copac labels.
Parameters
Expand All @@ -289,11 +310,16 @@ def fit_predict(self, X, y=None, sample_weight=None):
Note that weights are absolute, and default to 1.
CURRENTLY IGNORED.
y : Ignored
return_core_pts : bool
Return cluster labels and core points per correlation dimension
Returns
-------
y : ndarray, shape (n_samples,)
copac labels
"""
self.fit(X, sample_weight=sample_weight)
return self.labels_
self.fit(X, sample_weight=sample_weight, return_core_pts=return_core_pts)
if return_core_pts:
return self.labels_, self.core_point_indices_
else:
return self.labels_
66 changes: 30 additions & 36 deletions copac/tests/test_copac.py
Original file line number Diff line number Diff line change
@@ -1,46 +1,40 @@
"""
Testing for Clustering methods
"""
import unittest
import pytest

import numpy as np

from sklearn.metrics.cluster import v_measure_score
from sklearn.utils.testing import assert_equal, assert_array_equal
from sklearn.datasets.samples_generator import make_blobs
from sklearn.datasets import make_blobs

from ..copac import COPAC


class TestCopac(unittest.TestCase):

def setUp(self):
""" Set up very simple data set """
self.n_clusters = 2
self.centers = np.array([[3, 3], [-3, -3]]) + 10
self.X, self.y = make_blobs(n_samples=60, n_features=2,
centers=self.centers, cluster_std=0.4,
shuffle=True, random_state=0)
self.v = v_measure_score(self.y, self.y)

def tearDown(self):
del self.n_clusters, self.centers, self.X

def test_copac(self):
""" Minimal test that COPAC runs at all. """
k = 40
mu = 10
eps = 2
alpha = 0.85
copac = COPAC(k=k, mu=mu, eps=eps, alpha=alpha)
y_pred = copac.fit_predict(self.X)
v = v_measure_score(self.y, y_pred)
# Must score perfectly on very simple data
assert_equal(self.v, v)
# Check correct labels_ attribute
copac = COPAC(k=k, mu=mu, eps=eps, alpha=alpha)
copac.fit(self.X)
assert_array_equal(copac.labels_, y_pred)

if __name__ == "__main__":
unittest.main()
@pytest.mark.parametrize("return_core_pts", [True, False])
def test_copac(return_core_pts):
""" Minimal test that COPAC runs at all. """
# Set up very simple data set
n_clusters = 2
centers = np.array([[3, 3], [-3, -3]]) + 10
X, y = make_blobs(n_samples=60, n_features=2,
centers=centers, cluster_std=0.4,
shuffle=True, random_state=0)
v_true = v_measure_score(y, y)

k = 40
mu = 10
eps = 2
alpha = 0.85

copac = COPAC(k=k, mu=mu, eps=eps, alpha=alpha)
y_pred = copac.fit_predict(X, return_core_pts=return_core_pts)
if return_core_pts:
y_pred, core_pts_ind = y_pred
assert isinstance(core_pts_ind, dict)
v_pred = v_measure_score(y, y_pred)
# Must score perfectly on very simple data
np.testing.assert_equal(v_true, v_pred)
# Check correct labels_ attribute
copac = COPAC(k=k, mu=mu, eps=eps, alpha=alpha)
copac.fit(X)
np.testing.assert_array_equal(copac.labels_, y_pred)
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ universal = true

[metadata]
name = COPAC
version = 0.3.0
version = attr: copac.__init__.__version__
author = Roman Feldbauer
author_email = [email protected]
url = https://github.com/VarIr/copac
Expand Down

0 comments on commit 8326af8

Please sign in to comment.