Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FEAT Extract core points #17

Merged
merged 7 commits into from
Sep 15, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 0 additions & 7 deletions copac/__init__.py
Original file line number Diff line number Diff line change
@@ -1,8 +1 @@
from .copac import COPAC, copac

__all__ = [
'COPAC',
'copac',
'__version__',
]
__version__ = "0.3.0"
76 changes: 51 additions & 25 deletions copac/copac.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
COPAC: Correlation Partition Clustering
"""

# Author: Roman Feldbauer <[email protected]>
# Author: Roman Feldbauer <[email protected]>
# Elisabeth Hartel
# Jiri Mauritz <jirmauritz at gmail dot com>
# Thomas Turic <[email protected]>
Expand All @@ -16,10 +16,7 @@
from scipy.spatial.distance import squareform

from sklearn.base import BaseEstimator, ClusterMixin
try: # for sklearn < 0.23
from sklearn.cluster.dbscan_ import dbscan
except: # for sklearn >= 0.23
from sklearn.cluster._dbscan import dbscan
from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors
from sklearn.utils import check_array

Expand All @@ -41,9 +38,12 @@ def _cdist(P, Q, Mhat_P):
return (PQ_diff @ Mhat_P * PQ_diff).sum(axis=1)


def copac(X, k=10, mu=5, eps=0.5, alpha=0.85, metric='euclidean',
metric_params=None, algorithm='auto', leaf_size=30, p=None,
n_jobs=1, sample_weight=None):
def copac(X: np.ndarray, *,
k: int = 10, mu: int = 5, eps: float = 0.5, alpha: float = 0.85,
metric: str = 'euclidean', metric_params=None,
algorithm: str = 'auto', leaf_size: int = 30, p: float = None,
n_jobs: int = 1, sample_weight: np.ndarray=None,
return_core_pts: bool = False):
"""Perform COPAC clustering from vector array.

Parameters
Expand Down Expand Up @@ -84,12 +84,16 @@ def copac(X, k=10, mu=5, eps=0.5, alpha=0.85, metric='euclidean',
n_jobs : int, optional, default=1
Number of parallel processes. Use all cores with n_jobs=-1.
sample_weight : None
Currently ignored
Sample weights
return_core_pts : bool
Return clusters labels and core point indices for each correlation dimension.

Returns
-------
labels : array [n_samples]
Cluster labels for each point. Noisy samples are given the label -1.
core_pts_ind : dict[int, array]
Indices of core points for each correlation dimension (only if ``return_core_pts=True``).

References
----------
Expand All @@ -99,9 +103,9 @@ def copac(X, k=10, mu=5, eps=0.5, alpha=0.85, metric='euclidean',
Conference on Data Mining, April 26-28, 2007, Minneapolis,
Minnesota, USA (2007), pp. 413–418.
"""
X = check_array(X)
n, d = X.shape
y = -np.ones(n, dtype=np.int)
data_dtype = X.dtype
y = -np.ones(n, dtype=int)
if n_jobs == -1:
n_jobs = cpu_count()

Expand Down Expand Up @@ -141,10 +145,11 @@ def copac(X, k=10, mu=5, eps=0.5, alpha=0.85, metric='euclidean',
# Loop over partitions according to local corr. dim.
max_label = 0
used_y = np.zeros_like(y, dtype=int)
for D in Ds:
core_pts = {}
for dim, D in enumerate(Ds, start=1):
n_D = D.shape[0]
cdist_P = -np.ones(n_D * (n_D - 1) // 2, dtype=np.float)
cdist_Q = -np.ones((n_D, n_D), dtype=np.float)
cdist_P = -np.ones(n_D * (n_D - 1) // 2, dtype=data_dtype)
cdist_Q = -np.ones((n_D, n_D), dtype=data_dtype)
start = 0
# Calculate triu part of distance matrix
for i in range(0, n_D - 1):
Expand All @@ -168,9 +173,10 @@ def copac(X, k=10, mu=5, eps=0.5, alpha=0.85, metric='euclidean',

# Perform DBSCAN with full distance matrix
cdist = squareform(cdist)
clust = dbscan(X=cdist, eps=eps, min_samples=mu,
metric='precomputed', n_jobs=n_jobs)
_, labels = clust
dbscan = DBSCAN(eps=eps, min_samples=mu, metric="precomputed", n_jobs=n_jobs)
labels = dbscan.fit_predict(X=cdist, sample_weight=sample_weight)
core_pts[dim] = dbscan.core_sample_indices_

# Each DBSCAN run is unaware of previous ones,
# so we need to keep track of previous copac IDs
y_D = labels + max_label
Expand All @@ -180,7 +186,11 @@ def copac(X, k=10, mu=5, eps=0.5, alpha=0.85, metric='euclidean',
y[D] = y_D
used_y[D] += 1
assert np.all(used_y == 1), "Not all samples were handled exactly once!"
return y

if return_core_pts:
return y, core_pts
else:
return y


class COPAC(BaseEstimator, ClusterMixin):
Expand Down Expand Up @@ -254,7 +264,7 @@ def __init__(self, k=10, mu=5, eps=0.5, alpha=0.85,
self.p = p
self.n_jobs = n_jobs

def fit(self, X, y=None, sample_weight=None):
def fit(self, X, y=None, sample_weight=None, return_core_pts=False):
"""Perform COPAC clustering from features.

Parameters
Expand All @@ -268,14 +278,25 @@ def fit(self, X, y=None, sample_weight=None):
Note that weights are absolute, and default to 1.
CURRENTLY IGNORED.
y : Ignored
return_core_pts : bool
Return cluster labels and core points per correlation dimension
"""
X = check_array(X)
clust = copac(X, sample_weight=sample_weight,
**self.get_params())
X: np.ndarray = check_array(X)
result = copac(
X=X,
sample_weight=sample_weight,
return_core_pts=return_core_pts,
**self.get_params(),
)
if return_core_pts:
clust, core_pts = result
self.core_point_indices_ = core_pts
else:
clust = result
self.labels_ = clust
return self

def fit_predict(self, X, y=None, sample_weight=None):
def fit_predict(self, X, y=None, sample_weight=None, return_core_pts=False):
"""Performs clustering on X and returns copac labels.

Parameters
Expand All @@ -289,11 +310,16 @@ def fit_predict(self, X, y=None, sample_weight=None):
Note that weights are absolute, and default to 1.
CURRENTLY IGNORED.
y : Ignored
return_core_pts : bool
Return cluster labels and core points per correlation dimension

Returns
-------
y : ndarray, shape (n_samples,)
copac labels
"""
self.fit(X, sample_weight=sample_weight)
return self.labels_
self.fit(X, sample_weight=sample_weight, return_core_pts=return_core_pts)
if return_core_pts:
return self.labels_, self.core_point_indices_
else:
return self.labels_
66 changes: 30 additions & 36 deletions copac/tests/test_copac.py
Original file line number Diff line number Diff line change
@@ -1,46 +1,40 @@
"""
Testing for Clustering methods
"""
import unittest
import pytest

import numpy as np

from sklearn.metrics.cluster import v_measure_score
from sklearn.utils.testing import assert_equal, assert_array_equal
from sklearn.datasets.samples_generator import make_blobs
from sklearn.datasets import make_blobs

from ..copac import COPAC


class TestCopac(unittest.TestCase):

def setUp(self):
""" Set up very simple data set """
self.n_clusters = 2
self.centers = np.array([[3, 3], [-3, -3]]) + 10
self.X, self.y = make_blobs(n_samples=60, n_features=2,
centers=self.centers, cluster_std=0.4,
shuffle=True, random_state=0)
self.v = v_measure_score(self.y, self.y)

def tearDown(self):
del self.n_clusters, self.centers, self.X

def test_copac(self):
""" Minimal test that COPAC runs at all. """
k = 40
mu = 10
eps = 2
alpha = 0.85
copac = COPAC(k=k, mu=mu, eps=eps, alpha=alpha)
y_pred = copac.fit_predict(self.X)
v = v_measure_score(self.y, y_pred)
# Must score perfectly on very simple data
assert_equal(self.v, v)
# Check correct labels_ attribute
copac = COPAC(k=k, mu=mu, eps=eps, alpha=alpha)
copac.fit(self.X)
assert_array_equal(copac.labels_, y_pred)

if __name__ == "__main__":
unittest.main()
@pytest.mark.parametrize("return_core_pts", [True, False])
def test_copac(return_core_pts):
""" Minimal test that COPAC runs at all. """
# Set up very simple data set
n_clusters = 2
centers = np.array([[3, 3], [-3, -3]]) + 10
X, y = make_blobs(n_samples=60, n_features=2,
centers=centers, cluster_std=0.4,
shuffle=True, random_state=0)
v_true = v_measure_score(y, y)

k = 40
mu = 10
eps = 2
alpha = 0.85

copac = COPAC(k=k, mu=mu, eps=eps, alpha=alpha)
y_pred = copac.fit_predict(X, return_core_pts=return_core_pts)
if return_core_pts:
y_pred, core_pts_ind = y_pred
assert isinstance(core_pts_ind, dict)
v_pred = v_measure_score(y, y_pred)
# Must score perfectly on very simple data
np.testing.assert_equal(v_true, v_pred)
# Check correct labels_ attribute
copac = COPAC(k=k, mu=mu, eps=eps, alpha=alpha)
copac.fit(X)
np.testing.assert_array_equal(copac.labels_, y_pred)
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ universal = true

[metadata]
name = COPAC
version = 0.3.0
version = attr: copac.__init__.__version__
author = Roman Feldbauer
author_email = [email protected]
url = https://github.com/VarIr/copac
Expand Down