diff --git a/Orange/clustering/louvain.py b/Orange/clustering/louvain.py new file mode 100644 index 00000000000..b565b1d1fbe --- /dev/null +++ b/Orange/clustering/louvain.py @@ -0,0 +1,123 @@ +"""Python port for Louvain clustering, available at +https://github.com/taynaud/python-louvain + +Original C++ implementation available at +https://sites.google.com/site/findcommunities/ + +""" + +import numpy as np +import networkx as nx +from community import best_partition +from sklearn.neighbors import NearestNeighbors + +import Orange +from Orange.data import Table + + +def jaccard(x, y): + # type: (set, set) -> float + """Compute the Jaccard similarity between two sets.""" + return len(x & y) / len(x | y) + + +def table_to_knn_graph(data, k_neighbors, metric, progress_callback=None): + """Convert tabular data to a graph using a nearest neighbors approach with + the Jaccard similarity as the edge weights. + + Parameters + ---------- + data : Table + k_neighbors : int + metric : str + A distance metric supported by sklearn. + progress_callback : Callable[[float], None] + + Returns + ------- + nx.Graph + + """ + # We do k + 1 because each point is closest to itself, which is not useful + knn = NearestNeighbors(n_neighbors=k_neighbors, metric=metric).fit(data.X) + nearest_neighbors = knn.kneighbors(data.X, return_distance=False) + # Convert to list of sets so jaccard can be computed efficiently + nearest_neighbors = list(map(set, nearest_neighbors)) + num_nodes = len(nearest_neighbors) + + # Create an empty graph and add all the data ids as nodes for easy mapping + graph = nx.Graph() + graph.add_nodes_from(range(len(data))) + + for idx, node in enumerate(graph.nodes): + if progress_callback: + progress_callback(idx / num_nodes) + + for neighbour in nearest_neighbors[node]: + graph.add_edge(node, neighbour, weight=jaccard( + nearest_neighbors[node], nearest_neighbors[neighbour])) + + return graph + + +class Louvain: + preprocessors = [Orange.preprocess.Continuize(), + Orange.preprocess.SklImpute()] + + def __init__(self, k_neighbors=30, metric='l2', resolution=1., preprocessors=None): + """Louvain clustering for community detection in graphs. + + Louvain clustering is a community detection algorithm for detecting + clusters of "communities" in graphs. As such, tabular data must first + be converted into graph form. This is typically done by computing the + KNN graph on the input data. + + Parameters + ---------- + k_neighbors : Optional[int] + The number of nearest neighbors to use for the KNN graph if + tabular data is passed. + metric : Optional[str] + The metric to use to compute the nearest neighbors. + resolution : Optional[float] + The resolution is a parameter of the Louvain method that affects + the size of the recovered clusters. + + """ + if preprocessors is None: + preprocessors = type(self).preprocessors + self.preprocessors = tuple(preprocessors) + + self.k_neighbors = k_neighbors + self.metric = metric + self.resolution = resolution + + self.labels = None + + def __call__(self, data): + data = self.preprocess(data) + return self.fit_predict(data.X, data.Y) + + def preprocess(self, data): + for pp in self.preprocessors: + data = pp(data) + return data + + def fit(self, X, y=None): + # If we are given a table, we have to convert it to a graph first + if isinstance(X, Table): + graph = table_to_knn_graph(X.X, metric=self.metric, k_neighbors=self.k_neighbors) + # Same goes for a matrix + elif isinstance(X, np.ndarray): + graph = table_to_knn_graph(X, metric=self.metric, k_neighbors=self.k_neighbors) + elif isinstance(X, nx.Graph): + graph = X + + partition = best_partition(graph, resolution=self.resolution) + partition = np.fromiter(list(zip(*sorted(partition.items())))[1], dtype=int) + + self.labels = partition + + def fit_predict(self, X, y=None): + self.fit(X, y) + return self.labels diff --git a/Orange/widgets/unsupervised/icons/LouvainClustering.svg b/Orange/widgets/unsupervised/icons/LouvainClustering.svg new file mode 100644 index 00000000000..aec85698153 --- /dev/null +++ b/Orange/widgets/unsupervised/icons/LouvainClustering.svg @@ -0,0 +1,25 @@ + + + + + + + + + + + + + + + + + + diff --git a/Orange/widgets/unsupervised/owlouvainclustering.py b/Orange/widgets/unsupervised/owlouvainclustering.py new file mode 100644 index 00000000000..2bd4c923e90 --- /dev/null +++ b/Orange/widgets/unsupervised/owlouvainclustering.py @@ -0,0 +1,376 @@ +from collections import deque +from concurrent.futures import Future # pylint: disable=unused-import +from types import SimpleNamespace as namespace +from typing import Optional # pylint: disable=unused-import + +import numpy as np +import networkx as nx # pylint: disable=unused-import +from AnyQt.QtCore import Qt, pyqtSignal as Signal, QObject +from AnyQt.QtWidgets import QSlider, QCheckBox, QWidget # pylint: disable=unused-import + +from Orange.clustering.louvain import table_to_knn_graph, Louvain +from Orange.data import Table, DiscreteVariable +from Orange.projection import PCA +from Orange.widgets import widget, gui, report +from Orange.widgets.settings import DomainContextHandler, ContextSetting, \ + Setting +from Orange.widgets.utils.annotated_data import get_next_name, add_columns, \ + ANNOTATED_DATA_SIGNAL_NAME +from Orange.widgets.utils.concurrent import ThreadExecutor +from Orange.widgets.utils.signals import Input, Output +from Orange.widgets.widget import Msg + +try: + from orangecontrib.network.network import Graph +except ImportError: + Graph = None + + +_MAX_PCA_COMPONENTS = 50 +_DEFAULT_PCA_COMPONENTS = 25 +_MAX_K_NEIGBOURS = 200 +_DEFAULT_K_NEIGHBORS = 30 + + +METRICS = [('Euclidean', 'l2'), ('Manhattan', 'l1')] + + +class TaskQueue(QObject): + """Not really a task queue `per-se`. Running start will run the tasks in + the current list and cannot handle adding other tasks while running.""" + on_exception = Signal(Exception) + on_complete = Signal() + on_progress = Signal(float) + on_cancel = Signal() + + def __init__(self, parent=None): + super().__init__(parent=parent) + self.__tasks = deque() + self.__progress = 0 + self.__cancelled = False + + def cancel(self): + self.__cancelled = True + + def push(self, task): + self.__tasks.append(task) + + def __set_progress(self, progress): + # Only emit progress signal when the progress has changed sufficiently + if int(progress * 100) > int(self.__progress * 100): + self.on_progress.emit(progress) + self.__progress = progress + + def start(self): + num_tasks = len(self.__tasks) + + for idx, task_spec in enumerate(self.__tasks): + + if self.__cancelled: + self.on_cancel.emit() + return + + def __task_progress(percentage, index=idx): + current_progress = index / num_tasks + # How much progress can each task contribute to the total + # work to be done + task_percentage = 1 / len(self.__tasks) + # Convert the progress done by the task into the total + # progress to the task + relative_progress = task_percentage * percentage + self.__set_progress(current_progress + relative_progress) + + try: + if getattr(task_spec, 'progress_callback', False): + task_spec.task(progress_callback=__task_progress) + else: + task_spec.task() + self.__set_progress((idx + 1) / num_tasks) + + except Exception as e: # pylint: disable=broad-except + self.on_exception.emit(e) + return + + self.on_complete.emit() + + +class OWLouvainClustering(widget.OWWidget): + name = 'Louvain Clustering' + description = 'Detects communities in a network of nearest neighbors.' + icon = 'icons/LouvainClustering.svg' + priority = 2110 + + want_main_area = False + + settingsHandler = DomainContextHandler() + + class Inputs: + data = Input('Data', Table, default=True) + + if Graph is not None: + class Outputs: + annotated_data = Output(ANNOTATED_DATA_SIGNAL_NAME, Table, default=True) + graph = Output('Network', Graph) + else: + class Outputs: + annotated_data = Output(ANNOTATED_DATA_SIGNAL_NAME, Table, default=True) + + apply_pca = ContextSetting(True) + pca_components = ContextSetting(_DEFAULT_PCA_COMPONENTS) + metric_idx = ContextSetting(0) + k_neighbors = ContextSetting(_DEFAULT_K_NEIGHBORS) + resolution = ContextSetting(1.) + auto_commit = Setting(True) + + class Error(widget.OWWidget.Error): + empty_dataset = Msg('No features in data') + general_error = Msg('Error occured during clustering\n{}') + + def __init__(self): + super().__init__() + + self.data = None # type: Optional[Table] + self.preprocessed_data = None # type: Optional[Table] + self.graph = None # type: Optional[nx.Graph] + self.partition = None # type: Optional[np.array] + + self.__executor = ThreadExecutor(parent=self) + self.__future = None # type: Optional[Future] + self.__queue = None # type: Optional[TaskQueue] + + pca_box = gui.vBox(self.controlArea, 'PCA Preprocessing') + self.apply_pca_cbx = gui.checkBox( + pca_box, self, 'apply_pca', label='Apply PCA preprocessing', + callback=self._update_graph, + ) # type: QCheckBox + self.pca_components_slider = gui.hSlider( + pca_box, self, 'pca_components', label='Components: ', minValue=2, + maxValue=_MAX_PCA_COMPONENTS, callback=self._update_pca_components, + tracking=False + ) # type: QSlider + + graph_box = gui.vBox(self.controlArea, 'Graph parameters') + self.metric_combo = gui.comboBox( + graph_box, self, 'metric_idx', label='Distance metric', + items=[m[0] for m in METRICS], callback=self._update_graph, + orientation=Qt.Horizontal, + ) # type: gui.OrangeComboBox + self.k_neighbors_spin = gui.spin( + graph_box, self, 'k_neighbors', minv=1, maxv=_MAX_K_NEIGBOURS, + label='k neighbors', controlWidth=80, alignment=Qt.AlignRight, + callback=self._update_graph, + ) # type: gui.SpinBoxWFocusOut + self.resolution_spin = gui.hSlider( + graph_box, self, 'resolution', minValue=0, maxValue=5., step=1e-1, + label='Resolution', intOnly=False, labelFormat='%.1f', + callback=self._update_resolution, tracking=False, + ) # type: QSlider + self.resolution_spin.parent().setToolTip( + 'The resolution parameter affects the number of clusters to find. ' + 'Smaller values tend to produce more clusters and larger values ' + 'retrieve less clusters.' + ) + + self.apply_button = gui.auto_commit( + self.controlArea, self, 'auto_commit', 'Apply', box=None, + commit=self.commit, + ) # type: QWidget + + def _update_graph(self): + self._invalidate_graph() + self.commit() + + def _update_pca_components(self): + self._invalidate_pca_projection() + self.commit() + + def _update_resolution(self): + self._invalidate_partition() + self.commit() + + def _compute_pca_projection(self): + if self.pca_projection is None and self.apply_pca: + self.setStatusMessage('Computing PCA...') + + pca = PCA(n_components=self.pca_components, random_state=0) + model = pca(self.preprocessed_data) + self.pca_projection = model(self.preprocessed_data) + + def _compute_graph(self, progress_callback=None): + if self.graph is None: + self.setStatusMessage('Building graph...') + + data = self.pca_projection if self.apply_pca else self.preprocessed_data + + self.graph = table_to_knn_graph( + data, k_neighbors=self.k_neighbors, + metric=METRICS[self.metric_idx][1], + progress_callback=progress_callback, + ) + + def _compute_partition(self): + if self.partition is None: + self.setStatusMessage('Detecting communities...') + self.setBlocking(True) + + louvain = Louvain(resolution=self.resolution) + self.partition = louvain.fit_predict(self.graph) + + def _processing_complete(self): + self.setStatusMessage('') + self.setBlocking(False) + self.progressBarFinished() + + def _handle_exceptions(self, ex): + self.Error.general_error(str(ex)) + + def cancel(self): + """Cancel any running jobs.""" + if self.__future is not None: + assert self.__queue is not None + self.__queue.cancel() + self.__queue = None + self.__future.cancel() + self.__future = None + + def commit(self): + self.Error.clear() + # Kill any running jobs + self.cancel() + + if self.data is None: + return + + # Make sure the dataset is ok + if len(self.data.domain.attributes) < 1: + self.Error.empty_dataset() + return + + # Preprocess the dataset + if self.preprocessed_data is None: + louvain = Louvain() + self.preprocessed_data = louvain.preprocess(self.data) + + # Prepare the tasks to run + queue = TaskQueue(parent=self) + + if self.pca_projection is None and self.apply_pca: + queue.push(namespace(task=self._compute_pca_projection)) + + if self.graph is None: + queue.push(namespace(task=self._compute_graph, progress_callback=True)) + + if self.partition is None: + queue.push(namespace(task=self._compute_partition)) + + # Prepare callbacks + queue.on_progress.connect(lambda val: self.progressBarSet(100 * val)) + queue.on_complete.connect(self._processing_complete) + queue.on_complete.connect(self._send_data) + queue.on_exception.connect(self._handle_exceptions) + self.__queue = queue + + # Run the task queue + self.progressBarInit() + self.setBlocking(True) + self.__future = self.__executor.submit(queue.start) + + def _send_data(self): + if self.partition is None or self.data is None: + return + domain = self.data.domain + # Compute the frequency of each cluster index + counts = np.bincount(self.partition) + indices = np.argsort(counts)[::-1] + index_map = {n: o for n, o in zip(indices, range(len(indices)))} + new_partition = list(map(index_map.get, self.partition)) + + cluster_var = DiscreteVariable( + get_next_name(domain, 'Cluster'), + values=['C%d' % (i + 1) for i, _ in enumerate(np.unique(new_partition))] + ) + + new_domain = add_columns(domain, metas=[cluster_var]) + new_table = self.data.transform(new_domain) + new_table.get_column_view(cluster_var)[0][:] = new_partition + self.Outputs.annotated_data.send(new_table) + + if Graph is not None: + graph = Graph(self.graph) + graph.set_items(new_table) + self.Outputs.graph.send(graph) + + def _invalidate_pca_projection(self): + self.pca_projection = None + self._invalidate_graph() + + def _invalidate_graph(self): + self.graph = None + self._invalidate_partition() + + def _invalidate_partition(self): + self.partition = None + + @Inputs.data + def set_data(self, data): + self.closeContext() + self.Error.clear() + + prev_data, self.data = self.data, data + self.openContext(self.data) + + # If X hasn't changed, there's no reason to recompute clusters + if prev_data and self.data and np.array_equal(self.data.X, prev_data.X): + if self.auto_commit: + self._send_data() + return + + # Clear the outputs + self.Outputs.annotated_data.send(None) + if Graph is not None: + self.Outputs.graph.send(None) + + # Clear internal state + self.preprocessed_data = None + self._invalidate_pca_projection() + if self.data is None: + return + + # Can't have more PCA components than the number of attributes + n_attrs = len(data.domain.attributes) + self.pca_components_slider.setMaximum(min(_MAX_PCA_COMPONENTS, n_attrs)) + self.pca_components_slider.setValue(min(_DEFAULT_PCA_COMPONENTS, n_attrs)) + # Can't have more k neighbors than there are data points + self.k_neighbors_spin.setMaximum(min(_MAX_K_NEIGBOURS, len(data) - 1)) + self.k_neighbors_spin.setValue(min(_DEFAULT_K_NEIGHBORS, len(data) - 1)) + + self.commit() + + def onDeleteWidget(self): + self.cancel() + super().onDeleteWidget() + + def send_report(self): + pca = report.bool_str(self.apply_pca) + if self.apply_pca: + pca += report.plural(', {number} component{s}', self.pca_components) + + self.report_items(( + ('PCA preprocessing', pca), + ('Metric', METRICS[self.metric_idx][0]), + ('k neighbors', self.k_neighbors), + ('Resolution', self.resolution), + )) + + +if __name__ == '__main__': + from AnyQt.QtWidgets import QApplication # pylint: disable=ungrouped-imports + import sys + + app = QApplication(sys.argv) + ow = OWLouvainClustering() + ow.resetSettings() + + ow.set_data(Table(sys.argv[1] if len(sys.argv) > 1 else 'iris')) + ow.show() + app.exec_() diff --git a/Orange/widgets/unsupervised/tests/test_owlouvain.py b/Orange/widgets/unsupervised/tests/test_owlouvain.py new file mode 100644 index 00000000000..f73832a8903 --- /dev/null +++ b/Orange/widgets/unsupervised/tests/test_owlouvain.py @@ -0,0 +1,96 @@ +from unittest.mock import patch + +import numpy as np + +from Orange.data import Table, Domain, ContinuousVariable +from Orange.widgets.tests.base import WidgetTest +from Orange.widgets.unsupervised.owlouvainclustering import OWLouvainClustering + +# Deterministic tests +np.random.seed(42) + + +class TestOWLouvain(WidgetTest): + def setUp(self): + self.widget = self.create_widget( + OWLouvainClustering, stored_settings={'auto_commit': False} + ) + self.iris = Table('iris') + + def tearDown(self): + self.widget.onDeleteWidget() + super().tearDown() + + def test_removing_data(self): + self.send_signal(self.widget.Inputs.data, self.iris) + self.commit_and_wait() + self.send_signal(self.widget.Inputs.data, None) + self.commit_and_wait() + + def test_clusters_ordered_by_size(self): + """Cluster names should be sorted based on the number of instances.""" + x1 = np.array([[0, 0]] * 20) + x2 = np.array([[1, 0]] * 15) + x3 = np.array([[0, 1]] * 10) + x4 = np.array([[1, 1]] * 5) + data = np.vstack((x1, x2, x3, x4)) + # Remove any order depencence in data, not that this should affect it + np.random.shuffle(data) + + table = Table.from_numpy(domain=Domain.from_numpy(X=data), X=data) + + self.send_signal(self.widget.Inputs.data, table) + self.widget.k_neighbors = 4 + self.commit_and_wait() + output = self.get_output(self.widget.Outputs.annotated_data) + + clustering = output.get_column_view('Cluster')[0].astype(int) + counts = np.bincount(clustering) + np.testing.assert_equal(counts, sorted(counts, reverse=True)) + + def test_empty_dataset(self): + # Prepare a table with 5 rows with only meta attributes + meta = np.array([0] * 5) + meta_var = ContinuousVariable(name='meta_var') + table = Table.from_domain(domain=Domain([], metas=[meta_var]), n_rows=5) + table.get_column_view(meta_var)[0][:] = meta + + self.send_signal(self.widget.Inputs.data, table) + self.commit_and_wait() + self.assertTrue(self.widget.Error.empty_dataset.is_shown()) + + def test_do_not_recluster_on_same_data(self): + """Do not recluster data points when targets or metas change.""" + + # Prepare some dummy data + x = np.eye(5) + y1, y2 = np.ones((5, 1)), np.ones((5, 2)) + meta1, meta2 = np.ones((5, 1)), np.ones((5, 2)) + + table1 = Table.from_numpy( + domain=Domain.from_numpy(X=x, Y=y1, metas=meta1), + X=x, Y=y1, metas=meta1, + ) + # X is same, should not cause update + table2 = Table.from_numpy( + domain=Domain.from_numpy(X=x, Y=y2, metas=meta2), + X=x, Y=y2, metas=meta2, + ) + # X is different, should cause update + table3 = table1.copy() + table3.X[:, 0] = 1 + + with patch.object(self.widget, 'commit') as commit: + self.send_signal(self.widget.Inputs.data, table1) + self.commit_and_wait() + call_count = commit.call_count + + # Sending data with same X should not recompute the clustering + self.send_signal(self.widget.Inputs.data, table2) + self.commit_and_wait() + self.assertEqual(call_count, commit.call_count) + + # Sending data with different X should recompute the clustering + self.send_signal(self.widget.Inputs.data, table3) + self.commit_and_wait() + self.assertEqual(call_count + 1, commit.call_count) diff --git a/doc/visual-programming/source/index.rst b/doc/visual-programming/source/index.rst index d8d558fa5b0..54f572c8033 100644 --- a/doc/visual-programming/source/index.rst +++ b/doc/visual-programming/source/index.rst @@ -112,6 +112,7 @@ Unsupervised widgets/unsupervised/savedistancematrix widgets/unsupervised/hierarchicalclustering widgets/unsupervised/kmeansclustering + widgets/unsupervised/louvainclustering widgets/unsupervised/mds widgets/unsupervised/manifoldlearning diff --git a/doc/visual-programming/source/widgets/unsupervised/images/Louvain-Example.png b/doc/visual-programming/source/widgets/unsupervised/images/Louvain-Example.png new file mode 100644 index 00000000000..fbb98caa6b1 Binary files /dev/null and b/doc/visual-programming/source/widgets/unsupervised/images/Louvain-Example.png differ diff --git a/doc/visual-programming/source/widgets/unsupervised/images/Louvain-stamped.png b/doc/visual-programming/source/widgets/unsupervised/images/Louvain-stamped.png new file mode 100644 index 00000000000..ec249c930cd Binary files /dev/null and b/doc/visual-programming/source/widgets/unsupervised/images/Louvain-stamped.png differ diff --git a/doc/visual-programming/source/widgets/unsupervised/louvainclustering.rst b/doc/visual-programming/source/widgets/unsupervised/louvainclustering.rst new file mode 100644 index 00000000000..2849ec89f3d --- /dev/null +++ b/doc/visual-programming/source/widgets/unsupervised/louvainclustering.rst @@ -0,0 +1,45 @@ +Louvain Clustering +================== + +Groups items using the Louvain clustering algorithm. + +Inputs + Data + input dataset + +Outputs + Data + dataset with cluster index as a class attribute + Graph (with the Network addon) + the weighted k-nearest neighbor graph + + +The widget first converts the input data into a k-nearest neighbor graph. To preserve the notions of distance, the Jaccard index for the number of shared neighbors is used to weight the edges. Finally, a `modularity optimization `_ community detection algorithm is applied to the graph to retrieve clusters of highly interconnected nodes. The widget outputs a new dataset in which the cluster index is used as a meta attribute. + + +.. figure:: images/Louvain-stamped.png + +1. PCA processing is typically applied to the original data to remove noise. +2. The distance metric is used for finding specified number of nearest + neighbors. +3. The number of nearest neighbors to use to form the KNN graph. +4. Resolution is a parameter for the Louvain community detection algorithm that + affects the size of the recovered clusters. Smaller resolutions recover + smaller, and therefore a larger number of clusters, and conversely, larger + values recover clusters containing more data points. +5. When *Apply Automatically* is ticked, the widget will automatically + communicate all changes. Alternatively, click *Apply*. + +Example +------- + +*Louvain Clustering* converts the dataset into a graph, where it finds highly interconected nodes. We can visualize the graph itself using the *Network Explorer* from the Network addon. + +.. figure:: images/Louvain-Example.png + +References +---------- + +Blondel, Vincent D., et al. "Fast unfolding of communities in large networks." Journal of statistical mechanics: theory and experiment 2008.10 (2008): P10008. + +Lambiotte, Renaud, J-C. Delvenne, and Mauricio Barahona. "Laplacian dynamics and multiscale modular structure in networks." arXiv preprint arXiv:0812.1770 (2008). diff --git a/requirements-core.txt b/requirements-core.txt index 449150e016c..dacd09dd2ba 100644 --- a/requirements-core.txt +++ b/requirements-core.txt @@ -13,3 +13,5 @@ keyring keyrings.alt # for alternative keyring implementations setuptools>=36.3 serverfiles # for Data Sets synchronization +networkx +python-louvain