Fix to sort_by_size:

faster and efficient sorting of clusters
dpeerlab · Apr 29, 2020 · 3128fc9 · 3128fc9
1 parent 696747c
commit 3128fc9
Show file tree

Hide file tree

Showing 4 changed files with 87 additions and 20 deletions.
diff --git a/README.md b/README.md
@@ -49,6 +49,10 @@ If you use PhenoGraph in work you publish, please cite our publication:
 Release Notes
 -------------
 
+### Version 1.5.4
+
+ * Faster and more efficient sorting by size of clusters, for large nearest neighbours graph, implementing multiprocessing and faster methods for sorting.
+
 ### Version 1.5.3
 
  * Phenograph supports now [**Leiden**](https://www.nature.com/articles/s41598-019-41695-z) algorithm for community detection.
@@ -82,4 +86,37 @@ Release Notes
 
 ### Version 1.3
 
- * Proper support for Linux.
+ * Proper support for Linux.
+
+---
+Troubleshooting
+---------------
+
+### Notebook freezes after several attempts of running PhenoGraph using Jypyter Notebook
+
+* Running `PhenoGraph` from a Jupyter Notebook repeatedly on macOS Catalina, but not Mojave,  using Python 3.7.6, causes a hang and the notebook becomes unresponsive, even for a basic matrix of nearest neighbors. However, this issue was not reproducible in command line using `Python` interpreter in both Catalina and Mojave platforms, without using Jupyter Notebook.
+
+  It was found that attempting to plot principal components using 
+
+    ```
+    :func:`~matplotlib.pyplot.scatter`
+    ```
+    
+  in Jupyter Notebook causes a freeze, and `PhenoGraph` becomes unresponsive unless the kernel is restarted. When removing this line of code, everything goes back to normal and the Jupyter Notebook stopes crashing with repeated runs of `PhenoGraph`. 
+  
+### Architecture related error
+
+* When attempting to process very large nearest neighbours graph, _e.g._ a 2000000 `x` 2000000 kNN graph matrix with 300 nearest neighbours, a `struct.error()` is raised: 
+  
+    ```python
+    struct.error: 'i' format requires -2147483648 <= number <= 2147483647
+    ```
+  
+  This issue was reported on [stackoverflow](https://stackoverflow.com/questions/47776486/python-struct-error-i-format-requires-2147483648-number-2147483647) and it's related to the multiprocessing while building the Jaccard object.
+  
+  The `struct.error()` has been fixed in python >= 3.8.0.
+    
+### `leidenalg` inside conda environment
+
+* When using `PhenoGraph` inside a conda environment `leiden` takes longer to complete for larger samples compared to the system Python. 
+
diff --git a/phenograph/cluster.py b/phenograph/cluster.py
@@ -1,7 +1,12 @@
+import multiprocessing as mp
+import os
+import re
+import time
+import uuid
 from typing import Union, Optional, Type
 
-import leidenalg
 import igraph as ig
+import leidenalg
 import numpy as np
 from leidenalg.VertexPartition import MutableVertexPartition
 from scipy import sparse as sp
@@ -16,10 +21,21 @@
     graph2binary,
     runlouvain,
 )
-import time
-import re
-import os
-import uuid
+
+
+def chunk_clusters(cl):
+    for i in range(0, np.unique(cl).size, 5000):
+        yield np.unique(cl)[i : i + 5000]
+
+
+def yield_clusters(cl, ch):
+    for i in ch:
+        yield cl == i
+
+
+def get_sizes(func, args):
+    results = func(*args)
+    return [np.count_nonzero(res) for res in results]
 
 
 def sort_by_size(clusters, min_size):
@@ -31,14 +47,20 @@ def sort_by_size(clusters, min_size):
     :param min_size:
     :return: relabeled
     """
-    relabeled = np.zeros(clusters.shape, dtype=np.int)
-    sizes = [sum(clusters == x) for x in np.unique(clusters)]
+    p = mp.Pool(mp.cpu_count())
+    sizes = []
+    ch_clust = chunk_clusters(clusters)
+    TASKS = [(yield_clusters, (clusters, i)) for i in ch_clust]
+    results = [p.apply_async(get_sizes, t) for t in TASKS]
+    for res in results:
+        sizes.extend(res.get())
+
     o = np.argsort(sizes)[::-1]
-    for i, c in enumerate(o):
-        if sizes[c] > min_size:
-            relabeled[clusters == c] = i
-        else:
-            relabeled[clusters == c] = -1
+    my_dict = {c: i for i, c in enumerate(o) if sizes[c] > min_size}
+    my_dict.update({c: -1 for i, c in enumerate(o) if sizes[c] <= min_size})
+
+    relabeled = np.vectorize(my_dict.get)(clusters)
+
     return relabeled
 
 
@@ -77,8 +99,7 @@ def cluster(
         The graph construction process produces a directed graph, which is symmetrized
         by one of two methods (see below)
     :param prune: Whether to symmetrize by taking the average (prune=False) or product
-        (prune=True) between the graph
-        and its transpose
+        (prune=True) between the graph and its transpose
     :param min_cluster_size: Cells that end up in a cluster smaller than
         min_cluster_size are considered outliers and are assigned to -1 in the cluster
         labels
@@ -183,8 +204,12 @@ def cluster(
         uid = uuid.uuid1().hex
         graph2binary(uid, graph)
         communities, Q = runlouvain(uid, tol=q_tol, time_limit=louvain_time_limit)
-        print("PhenoGraph complete in {} seconds".format(time.time() - tic), flush=True)
+
+        print("Sorting communities by size, please wait ...", flush=True)
         communities = sort_by_size(communities, min_cluster_size)
+
+        print("PhenoGraph complete in {} seconds".format(time.time() - tic), flush=True)
+
         # clean up
         for f in os.listdir():
             if re.search(uid, f):
@@ -196,7 +221,7 @@ def cluster(
         edgelist = np.vstack(graph.nonzero()).T.tolist()
         g = ig.Graph(max(graph.shape), edgelist, directed=directed)
         # set vertices as weights
-        g.es["weights"] = graph.toarray()[graph.nonzero()]
+        g.es["weights"] = graph.data
 
         kargs = dict()
         if not partition_type:
@@ -218,9 +243,14 @@ def cluster(
             "Leiden completed in {} seconds".format(time.time() - tic_), flush=True,
         )
         communities = np.asarray(communities.membership)
-        print("PhenoGraph complete in {} seconds".format(time.time() - tic), flush=True)
+
+        print("Sorting communities by size, please wait ...", flush=True)
         communities = sort_by_size(communities, min_cluster_size)
 
+        print(
+            "PhenoGraph completed in {} seconds".format(time.time() - tic), flush=True
+        )
+
     else:
         # return only graph object
         pass

diff --git a/phenograph/version.py b/phenograph/version.py
@@ -1,3 +1,3 @@
-__version__ = "1.5.3"
+__version__ = "1.5.4"
 __author__ = "Jacob Levine"
 __email__ = "[email protected]"
diff --git a/requirements.txt b/requirements.txt
@@ -1,6 +1,6 @@
 leidenalg >= 0.7.0
 setuptools >= 18.0.1
-numpy >= 1.9.2
+numpy >= 1.12
 scipy >= 0.16.0
 scikit_learn >= 0.17
 psutil > 4