Skip to content

Commit

Permalink
Fix to sort_by_size:
Browse files Browse the repository at this point in the history
faster and efficient sorting of clusters
  • Loading branch information
awnimo committed Apr 29, 2020
1 parent 696747c commit 3128fc9
Show file tree
Hide file tree
Showing 4 changed files with 87 additions and 20 deletions.
39 changes: 38 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,10 @@ If you use PhenoGraph in work you publish, please cite our publication:
Release Notes
-------------

### Version 1.5.4

* Faster and more efficient sorting by size of clusters, for large nearest neighbours graph, implementing multiprocessing and faster methods for sorting.

### Version 1.5.3

* Phenograph supports now [**Leiden**](https://www.nature.com/articles/s41598-019-41695-z) algorithm for community detection.
Expand Down Expand Up @@ -82,4 +86,37 @@ Release Notes

### Version 1.3

* Proper support for Linux.
* Proper support for Linux.

---
Troubleshooting
---------------

### Notebook freezes after several attempts of running PhenoGraph using Jypyter Notebook

* Running `PhenoGraph` from a Jupyter Notebook repeatedly on macOS Catalina, but not Mojave, using Python 3.7.6, causes a hang and the notebook becomes unresponsive, even for a basic matrix of nearest neighbors. However, this issue was not reproducible in command line using `Python` interpreter in both Catalina and Mojave platforms, without using Jupyter Notebook.

It was found that attempting to plot principal components using

```
:func:`~matplotlib.pyplot.scatter`
```
in Jupyter Notebook causes a freeze, and `PhenoGraph` becomes unresponsive unless the kernel is restarted. When removing this line of code, everything goes back to normal and the Jupyter Notebook stopes crashing with repeated runs of `PhenoGraph`.
### Architecture related error
* When attempting to process very large nearest neighbours graph, _e.g._ a 2000000 `x` 2000000 kNN graph matrix with 300 nearest neighbours, a `struct.error()` is raised:
```python
struct.error: 'i' format requires -2147483648 <= number <= 2147483647
```
This issue was reported on [stackoverflow](https://stackoverflow.com/questions/47776486/python-struct-error-i-format-requires-2147483648-number-2147483647) and it's related to the multiprocessing while building the Jaccard object.
The `struct.error()` has been fixed in python >= 3.8.0.
### `leidenalg` inside conda environment
* When using `PhenoGraph` inside a conda environment `leiden` takes longer to complete for larger samples compared to the system Python.
64 changes: 47 additions & 17 deletions phenograph/cluster.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
import multiprocessing as mp
import os
import re
import time
import uuid
from typing import Union, Optional, Type

import leidenalg
import igraph as ig
import leidenalg
import numpy as np
from leidenalg.VertexPartition import MutableVertexPartition
from scipy import sparse as sp
Expand All @@ -16,10 +21,21 @@
graph2binary,
runlouvain,
)
import time
import re
import os
import uuid


def chunk_clusters(cl):
for i in range(0, np.unique(cl).size, 5000):
yield np.unique(cl)[i : i + 5000]


def yield_clusters(cl, ch):
for i in ch:
yield cl == i


def get_sizes(func, args):
results = func(*args)
return [np.count_nonzero(res) for res in results]


def sort_by_size(clusters, min_size):
Expand All @@ -31,14 +47,20 @@ def sort_by_size(clusters, min_size):
:param min_size:
:return: relabeled
"""
relabeled = np.zeros(clusters.shape, dtype=np.int)
sizes = [sum(clusters == x) for x in np.unique(clusters)]
p = mp.Pool(mp.cpu_count())
sizes = []
ch_clust = chunk_clusters(clusters)
TASKS = [(yield_clusters, (clusters, i)) for i in ch_clust]
results = [p.apply_async(get_sizes, t) for t in TASKS]
for res in results:
sizes.extend(res.get())

o = np.argsort(sizes)[::-1]
for i, c in enumerate(o):
if sizes[c] > min_size:
relabeled[clusters == c] = i
else:
relabeled[clusters == c] = -1
my_dict = {c: i for i, c in enumerate(o) if sizes[c] > min_size}
my_dict.update({c: -1 for i, c in enumerate(o) if sizes[c] <= min_size})

relabeled = np.vectorize(my_dict.get)(clusters)

return relabeled


Expand Down Expand Up @@ -77,8 +99,7 @@ def cluster(
The graph construction process produces a directed graph, which is symmetrized
by one of two methods (see below)
:param prune: Whether to symmetrize by taking the average (prune=False) or product
(prune=True) between the graph
and its transpose
(prune=True) between the graph and its transpose
:param min_cluster_size: Cells that end up in a cluster smaller than
min_cluster_size are considered outliers and are assigned to -1 in the cluster
labels
Expand Down Expand Up @@ -183,8 +204,12 @@ def cluster(
uid = uuid.uuid1().hex
graph2binary(uid, graph)
communities, Q = runlouvain(uid, tol=q_tol, time_limit=louvain_time_limit)
print("PhenoGraph complete in {} seconds".format(time.time() - tic), flush=True)

print("Sorting communities by size, please wait ...", flush=True)
communities = sort_by_size(communities, min_cluster_size)

print("PhenoGraph complete in {} seconds".format(time.time() - tic), flush=True)

# clean up
for f in os.listdir():
if re.search(uid, f):
Expand All @@ -196,7 +221,7 @@ def cluster(
edgelist = np.vstack(graph.nonzero()).T.tolist()
g = ig.Graph(max(graph.shape), edgelist, directed=directed)
# set vertices as weights
g.es["weights"] = graph.toarray()[graph.nonzero()]
g.es["weights"] = graph.data

kargs = dict()
if not partition_type:
Expand All @@ -218,9 +243,14 @@ def cluster(
"Leiden completed in {} seconds".format(time.time() - tic_), flush=True,
)
communities = np.asarray(communities.membership)
print("PhenoGraph complete in {} seconds".format(time.time() - tic), flush=True)

print("Sorting communities by size, please wait ...", flush=True)
communities = sort_by_size(communities, min_cluster_size)

print(
"PhenoGraph completed in {} seconds".format(time.time() - tic), flush=True
)

else:
# return only graph object
pass
Expand Down
2 changes: 1 addition & 1 deletion phenograph/version.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
__version__ = "1.5.3"
__version__ = "1.5.4"
__author__ = "Jacob Levine"
__email__ = "[email protected]"
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
leidenalg >= 0.7.0
setuptools >= 18.0.1
numpy >= 1.9.2
numpy >= 1.12
scipy >= 0.16.0
scikit_learn >= 0.17
psutil > 4

0 comments on commit 3128fc9

Please sign in to comment.