Skip to content

Commit

Permalink
Merge pull request #139 from JonathanShor/cluster_opts
Browse files Browse the repository at this point in the history
Dev 4.0
  • Loading branch information
JonathanShor authored Mar 12, 2022
2 parents d52af48 + 516d90d commit d25393d
Show file tree
Hide file tree
Showing 5 changed files with 126 additions and 52 deletions.
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,9 @@ The classifier works best when

In `v2.5` we have added a new experimental clustering method (`scanpy`'s Louvain clustering) that is much faster than phenograph. We are still validating results from this new clustering. Please see the notebook below for an example of using this new feature.

See our [jupyter notebook](https://nbviewer.jupyter.org/github/JonathanShor/DoubletDetection/blob/master/tests/notebooks/PBMC_10k_vignette.ipynb) for an example on 8k PBMCs from 10x.
## Tutorial

See our [jupyter notebook](https://nbviewer.jupyter.org/github/JonathanShor/DoubletDetection/blob/master/tests/notebooks/PBMC_10k_vignette.ipynb) for an example on 10k PBMCs from 10x Genomics.

## Obtaining data

Expand Down
112 changes: 74 additions & 38 deletions doubletdetection/doubletdetection.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,14 @@ class BoostClassifier:
use; other genes discarded. Will use all genes when zero.
replace (bool, optional): If False, a cell will be selected as a
synthetic doublet's parent no more than once.
use_phenograph (bool, optional): Set to False to disable PhenoGraph clustering
in exchange for louvain clustering implemented in scanpy. Defaults to True.
phenograph_parameters (dict, optional): Parameter dict to pass directly
to PhenoGraph. Note that we change the PhenoGraph 'prune' default to
True; you must specifically include 'prune': False here to change
this. Only used when use_phenograph is True.
self.clustering_algorithm (str, optional): One of `["louvain", "leiden",
"phenograph"]`. `"louvain"` and `leiden` refer to the scanpy implementations.
clustering_kwargs (dict, optional): Keyword args to pass directly
to clusering algorithm. Note that we change the PhenoGraph 'prune' default to
True. We also set `directed=False` and `resolution=4` for Louvain
and Leiden clustering. You must specifically include these params here
to change them. `random_state` and `key_added` should not be overriden
when clustering algorithm is Louvain or Leiden.
n_iters (int, optional): Number of fit operations from which to collect
p-values. Defualt value is 25.
normalizer ((sp_sparse) -> ndarray): Method to normalize raw_counts.
Expand All @@ -52,14 +54,12 @@ class BoostClassifier:
standard_scaling (bool, optional): Set to True to enable standard scaling
of normalized count matrix prior to PCA. Recommended when not using
Phenograph. Defaults to False.
n_jobs (int, optional): Number of jobs to use. Speeds up neighbor computation.
Attributes:
all_log_p_values_ (ndarray): Hypergeometric test natural log p-value per
cell for cluster enrichment of synthetic doublets. Shape (n_iters,
num_cells).
all_p_values_ (ndarray): DEPRECATED. Exponentiated all_log_p_values.
Due to rounding point errors, use of all_log_p_values recommended.
Will be removed in v3.0.
cell for cluster enrichment of synthetic doublets. Use for tresholding.
Shape (n_iters, num_cells).
all_scores_ (ndarray): The fraction of a cell's cluster that is
synthetic doublets. Shape (n_iters, num_cells).
communities_ (ndarray): Cluster ID for corresponding cell. Shape
Expand All @@ -83,22 +83,31 @@ def __init__(
n_components=30,
n_top_var_genes=10000,
replace=False,
use_phenograph=True,
phenograph_parameters={"prune": True},
n_iters=25,
clustering_algorithm="phenograph",
clustering_kwargs=None,
n_iters=10,
normalizer=None,
random_state=0,
verbose=False,
standard_scaling=False,
n_jobs=1,
):
self.boost_rate = boost_rate
self.replace = replace
self.use_phenograph = use_phenograph
self.clustering_algorithm = clustering_algorithm
self.n_iters = n_iters
self.normalizer = normalizer
self.random_state = random_state
self.verbose = verbose
self.standard_scaling = standard_scaling
self.n_jobs = n_jobs

if self.clustering_algorithm not in ["louvain", "phenograph", "leiden"]:
raise ValueError(
"Clustering algorithm needs to be one of ['louvain', 'phenograph', 'leiden']"
)
if self.clustering_algorithm == "leiden":
warnings.warn("Leiden clustering is experimental and results have not been validated.")

if self.random_state:
np.random.seed(self.random_state)
Expand All @@ -111,16 +120,10 @@ def __init__(
# Floor negative n_top_var_genes by 0
self.n_top_var_genes = max(0, n_top_var_genes)

if use_phenograph is True:
if "prune" not in phenograph_parameters:
phenograph_parameters["prune"] = True
self.phenograph_parameters = phenograph_parameters
if (self.n_iters == 1) and (phenograph_parameters.get("prune") is True):
warn_msg = (
"Using phenograph parameter prune=False is strongly recommended when "
+ "running only one iteration. Otherwise, expect many NaN labels."
)
warnings.warn(warn_msg)
self.clustering_kwargs = (
{} if not isinstance(clustering_kwargs, dict) else clustering_kwargs
)
self._set_clustering_kwargs()

if not self.replace and self.boost_rate > 0.5:
warn_msg = (
Expand All @@ -143,7 +146,7 @@ def fit(self, raw_counts):
raw_counts (array-like): Count matrix, oriented cells by genes.
Sets:
all_scores_, all_p_values_, all_log_p_values_, communities_,
all_scores_, all_log_p_values_, communities_,
top_var_genes, parents, synth_communities
Returns:
Expand All @@ -163,6 +166,9 @@ def fit(self, raw_counts):
print("Sparsifying matrix.")
raw_counts = csr_matrix(raw_counts)

old_n_jobs = sc.settings.n_jobs
sc.settings.n_jobs = self.n_jobs

if self.n_top_var_genes > 0:
if self.n_top_var_genes < raw_counts.shape[1]:
gene_variances = (
Expand Down Expand Up @@ -200,17 +206,17 @@ def fit(self, raw_counts):

# Release unneeded large data vars
del self._raw_counts
del self._norm_counts
del self._raw_synthetics
del self._synthetics
if self.normalizer is None:
del self._normed_raw_counts
del self._lib_size

# reset scanpy n_jobs
sc.settings.n_jobs = old_n_jobs

self.communities_ = all_communities
self.parents_ = all_parents
self.synth_communities_ = all_synth_communities
self.all_p_values_ = np.exp(self.all_log_p_values_)

return self

Expand Down Expand Up @@ -291,10 +297,7 @@ def _one_fit(self):
normed_synths = self._raw_synthetics.copy()
inplace_csr_row_normalize_l1(normed_synths)
aug_counts = sp_sparse.vstack((self._normed_raw_counts, normed_synths))
aug_counts = np.log(aug_counts.A * np.median(aug_lib_size) + 0.1)

self._norm_counts = aug_counts[: self._num_cells]
self._synthetics = aug_counts[self._num_cells :]
aug_counts = np.log((aug_counts * np.median(aug_lib_size)).A + 0.1)

aug_counts = anndata.AnnData(aug_counts)
aug_counts.obs["n_counts"] = aug_lib_size
Expand All @@ -306,24 +309,33 @@ def _one_fit(self):
sc.tl.pca(aug_counts, n_comps=self.n_components, random_state=self.random_state)
if self.verbose:
print("Clustering augmented data set...\n")
if self.use_phenograph:
if self.clustering_algorithm == "phenograph":
f = io.StringIO()
with redirect_stdout(f):
fullcommunities, _, _ = phenograph.cluster(
aug_counts.obsm["X_pca"], **self.phenograph_parameters
aug_counts.obsm["X_pca"], n_jobs=self.n_jobs, **self.clustering_kwargs
)
out = f.getvalue()
if self.verbose:
print(out)
else:
if self.clustering_algorithm == "louvain":
clus = sc.tl.louvain
else:
clus = sc.tl.leiden
sc.pp.neighbors(
aug_counts,
random_state=self.random_state,
method="umap",
n_neighbors=10,
)
sc.tl.louvain(aug_counts, random_state=self.random_state, resolution=4, directed=False)
fullcommunities = np.array(aug_counts.obs["louvain"], dtype=int)
clus(
aug_counts,
key_added="clusters",
random_state=self.random_state,
**self.clustering_kwargs,
)
fullcommunities = np.array(aug_counts.obs["clusters"], dtype=int)
min_ID = min(fullcommunities)
self.communities_ = fullcommunities[: self._num_cells]
self.synth_communities_ = fullcommunities[self._num_cells :]
Expand Down Expand Up @@ -352,7 +364,7 @@ def _one_fit(self):
i: hypergeom.logsf(
synth_cells_per_comm[i],
aug_counts.shape[0],
self._synthetics.shape[0],
normed_synths.shape[0],
synth_cells_per_comm[i] + orig_cells_per_comm[i],
)
for i in community_IDs
Expand Down Expand Up @@ -383,3 +395,27 @@ def _createDoublets(self):

self._raw_synthetics = synthetic
self.parents_ = parents

def _set_clustering_kwargs(self):
"""Sets .clustering_kwargs"""
if self.clustering_algorithm == "phenograph":
if "prune" not in self.clustering_kwargs:
self.clustering_kwargs["prune"] = True
self.clustering_kwargs = self.clustering_kwargs
if (self.n_iters == 1) and (self.clustering_kwargs.get("prune") is True):
warn_msg = (
"Using phenograph parameter prune=False is strongly recommended when "
+ "running only one iteration. Otherwise, expect many NaN labels."
)
warnings.warn(warn_msg)
else:
if "directed" not in self.clustering_kwargs:
self.clustering_kwargs["directed"] = True
if "resolution" not in self.clustering_kwargs:
self.clustering_kwargs["resolution"] = 4
if "key_added" in self.clustering_kwargs:
raise ValueError("'key_added' param cannot be overriden")
if "random_state" in self.clustering_kwargs:
raise ValueError(
"'random_state' param cannot be overriden. Please use classifier 'random_state'."
)
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ packages = [
{include = "doubletdetection"},
]
readme = "README.md"
version = "3.0"
version = "4.0"

[tool.poetry.dependencies]
anndata = ">=0.6"
Expand Down
27 changes: 18 additions & 9 deletions tests/notebooks/PBMC_10k_vignette.ipynb

Large diffs are not rendered by default.

33 changes: 30 additions & 3 deletions tests/test_package.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import numpy as np

import pytest
import doubletdetection


Expand All @@ -8,14 +8,41 @@ def test_classifier():
counts = np.random.poisson(size=(500, 100))

# no phenograph
clf = doubletdetection.BoostClassifier(n_iters=2, use_phenograph=False, standard_scaling=True)
clf = doubletdetection.BoostClassifier(
n_iters=2, clustering_algorithm="louvain", standard_scaling=True
)
clf.fit(counts).predict(p_thresh=1e-16, voter_thresh=0.5)
clf.doublet_score()

# with phenograph
clf = doubletdetection.BoostClassifier(n_iters=2, use_phenograph=True, standard_scaling=True)
clf = doubletdetection.BoostClassifier(
n_iters=2, clustering_algorithm="phenograph", standard_scaling=True
)
clf.fit(counts).predict(p_thresh=1e-16, voter_thresh=0.5)
clf.doublet_score()

# with leiden
clf = doubletdetection.BoostClassifier(
n_iters=2, clustering_algorithm="leiden", standard_scaling=True, random_state=123
)
clf.fit(counts).predict(p_thresh=1e-16, voter_thresh=0.5)
scores1 = clf.doublet_score()

# test random state
# with leiden
clf = doubletdetection.BoostClassifier(
n_iters=2, clustering_algorithm="leiden", standard_scaling=True, random_state=123
)
clf.fit(counts).predict(p_thresh=1e-16, voter_thresh=0.5)
scores2 = clf.doublet_score()
np.testing.assert_equal(scores1, scores2)

# plotting
doubletdetection.plot.convergence(clf, show=False, p_thresh=1e-16, voter_thresh=0.5)
doubletdetection.plot.threshold(clf, show=False, p_step=6)

# test that you can't use random clustering algorithm
with pytest.raises(ValueError):
clf = doubletdetection.BoostClassifier(
n_iters=2, clustering_algorithm="my_clusters", standard_scaling=True
)

0 comments on commit d25393d

Please sign in to comment.