diff --git a/docs/source/conf.py b/docs/source/conf.py index 5e27fdb..842508e 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -68,10 +68,10 @@ # html_theme = "scanpydoc" html_theme = "sphinx_rtd_theme" html_static_path = ["_static"] -html_theme_options = { - "repository_url": repository_url, - "use_repository_button": True, -} +# html_theme_options = { # Meeds sphinx-book-theme or scanpydoc +# "repository_url": repository_url, +# "use_repository_button": True, +# } html_show_sphinx = False # html_logo = "_static/img/Scanpy_Logo_BrightFG.svg" html_title = "scdemon" diff --git a/notebooks/example.py b/notebooks/example.py index 3818fb6..a64b6cf 100644 --- a/notebooks/example.py +++ b/notebooks/example.py @@ -1,9 +1,6 @@ #!/usr/bin/env python3 """Example code for using scdemon part of the library.""" -# --------------------------------------------------------------------- # Example - compute co-expression modules using a scanpy anndata object -# Updated: 04/08/24 -# --------------------------------------------------------------------- import os import logging import numpy as np diff --git a/pyproject.toml b/pyproject.toml index 6f61628..cf9a6a1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,9 +24,7 @@ dependencies = [ # Added from original modules, can prune: "scanpy", "anndata", "seaborn", "matplotlib", - "gprofiler-official", - "igraph", "adjustText", - "numba" + "igraph", "numba" ] [project.urls] @@ -35,7 +33,7 @@ Source = "https://github.com/kellislab/scdemon" Home-page = "https://github.com/kellislab/scdemon" [project.optional-dependencies] -test = ["pytest", "pytest-cov", "statsmodels", "anndata"] +test = ["adjustText", "gprofiler-official"] [tool.scikit-build] wheel.expand-macos-universal-tags = true @@ -45,9 +43,6 @@ packages = ["scdemon", "scdemon.utils", "scdemon.plotting", "scdemon.graph", "scdemon.data"] -[tool.pytest.ini_options] -python_files = "test_*.py" - [tool.coverage.run] branch = true diff --git a/scdemon/graph/adjacency_matrix.py b/scdemon/graph/adjacency_matrix.py index 5f90bda..b5521ec 100644 --- a/scdemon/graph/adjacency_matrix.py +++ b/scdemon/graph/adjacency_matrix.py @@ -17,12 +17,9 @@ class adjacency_matrix(object): + """\ + Process given correlation matrix into the adjacency matrix. """ - Turn a correlation matrix into an adjacency matrix. - - Process given correlation matrix into the adjacency matrix. - """ - def __init__( self, corr, @@ -40,7 +37,46 @@ def __init__( scale=None, degree_cutoff=0 ): - """Initialize adjacency matrix class.""" + """\ + Initialize adjacency matrix class. + + + Parameters + ---------- + corr : np.array | sparse.csr_matrix + Gene-gene correlation matrix + adjacency : np.array + Adjacency matrix + method : str + Thresholding method for graph: + + ``'bivariate'`` + default, threshold based on bivariate spline fit to gene-gene sparsity + ``'cutoff'`` + single cutoff across full matrix + ``'sd'`` + based on estimated sd, expensive + corr_sd : np.array + Estimated standard deviation of correlations. Only used for ``sd`` method + labels : np.array + Labels for nodes (genes) + margin : np.array + Fraction non-zero values for each of the variables in the original dataset (gene sparsity) + cutoff : float + Raw correlation threshold + z : float + Z-score threshold + zero_outliers : bool + Whether to set outliers to 0 when calculating splines + keep_all_z : bool + Whether to keep z-score matrix dense by removing all values below threshold + knn_k : int + Pruning: keep only the top k edges per node + scale : float + Pruning: remove edges below a certain fraction (in (0.0, 1.0]) of the top edge for the node + degree_cutoff : int + Pruning: remove graph nodes with degree below this cutoff + """ self.corr = corr self.adjacency = adjacency self.method = method diff --git a/scdemon/graph/gene_graph.py b/scdemon/graph/gene_graph.py index da1604d..6aeb914 100644 --- a/scdemon/graph/gene_graph.py +++ b/scdemon/graph/gene_graph.py @@ -1,9 +1,5 @@ #!/usr/bin/env python3 -"""Gene graph handling.""" -# -------------------------------- -# Class for handling a gene graph: -# Updated: 06/28/21 -# -------------------------------- +"""Class for handling a gene-gene graph.""" from ..utils import vcorrcoef from ..data import snap_colors from .utils_community_detection import ( @@ -11,19 +7,17 @@ get_modules_from_partition, set_module_colors ) - from .utils_pruning import prune_graph_components, prune_graph_degree import logging import numpy as np import pandas as pd -from scipy import sparse - -# For graphs -import igraph class gene_graph(object): + """\ + Gene-gene graph + """ def __init__(self, corr, genes, @@ -32,6 +26,26 @@ def __init__(self, edge_weight=None, layout_method="fr", min_size=4): + """\ + Initialize gene-gene graph given adjacency matrix object or pre-computed graph + + Parameters + ---------- + corr : np.array | sparse.csr_matrix + Gene-gene correlation matrix + genes : np.array + List of genes + adj : adjacency_matrix + Adjacency matrix object + graph : igraph.Graph + Pre-computed graph (used for multigraph clustering) + edge_weight : float + Edge weight for graphs, if want a fixed weight + layout_method : str + Layout method for ``.layout()`` on ``igraph`` object + min_size : int + Minimum size of a graph component, for pruning + """ self.corr = corr self.genes = genes self.adj = adj # Adjacency matrix object @@ -59,10 +73,24 @@ def __init__(self, self.scores = {} self.module_match = {} - # TODO: Add options for different methods of module detection here: def construct_graph(self, resolution=2, method='leiden', full_graph=False, modules=True, layout=True): - """Construct the graph object and find gene modules.""" + """\ + Construct the graph object and find gene modules. + + Parameters + ---------- + resolution : float + Resolution for clustering graph into modules + method : str + Method for clustering modules. Only ``'leiden'`` currently implemented + full_graph : bool + Create full graph or threshold low values + modules : bool + Whether to calculate gene modules + layout : bool + Whether to lay out the graph + """ # Building adjacency > graph > modules, handled by graph object: if self.graph is None: # 5. Build adjacency matrix @@ -71,7 +99,7 @@ def construct_graph(self, resolution=2, method='leiden', # 6. Create k-NN (thresholding) from adjacency # Note: full adjacency graph is for multi-graph purposes. self._make_graph(full=full_graph) - # TODO: check if modules is None, also remake if made new graph + # NOTE: check if modules is None, also remake if made new graph if modules: # 7. Perform community detection (NMF, BigClam, Leiden) self.calculate_gene_modules( @@ -80,9 +108,16 @@ def construct_graph(self, resolution=2, method='leiden', # 7a. Layout graph self.layout_graph(layout_method=self.layout_method) - # TODO: Decide if arguments feed in or keep everything in self. def _make_graph(self, full=False): - """Make graph object from adjacency.""" + """\ + Make graph object from adjacency. + + Parameters + ---------- + full : bool + Whether to create full graph or to threshold low values and prune graph. Full graph is used for multigraph + """ + import igraph logging.info("Making graph object from adjacency.") if full: # Use the full adjacency before adjacency pruning: @@ -113,7 +148,14 @@ def _make_graph(self, full=False): " nodes, out of " + str(self.corr.shape[0])) def layout_graph(self, layout_method='fr'): - """Compute graph layout.""" + """\ + Compute graph layout. + + Parameters + ---------- + layout_method + Layout method for ``.layout()`` on ``igraph`` object + """ logging.info("Laying out graph, method=" + str(layout_method)) if layout_method == "fr": self.layout = self.graph.layout_fruchterman_reingold( @@ -122,21 +164,27 @@ def layout_graph(self, layout_method='fr'): self.layout = self.graph.layout(layout_method) def calculate_gene_modules(self, method="leiden", **kwargs): - """Calculate modules from gene-gene graph using graph clustering.""" + """\ + Calculate modules from gene-gene graph using graph clustering. + + Parameters + ---------- + method : str + Method for clustering modules. Only ``'leiden'`` currently implemented + **kwargs + Arguments for calculating modules using given method + """ logging.info("Running module detection using method=" + str(method)) - # TODO: fix assigning method for factors if method == 'leiden': self.assign[method] = calculate_gene_modules_leiden( self.graph, **kwargs) else: - # TODO: implement louvain, resolution seems more tunable? + # NOTE: Could also implement louvain, HDBSCAN, etc raise ValueError("Method '%s' not found" % method) - # TODO: fix color for factors, maybe just store one color per module self.colors[method] = set_module_colors( self.assign[method], self.snapcols) - # TODO: put full handling of multigraph in gene_graph, including graphlist def _partition_multigraph(self, graph, graphlist, membership, method = 'leiden'): # Turn multiplex partition into modules @@ -151,7 +199,6 @@ def _partition_multigraph(self, graph, graphlist, membership, self.colors[method] = set_module_colors( self.assign[method], self.snapcols) - # TODO: Allow compute on adjusted adjacency def compute_umap(self): """Calculate UMAP for correlation estimate underlying graph.""" import umap @@ -167,7 +214,22 @@ def compute_umap(self): self.umat = uw.fit_transform(corr_gene_subset) def get_modules(self, attr="leiden", adata=None, print_modules=False): - """Get list of modules from graph and clustering.""" + """\ + Get list of modules from graph and clustering. + + Parameters + ---------- + attr : str + Modules name within the graph ('leiden' is only current supported method) + adata : AnnData + AnnData object (needed if modules haven't been populated) + print_modules : bool + Whether to print modules as well + + Returns + ------- + Dictionary of modules with lists of assigned genes + """ if attr not in self.modules.keys(): # Construct object if necessary: self.populate_modules(adata, attr) @@ -187,7 +249,22 @@ def get_module_assignment(self, attr="leiden", adata=None): return mdf def find_gene(self, gene, attr='leiden', print_genes=False): - """Find module corresponding to a gene.""" + """\ + Find the module containing a specific gene. + + Parameters + ---------- + gene : str + Gene to look up in the modules + attr : str + Modules name within the graph ('leiden' is only current supported method) + print_genes: bool + Whether to print the list of genes in the module + + Returns + ------- + List of genes in the module that contains the query gene + """ mkey = None for key in self.modules[attr].keys(): if gene in self.modules[attr][key]: @@ -202,19 +279,25 @@ def find_gene(self, gene, attr='leiden', print_genes=False): print(mkey, " ".join(out)) return out - # TODO: Speed this up (possibly avg. by tform multiplication) - # Modules lists functions (requires external adata or X): - # TODO: Update to use X only, not adata.X (fix subsetting) def populate_modules(self, adata, attr='leiden'): - """Populate modules data.""" + """\ + Populate modules data, including average expression across cells and the top genes. + + Parameters + ---------- + adata : AnnData + Single-cell dataset, if need to populate modules + attr : str + Modules name within the graph ('leiden' is only current supported method) + """ + from scipy.sparse import issparse + logging.info("Populating modules data") if adata is None: raise TypeError("adata is None, need adata to populate modules") - logging.info("Populating modules data") - # TODO: make orderedDict for the module names? partition = self.assign[attr] nam = np.array(self.graph.vs["name"]) modules = np.unique(partition) - issp = sparse.issparse(adata.X) # TODO: get this into graph class + issp = issparse(adata.X) # Initialize: self.modules[attr] = {} self.mnames[attr] = [""] * (np.max(modules) + 1) @@ -237,7 +320,6 @@ def populate_modules(self, adata, attr='leiden'): self.mnames[attr][i] = "M" + str(i) + \ " (" + ngstr + "): " + topgenes - # TODO: Speed this up - sort step can be slow def match_genes_to_modules(self, attr='leiden'): """Match genes to the closest to module by its correlation.""" logging.info("Matching genes to modules") @@ -260,9 +342,21 @@ def match_genes_to_modules(self, attr='leiden'): self.module_match[attr] = np.argmax(self.module_match[attr], 1) # Add save functions here (graph and full object as well): - def save_modules(self, attr="leiden", as_df=True, - filename=None, adata=None): - """Save module list as txt or tsv.""" + def save_modules(self, filename, attr="leiden", as_df=True, adata=None): + """\ + Save module list for this specific graph as txt or tsv. + + Parameters + ---------- + filename : str + Name for output file, required + attr : str + Modules name within the graph ('leiden' is only current supported method) + as_df : bool + Write out dataframe instead of a raw list of genes per module + adata : AnnData + Single-cell dataset, if need to populate modules + """ if as_df: mdf = pd.DataFrame({"gene": np.array(self.graph.vs["name"]), "leiden": self.assign[attr], diff --git a/scdemon/modules.py b/scdemon/modules.py index 2c92254..dca4c7e 100644 --- a/scdemon/modules.py +++ b/scdemon/modules.py @@ -1,6 +1,5 @@ #!/usr/bin/env python3 -"""Class for computing modules on adata object.""" - +"""Class for handling modules and graph objects.""" # Internal imports: from .graph import gene_graph, adjacency_matrix from .utils import ( @@ -67,10 +66,6 @@ def __init__(self, adata, Compare the metadata covariates to the PCs. covariate_cutoff : float covariate_cutoff - - Returns - ------- - Object with which to compute gene modules """ # Dataset: self.adata = adata diff --git a/scdemon/plotting/utils_graph_plotting.py b/scdemon/plotting/utils_graph_plotting.py index c7c430d..02db29e 100644 --- a/scdemon/plotting/utils_graph_plotting.py +++ b/scdemon/plotting/utils_graph_plotting.py @@ -5,7 +5,6 @@ import numpy as np import igraph -from adjustText import adjust_text from matplotlib import pyplot as plt @@ -58,6 +57,7 @@ def _add_labels(labels, layout, ax, adjust_labels=True, fontsize=12): ha="center", va="center", fontsize=fontsize) for i in keptind] if adjust_labels: + from adjustText import adjust_text adjust_text(text, lim=25) diff --git a/scdemon/utils/correlation.py b/scdemon/utils/correlation.py index 5cbf4c7..7d333e9 100644 --- a/scdemon/utils/correlation.py +++ b/scdemon/utils/correlation.py @@ -5,8 +5,9 @@ def calculate_margin_genes(X): + from scipy.sparse import issparse margin = np.mean(X > 0, axis=0).copy() - if sparse.issparse(X): + if issparse(X): margin = np.array(margin)[0] return(margin) diff --git a/scdemon/utils/go_enrichments.py b/scdemon/utils/go_enrichments.py index fde8427..8cfdb8a 100644 --- a/scdemon/utils/go_enrichments.py +++ b/scdemon/utils/go_enrichments.py @@ -10,7 +10,7 @@ def get_goterms(obj, graph_id, attr="leiden", organism="hsapiens", # NOTE: Needs to be able to connect to internet to run # TODO: Add exception/failure mode if not able to connect obj.gp = GProfiler(return_dataframe=True) - mlist = obj.get_modules(graph_id) + mlist = obj.get_modules(graph_id, attr=attr, print_modules=False) obj.gpres = {} for ll in mlist.keys(): testlist = mlist[ll].tolist()