update documentation

morris-lab · Aug 25, 2021 · 65809f2 · 65809f2
1 parent 3db5d4a
commit 65809f2
Show file tree

Hide file tree

Showing 4 changed files with 145 additions and 30 deletions.
diff --git a/celloracle/applications/development_module.py b/celloracle/applications/development_module.py
@@ -14,6 +14,7 @@
 import numpy as np
 from tqdm.notebook import tqdm
 import matplotlib.pyplot as plt
+import seaborn as sns
 #import h5py
 
 from scipy.stats import wilcoxon
@@ -171,21 +172,73 @@ def calculate_inner_product(self):
 
         # Calculate inner product between the pseudotime-gradient and the perturb-gradient
         self.inner_product = np.array([np.dot(i, j) for i, j in zip(self.flow, self.ref_flow)])
+        self.inner_product_random = np.array([np.dot(i, j) for i, j in zip(self.flow_rndm, self.ref_flow)])
 
 
     def calculate_digitized_ip(self, n_bins=10):
 
         inner_product_df = pd.DataFrame({"score": self.inner_product[~self.mass_filter_simulation],
-                                                "pseudotime": self.pseudotime_on_grid[~self.mass_filter_simulation]})
+                                         "score_randomized_GRN": self.inner_product_random[~self.mass_filter_simulation],
+                                         "pseudotime": self.pseudotime_on_grid[~self.mass_filter_simulation]})
 
 
         bins = _get_bins(inner_product_df.pseudotime, n_bins)
         inner_product_df["pseudotime_id"] = np.digitize(inner_product_df.pseudotime, bins)
 
         self.inner_product_df = inner_product_df
 
-    def get_p_inner_product(self, method="wilcoxon"):
-        return get_p_inner_product(inner_product_df=self.inner_product_df, method=method)
+    def get_negative_PS_p_value(self, pseudotime=None, return_ps_sum=False, plot=False):
+
+        df = self.inner_product_df.copy()
+
+        if pseudotime is not None:
+            pseudotime = [i for i in pseudotime if i in list("0123456789")]
+            df = df[df.pseudotime_id.isin(pseudotime)]
+
+        x = df["score"]
+        y = df["score_randomized_GRN"]
+
+        # Clipping positive value to focus on negative IP
+        x = np.clip(x, -np.inf, 0)
+        y = np.clip(y, -np.inf, 0)
+
+        if plot:
+            sns.distplot(x)
+            sns.distplot(y)
+
+        # Paired non-parametric test with Wilcoxon's runk sum test
+        s, p = wilcoxon(x, y, alternative="less")
+
+        if return_ps_sum:
+            return p, -x.sum(), -y.sum()
+
+        return p
+
+    def get_positive_PS_p_value(self, pseudotime=None, return_ps_sum=False, plot=False):
+        df = self.inner_product_df.copy()
+
+        if pseudotime is not None:
+            pseudotime = [i for i in pseudotime if i in list("0123456789")]
+            df = df[df.pseudotime_id.isin(pseudotime)]
+
+        x = df["score"]
+        y = df["score_randomized_GRN"]
+
+        # Clipping negative value to focus on positive IP
+        x = np.clip(x, 0, np.inf)
+        y = np.clip(y, 0, np.inf)
+
+        if plot:
+            sns.distplot(x)
+            sns.distplot(y)
+
+        # Paired non-parametric test with Wilcoxon's runk sum test
+        s, p = wilcoxon(x, y, alternative="greater")
+
+        if return_ps_sum:
+            return p, -x.sum(), -y.sum()
+
+        return p
 
     def get_sum_of_negative_ips(self):
         return get_sum_of_negative_ips(inner_product_df=self.inner_product_df)
@@ -302,32 +355,6 @@ def _get_bins(array, n_bins):
 
 from scipy.stats import wilcoxon
 
-def get_p_inner_product(inner_product_df, method="wilcoxon"):
-    """
-
-
-    """
-    dizitized_pseudotimes = np.sort(inner_product_df["pseudotime_id"].unique())
-
-    li = []
-    for i in dizitized_pseudotimes:
-        pseudotime_ = inner_product_df[inner_product_df["pseudotime_id"]==i].score.values
-
-        if method == "wilcoxon":
-            _, p_ts = wilcoxon(x=pseudotime_, alternative="two-sided")
-            _, p_greater = wilcoxon(x=pseudotime_, alternative="greater")
-            _, p_less = wilcoxon(x=pseudotime_, alternative="less")
-            mean_ = pseudotime_.mean()
-            median_ = np.median(pseudotime_)
-        #print(i, p)
-        li.append([mean_, median_, p_ts, p_greater, p_less])
-
-    inner_product_summary = \
-        pd.DataFrame(np.array(li),
-                     columns=["ip_mean", "ip_median", "p_twosided", "p_less", "p_greater"],
-                     index=dizitized_pseudotimes)
-
-    return inner_product_summary
 
 
 def get_sum_of_positive_ips(inner_product_df):

diff --git a/celloracle/applications/systematic_analysis_helper.py b/celloracle/applications/systematic_analysis_helper.py
@@ -140,6 +140,94 @@ def wrapper(misc, pseudotime, n_TFs=20):
 
         return interactive_table
 
+    def _interactive_calculate_negative_ps_p_value(self):
+        """
+        8/21/2021. This function is under development
+        Get p-values for negative PS sum score by comparing nPS to randomized GRN nPS.
+        """
+
+        if self.hdf5_info is None:
+            self.get_hdf5_info()
+
+        def wrapper(misc, pseudotime, n_TFs=20):
+            df = self._calculate_negative_ps_p_value(misc=misc, pseudotime=pseudotime)
+
+
+            print(f"Top {n_TFs} in {misc}")
+            display(HTML(df.iloc[:min(n_TFs, df.shape[0])].to_html()))
+
+        interactive_table = interactive(wrapper,
+                               #{'manual': True},
+                               misc=self.hdf5_info["misc_list"],
+                               pseudotime="0,1,2,3,4,5,6,7,8,9",
+                               n_TFs=(5, 50, 1))
+
+        return interactive_table
+
+    def _update_inner_product_df(self, n_bins=10, verbose=True):
+        self.del_attrs()
+
+        gene_misc_lists = self.hdf5_info["gene_misc_lists"]
+
+        li = []
+        if verbose:
+            loop = tqdm(gene_misc_lists)
+        else:
+            loop = gene_misc_lists
+
+        self.del_attrs()
+
+        for gene, misc in loop:
+            self.load_hdf5(gene=gene, misc=misc)
+            self.calculate_inner_product()
+            self.calculate_digitized_ip(n_bins=n_bins)
+            self.dump_hdf5(gene=gene, misc=misc)
+            # Clear memory
+            self.del_attrs()
+
+
+    def _calculate_negative_ps_p_value(self, misc, pseudotime="0,1,2,3,4,5,6,7,8,9", verbose=True):
+
+        """
+        8/21/2021. This function is under development
+        Get p-values for negative PS sum score by comparing nPS to randomized GRN nPS.
+        """
+
+        self.del_attrs()
+
+        gene_lists = self.hdf5_info["gene_list"]
+
+        p_list = []
+        ps_sums = []
+        ps_sum_randoms = []
+        if verbose:
+            loop = tqdm(gene_lists)
+        else:
+            loop = gene_lists
+
+        for gene in loop:
+
+            self.load_hdf5(gene=gene, misc=misc, specify_attributes=["inner_product_df"])
+            if "score_randomized_GRN" not in self.inner_product_df.columns:
+                raise ValueError("please update inner_product_df first")
+            p, ps_sum, ps_sum_random = self.get_negative_PS_p_value(pseudotime=pseudotime, return_ps_sum=True, plot=False)
+            p_list.append(p)
+            ps_sum_randoms.append(ps_sum_random)
+            ps_sums.append(ps_sum)
+
+        # Clear memory
+        self.del_attrs()
+
+        # p-value correction
+        p_corrected = np.clip(np.array(p_list)*len(gene_lists), 0, 1)
+
+        result = pd.DataFrame({"gene": gene_lists, "ps_sum":ps_sums, "ps_sum_random": ps_sum_randoms,
+                               "p": p_list, "p_adj": p_corrected})
+
+        result = result.sort_values("ps_sum", ascending=False).reset_index(drop=True)
+
+        return result
+
 
     def estimate_scale_for_visualization(self, return_result=True):
 

diff --git a/docs/celloracle.pdf b/docs/celloracle.pdf
diff --git a/docs/tutorials/index.rst b/docs/tutorials/index.rst
@@ -39,7 +39,7 @@ Prerequisites
 
 - This tutorial assumes that you have some Python programming experience. In particular, we assume you are familiar with Python data science libraries: jupyter, pandas, and matplotlib.
 
-- Also, this tutorial assume that you are familiar with basic scRNA-seq data analysis. In particular, we assume you have some experience of scRNA-seq analysis using `Scanpy and Anndata <https://scanpy.readthedocs.io/en/stable/>`_ , which is a python toolkit for single-cell analysis. You can use scRNA-seq data processed with `Seurat <https://satijalab.org/seurat/>`_ . But the Seurat data need to be converted into Anndata format in advance to CellOracle analysis. See this  `tutorial  <https://xxx.com>`_ for detail.
+- Also, this tutorial assume that you are familiar with basic scRNA-seq data analysis. In particular, we assume you have some experience of scRNA-seq analysis using `Scanpy and Anndata <https://scanpy.readthedocs.io/en/stable/>`_ , which is a python toolkit for single-cell analysis. You can use scRNA-seq data processed with `Seurat <https://satijalab.org/seurat/>`_ . But the Seurat data need to be converted into Anndata format in advance to CellOracle analysis. See this  `page  <https://morris-lab.github.io/CellOracle.documentation/modules/index.html#command-line-api>`_ for detail.
 
 - CellOracle provides pre-build base-GRN, and it is not necessary to construct custom base-GRN. But if you want to construct custom base-GRN from your scATAC-seq data, we recommend using `Cicero <https://cole-trapnell-lab.github.io/cicero-release/>`_ . In this case, please get used to Cicero, basic scATAC-seq data analysis, and TF motif analysis in advance to start constructing base-GRN.