Merge pull request #31 from sanjaynagi/add-kegg-enrichmnent-19-04-23

implement kegg enrichment function
sanjaynagi · Apr 19, 2023 · dc8d21d · dc8d21d
2 parents 6ce53e0 + dded835
commit dc8d21d
Show file tree

Hide file tree

Showing 2 changed files with 65 additions and 1 deletion.
diff --git a/anoexpress/anoexpress.py b/anoexpress/anoexpress.py
@@ -477,6 +477,52 @@ def pfam_hypergeometric(analysis, name, func, percentile=0.05):
 
     return(hyper_geo)
 
+def kegg_hypergeometric(analysis, name, func, percentile=0.05):
+    """
+    Perform a hypergeometric test on GO terms of the the top % percentile genes ranked by user input function.
+
+    Parameters
+    ----------
+    analysis: {"gamb_colu", "gamb_colu_arab", "gamb_colu_arab_fun", "fun"}
+      which analysis to load gene expression data for. analyses with more species will have less genes
+      present, due to the process of finding orthologs.
+    name: str
+      name of the function to rank genes by
+    func: function
+      function to rank genes by (such as np.nanmedian, np.nanmean)
+    percentile: float, optional
+      percentile of genes to use for the enriched set in hypergeometric test. Defaults to 0.05
+
+    Returns
+    -------
+    go_hypergeo_results: pd.DataFrame
+    """
+
+    fc_data = pd.read_csv(f"https://raw.githubusercontent.com/sanjaynagi/ano-express/main/results/fcs.{analysis}.tsv", sep="\t")
+    fc_genes = fc_data.reset_index()['GeneID'].to_list()
+
+    # get top % percentile genes ranked by func
+    fc_ranked = load_candidates(analysis=analysis, name=name, func=func)
+    percentile_idx = fc_ranked.reset_index()['GeneID'].unique().shape[0] * percentile
+    top_geneIDs = fc_ranked.reset_index().loc[:, 'GeneID'][:int(percentile_idx)] 
+
+    # load gene annotation file 
+    kegg_df = pd.read_csv("https://raw.githubusercontent.com/sanjaynagi/ano-express/main/resources/AgamP4.kegg", sep="\t")
+    kegg_annotations = kegg_df[['kegg_pathway', 'description']].rename(columns={'kegg_pathway':'annotation'}).drop_duplicates()
+    kegg_df = kegg_df[['GeneID', 'kegg_pathway']].drop_duplicates()
+    kegg_df = kegg_df.query("GeneID in @fc_genes")
+    N = kegg_df.GeneID.unique().shape[0] #Total number of genes with some annotation 
+    k = np.isin(kegg_df.loc[:, 'GeneID'].unique(), top_geneIDs).sum() 
+
+    hyper_geo = _hypergeometric(
+        annotation_df=kegg_df, 
+        column_name='kegg_pathway', 
+        target_gene_list=top_geneIDs,
+        N=N,
+        k=k)    
+    hyper_geo = hyper_geo.merge(kegg_annotations, how='left')
+    return(hyper_geo)
+
 def _hypergeometric(annotation_df, column_name, target_gene_list, N, k):
     """
     This function performs a hypergeometric test on a given annotation column

diff --git a/tests/test_anoexpress.py b/tests/test_anoexpress.py
@@ -104,4 +104,22 @@ def test_plot_gene_expression_type(plot_type):
         analysis="gamb_colu", 
         microarray=False, 
         plot_type=plot_type,
-        )
+        )
+
+def test_go_hypergeometric():
+    go = xpress.go_hypergeometric(analysis='gamb_colu_arab_fun', name="median", func=np.nanmedian
+        )
+    assert isinstance(go, pd.DataFrame)
+    assert go.iloc[0,0] == 'GO:0042302' # check first value is correct
+
+def test_pfam_hypergeometric():
+    pfam = xpress.pfam_hypergeometric(analysis='gamb_colu_arab_fun', name="median", func=np.nanmedian
+        )
+    assert isinstance(pfam, pd.DataFrame)
+    assert pfam.iloc[0,0] == 'C_tripleX' # check first value is correct
+
+def test_kegg_hypergeometric():
+    kegg = xpress.kegg_hypergeometric(analysis='gamb_colu_arab_fun', name="median", func=np.nanmedian
+        )
+    assert isinstance(kegg, pd.DataFrame)
+    assert kegg.iloc[0,0] == 'aga00982' # check first value is correct