From d2a7719641944e3b76df961aaed43224b39c9c04 Mon Sep 17 00:00:00 2001
From: Sanjay C Nagi <sanjay.c.nagi@gmail.com>
Date: Wed, 19 Apr 2023 14:49:00 +0100
Subject: [PATCH 1/2] add kegg enrichment function

---
 anoexpress/anoexpress.py | 46 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/anoexpress/anoexpress.py b/anoexpress/anoexpress.py
index c1e5f1a..bf4470d 100644
--- a/anoexpress/anoexpress.py
+++ b/anoexpress/anoexpress.py
@@ -477,6 +477,52 @@ def pfam_hypergeometric(analysis, name, func, percentile=0.05):
         
     return(hyper_geo)
 
+def kegg_hypergeometric(analysis, name, func, percentile=0.05):
+    """
+    Perform a hypergeometric test on GO terms of the the top % percentile genes ranked by user input function.
+
+    Parameters
+    ----------
+    analysis: {"gamb_colu", "gamb_colu_arab", "gamb_colu_arab_fun", "fun"}
+      which analysis to load gene expression data for. analyses with more species will have less genes
+      present, due to the process of finding orthologs.
+    name: str
+      name of the function to rank genes by
+    func: function
+      function to rank genes by (such as np.nanmedian, np.nanmean)
+    percentile: float, optional
+      percentile of genes to use for the enriched set in hypergeometric test. Defaults to 0.05
+
+    Returns
+    -------
+    go_hypergeo_results: pd.DataFrame
+    """
+
+    fc_data = pd.read_csv(f"https://raw.githubusercontent.com/sanjaynagi/ano-express/main/results/fcs.{analysis}.tsv", sep="\t")
+    fc_genes = fc_data.reset_index()['GeneID'].to_list()
+
+    # get top % percentile genes ranked by func
+    fc_ranked = load_candidates(analysis=analysis, name=name, func=func)
+    percentile_idx = fc_ranked.reset_index()['GeneID'].unique().shape[0] * percentile
+    top_geneIDs = fc_ranked.reset_index().loc[:, 'GeneID'][:int(percentile_idx)] 
+
+    # load gene annotation file 
+    kegg_df = pd.read_csv("https://raw.githubusercontent.com/sanjaynagi/ano-express/main/resources/AgamP4.kegg", sep="\t")
+    kegg_annotations = kegg_df[['kegg_pathway', 'description']].rename(columns={'kegg_pathway':'annotation'}).drop_duplicates()
+    kegg_df = kegg_df[['GeneID', 'kegg_pathway']].drop_duplicates()
+    kegg_df = kegg_df.query("GeneID in @fc_genes")
+    N = kegg_df.GeneID.unique().shape[0] #Total number of genes with some annotation 
+    k = np.isin(kegg_df.loc[:, 'GeneID'].unique(), top_geneIDs).sum() 
+
+    hyper_geo = _hypergeometric(
+        annotation_df=kegg_df, 
+        column_name='kegg_pathway', 
+        target_gene_list=top_geneIDs,
+        N=N,
+        k=k)    
+    hyper_geo = hyper_geo.merge(kegg_annotations, how='left')
+    return(hyper_geo)
+
 def _hypergeometric(annotation_df, column_name, target_gene_list, N, k):
     """
     This function performs a hypergeometric test on a given annotation column

From dded83546c021e03d7a947455e271d3488c63278 Mon Sep 17 00:00:00 2001
From: Sanjay C Nagi <sanjay.c.nagi@gmail.com>
Date: Wed, 19 Apr 2023 15:04:29 +0100
Subject: [PATCH 2/2] add kegg enrichment function tests

---
 tests/test_anoexpress.py | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/tests/test_anoexpress.py b/tests/test_anoexpress.py
index 59e439e..7274f31 100644
--- a/tests/test_anoexpress.py
+++ b/tests/test_anoexpress.py
@@ -104,4 +104,22 @@ def test_plot_gene_expression_type(plot_type):
         analysis="gamb_colu", 
         microarray=False, 
         plot_type=plot_type,
-        )
\ No newline at end of file
+        )
+    
+def test_go_hypergeometric():
+    go = xpress.go_hypergeometric(analysis='gamb_colu_arab_fun', name="median", func=np.nanmedian
+        )
+    assert isinstance(go, pd.DataFrame)
+    assert go.iloc[0,0] == 'GO:0042302' # check first value is correct
+
+def test_pfam_hypergeometric():
+    pfam = xpress.pfam_hypergeometric(analysis='gamb_colu_arab_fun', name="median", func=np.nanmedian
+        )
+    assert isinstance(pfam, pd.DataFrame)
+    assert pfam.iloc[0,0] == 'C_tripleX' # check first value is correct
+
+def test_kegg_hypergeometric():
+    kegg = xpress.kegg_hypergeometric(analysis='gamb_colu_arab_fun', name="median", func=np.nanmedian
+        )
+    assert isinstance(kegg, pd.DataFrame)
+    assert kegg.iloc[0,0] == 'aga00982' # check first value is correct
\ No newline at end of file