From d2a7719641944e3b76df961aaed43224b39c9c04 Mon Sep 17 00:00:00 2001 From: Sanjay C Nagi Date: Wed, 19 Apr 2023 14:49:00 +0100 Subject: [PATCH 1/2] add kegg enrichment function --- anoexpress/anoexpress.py | 46 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/anoexpress/anoexpress.py b/anoexpress/anoexpress.py index c1e5f1a..bf4470d 100644 --- a/anoexpress/anoexpress.py +++ b/anoexpress/anoexpress.py @@ -477,6 +477,52 @@ def pfam_hypergeometric(analysis, name, func, percentile=0.05): return(hyper_geo) +def kegg_hypergeometric(analysis, name, func, percentile=0.05): + """ + Perform a hypergeometric test on GO terms of the the top % percentile genes ranked by user input function. + + Parameters + ---------- + analysis: {"gamb_colu", "gamb_colu_arab", "gamb_colu_arab_fun", "fun"} + which analysis to load gene expression data for. analyses with more species will have less genes + present, due to the process of finding orthologs. + name: str + name of the function to rank genes by + func: function + function to rank genes by (such as np.nanmedian, np.nanmean) + percentile: float, optional + percentile of genes to use for the enriched set in hypergeometric test. Defaults to 0.05 + + Returns + ------- + go_hypergeo_results: pd.DataFrame + """ + + fc_data = pd.read_csv(f"https://raw.githubusercontent.com/sanjaynagi/ano-express/main/results/fcs.{analysis}.tsv", sep="\t") + fc_genes = fc_data.reset_index()['GeneID'].to_list() + + # get top % percentile genes ranked by func + fc_ranked = load_candidates(analysis=analysis, name=name, func=func) + percentile_idx = fc_ranked.reset_index()['GeneID'].unique().shape[0] * percentile + top_geneIDs = fc_ranked.reset_index().loc[:, 'GeneID'][:int(percentile_idx)] + + # load gene annotation file + kegg_df = pd.read_csv("https://raw.githubusercontent.com/sanjaynagi/ano-express/main/resources/AgamP4.kegg", sep="\t") + kegg_annotations = kegg_df[['kegg_pathway', 'description']].rename(columns={'kegg_pathway':'annotation'}).drop_duplicates() + kegg_df = kegg_df[['GeneID', 'kegg_pathway']].drop_duplicates() + kegg_df = kegg_df.query("GeneID in @fc_genes") + N = kegg_df.GeneID.unique().shape[0] #Total number of genes with some annotation + k = np.isin(kegg_df.loc[:, 'GeneID'].unique(), top_geneIDs).sum() + + hyper_geo = _hypergeometric( + annotation_df=kegg_df, + column_name='kegg_pathway', + target_gene_list=top_geneIDs, + N=N, + k=k) + hyper_geo = hyper_geo.merge(kegg_annotations, how='left') + return(hyper_geo) + def _hypergeometric(annotation_df, column_name, target_gene_list, N, k): """ This function performs a hypergeometric test on a given annotation column From dded83546c021e03d7a947455e271d3488c63278 Mon Sep 17 00:00:00 2001 From: Sanjay C Nagi Date: Wed, 19 Apr 2023 15:04:29 +0100 Subject: [PATCH 2/2] add kegg enrichment function tests --- tests/test_anoexpress.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/tests/test_anoexpress.py b/tests/test_anoexpress.py index 59e439e..7274f31 100644 --- a/tests/test_anoexpress.py +++ b/tests/test_anoexpress.py @@ -104,4 +104,22 @@ def test_plot_gene_expression_type(plot_type): analysis="gamb_colu", microarray=False, plot_type=plot_type, - ) \ No newline at end of file + ) + +def test_go_hypergeometric(): + go = xpress.go_hypergeometric(analysis='gamb_colu_arab_fun', name="median", func=np.nanmedian + ) + assert isinstance(go, pd.DataFrame) + assert go.iloc[0,0] == 'GO:0042302' # check first value is correct + +def test_pfam_hypergeometric(): + pfam = xpress.pfam_hypergeometric(analysis='gamb_colu_arab_fun', name="median", func=np.nanmedian + ) + assert isinstance(pfam, pd.DataFrame) + assert pfam.iloc[0,0] == 'C_tripleX' # check first value is correct + +def test_kegg_hypergeometric(): + kegg = xpress.kegg_hypergeometric(analysis='gamb_colu_arab_fun', name="median", func=np.nanmedian + ) + assert isinstance(kegg, pd.DataFrame) + assert kegg.iloc[0,0] == 'aga00982' # check first value is correct \ No newline at end of file