From c6f35ab3b5706303d129cacc78857e3d11ff4009 Mon Sep 17 00:00:00 2001 From: Luis Pedro Coelho Date: Wed, 10 Jan 2024 16:06:50 +1000 Subject: [PATCH] BUG Add missing imports --- .../07_c_AMP_features_comparison.py | 5 ++- Manuscript_Analysis/coprediction-activy.py | 43 +++++++++++++++++++ 2 files changed, 47 insertions(+), 1 deletion(-) create mode 100644 Manuscript_Analysis/coprediction-activy.py diff --git a/Manuscript_Analysis/07_c_AMP_features_comparison.py b/Manuscript_Analysis/07_c_AMP_features_comparison.py index 4caf264..0036308 100644 --- a/Manuscript_Analysis/07_c_AMP_features_comparison.py +++ b/Manuscript_Analysis/07_c_AMP_features_comparison.py @@ -5,10 +5,13 @@ import pandas as pd import numpy as np import seaborn as sns +from scipy import stats from matplotlib import pyplot as plt from matplotlib import cm from modlamp.descriptors import GlobalDescriptor from modlamp.descriptors import PeptideDescriptor +from statsmodels.stats.multitest import multipletests + from os import makedirs makedirs('figures', exist_ok=True) plt.rcParams['svg.fonttype'] = 'none' @@ -130,7 +133,7 @@ def getlims(feat): for i in range(len(data_groups)): for j in range(i+1, len(data_groups)): for feat,label in panels: - u,p = mannwhitneyu(data_groups[i][feat], data_groups[j][feat]) + u,p = stats.mannwhitneyu(data_groups[i][feat], data_groups[j][feat]) comparisons.append((data_names[i], data_names[j], feat, p)) print(f'{data_names[i]} vs {data_names[j]}: {label}: p={p:.2e}') comparisons = pd.DataFrame(comparisons, columns=['Group 1', 'Group 2', 'Feature', 'P-value']) diff --git a/Manuscript_Analysis/coprediction-activy.py b/Manuscript_Analysis/coprediction-activy.py new file mode 100644 index 0000000..acbba98 --- /dev/null +++ b/Manuscript_Analysis/coprediction-activy.py @@ -0,0 +1,43 @@ +import seaborn as sns +import pandas as pd +from matplotlib import pyplot as plt +from matplotlib import cm +from scipy import stats +from statsmodels.stats import multitest as mt + +peptides = pd.read_excel('data/Peptide_nickname_AMPSphere.xlsx', index_col=0) +peptides = peptides[peptides.index.str.startswith('AMP10')].rename(columns={'Active?':'Active'}) +peptides['Active'] = peptides['Active'] == 'Yes' +copred = pd.read_table('new_data/AMP_coprediction_AMPSphere.tsv.xz', index_col=0) +quality = pd.read_table('../data_folder/quality_assessment.tsv.xz', index_col=0) + +peptides = peptides.join(copred) +peptides = peptides.join(quality) +peptides.eval('hq = Coordinates == "Passed" & Antifam == "Passed" & RNAcode == "Passed"', inplace=True) +peptides.eval('has_meta = metaproteomes == "Passed" | metatranscriptomes == "Passed"', inplace=True) +preds = ['APIN', 'AMPScanner2', 'AMPlify', 'ampir', 'amPEPpy', 'AI4AMP', 'Macrel'] + +ps = [] +_,p = stats.fisher_exact(peptides.groupby(['Active','APIN']).size().values.reshape((2,2))) +ps.append(p) +for p in preds[1:]: + m_u = (stats.mannwhitneyu(peptides.query('Active')[p], peptides.query('~Active')[p])) + ps.append(m_u.pvalue) + +_, pvals_adjust, _ , _ = mt.multipletests(ps, method='holm-sidak') + + +fig,ax = plt.subplots() +# convert to long format +# [ Active(bool) | Predictor(str) | value(float) ] +peptides_long = pd.melt(peptides[['Active']+preds], id_vars=['Active'], value_vars=preds) +peptides_long.columns = ['Active', 'Predictor', 'Probability'] +peptides_long.Active.replace({True:'Active', False:'Inactive'}, inplace=True) +sns.boxplot(data=peptides_long, x='Predictor', y='Probability', hue='Active', ax=ax, boxprops={'facecolor':'None'}, showfliers=False, legend=False) +s = sns.stripplot(data=peptides_long, x='Predictor', y='Probability', hue='Active', dodge=True, ax=ax, alpha=0.6, palette=cm.Dark2.colors, legend='full') +ax.hlines(0.5, -0.5, len(preds)-.5, linestyles='--', linewidth=1, color='k') +ax.get_legend().set_title(None) +for ix,p in enumerate(pvals_adjust): + ax.text(ix, 1.05, f'p={p:.02f}', ha='center', va='bottom') +sns.despine(fig, trim=True) +fig.savefig('figures/copredictions.svg')