diff --git a/Manuscript_Analysis/08_multi_habitat_cAMPs.py b/Manuscript_Analysis/08_multi_habitat_cAMPs.py index e359d8a..6ecb0be 100644 --- a/Manuscript_Analysis/08_multi_habitat_cAMPs.py +++ b/Manuscript_Analysis/08_multi_habitat_cAMPs.py @@ -1,133 +1,36 @@ #!/usr/bin/env python # coding: utf-8 - -# # AMPSphere v.2022-03 -# -# This is a notebook meant to form the set of notebooks used to analyze the data in AMPSphere and write the manuscript: -# -# __AMPSphere: Global survey of prokaryotic antimicrobial peptides shaping microbiomes__ - -# ### Analysis of abundance and diversity of c_AMPs -# -# c_AMPs are distributed as gene variants through many species. Here, we will test: -# -# I. Is there multi-habitat c_AMPs? -# II. Are multi-habitat c_AMPs clonal? - -# In[1]: - - -import lzma import random import numpy as np import pandas as pd import seaborn as sns import matplotlib.pyplot as plt -from Bio import SeqIO from tqdm import tqdm from scipy.stats import norm from scipy.stats import shapiro from scipy.stats import pearsonr, spearmanr +import environments -# In[2]: - - -higher_level = {'sediment' : 'other', - 'bird gut' : 'other animal', - 'cat gut' : 'non-human mammal gut', - 'insect associated' : 'other animal', - 'human urogenital tract' : 'other human', - 'dog gut' : 'non-human mammal gut', - 'fermented food' : 'anthropogenic', - 'groundwater' : 'aquatic', - 'coral associated' : 'other animal', - 'rat gut' : 'non-human mammal gut', - 'human associated' : 'other human', - 'cattle gut' : 'non-human mammal gut', - 'deer gut' : 'non-human mammal gut', - 'mouse gut' : 'non-human mammal gut', - 'river associated' : 'aquatic', - 'primate gut' : 'non-human mammal gut', - 'human respiratory tract' : 'other human', - 'cattle rumen' : 'other animal', - 'human saliva' : 'other human', - 'activated sludge' : 'anthropogenic', - 'lake associated' : 'aquatic', - 'wastewater' : 'anthropogenic', - 'chicken gut' : 'other animal', - 'air' : 'other', - 'human mouth' : 'other human', - 'plant associated' : 'soil/plant', - 'water associated' : 'aquatic', - 'pig gut' : 'non-human mammal gut', - 'human skin' : 'other human', - 'marine' : 'aquatic', - 'soil' : 'soil/plant', - 'built environment' : 'anthropogenic', - 'human gut' : 'human gut', - 'anthropogenic': 'anthropogenic', - 'bear gut' : 'non-human mammal gut', - 'bee gut': 'other animal', - 'bat gut': 'non-human mammal gut', - 'dog associated': 'other animal', - 'cattle associated': 'other animal', - 'crustacean associated': 'other animal', - 'insect gut': 'other animal', - 'goat gut': 'non-human mammal gut', - 'rodent gut': 'non-human mammal gut', - 'fisher gut': 'non-human mammal gut', - 'human digestive tract': 'other human', - 'coyote gut': 'non-human mammal gut', - 'planarian associated': 'other animal', - 'sponge associated': 'other animal', - 'goat rumen': 'other animal', - 'crustacean gut': 'other animal', - 'annelidae associated': 'other animal', - 'bird skin': 'other animal', - 'beatle gut': 'other animal', - 'termite gut': 'other animal', - 'fish gut': 'other animal', - 'mollusc associated': 'other animal', - 'ship worm associated': 'other animal', - 'rabbit gut': 'non-human mammal gut', - 'tunicate associated': 'other animal', - 'mussel associated': 'other animal', - 'horse gut': 'non-human mammal gut', - 'wasp gut': 'other animal', - 'guinea pig gut': 'non-human mammal gut'} - - -# In[3]: - - -# load data data = pd.read_table('../data_folder/gmsc_amp_genes_envohr_source.tsv.gz') - -# In[4]: - - # eliminate genes/amps without environment (from ProGenomes) data = data[~data.general_envo_name.isna()] -# attribut high-level habitat to genes/amps -data['high'] = data.general_envo_name.map(lambda x: higher_level.get(x, 'other')) - - -# In[5]: +# attribute high-level habitat to genes/amps +data['high'] = data.general_envo_name.map(lambda x: environments.higher_level.get(x, 'other')) # testing multihabitat AMPs def get_multihabitat(df, level=None): ''' counts the number of multi-habitat AMPs (present in at least 3 environments) - + :inputs: data frame containing at least AMP, general_envo_name, high-level habitat level which stats for the habitat or high-level environment - + :outputs: length of the list of multi-habitat c_AMPs ''' @@ -135,65 +38,52 @@ def get_multihabitat(df, level=None): level = 'high' if level == 'low': level = 'general_envo_name' - + h = df[['amp', level]].drop_duplicates() h = h.amp.value_counts() h = h[h > 1].index - return h -# In[6]: - - l = get_multihabitat(data, 'high') l0 = get_multihabitat(data, 'low') -print(f'Multi-habitat AMPs: {len(l0)}\nMulti-high-level-habitat: {len(l)}') - - -# In[7]: +print(f'Multi-habitat AMPs: {len(l0):,}\nMulti-high-level-habitat: {len(l):,}') k = data[['amp', 'high']].drop_duplicates()['amp'].value_counts() k = k[k>1].index -with open('multihabitat_highlevel.txt', 'wt') as out: +with open('outputs/multihabitat_highlevel.txt', 'wt') as out: for i in k: out.write(f'{i}\n') k = data[['amp', 'general_envo_name']].drop_duplicates()['amp'].value_counts() k = k[k>1].index -with open('multihabitat_generalenvo.txt', 'wt') as out: +with open('outputs/multihabitat_generalenvo.txt', 'wt') as out: for i in k: out.write(f'{i}\n') # ### Permutation test -# +# # We shuffle the high-level habitat annotation for the samples, and then, calculate the number of multi-habitat c_AMPs. This operation is repeated 100 times, and then we calculate the average and standard deviation of the distribution of random results. Using Shapiro-Wilk test, we check if the random distribution is normal, and if it is, we calculate the Z-score for the result obtained for AMPSphere. The Z-score is then converted into a p-value to support our conclusions. -# In[8]: - - # testing significance def shuffle_test(df, level=None): if level == None: level = 'high' elif level == 'low': level = 'general_envo_name' - + habitat = df.set_index('sample')[level].to_dict() - + values = [v for _, v in habitat.items()] random.shuffle(values) - + for k, v in zip(habitat, values): habitat[k] = v - + altdf = df.copy() altdf[level] = altdf['sample'].map(lambda x: habitat.get(x)) - - return altdf - -# In[9]: + return altdf #test high level habitats @@ -205,17 +95,8 @@ def shuffle_test(df, level=None): print(test) - -# In[10]: - - _, p = shapiro(test) - -if p < 0.05: res = 'not-normal' -else: res = 'normal' - -print(f'The Shapiro-Wilks test returned a p = {p}') -print(f'This means that the distribution is {res}') +print(f'The Shapiro-Wilks test returned p = {p}') avg, std = np.mean(test), np.std(test) @@ -230,9 +111,6 @@ def shuffle_test(df, level=None): print(f'This gives us a p-value of {pz}') -# In[17]: - - #test habitats test_low = [] for _ in tqdm(range(100)): @@ -242,17 +120,9 @@ def shuffle_test(df, level=None): print(test_low) - -# In[18]: - - _, p = shapiro(test_low) -if p < 0.05: res = 'not-normal' -else: res = 'normal' - print(f'The Shapiro-Wilks test returned a p = {p}') -print(f'This means that the distribution is {res}') avg, std = np.mean(test_low), np.std(test_low) @@ -262,7 +132,7 @@ def shuffle_test(df, level=None): pz = norm.sf(abs(Z)) -print(f'The number of multi-habitat AMPs in AMPSphere was {len(l0)}') +print(f'The number of multi-habitat AMPs in AMPSphere was {len(l0):,}') print(f'It was {Z} * std of the random distribution') print(f'This gives us a p-value of {pz}') diff --git a/Manuscript_Analysis/09_supplementary_info_about_habitats_and_samples.py b/Manuscript_Analysis/09_supplementary_info_about_habitats_and_samples.py index aefd720..b572455 100644 --- a/Manuscript_Analysis/09_supplementary_info_about_habitats_and_samples.py +++ b/Manuscript_Analysis/09_supplementary_info_about_habitats_and_samples.py @@ -1,92 +1,14 @@ #!/usr/bin/env python -# coding: utf-8 -# # AMPSphere v.2022-03 -# -# This is a notebook meant to form the set of notebooks used to analyze the data in AMPSphere and write the manuscript: -# -# __AMPSphere: Global survey of prokaryotic antimicrobial peptides shaping microbiomes__ -# -# ### Summarizing AMPSphere origins and results in supplementary tables -# -# Here, we summarize the metagenomes, genomes and other info needed to rebuild AMPSphere along with other supplementary data reporting results. -# In[1]: - - -# loading libraries import pandas as pd import seaborn as sns import matplotlib.pyplot as plt from scipy.stats import spearmanr +import environments - -# In[2]: - - -higher_level = {'sediment' : 'other', - 'bird gut' : 'other animal', - 'cat gut' : 'non-human mammal gut', - 'insect associated' : 'other animal', - 'human urogenital tract' : 'other human', - 'dog gut' : 'non-human mammal gut', - 'fermented food' : 'anthropogenic', - 'groundwater' : 'aquatic', - 'coral associated' : 'other animal', - 'rat gut' : 'non-human mammal gut', - 'human associated' : 'other human', - 'cattle gut' : 'non-human mammal gut', - 'deer gut' : 'non-human mammal gut', - 'mouse gut' : 'non-human mammal gut', - 'river associated' : 'aquatic', - 'primate gut' : 'non-human mammal gut', - 'human respiratory tract' : 'other human', - 'cattle rumen' : 'other animal', - 'human saliva' : 'other human', - 'activated sludge' : 'anthropogenic', - 'lake associated' : 'aquatic', - 'wastewater' : 'anthropogenic', - 'chicken gut' : 'other animal', - 'air' : 'other', - 'human mouth' : 'other human', - 'plant associated' : 'soil/plant', - 'water associated' : 'aquatic', - 'pig gut' : 'non-human mammal gut', - 'human skin' : 'other human', - 'marine' : 'aquatic', - 'soil' : 'soil/plant', - 'built environment' : 'anthropogenic', - 'human gut' : 'human gut', - 'anthropogenic': 'anthropogenic', - 'bear gut' : 'non-human mammal gut', - 'bee gut': 'other animal', - 'bat gut': 'non-human mammal gut', - 'dog associated': 'other animal', - 'cattle associated': 'other animal', - 'crustacean associated': 'other animal', - 'insect gut': 'other animal', - 'goat gut': 'non-human mammal gut', - 'rodent gut': 'non-human mammal gut', - 'fisher gut': 'non-human mammal gut', - 'human digestive tract': 'other human', - 'coyote gut': 'non-human mammal gut', - 'planarian associated': 'other animal', - 'sponge associated': 'other animal', - 'goat rumen': 'other animal', - 'crustacean gut': 'other animal', - 'annelidae associated': 'other animal', - 'bird skin': 'other animal', - 'beatle gut': 'other animal', - 'termite gut': 'other animal', - 'fish gut': 'other animal', - 'mollusc associated': 'other animal', - 'ship worm associated': 'other animal', - 'rabbit gut': 'non-human mammal gut', - 'tunicate associated': 'other animal', - 'mussel associated': 'other animal', - 'horse gut': 'non-human mammal gut', - 'wasp gut': 'other animal', - 'guinea pig gut': 'non-human mammal gut'} +for k in set(environments.higher_level) ^ set(higher_level.keys()): + print(f'{k:50}\t{environments.higher_level[k]}') is_host_associated = {'human gut' : True, @@ -111,29 +33,16 @@ 'red fir'} -# In[3]: - - def listing(x): return ', '.join(x.tolist()) -# In[4]: - - -# load data data = pd.read_table('../data_folder/gmsc_amp_genes_envohr_source.tsv.gz') - -data = data[data.is_metagenomic == True] - -data['higher'] = data['general_envo_name'].map(lambda g: higher_level.get(g, 'other')) - +data = data.query('is_metagenomic') +data['higher'] = data['general_envo_name'].map(environments.higher_level) data['host_associated'] = data['higher'].map(lambda g: is_host_associated.get(g, 'NA')) -# In[5]: - - nf = data[['amp','host_associated']] nf = nf.drop_duplicates() nf = nf.groupby('host_associated') @@ -141,9 +50,6 @@ def listing(x): print(nf) -# In[6]: - - nf = data[data.higher.isin(['soil/plant', 'aquatic'])] nf = nf[['amp', 'higher']] nf = nf.drop_duplicates() @@ -152,10 +58,7 @@ def listing(x): print(nf) -# In[7]: - - -nf = data[data.higher == 'anthropogenic'] +nf = data.query('higher == "anthropogenic"') nf = nf[['amp','higher']] nf = nf.drop_duplicates() nf = nf.groupby('higher') @@ -163,15 +66,8 @@ def listing(x): print(nf) -# In[8]: - - -nf = len(set(data[(data.higher == 'other')]['amp'])) -print(f'Other environments: {nf}') - - -# In[9]: - +nf = len(set(data.query('higher == "other"')['amp'])) +print(f'Other environments: {nf:,}') metadata = pd.read_table('../data_folder/metadata.tsv.gz') metadata.rename({'sample_accession': 'sample'}, axis=1, inplace=True) @@ -180,14 +76,8 @@ def listing(x): nf[['host_common_name', 'amp']].drop_duplicates().groupby('host_common_name').agg('size').sort_values() -# In[10]: - - nf['nenvo'] = [x if x not in plants else 'plant' for x in nf.host_common_name] -nf[['nenvo', 'amp']].drop_duplicates().groupby('nenvo').agg('size').sort_values() - - -# In[11]: +nf[['nenvo', 'amp']].drop_duplicates().groupby('nenvo').agg('size').sort_values() samples = data[['sample', 'higher']] @@ -196,45 +86,31 @@ def listing(x): samples = samples.agg('size') -# In[12]: - - habitats = data[['higher', 'general_envo_name']].drop_duplicates() -habitats = habitats.groupby('higher')['general_envo_name'].apply(lambda x: listing(x)) - - -# In[13]: +habitats = habitats.groupby('higher')['general_envo_name'].apply(listing) redamps = data.groupby('higher').agg('size') -# In[14]: - - nramps = data[['higher', 'amp']] nramps = nramps.drop_duplicates() nramps = nramps.groupby('higher') nramps = nramps.agg('size') -# In[15]: - - fams = pd.read_table('../data_folder/SPHERE_v.2022-03.levels_assessment.tsv.gz') -fams = fams.rename({'AMP accession': 'amp', +fams +fams.rename(columns={'AMP accession': 'amp', 'SPHERE_fam level III': 'family'}, - axis=1) - + inplace=True) + fams = fams[['amp', 'family']] data = data.merge(on='amp', right=fams) fams = fams.groupby('family').agg('size') fams = fams[fams >= 8].index -# In[16]: - - data = data[['higher', 'family']].drop_duplicates() famps = data.groupby('higher').agg('size') famp_l = data[data.family.isin(fams)].groupby('higher').agg('size') @@ -242,8 +118,6 @@ def listing(x): # Here, it follows the supplementary table with info about the samples, number of redundant and non-redundant AMPs, as well as the number of clusters and families each high-level habitat affiliates. -# In[17]: - df = pd.concat([habitats, samples, @@ -254,58 +128,43 @@ def listing(x): axis=1) df = df.reset_index() -df = df.rename({'higher': 'high level environment', +df.rename(columns={'higher': 'high level environment', 'general_envo_name': 'habitats', 0: 'samples', 1: 'redundant AMPs', 2: 'non-redundant AMPs', 3: 'AMP clusters', 4: 'AMP families'}, - axis=1) - -df + inplace=True) # ### Information about the samples used in AMPSphere -# In[18]: - - # load data again data = pd.read_table('../data_folder/samples-min500k-assembly-prodigal-stats.tsv.gz') amps = pd.read_table('../data_folder/gmsc_amp_genes_envohr_source.tsv.gz') -# In[19]: - - # filter columns namps = amps[['amp', 'sample', 'general_envo_name']] -# In[20]: - - # eliminate redundancy namps = namps.drop_duplicates() namps = namps.groupby('sample') namps = namps.agg('size') namps = namps.reset_index() - -namps = namps.rename({0: 'amps', - 'sample': 'sample_accession'}, - axis=1) - - -# In[21]: +namps.rename(columns={0: 'amps', + 'sample': 'sample_accession'}, + inplace=True) # merge splitted data a = data.merge(on='sample_accession', right=namps) - + b = data[~data.sample_accession.isin(namps.sample_accession)] data = pd.concat([a, b]) data.amps = data.amps.fillna(0) @@ -313,13 +172,10 @@ def listing(x): data['amps_per_assembled_Mbp'] = data.amps * 1_000_000 / data.assembly_total_length -# In[22]: - # more data... envo = pd.read_table('../data_folder/metadata.tsv.xz') gen = pd.read_table('../data_folder/general_envo_names.tsv.xz') - envo = envo.merge(on=['microontology', 'host_scientific_name', 'host_tax_id'], @@ -341,9 +197,6 @@ def listing(x): data = data.merge(on='sample_accession', right=envo) -# In[23]: - - # supp table S1 sup1 = data[['sample_accession', 'general_envo_name', 'inserts_raw', 'assembly_total_length',