From 5f5888d4ca038b9b46f45d1a2ff8ed7e66ab316a Mon Sep 17 00:00:00 2001
From: Luis Pedro Coelho <luis@luispedro.org>
Date: Wed, 10 Jan 2024 18:22:30 +1000
Subject: [PATCH] RFCT Simplify code

- Import `higher_level` dict from module
- Use `inplace=True` instead of assigning
---
 Manuscript_Analysis/08_multi_habitat_cAMPs.py | 164 ++-------------
 ...mentary_info_about_habitats_and_samples.py | 187 ++----------------
 2 files changed, 37 insertions(+), 314 deletions(-)

diff --git a/Manuscript_Analysis/08_multi_habitat_cAMPs.py b/Manuscript_Analysis/08_multi_habitat_cAMPs.py
index e359d8a..6ecb0be 100644
--- a/Manuscript_Analysis/08_multi_habitat_cAMPs.py
+++ b/Manuscript_Analysis/08_multi_habitat_cAMPs.py
@@ -1,133 +1,36 @@
 #!/usr/bin/env python
 # coding: utf-8
-
-# # AMPSphere v.2022-03
-# 
-# This is a notebook meant to form the set of notebooks used to analyze the data in AMPSphere and write the manuscript:
-# 
-# __AMPSphere: Global survey of prokaryotic antimicrobial peptides shaping microbiomes__
-
-# ### Analysis of abundance and diversity of c_AMPs
-# 
-# c_AMPs are distributed as gene variants through many species. Here, we will test:
-#     
-#     I. Is there multi-habitat c_AMPs?
-#     II. Are multi-habitat c_AMPs clonal?
-
-# In[1]:
-
-
-import lzma
 import random
 import numpy as np
 import pandas as pd
 import seaborn as sns
 import matplotlib.pyplot as plt
 
-from Bio import SeqIO
 from tqdm import tqdm
 from scipy.stats import norm
 from scipy.stats import shapiro
 from scipy.stats import pearsonr, spearmanr
 
+import environments
 
-# In[2]:
-
-
-higher_level = {'sediment' : 'other',
-        'bird gut' : 'other animal',
-        'cat gut' : 'non-human mammal gut',
-        'insect associated' : 'other animal',
-        'human urogenital tract' : 'other human',
-        'dog gut' : 'non-human mammal gut',
-        'fermented food' : 'anthropogenic',
-        'groundwater' : 'aquatic',
-        'coral associated' : 'other animal',
-        'rat gut' : 'non-human mammal gut',
-        'human associated' : 'other human',
-        'cattle gut' : 'non-human mammal gut',
-        'deer gut' : 'non-human mammal gut',
-        'mouse gut' : 'non-human mammal gut',
-        'river associated' : 'aquatic',
-        'primate gut' : 'non-human mammal gut',
-        'human respiratory tract' : 'other human',
-        'cattle rumen' : 'other animal',
-        'human saliva' : 'other human',
-        'activated sludge' : 'anthropogenic',
-        'lake associated' : 'aquatic',
-        'wastewater' : 'anthropogenic',
-        'chicken gut' : 'other animal',
-        'air' : 'other',
-        'human mouth' : 'other human',
-        'plant associated' : 'soil/plant',
-        'water associated' : 'aquatic',
-        'pig gut' : 'non-human mammal gut',
-        'human skin' : 'other human',
-        'marine' : 'aquatic',
-        'soil' : 'soil/plant',
-        'built environment' : 'anthropogenic',
-        'human gut' : 'human gut',
-        'anthropogenic': 'anthropogenic',
-        'bear gut' : 'non-human mammal gut',
-        'bee gut': 'other animal',
-        'bat gut': 'non-human mammal gut',
-        'dog associated': 'other animal',
-        'cattle associated': 'other animal',
-        'crustacean associated': 'other animal',
-        'insect gut': 'other animal',
-        'goat gut': 'non-human mammal gut', 
-        'rodent gut': 'non-human mammal gut',
-        'fisher gut': 'non-human mammal gut',
-        'human digestive tract': 'other human',
-        'coyote gut': 'non-human mammal gut',
-        'planarian associated': 'other animal',
-        'sponge associated': 'other animal',
-        'goat rumen': 'other animal',
-        'crustacean gut': 'other animal',
-        'annelidae associated': 'other animal',
-        'bird skin': 'other animal',
-        'beatle gut': 'other animal',
-        'termite gut': 'other animal', 
-        'fish gut': 'other animal',
-        'mollusc associated': 'other animal',
-        'ship worm associated': 'other animal',
-        'rabbit gut': 'non-human mammal gut',
-        'tunicate associated': 'other animal',
-        'mussel associated': 'other animal',
-        'horse gut': 'non-human mammal gut',
-        'wasp gut': 'other animal',
-        'guinea pig gut': 'non-human mammal gut'}
-
-
-# In[3]:
-
-
-# load data
 data = pd.read_table('../data_folder/gmsc_amp_genes_envohr_source.tsv.gz')
 
-
-# In[4]:
-
-
 # eliminate genes/amps without environment (from ProGenomes)
 data = data[~data.general_envo_name.isna()]
 
-# attribut high-level habitat to genes/amps
-data['high'] = data.general_envo_name.map(lambda x: higher_level.get(x, 'other'))
-
-
-# In[5]:
+# attribute high-level habitat to genes/amps
+data['high'] = data.general_envo_name.map(lambda x: environments.higher_level.get(x, 'other'))
 
 
 # testing multihabitat AMPs
 def get_multihabitat(df, level=None):
     '''
     counts the number of multi-habitat AMPs (present in at least 3 environments)
-    
+
     :inputs:
     data frame containing at least AMP, general_envo_name, high-level habitat
     level which stats for the habitat or high-level environment
-    
+
     :outputs:
     length of the list of multi-habitat c_AMPs
     '''
@@ -135,65 +38,52 @@ def get_multihabitat(df, level=None):
         level = 'high'
     if level == 'low':
         level = 'general_envo_name'
-        
+
     h = df[['amp', level]].drop_duplicates()
     h = h.amp.value_counts()
     h = h[h > 1].index
-    
     return h
 
 
-# In[6]:
-
-
 l = get_multihabitat(data, 'high')
 l0 = get_multihabitat(data, 'low')
-print(f'Multi-habitat AMPs: {len(l0)}\nMulti-high-level-habitat: {len(l)}')
-
-
-# In[7]:
+print(f'Multi-habitat AMPs: {len(l0):,}\nMulti-high-level-habitat: {len(l):,}')
 
 
 k = data[['amp', 'high']].drop_duplicates()['amp'].value_counts()
 k = k[k>1].index
 
-with open('multihabitat_highlevel.txt', 'wt') as out:
+with open('outputs/multihabitat_highlevel.txt', 'wt') as out:
     for i in k: out.write(f'{i}\n')
 
 k = data[['amp', 'general_envo_name']].drop_duplicates()['amp'].value_counts()
 k = k[k>1].index
 
-with open('multihabitat_generalenvo.txt', 'wt') as out:
+with open('outputs/multihabitat_generalenvo.txt', 'wt') as out:
     for i in k: out.write(f'{i}\n')
 
 
 # ### Permutation test
-# 
+#
 # We shuffle the high-level habitat annotation for the samples, and then, calculate the number of multi-habitat c_AMPs. This operation is repeated 100 times, and then we calculate the average and standard deviation of the distribution of random results. Using Shapiro-Wilk test, we check if the random distribution is normal, and if it is, we calculate the Z-score for the result obtained for AMPSphere. The Z-score is then converted into a p-value to support our conclusions.
 
-# In[8]:
-
-
 # testing significance
 def shuffle_test(df, level=None):
     if level == None: level = 'high'
     elif level == 'low': level = 'general_envo_name'
-        
+
     habitat = df.set_index('sample')[level].to_dict()
-    
+
     values = [v for _, v in habitat.items()]
     random.shuffle(values)
-    
+
     for k, v in zip(habitat, values):
         habitat[k] = v
-        
+
     altdf = df.copy()
     altdf[level] = altdf['sample'].map(lambda x: habitat.get(x))
-    
-    return altdf
 
-
-# In[9]:
+    return altdf
 
 
 #test high level habitats
@@ -205,17 +95,8 @@ def shuffle_test(df, level=None):
 
 print(test)
 
-
-# In[10]:
-
-
 _, p = shapiro(test)
-
-if p < 0.05: res = 'not-normal'
-else: res = 'normal'
-
-print(f'The Shapiro-Wilks test returned a p = {p}')
-print(f'This means that the distribution is {res}')
+print(f'The Shapiro-Wilks test returned p = {p}')
 
 avg, std = np.mean(test), np.std(test)
 
@@ -230,9 +111,6 @@ def shuffle_test(df, level=None):
 print(f'This gives us a p-value of {pz}')
 
 
-# In[17]:
-
-
 #test habitats
 test_low = []
 for _ in tqdm(range(100)):
@@ -242,17 +120,9 @@ def shuffle_test(df, level=None):
 
 print(test_low)
 
-
-# In[18]:
-
-
 _, p = shapiro(test_low)
 
-if p < 0.05: res = 'not-normal'
-else: res = 'normal'
-
 print(f'The Shapiro-Wilks test returned a p = {p}')
-print(f'This means that the distribution is {res}')
 
 avg, std = np.mean(test_low), np.std(test_low)
 
@@ -262,7 +132,7 @@ def shuffle_test(df, level=None):
 
 pz = norm.sf(abs(Z))
 
-print(f'The number of multi-habitat AMPs in AMPSphere was {len(l0)}')
+print(f'The number of multi-habitat AMPs in AMPSphere was {len(l0):,}')
 print(f'It was {Z} * std of the random distribution')
 print(f'This gives us a p-value of {pz}')
 
diff --git a/Manuscript_Analysis/09_supplementary_info_about_habitats_and_samples.py b/Manuscript_Analysis/09_supplementary_info_about_habitats_and_samples.py
index aefd720..b572455 100644
--- a/Manuscript_Analysis/09_supplementary_info_about_habitats_and_samples.py
+++ b/Manuscript_Analysis/09_supplementary_info_about_habitats_and_samples.py
@@ -1,92 +1,14 @@
 #!/usr/bin/env python
-# coding: utf-8
 
-# # AMPSphere v.2022-03
-# 
-# This is a notebook meant to form the set of notebooks used to analyze the data in AMPSphere and write the manuscript:
-# 
-# __AMPSphere: Global survey of prokaryotic antimicrobial peptides shaping microbiomes__
-# 
-# ### Summarizing AMPSphere origins and results in supplementary tables
-# 
-# Here, we summarize the metagenomes, genomes and other info needed to rebuild AMPSphere along with other supplementary data reporting results.
 
-# In[1]:
-
-
-# loading libraries
 import pandas as pd
 import seaborn as sns
 import matplotlib.pyplot as plt
 from scipy.stats import spearmanr
+import environments
 
-
-# In[2]:
-
-
-higher_level = {'sediment' : 'other',
-        'bird gut' : 'other animal',
-        'cat gut' : 'non-human mammal gut',
-        'insect associated' : 'other animal',
-        'human urogenital tract' : 'other human',
-        'dog gut' : 'non-human mammal gut',
-        'fermented food' : 'anthropogenic',
-        'groundwater' : 'aquatic',
-        'coral associated' : 'other animal',
-        'rat gut' : 'non-human mammal gut',
-        'human associated' : 'other human',
-        'cattle gut' : 'non-human mammal gut',
-        'deer gut' : 'non-human mammal gut',
-        'mouse gut' : 'non-human mammal gut',
-        'river associated' : 'aquatic',
-        'primate gut' : 'non-human mammal gut',
-        'human respiratory tract' : 'other human',
-        'cattle rumen' : 'other animal',
-        'human saliva' : 'other human',
-        'activated sludge' : 'anthropogenic',
-        'lake associated' : 'aquatic',
-        'wastewater' : 'anthropogenic',
-        'chicken gut' : 'other animal',
-        'air' : 'other',
-        'human mouth' : 'other human',
-        'plant associated' : 'soil/plant',
-        'water associated' : 'aquatic',
-        'pig gut' : 'non-human mammal gut',
-        'human skin' : 'other human',
-        'marine' : 'aquatic',
-        'soil' : 'soil/plant',
-        'built environment' : 'anthropogenic',
-        'human gut' : 'human gut',
-        'anthropogenic': 'anthropogenic',
-        'bear gut' : 'non-human mammal gut',
-        'bee gut': 'other animal',
-        'bat gut': 'non-human mammal gut',
-        'dog associated': 'other animal',
-        'cattle associated': 'other animal',
-        'crustacean associated': 'other animal',
-        'insect gut': 'other animal',
-        'goat gut': 'non-human mammal gut', 
-        'rodent gut': 'non-human mammal gut',
-        'fisher gut': 'non-human mammal gut',
-        'human digestive tract': 'other human',
-        'coyote gut': 'non-human mammal gut',
-        'planarian associated': 'other animal',
-        'sponge associated': 'other animal',
-        'goat rumen': 'other animal',
-        'crustacean gut': 'other animal',
-        'annelidae associated': 'other animal',
-        'bird skin': 'other animal',
-        'beatle gut': 'other animal',
-        'termite gut': 'other animal', 
-        'fish gut': 'other animal',
-        'mollusc associated': 'other animal',
-        'ship worm associated': 'other animal',
-        'rabbit gut': 'non-human mammal gut',
-        'tunicate associated': 'other animal',
-        'mussel associated': 'other animal',
-        'horse gut': 'non-human mammal gut',
-        'wasp gut': 'other animal',
-        'guinea pig gut': 'non-human mammal gut'}
+for k in set(environments.higher_level) ^ set(higher_level.keys()):
+    print(f'{k:50}\t{environments.higher_level[k]}')
 
 
 is_host_associated = {'human gut' : True,
@@ -111,29 +33,16 @@
         'red fir'}
 
 
-# In[3]:
-
-
 def listing(x):
     return ', '.join(x.tolist())
 
 
-# In[4]:
-
-
-# load data
 data = pd.read_table('../data_folder/gmsc_amp_genes_envohr_source.tsv.gz')
-
-data = data[data.is_metagenomic == True]
-
-data['higher'] = data['general_envo_name'].map(lambda g: higher_level.get(g, 'other'))
-
+data = data.query('is_metagenomic')
+data['higher'] = data['general_envo_name'].map(environments.higher_level)
 data['host_associated'] = data['higher'].map(lambda g: is_host_associated.get(g, 'NA'))
 
 
-# In[5]:
-
-
 nf = data[['amp','host_associated']]
 nf = nf.drop_duplicates()
 nf = nf.groupby('host_associated')
@@ -141,9 +50,6 @@ def listing(x):
 print(nf)
 
 
-# In[6]:
-
-
 nf = data[data.higher.isin(['soil/plant', 'aquatic'])]
 nf = nf[['amp', 'higher']]
 nf = nf.drop_duplicates()
@@ -152,10 +58,7 @@ def listing(x):
 print(nf)
 
 
-# In[7]:
-
-
-nf = data[data.higher == 'anthropogenic']
+nf = data.query('higher == "anthropogenic"')
 nf = nf[['amp','higher']]
 nf = nf.drop_duplicates()
 nf = nf.groupby('higher')
@@ -163,15 +66,8 @@ def listing(x):
 print(nf)
 
 
-# In[8]:
-
-
-nf = len(set(data[(data.higher == 'other')]['amp']))
-print(f'Other environments: {nf}')
-
-
-# In[9]:
-
+nf = len(set(data.query('higher == "other"')['amp']))
+print(f'Other environments: {nf:,}')
 
 metadata = pd.read_table('../data_folder/metadata.tsv.gz')
 metadata.rename({'sample_accession': 'sample'}, axis=1, inplace=True)
@@ -180,14 +76,8 @@ def listing(x):
 nf[['host_common_name', 'amp']].drop_duplicates().groupby('host_common_name').agg('size').sort_values()
 
 
-# In[10]:
-
-
 nf['nenvo'] = [x if x not in plants else 'plant' for x in nf.host_common_name]
-nf[['nenvo', 'amp']].drop_duplicates().groupby('nenvo').agg('size').sort_values()  
-
-
-# In[11]:
+nf[['nenvo', 'amp']].drop_duplicates().groupby('nenvo').agg('size').sort_values()
 
 
 samples = data[['sample', 'higher']]
@@ -196,45 +86,31 @@ def listing(x):
 samples = samples.agg('size')
 
 
-# In[12]:
-
-
 habitats = data[['higher', 'general_envo_name']].drop_duplicates()
-habitats = habitats.groupby('higher')['general_envo_name'].apply(lambda x: listing(x))
-
-
-# In[13]:
+habitats = habitats.groupby('higher')['general_envo_name'].apply(listing)
 
 
 redamps = data.groupby('higher').agg('size')
 
 
-# In[14]:
-
-
 nramps = data[['higher', 'amp']]
 nramps = nramps.drop_duplicates()
 nramps = nramps.groupby('higher')
 nramps = nramps.agg('size')
 
 
-# In[15]:
-
-
 fams = pd.read_table('../data_folder/SPHERE_v.2022-03.levels_assessment.tsv.gz')
-fams = fams.rename({'AMP accession': 'amp',
+fams
+fams.rename(columns={'AMP accession': 'amp',
                     'SPHERE_fam level III': 'family'},
-                   axis=1)
-                   
+            inplace=True)
+
 fams = fams[['amp', 'family']]
 data = data.merge(on='amp', right=fams)
 fams = fams.groupby('family').agg('size')
 fams = fams[fams >= 8].index
 
 
-# In[16]:
-
-
 data = data[['higher', 'family']].drop_duplicates()
 famps = data.groupby('higher').agg('size')
 famp_l = data[data.family.isin(fams)].groupby('higher').agg('size')
@@ -242,8 +118,6 @@ def listing(x):
 
 # Here, it follows the supplementary table with info about the samples, number of redundant and non-redundant AMPs, as well as the number of clusters and families each high-level habitat affiliates.
 
-# In[17]:
-
 
 df = pd.concat([habitats,
                 samples,
@@ -254,58 +128,43 @@ def listing(x):
                axis=1)
 
 df = df.reset_index()
-df = df.rename({'higher': 'high level environment',
+df.rename(columns={'higher': 'high level environment',
                 'general_envo_name': 'habitats',
                 0: 'samples',
                 1: 'redundant AMPs',
                 2: 'non-redundant AMPs',
                 3: 'AMP clusters',
                 4: 'AMP families'},
-               axis=1)
-
-df
+          inplace=True)
 
 
 # ### Information about the samples used in AMPSphere
 
-# In[18]:
-
-
 # load data again
 data = pd.read_table('../data_folder/samples-min500k-assembly-prodigal-stats.tsv.gz')
 amps = pd.read_table('../data_folder/gmsc_amp_genes_envohr_source.tsv.gz')
 
 
-# In[19]:
-
-
 # filter columns
 namps = amps[['amp',
               'sample',
               'general_envo_name']]
 
 
-# In[20]:
-
-
 # eliminate redundancy
 namps = namps.drop_duplicates()
 namps = namps.groupby('sample')
 namps = namps.agg('size')
 namps = namps.reset_index()
-
-namps = namps.rename({0: 'amps',
-                      'sample': 'sample_accession'},
-                     axis=1)
-
-
-# In[21]:
+namps.rename(columns={0: 'amps',
+                  'sample': 'sample_accession'},
+                 inplace=True)
 
 
 # merge splitted data
 a = data.merge(on='sample_accession',
                right=namps)
-               
+
 b = data[~data.sample_accession.isin(namps.sample_accession)]
 data = pd.concat([a, b])
 data.amps = data.amps.fillna(0)
@@ -313,13 +172,10 @@ def listing(x):
 data['amps_per_assembled_Mbp'] = data.amps * 1_000_000 / data.assembly_total_length
 
 
-# In[22]:
-
 
 # more data...
 envo = pd.read_table('../data_folder/metadata.tsv.xz')
 gen = pd.read_table('../data_folder/general_envo_names.tsv.xz')
-
 envo = envo.merge(on=['microontology',
                       'host_scientific_name',
                       'host_tax_id'],
@@ -341,9 +197,6 @@ def listing(x):
 data = data.merge(on='sample_accession', right=envo)
 
 
-# In[23]:
-
-
 # supp table S1
 sup1 = data[['sample_accession', 'general_envo_name',
              'inserts_raw', 'assembly_total_length',