Skip to content

Commit

Permalink
RFCT Simplify code
Browse files Browse the repository at this point in the history
- Import `higher_level` dict from module
- Use `inplace=True` instead of assigning
  • Loading branch information
luispedro committed Jan 10, 2024
1 parent c6f35ab commit 5f5888d
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 314 deletions.
164 changes: 17 additions & 147 deletions Manuscript_Analysis/08_multi_habitat_cAMPs.py
Original file line number Diff line number Diff line change
@@ -1,199 +1,89 @@
#!/usr/bin/env python
# coding: utf-8

# # AMPSphere v.2022-03
#
# This is a notebook meant to form the set of notebooks used to analyze the data in AMPSphere and write the manuscript:
#
# __AMPSphere: Global survey of prokaryotic antimicrobial peptides shaping microbiomes__

# ### Analysis of abundance and diversity of c_AMPs
#
# c_AMPs are distributed as gene variants through many species. Here, we will test:
#
# I. Is there multi-habitat c_AMPs?
# II. Are multi-habitat c_AMPs clonal?

# In[1]:


import lzma
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from Bio import SeqIO
from tqdm import tqdm
from scipy.stats import norm
from scipy.stats import shapiro
from scipy.stats import pearsonr, spearmanr

import environments

# In[2]:


higher_level = {'sediment' : 'other',
'bird gut' : 'other animal',
'cat gut' : 'non-human mammal gut',
'insect associated' : 'other animal',
'human urogenital tract' : 'other human',
'dog gut' : 'non-human mammal gut',
'fermented food' : 'anthropogenic',
'groundwater' : 'aquatic',
'coral associated' : 'other animal',
'rat gut' : 'non-human mammal gut',
'human associated' : 'other human',
'cattle gut' : 'non-human mammal gut',
'deer gut' : 'non-human mammal gut',
'mouse gut' : 'non-human mammal gut',
'river associated' : 'aquatic',
'primate gut' : 'non-human mammal gut',
'human respiratory tract' : 'other human',
'cattle rumen' : 'other animal',
'human saliva' : 'other human',
'activated sludge' : 'anthropogenic',
'lake associated' : 'aquatic',
'wastewater' : 'anthropogenic',
'chicken gut' : 'other animal',
'air' : 'other',
'human mouth' : 'other human',
'plant associated' : 'soil/plant',
'water associated' : 'aquatic',
'pig gut' : 'non-human mammal gut',
'human skin' : 'other human',
'marine' : 'aquatic',
'soil' : 'soil/plant',
'built environment' : 'anthropogenic',
'human gut' : 'human gut',
'anthropogenic': 'anthropogenic',
'bear gut' : 'non-human mammal gut',
'bee gut': 'other animal',
'bat gut': 'non-human mammal gut',
'dog associated': 'other animal',
'cattle associated': 'other animal',
'crustacean associated': 'other animal',
'insect gut': 'other animal',
'goat gut': 'non-human mammal gut',
'rodent gut': 'non-human mammal gut',
'fisher gut': 'non-human mammal gut',
'human digestive tract': 'other human',
'coyote gut': 'non-human mammal gut',
'planarian associated': 'other animal',
'sponge associated': 'other animal',
'goat rumen': 'other animal',
'crustacean gut': 'other animal',
'annelidae associated': 'other animal',
'bird skin': 'other animal',
'beatle gut': 'other animal',
'termite gut': 'other animal',
'fish gut': 'other animal',
'mollusc associated': 'other animal',
'ship worm associated': 'other animal',
'rabbit gut': 'non-human mammal gut',
'tunicate associated': 'other animal',
'mussel associated': 'other animal',
'horse gut': 'non-human mammal gut',
'wasp gut': 'other animal',
'guinea pig gut': 'non-human mammal gut'}


# In[3]:


# load data
data = pd.read_table('../data_folder/gmsc_amp_genes_envohr_source.tsv.gz')


# In[4]:


# eliminate genes/amps without environment (from ProGenomes)
data = data[~data.general_envo_name.isna()]

# attribut high-level habitat to genes/amps
data['high'] = data.general_envo_name.map(lambda x: higher_level.get(x, 'other'))


# In[5]:
# attribute high-level habitat to genes/amps
data['high'] = data.general_envo_name.map(lambda x: environments.higher_level.get(x, 'other'))


# testing multihabitat AMPs
def get_multihabitat(df, level=None):
'''
counts the number of multi-habitat AMPs (present in at least 3 environments)
:inputs:
data frame containing at least AMP, general_envo_name, high-level habitat
level which stats for the habitat or high-level environment
:outputs:
length of the list of multi-habitat c_AMPs
'''
if level == None:
level = 'high'
if level == 'low':
level = 'general_envo_name'

h = df[['amp', level]].drop_duplicates()
h = h.amp.value_counts()
h = h[h > 1].index

return h


# In[6]:


l = get_multihabitat(data, 'high')
l0 = get_multihabitat(data, 'low')
print(f'Multi-habitat AMPs: {len(l0)}\nMulti-high-level-habitat: {len(l)}')


# In[7]:
print(f'Multi-habitat AMPs: {len(l0):,}\nMulti-high-level-habitat: {len(l):,}')


k = data[['amp', 'high']].drop_duplicates()['amp'].value_counts()
k = k[k>1].index

with open('multihabitat_highlevel.txt', 'wt') as out:
with open('outputs/multihabitat_highlevel.txt', 'wt') as out:
for i in k: out.write(f'{i}\n')

k = data[['amp', 'general_envo_name']].drop_duplicates()['amp'].value_counts()
k = k[k>1].index

with open('multihabitat_generalenvo.txt', 'wt') as out:
with open('outputs/multihabitat_generalenvo.txt', 'wt') as out:
for i in k: out.write(f'{i}\n')


# ### Permutation test
#
#
# We shuffle the high-level habitat annotation for the samples, and then, calculate the number of multi-habitat c_AMPs. This operation is repeated 100 times, and then we calculate the average and standard deviation of the distribution of random results. Using Shapiro-Wilk test, we check if the random distribution is normal, and if it is, we calculate the Z-score for the result obtained for AMPSphere. The Z-score is then converted into a p-value to support our conclusions.

# In[8]:


# testing significance
def shuffle_test(df, level=None):
if level == None: level = 'high'
elif level == 'low': level = 'general_envo_name'

habitat = df.set_index('sample')[level].to_dict()

values = [v for _, v in habitat.items()]
random.shuffle(values)

for k, v in zip(habitat, values):
habitat[k] = v

altdf = df.copy()
altdf[level] = altdf['sample'].map(lambda x: habitat.get(x))

return altdf


# In[9]:
return altdf


#test high level habitats
Expand All @@ -205,17 +95,8 @@ def shuffle_test(df, level=None):

print(test)


# In[10]:


_, p = shapiro(test)

if p < 0.05: res = 'not-normal'
else: res = 'normal'

print(f'The Shapiro-Wilks test returned a p = {p}')
print(f'This means that the distribution is {res}')
print(f'The Shapiro-Wilks test returned p = {p}')

avg, std = np.mean(test), np.std(test)

Expand All @@ -230,9 +111,6 @@ def shuffle_test(df, level=None):
print(f'This gives us a p-value of {pz}')


# In[17]:


#test habitats
test_low = []
for _ in tqdm(range(100)):
Expand All @@ -242,17 +120,9 @@ def shuffle_test(df, level=None):

print(test_low)


# In[18]:


_, p = shapiro(test_low)

if p < 0.05: res = 'not-normal'
else: res = 'normal'

print(f'The Shapiro-Wilks test returned a p = {p}')
print(f'This means that the distribution is {res}')

avg, std = np.mean(test_low), np.std(test_low)

Expand All @@ -262,7 +132,7 @@ def shuffle_test(df, level=None):

pz = norm.sf(abs(Z))

print(f'The number of multi-habitat AMPs in AMPSphere was {len(l0)}')
print(f'The number of multi-habitat AMPs in AMPSphere was {len(l0):,}')
print(f'It was {Z} * std of the random distribution')
print(f'This gives us a p-value of {pz}')

Loading

0 comments on commit 5f5888d

Please sign in to comment.