-
Notifications
You must be signed in to change notification settings - Fork 3
/
phylogeny.py
289 lines (273 loc) · 15 KB
/
phylogeny.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
#! /usr/bin/python
import re,os,random,os.path,time
import pandas as pd
#from gff2longest_CDS import GFF2LONGEST_CDS as seq
from fasta import fa_dic
from orthologs_groups import sonic_group_df,ORTHOMCL,busco_group_df
from multiprocessing import Pool
class PHYLOGENY(object):
def __init__(self,orthocluster_out,species_number,max_gene_number,software_name,raw_database,output_dir,TSCG,PSCG,LCG,OutGroup,DataType,min_PSCG_SPnumber,trim):
self.orthocluster_out = orthocluster_out #同源基因聚类输出
self.species_number = species_number # 总共的物种数目
self.max_gene_number = max_gene_number # 每个family的最大基因个数
self.software_name = software_name # 同源聚类的方法
self.raw_database = raw_database # 原始clean序列的目录,用于提取基因序列
self.output_dir = output_dir # 输出目录
self.TSCG = TSCG # 用于提取全部单拷贝基因的物种列表,eg['Atr','Ath']
self.PSCG = PSCG # 用于提取部分单拷贝基因的物种列表
self.LCG = LCG # 用于提取低拷贝基因的物种列表
self.InterGroup = TSCG + PSCG + LCG
self.OutGroup = OutGroup # 外类群列表
self.DataType = DataType # 建树数据类型
self.trim = trim # 用于pxclsq 中修建的比例和trimal
self.min_PSCG_SPnumber = min_PSCG_SPnumber # 最少内类群物种数目(最大为总共的物种数目-1)
if self.DataType == 'CDS' or self.DataType == 'Codon12': # iqtree 中数据类型参数
self.st = 'DNA'
elif self.DataType == 'pep':
self.st = 'AA'
if self.max_gene_number == 1: # 单拷贝还是多拷贝
self.Type = 'single'
else:
self.Type = 'low'
def core_orthogroups(self): # 从orthologs_groups模块中导入标准化后的基因家族聚类数据框,返回过滤后的数据框
if self.software_name == 'orthomcl':
all_group_df = ORTHOMCL().all_group_df(self.orthocluster_out)
elif self.software_name == 'sonicparanoid': # return a DataFrame of single copy gene.
all_group_df = sonic_group_df(self.orthocluster_out,self.species_number)
elif self.software_name == 'busco':
all_group_df = busco_group_df(self.orthocluster_out)
all_group_df = all_group_df[self.OutGroup + self.InterGroup]
d = {}
for groupID,row in all_group_df.iterrows():
groupID = 'group_' + str(groupID)
outgroup = []
for q in self.OutGroup:
if len(row[q]) != 1: #将超过1个同源基因的外类群定位'*'
row[q] = '*'
elif row[q] != '*': #即长度不等于1又不为'*'的,表示只含有1个同源基因,遂添加到列表,不符合的外类群列表为空
outgroup.append(q)
if len(outgroup) > 1: #如果外类群个数超过1,则只选用第一个物种的基因作为外类群,目前只支持两个外类群物种
row[outgroup[1]] = '*'
TSCG = []
PSCG = []
LCG = []
for i in self.TSCG:
if row[i] != '*' and len(row[i]) == 1:
TSCG.append(i)
else:
row[i] = '*'
for i in self.PSCG:
if row[i] != '*' and len(row[i]) == 1:
PSCG.append(i)
else:
row[i] = '*'
for i in self.LCG:
if row[i] != '*' and len(row[i]) <= self.max_gene_number:
LCG.append(i)
else:
row[i] = '*'
if len(TSCG) == len(self.TSCG) and outgroup: #严格单拷贝:所有TSCG物种都是单拷贝
if len(PSCG) >= self.min_PSCG_SPnumber: #部分单拷贝:该部分物种有一定比例的单拷贝即可,self.min_PSCG_SPnumber等于内类群的总个数,则所有物种都是单拷贝(严格单拷贝基因集)
if len(LCG) == len(self.LCG):
d[groupID] = row
core = pd.DataFrame(d).T #.T为横纵坐标转置
for abb,row in core.iteritems():
print(abb+'\t'+str(len([m for m in row if m != '*'])/len(row)),len(row))
print('output the groups csv')
core.to_csv(self.software_name+'_'+self.Type+'.csv',sep='\t')
return core # 返回筛选后的数据框
def extract_tree(self):
#core_orthogroups = self.core_orthogroups()
df = pd.read_csv('/data/01/user119/project/jsl/04_phylogeny/orthomcl/orthomcl_single.csv',sep='\t',index_col=0)
for groupID in df.index.values:
tree_path = os.path.join(self.output_dir,'align',groupID,groupID+'_'+self.DataType+'.aln-trimal.treefile')
if os.path.exists(tree_path):
print(tree_path)
def extraction_singlecopy_cds(self,input_cds_fa,abb):
fa_seq = fa_dic(input_cds_fa)
singlecopy_geneID = self.singlecopy_geneID(orthomcl_singlecopy)
pass
def build_database_1(self): # 生成每个基因的CDS与pep文件
core_orthogroups = self.core_orthogroups()
if not os.path.exists(self.output_dir+'/align'):
os.mkdir(self.output_dir+'/align')
for groupID,row in core_orthogroups.iterrows():
print(groupID)
path = os.path.join(self.output_dir,'align',groupID)
CDS_path = os.path.join(self.output_dir,'align',groupID,groupID+'_CDS')
pep_path = os.path.join(self.output_dir,'align',groupID,groupID+'_pep')
if not os.path.exists(CDS_path) or not os.path.exists(pep_path): #如果没有生产pep文件,则新建
if not os.path.exists(path):
os.mkdir(path)
else:
continue # 如果有,则跳过
f_cds = open(CDS_path,'w')
f_pep = open(pep_path,'w')
for geneID_list in row:
if geneID_list == '*':continue
for index,geneID in enumerate(geneID_list):
spID = geneID[:3]
if spID in self.OutGroup:
spID = 'Out'
cds_seq = fa_dic(os.path.join(self.raw_database,geneID[:3]+'_cds.fa'))[geneID]
pep_seq = fa_dic(os.path.join(self.raw_database,geneID[:3]+'_pep.fa'))[geneID]
f_cds.write('>'+spID+'_'+str(index+1)+'\n'+cds_seq+'\n')
f_pep.write('>'+spID+'_'+str(index+1)+'\n'+pep_seq+'\n')
f_cds.close()
f_pep.close()
def build_database_multiprocess_1(self,row): # 生成每个基因的CDS与pep文件
# row 为 (index,series)
if not os.path.exists(self.output_dir+'/align'):
os.mkdir(self.output_dir+'/align')
groupID = row[0]
path = os.path.join(self.output_dir,'align',groupID)
CDS_path = os.path.join(self.output_dir,'align',groupID,groupID+'_CDS')
pep_path = os.path.join(self.output_dir,'align',groupID,groupID+'_pep')
if not os.path.exists(CDS_path) or not os.path.exists(pep_path): #如果没有生成pep文件,则新建
if not os.path.exists(path):
os.mkdir(path)
else:
exit(0) # 如果有,则跳过
f_cds = open(CDS_path,'w')
f_pep = open(pep_path,'w')
for geneID_list in row[1]:
if geneID_list == '*':continue
for index,geneID in enumerate(geneID_list):
spID = geneID[:3]
if spID in self.OutGroup:
spID = 'Out'
cds_seq = fa_dic(os.path.join(self.raw_database,geneID[:3]+'_cds.fa'))[geneID]
pep_seq = fa_dic(os.path.join(self.raw_database,geneID[:3]+'_pep.fa'))[geneID]
f_cds.write('>'+spID+'_'+str(index+1)+'\n'+cds_seq+'\n')
f_pep.write('>'+spID+'_'+str(index+1)+'\n'+pep_seq+'\n')
f_cds.close()
f_pep.close()
print(groupID)
def singleCOPY_align_2(self): # 单个基因序列比对
with open(os.path.join(self.output_dir,'align_%s_%s.sh' % (self.DataType,self.trim)),'w') as f:
for groupID in os.listdir(self.output_dir+'/align'):
path = os.path.join(self.output_dir+'/align',groupID)
if not os.path.exists(os.path.join(path,groupID+'_CDS.aln')) or not os.path.exists(os.path.join(path,groupID+'_pep.aln')):
#f.write('cd %s && mafft --auto --quiet %s_pep > %s_pep.aln && pal2nal.pl %s_pep.aln %s_CDS -output fasta > %s_CDS.aln && trimal -in %s_pep.aln -out %s_pep.aln-trimal -automated1 && trimal -in %s_CDS.aln -out %s_CDS.aln-trimal -automated1\n' % (path,groupID,groupID,groupID,groupID,groupID,groupID,groupID,groupID,groupID))
f.write('cd %s && mafft --auto --quiet %s_pep > %s_pep.aln && pal2nal.pl %s_pep.aln %s_CDS -output fasta > %s_CDS.aln && pxclsq -a -p 0.2 -s %s_pep.aln -o %s_pep.aln-0.2cln && pxclsq -a -p 0.2 -s %s_CDS.aln -o %s_CDS.aln-0.2cln\n' % (path,groupID,groupID,groupID,groupID,groupID,groupID,groupID,groupID,groupID))
def aligned_filter_3(self): # 过滤掉非保守位点
for groupID in os.listdir(self.output_dir+'/align'):
print(groupID)
aligned_fa = os.path.join(self.output_dir+'/align',groupID,groupID+'_%s_aligned' % self.DataType)
d = []
fa_seq = fa_dic(aligned_fa)
t = {m:list(n) for m,n in fa_seq.items()}
matrix = pd.DataFrame(t)
for index,row in matrix.iterrows():
if row.tolist().count('-') <= 3:
d.append(row)
if len(d) < 300:continue
f = open(aligned_fa + '_filtered','w')
matrix_filtered = pd.DataFrame(d)
for index,row in matrix_filtered.iteritems():
f.write('>'+index+'\n'+''.join(row.tolist())+'\n')
f.close()
def CDS2Codon12(self): # 根据CDS提取codon12
for groupID in os.listdir(self.output_dir+'/align'):
path = os.path.join(self.output_dir,'align',groupID)
if not os.path.exists(os.path.join(path,groupID+'_CDS.aln-0.2cln')):
continue
f_codon12 = open(os.path.join(path,groupID+'_Codon12.aln-0.2cln'),'w')
Fa_dic = fa_dic(os.path.join(path,groupID+'_CDS.aln-0.2cln'))
for geneID,seq in Fa_dic.items():
d = []
for n in range(0,len(seq),3):
d.append(seq[n:n+2])
f_codon12.write('>'+geneID+'\n'+''.join(d)+'\n')
f_codon12.close()
print(groupID)
def genetree_iqtree_4(self): # 生成构建基因树sh
f = open(os.path.join(self.output_dir,'tree_%s_%s.sh' % (self.DataType,self.trim)),'w')
for groupID in os.listdir(self.output_dir+'/align'):
path = os.path.join(self.output_dir,'align',groupID)
if os.path.exists('%s/%s.aln-%s.treefile' % (path,groupID+'_'+self.DataType,'0.2cln')):continue
if os.path.exists('%s/%s.aln-%s' % (path,groupID+'_'+self.DataType,"0.2cln")):
f.write('cd %s && iqtree -s %s.aln-%s -pre %s.aln-%s -st %s -nt AUTO -bb 1000 -redo -m MFP -quiet\n' % (path,groupID+'_'+self.DataType,self.trim,groupID+'_'+self.DataType,self.trim,self.st))
f.close()
def generate_concatenation_5(self): # 将序列连接成超级矩阵
core_orthogroups = self.core_orthogroups()
print(len(core_orthogroups))
d = {i[:3]:[] for i in self.InterGroup+['Out']}
for groupID,row in core_orthogroups.iterrows():
path = os.path.join(self.output_dir,'align',groupID,groupID+'_%s.aln-0.2cln' % (self.DataType))
if os.path.exists(path):
t = fa_dic(path)
spID_current = set([i[:3] for i in t.keys()])
for x in set(d)-spID_current:
d[x].append('-'*len(t[random.choice(list(t.keys()))]))
for m,n in t.items():
d[m[:3]].append(n)
with open(self.output_dir+'/%s_concatenation.fa.aln-0.2cln' % (self.DataType),'w') as f:
for i in d:
f.write('>'+i+'\n'+''.join(d[i])+'\n')
def generate_concatenation(suffix,aln_dir,out_name):
d = {}
for i in os.listdir(aln_dir):
if i.endswith('CDS.aln-0.2cln'):
fa = fa_dic(os.path.join(aln_dir,i))
d[i] = fa
df = pd.DataFrame(d).fillna('*')
current_len = 0
with open(out_name,'w') as f:
for abb,row in df.iterrows():
for groupID,s in row.items():
gene_length = len(list(fa_dic(os.path.join(aln_dir,groupID)).values())[0])
if s == '*':
gap = '-'*gene_length
row[groupID] = gap
print('DNA, '+groupID+'='+str(current_len+1)+'-'+str(current_len+gene_length))
current_len += gene_length
seq = ''.join(row.tolist())
f.write('>'+abb+'\n'+seq+'\n')
def write_seq_from_tree(self,output_dir,tree_dir,pep_file,CDS_file): # 提取树文件中基因的序列
pep_dic = fa_dic(pep_file)
CDS_dic = fa_dic(CDS_file)
if not os.path.exists(output_dir+'/align'):
os.mkdir(output_dir+'/align')
for i in os.listdir(tree_dir):
cluster_name = i.split('.')[0]
path = os.path.join(output_dir,'align',cluster_name)
if not os.path.exists(path):
os.mkdir(path)
print(cluster_name)
#f_list = open(cluster_name+'_gene_list','w')
treestr = open(os.path.join(tree_dir,i)).read()
all_gene = re.findall('(\w{3}@.*?):',treestr)
with open(os.path.join(path,cluster_name+'_pep'),'w') as f_pep, open(os.path.join(path,cluster_name+'_CDS'),'w') as f_CDS:
for geneID in all_gene:
f_pep.write('>'+geneID[:3]+'\n'+pep_dic[geneID]+'\n')
f_CDS.write('>'+geneID[:3]+'\n'+CDS_dic[geneID]+'\n')
#f_list.write(geneID+'\n')
#f_list.close()
def extract_seq_for_treeshrink(clusterID,outdir): #提取 treeshink 删除基因后的对齐序列
for clusterID,row in pd.read_csv(clusterID,sep='\t',header=None).set_index(0).fillna('*').iterrows():
print(clusterID)
cds_path = os.path.join('/data/01/user119/project/jsl/04_phylogeny/orthomcl/Gbi_Pab_single/align/%s/%s_CDS.aln-0.2cln' % (clusterID,clusterID))
fa = fa_dic(cds_path)
for i in row:
if i != '*':
del fa[i]
with open(os.path.join(outdir,clusterID + '_CDS_shrunk_0.2cln.fasta'), 'w') as f:
for geneID,seq in fa.items():
f.write('>'+geneID+'\n'+seq+'\n')
if __name__ == '__main__':
#raw_database = ''
#output_dir = ''
#orthomcl_out = ''
#output_dir = ''
#a = ORTHOMCL()
#d = PHYLOGENY(orthomcl_out,15,1,'orthomcl',raw_database,output_dir,'Efe',['Gbi','Pab'],'CDS',12)
#a.core_orthogroups(orthomcl_out,14,1)
#d.build_database_1()
#d.singleCOPY_align_2()
#d.aligned_filter_3()
#d.genetree_iqtree_4()
#d.generate_concatenation_5()
#d.CDS2Codon12()
#d.extraction_singlecopy_cds()
#generate_concatenation()