-
Notifications
You must be signed in to change notification settings - Fork 0
/
validationP.py
105 lines (94 loc) · 3.18 KB
/
validationP.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#! /projects/team1/group2_geneprediction/python3/bin/python3.7
"""
Created on Sat Mar 2 21:14:21 2019
@author: yjk12
"""
import os
import multiprocessing
import argparse
import subprocess
import re
parser = argparse.ArgumentParser(description='blast')
parser.add_argument('-s',required=True, help = 'input sequences')
parser.add_argument('-b',required=True, help = 'blast_results')
parser.add_argument('-g',default='still', help = 'gff_files')
parser.add_argument('-o',required=True, help = 'output_folder')
args=parser.parse_args()
###blast result is sorted by e-value
path1=args.s
path2=args.b
#path3=args.g
statistic = multiprocessing.Manager().dict()
gms=sorted(os.listdir (path1))
blast=sorted(os.listdir(path2))
if args.g!='still':
gfffile=sorted(os.listdir(path3))
def modify(aa,bl,gf,ST): # aa--protein, bl--blast_result, gf--gff
###origin predicted protein sequences
pathA=os.path.join(path1,aa)
pathB=os.path.join(path2,bl)
#pathC=os.path.join(path3,gf)
file1=open(pathA,'r')
faa=file1.read()
faa=faa.split('>')[1:]
###blast result
file2=open(pathB,'r')
blastp=file2.read()
blastp=blastp.split('# BLASTP 2.8.1+\n# Query: ')[1:]
if len(faa)!=len(blastp):
print(len(faa)-len(blastp))
exit('no')
newlist=[] # new list that abandaned false positive/novel genes
newgene=[] # new list for abandaned genes
for i in range(len(faa)):
if re.search('# 0 hits found',blastp[i])==None:
newlist.append(i)
else:
newgene.append(i)
'''k=0
for i in range(len(faa)):
a=int(faa[i].split(' ')[0])
for j in range(k,len(blastp)):
b=int(blastp[j].split(' ')[0])
if a==b:
newlist.append(i)
k=j+1
break
elif a<b:
newgene.append(i)
break'''
### writing known genes
newprotein=open(args.o+'knownprotein/'+aa,'w')
for i in newlist:
newprotein.write('>'+faa[i])
newprotein.close()
### writing novel gene/false positive
novelgene=open(args.o+'novelgene/'+aa[0:-4]+'novel.faa','w')
for i in newgene:
novelgene.write('>'+faa[i])
novelgene.close()
#statistic.append(aa[0:-4]+':'+str(len(newlist)/(len(newlist)+len(newgene))))
statistic[aa[0:-4]]=len(newlist)/(len(newlist)+len(newgene))
###gff file
'''file3=open(pathC,'r+')
gff=file3.read()
gff=gff.split('##sequence-region') '''
if __name__=='__main__':
subprocess.call(['mkdir '+args.o+'knownprotein'],shell=True)
subprocess.call(['mkdir '+args.o+'novelgene'],shell=True)
for i in range(0,len(gms),4):
tlist=[]
for j in range(4):
if i+j > len(gms)-1:
break
else:
t=multiprocessing.Process(target=modify,args=(gms[i+j],blast[i+j],0,statistic)) #gfffile[i+j]
t.start()
tlist.append(t)
for k in tlist:
k.join()
sta=open(args.o+'/statistics.csv','w')
sta.write('Sample'+','+'Positive rate'+'\n')
for i,j in statistic.items():
sta.write(i+','+str(j)+'\n')
sta.close()