-
Notifications
You must be signed in to change notification settings - Fork 1
/
get_subgenome.py
123 lines (119 loc) · 4.9 KB
/
get_subgenome.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#sys.argv[1] -> the species you want to get subgenome(such as BOL20)
#sys.argv[2] -> 3genomes_full_tandem_AKBr_v1.5.txt.filtered
#sys.argv[3] -> final.txt
#sys.argv[4] -> result file name
import linecache
import sys
import os
import re
def get_subgenome(filename1, filename2, filename3):
outfile = open(sys.argv[4], 'w')
final_list = []
fr3 = open(filename3, 'r')
lines3 = fr3.readlines()
for line3 in lines3:
final_list.append(line3.strip())
if filename1 in final_list:
coords_filename = 'final_coords/' + filename1 + '.gff.coords.final'
else:
coords_filename = 'coords/' + filename1 + '.gff.coords'
fr2 = open(filename2, 'r')
lines2 = fr2.readlines()
LF_str = ''
MF1_str = ''
MF2_str = ''
# Ath10_dict1 => AKBr and block information
Ath10_dict1 = {}
# Ath10_dict2 => the subgenome gene of every Ath10 gene
Ath10_dict2 = {}
for line2 in lines2:
list2 = line2.strip().split('\t')
Ath10_dict1[list2[1]] = list2[1 : 7]
Ath10_dict2[list2[1]] = ['-' , '-' , '-']
if list2[4] != '-':
LF_str += list2[4] + ';'
if list2[5] != '-':
MF1_str += list2[5] + ';'
if list2[6] != '-':
MF2_str += list2[6] + ';'
# print(Ath10_dict1)
Brapa15_fr1 = open('Brapa15.final/' + filename1 + '_to_Brapa15.SynOrths', 'r')
Brapa15_lines1 = Brapa15_fr1.readlines()
Brapa15_dict = {}
for Brapa15_line1 in Brapa15_lines1:
Brapa15_dict[Brapa15_line1.strip().split('\t')[0]] = Brapa15_line1.strip().split('\t')[4]
subgenome_index = 0
Ath10_fr1 = open('Ath10.final/' + filename1 + '_to_Ath10.SynOrths', 'r')
Ath10_lines1 = Ath10_fr1.readlines()
for Ath10_line1 in Ath10_lines1:
Ath10_list1 = Ath10_line1.strip().split('\t')
# start to judge
try:
# have syntenic gene with Brapa15, Brapa15_synorths => Brapa15 synorths gene with aim species
Brapa15_synorths = Brapa15_dict[Ath10_list1[0]]
if re.search(Brapa15_synorths, LF_str):
subgenome_index = 1
elif re.search(Brapa15_synorths, MF1_str):
subgenome_index = 2
elif re.search(Brapa15_synorths, MF2_str):
subgenome_index = 3
else:
row_num = int(os.popen("cat " + coords_filename + " | grep -n " + Ath10_list1[0] + " | awk -F: '{print $1}'").read())
except KeyError:
row_num = int(os.popen("cat " + coords_filename + " | grep -n " + Ath10_list1[0] + " | awk -F: '{print $1}'").read())
if subgenome_index == 0:
if row_num > 10:
for i in range(row_num , row_num - 10, -1):
candidate_gene = linecache.getline(coords_filename, i).strip().split('\t')[0]
try:
Brapa15_synorths = Brapa15_dict[candidate_gene]
if re.search(Brapa15_synorths, LF_str):
subgenome_index = 1
elif re.search(Brapa15_synorths, MF1_str):
subgenome_index = 2
elif re.search(Brapa15_synorths, MF2_str):
subgenome_index = 3
else:
subgenome_index = 0
except KeyError:
continue
else:
for i in range(row_num , row_num + 30):
candidate_gene = linecache.getline(coords_filename, i).strip().split('\t')[0]
try:
Brapa15_synorths = Brapa15_dict[candidate_gene]
if re.search(Brapa15_synorths, LF_str):
subgenome_index = 1
elif re.search(Brapa15_synorths, MF1_str):
subgenome_index = 2
elif re.search(Brapa15_synorths, MF2_str):
subgenome_index = 3
else:
subgenome_index = 0
except KeyError:
continue
if subgenome_index == 1:
try:
Ath10_dict2[Ath10_list1[4]][0] = Ath10_list1[0]
except KeyError:
pass
elif subgenome_index == 2:
try:
Ath10_dict2[Ath10_list1[4]][1] = Ath10_list1[0]
except KeyError:
pass
elif subgenome_index == 3:
try:
Ath10_dict2[Ath10_list1[4]][2] = Ath10_list1[0]
except KeyError:
pass
# print(Ath10_list1[4])
# print(subgenome_index)
for key1, value1 in Ath10_dict1.items():
outfile.write('\t'.join(value1[0 : 3]) + '\t' + '\t'.join(Ath10_dict2[key1]) + '\n')
outfile.close()
fr2.close()
fr3.close()
Brapa15_fr1.close()
Ath10_fr1.close()
get_subgenome(sys.argv[1], sys.argv[2], sys.argv[3])