-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcount_divergent_sites.py
123 lines (108 loc) · 4.89 KB
/
count_divergent_sites.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import os
from optparse import OptionParser
### help and usage ###
usage = "usage: %prog [options] args"
description = '''Function: calculate divergence for sites of each annotation category '''
version = '%prog 07.27.2021'
parser = OptionParser(usage=usage,version=version, description = description)
parser.add_option("--blk",
action="store",
dest = "blk",
help = "block id of the macthed region (interger from 1-)",
metavar = "ARG")
parser.add_option("--in_assembly_rep",
action="store",
dest = "in_assembly_rep",
help = "input path of the NCBI assembly report of the target species",
metavar = "PATH")
parser.add_option("--in_anno_seq",
action = "store",
dest = "in_anno_seq",
help = "input path of reannotation of the whole genome of the target species in fasta format (.fa)",
metavar = "PATH")
parser.add_option("--in_map",
action="store",
dest = "in_map",
help = "input path of the map file that includes 0-based coordinates aligned genomic blocks",
metavar = "PATH")
parser.add_option("--in_mfa_tgt",
action="store",
dest = "in_mfa_tgt",
help = "input path of multiple fasta alignment file of matched blocks in the target assembly (.mfa)",
metavar = "PATH")
parser.add_option("--in_mfa_ans",
action="store",
dest = "in_mfa_ans",
help = "input path of multiple fasta alignment file of matched blocks in the ancestral (outgroup) assembly (.mfa)",
metavar = "PATH")
parser.add_option("--temp",
action = "store",
dest = "temp",
help = "output directory of temporary intermediate files",
metavar = "PATH")
parser.add_option("--bedtools",
action = "store",
dest = "bedtools",
help = "path of bedtools",
metavar = 'PATH')
(options,args) = parser.parse_args()
### introduced variables ###
blk = options.blk
in_assembly_rep = options.in_assembly_rep
in_anno_seq = options.in_anno_seq
in_map = options.in_map
in_mfa_tgt = options.in_mfa_tgt
in_mfa_ans = options.in_mfa_ans
temp = options.temp
bedtools=options.bedtools
### in-script variables ###
ctg_name = {}
map_coord = {}
ans_sites = {}
### 1. Build a dictionary for contig names transferrings
with open(in_assembly_rep, 'r') as f:
for line in f:
if not line.startswith('#'):
line = line.strip().split('\t')
ctg_name[line[0]] = line[6]
### 2. Build a dictionary for coordinates of matched blocks
with open(in_map, 'r') as f:
for line in f:
line = line.strip().split('\t')
map_coord[line[0]] = [line[1], line[2], line[3], line[4]]
### 3. read the multiple fasta alignment file of this alignment,
### and map bases of the outgroup species onto the genome of the target species.
pos = int(map_coord[blk][1])
match_seq = ''
with open(in_mfa_tgt, 'r') as f_tgt, open(in_mfa_ans, 'r') as f_ans, open('{0}/{1}.bed'.format(temp, blk), 'w') as f_bed:
for line1, line2 in zip(f_tgt, f_ans):
if not line1.startswith('>'):
line1 = line1.strip()
line2 = line2.strip()
for base1, base2 in zip(line1, line2):
if base1 != '-':
pos += 1
if base2 != '-':
if base1 != 'N' and base2 != 'N':
f_bed.write('\t'.join([ctg_name[map_coord[blk][0]], str(pos - 1), str(pos), '.', '0', '+']) + '\n')
if base1 != base2:
match_seq += 's'
else:
match_seq += 'c'
### 4. Get the annotation of all matched sites
os.system('{0} getfasta -s -tab -fi {1} -bed {2}/{3}.bed -fo {2}/{3}.txt'.format(bedtools, in_anno_seq, temp, blk))
### 5. count the allele difference for each annotation category
anno_name = {'A': 'non-synonymous', 'W': '2-fold_synonymous', 'H': '3-fold_synonymous', 'S': '4-fold_synonymous', '3': "3'UTR", '5': "5'UTR", 'R': 'RNA-coding_regions', 'I': 'intron', 'G': 'intergenic'}
anno_divergence = {x: [0, 0] for x in anno_name}
with open('{0}/{1}.txt'.format(temp, blk), 'r') as f:
for stat, line in zip(match_seq, f):
anno = line.strip().split('\t')[1]
anno_divergence[anno][1] += 1
anno_divergence[anno][0] += stat.count('s')
ctgr_order = []
count_order = []
for ctgr in anno_divergence:
ctgr_order.append(anno_name[ctgr])
count_order += anno_divergence[ctgr]
print('\t'.join(ctgr_order))
print('\t'.join(list(str(x) for x in count_order)))