-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprep_gt_data_split.py
216 lines (198 loc) · 10.4 KB
/
prep_gt_data_split.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
from optparse import OptionParser
import math
import sympy
### help and usage ###
usage = "usage: %prog [options] args"
description = '''Function: prepare the split input genotype data for BayeScEnv'''
version = '%prog 2.28.2022'
parser = OptionParser(usage=usage,version=version, description = description)
parser.add_option("--wd",
action="store",
dest = "wd",
help = "working directory",
metavar = "PATH")
parser.add_option("--in_snp_dir",
action="store",
dest = "in_snp_dir",
help = "the directory of filtered SNPs from all contigs in vcf format (.vcf)",
metavar = "PATH")
parser.add_option("--in_sample_size",
action="store",
dest = "in_sample_size",
help = "input path of allelic (auto/X) sample sizes of all samples analyzed (.txt)",
metavar = "PATH")
parser.add_option("--in_chr_assign",
action="store",
dest = "in_chr_assign",
help = "input path of assignments of contigs to chromosomes (.txt)",
metavar = "PATH")
parser.add_option("--in_sample_order",
action="store",
dest = "in_sample_order",
help = "input path of a file that contains the order of samples as environmental variables (.txt)",
metavar = "PATH")
parser.add_option("--out_log",
action="store",
dest = "out_log",
help = "output path of the log file (.log)",
metavar = "PATH")
(options,args) = parser.parse_args()
### introduced variables ###
wd = options.wd
in_snp_dir = options.in_snp_dir
in_sample_size = options.in_sample_size
in_chr_assign = options.in_chr_assign
in_sample_order = options.in_sample_order
out_log = options.out_log
### local variables ###
subsample_size = 10000
### def functions ###
## function to calculate effective sample size as the expectation of unique chromosomal draws
def expectation_sympy(nr, nc):
return float(sum(j*math.factorial(nc)*sympy.functions.combinatorial.numbers.stirling(nr, j)/(math.factorial(nc - j)*nc**nr) for j in range(1, nc + 1)))
### 1. input chromosomal assignment information of contigs
chrs = ['X', 'auto']
chr_assign = {x:{} for x in chrs}
with open(in_chr_assign, 'r') as f:
for line in f:
if not line.startswith('contig'):
line = line.strip().split('\t')
contig = line[0]
mapping = line[1]
cor_assign = line[2]
assign = cor_assign.split('-')[0]
if mapping != 'na':
if not mapping in chr_assign[assign]:
chr_assign[assign][mapping] = [contig]
else:
chr_assign[assign][mapping].append(contig)
else:
if cor_assign == 'X-linked':
if not 'X' in chr_assign[assign]:
chr_assign[assign]['X'] = [contig]
else:
chr_assign[assign]['X'].append(contig)
elif cor_assign == 'auto-linked':
if not cor_assign in chr_assign[assign]:
chr_assign[assign][cor_assign] = [contig]
else:
chr_assign[assign][cor_assign].append(contig)
### 2. extract haplotype size for each sample
sample_size = {}
with open(in_sample_size, 'r') as f:
i = 0
for line in f:
line = line.strip().split('\t')
if i == 0:
samples = line
elif i == 1:
sample_size['auto'] = {x: int(y) for x,y in zip(samples, line)}
elif i == 2:
sample_size['X'] = {x: int(y) for x,y in zip(samples, line)}
i += 1
### 3. extract the order of samples in the input of environmental variables
sample_order = []
with open(in_sample_order, 'r') as f:
for line in f:
if not line.startswith('Population'):
line = line.strip().split('\t')
sample_order.append(line[0])
### 4. generate genotype file for BayeScEnv
expectation_dict = {}
for chr in chrs:
dic_gt = {sample: {} for sample in sample_order}
num_snp, num_snp_filter = 0, 0
out_gt_pos = '{0}/{1}_pos_snp_gt_maf_5.txt'.format(wd, chr)
for arm in chr_assign[chr]:
for contig in chr_assign[chr][arm]:
in_snp = '{0}/{1}_snp_clean.vcf'.format(in_snp_dir, contig)
with open(in_snp, 'r') as f:
for line in f:
if line.startswith('#CHROM'):
samples = line.strip().split(' ')[-1].split('\t')
elif not line.startswith('#'):
line = line.strip().split('\t')
num_snp += 1
pos = int(line[1])
ref = line[3].upper()
alt = line[4].upper()
alleles = [ref] + alt.split(',')
allele_freq = {x:0 for x in alleles}
ct_sp = {}
# extract allele frequency of the top 2 alleles across all samples
for i in range(len(line[9:])):
info = line[9:][i]
gt = list(int(x) for x in info.split(':')[0].split('/'))
ct = [int(info.split(':')[1])] + list(int(x) for x in info.split(':')[2].split(','))
dp = int(info.split(':')[3])
if len(set(gt)) == 1:
ct_sp[i] = {alleles[gt[0]]: max(ct)}
else:
ct_sp[i] = {alleles[x]: ct[gt.index(x)] for x in gt}
for ale in alleles:
if ale in ct_sp[i]:
allele_freq[ale] += ct_sp[i][ale]/dp
else:
ct_sp[i][ale] = 0
# sort alleles by the total allele frequency (so that each sample has the same weight of voting despite sequencing depth), and determine the minor allele
bialleles = sorted(allele_freq, key=lambda x: allele_freq[x], reverse=True)[:2]
# extract allele count (of top two alleles) of each sample, corrected by effective sample size
cts_maj = []
cts_min = []
sum_min_ct = 0
sum_bi_ct = 0
for i in range(len(line[9:])):
dp_bi = sum(list(ct_sp[i][ale] for ale in bialleles if ale in ct_sp[i]))
sum_bi_ct += dp_bi
if dp_bi != 0:
af_major = ct_sp[i][bialleles[0]]/dp_bi
af_minor = ct_sp[i][bialleles[1]]/dp_bi
sum_min_ct += ct_sp[i][bialleles[1]]
# only calculate effective sample size for combinations of depth and pool size that haven't been calculated
key_sz = ','.join([str(dp_bi), str(sample_size[chr][samples[i]])])
if key_sz in expectation_dict:
efs = expectation_dict[key_sz]
else:
efs = expectation_sympy(dp_bi, sample_size[chr][samples[i]])
expectation_dict[key_sz] = efs
ct_major = af_major*efs
ct_minor = af_minor*efs
else:
ct_major, ct_minor = 0, 0
cts_maj.append(ct_major)
cts_min.append(ct_minor)
# filter SNPs by across-sample MAF >= 5%
if sum_min_ct/sum_bi_ct >= 0.05:
num_snp_filter += 1
for sample in dic_gt:
dic_gt[sample][len(dic_gt[sample]) + 1] = [':'.join([contig, str(pos)]), \
str(round(cts_maj[samples.index(sample)]) + round(cts_min[samples.index(sample)])), \
'2', str(round(cts_maj[samples.index(sample)])), \
str(round(cts_min[samples.index(sample)]))]
## subsample SNPs by a fixed interval
with open(out_gt_pos, 'w') as f_pos, open(out_log, 'a') as f_log:
# output the filtering statictics
f_log.write('{0}: Number of SNPs = {1}\n'.format(chr, num_snp))
f_log.write('{0}: Number of SNPs after MAF filtering = {1}\n'.format(chr, num_snp_filter))
f_log.write('{0}: Proportion of SNPs filtered out = {1:.2%}\n'.format(chr, (num_snp - num_snp_filter)/num_snp))
# determine the interval length of subsampling by sample size. interval length = numebr of samples
interval = math.ceil(len(dic_gt[sample_order[0]])/subsample_size)
subsamplesize_div = math.ceil(len(dic_gt[sample_order[0]])/interval)
f_log.write('{0}: Number of subsamples = {1}\n'.format(chr, interval))
# output subsampled SNPs
for sub in range(1, interval + 1):
out_gt = '{0}/{1}_{2}_snp_gt_maf_5.txt'.format(wd, chr, sub)
with open(out_gt, 'w') as f:
if sub + (subsamplesize_div - 1)*interval <= len(dic_gt[sample_order[0]]):
subsamplesize_actual = subsamplesize_div
else:
subsamplesize_actual = subsamplesize_div - 1
f.write('[loci]={}\n\n'.format(subsamplesize_actual))
f.write('[populations]={}\n\n'.format(len(sample_order)))
for pop in sample_order:
f.write('[pop]={}\n'.format(sample_order.index(pop) + 1))
for i in range(subsamplesize_actual):
f.write('\t'.join([str(i + 1)] + dic_gt[pop][sub + i*interval][1:]) + '\n')
if pop == sample_order[0]: # avoid repeated output to the position file
f_pos.write(('\t'.join([str(sub), str(i + 1), dic_gt[pop][sub + i*interval][0]]) + '\n'))
f.write('\n')