-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparsing.py
executable file
·579 lines (497 loc) · 24.2 KB
/
parsing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
# -*- coding: utf-8 -*-
"""
parsing.py - Module of DemInfHelper for parsing and processing VCF files and other data formats in the demography inference workflow.
This module provides functions for parsing and processing VCF files, calculating Site Frequency Spectra (SFS), Genotyping Quality (GQ) distributions,
and performing various data preprocessing tasks. It also includes functions for dynamic distance evaluation, SNP filtering, and more.
Functions:
- distrib_GQ(GQ_pop, line=[], pos_ind=None, bin_size=10):
Parse Genotyping Quality (GQ) information from a VCF file and calculate GQ distributions.
- parse_config(config_file):
Parse a configuration file and return the configuration parameters as a dictionary.
- update_config(config_dict, config_file):
Update a configuration file with values from a dictionary and preserve in-place comments.
- parse_sfs(sfs_file):
Parse a Site Frequency Spectrum (SFS) file and return a masked spectrum.
- get_contigs_lengths(vcf, length_cutoff=100000, only_names=False):
Parse contig lengths from a VCF file header and return a dictionary of contig names and lengths.
- dadi_output_parse(dadi_output_file):
Parse Dadi model output files and return the results as a list of values.
- pca_from_vcf(popid, vcf_file, nb_samples, out_dir, ploidy=2, keep_modified_vcf=False, modified_vcf_ram=False):
Perform Principal Component Analysis (PCA) on genetic data from a VCF file and generate PCA plots.
- vcf_line_parsing(PARAM, SFS=False, GQ=False, SMCPP=False, segments_size=1000):
Parse VCF lines, calculate SNP distances, and generate Site Frequency Spectra (SFS) and GQ distributions.
Each function in this module is designed to assist in parsing and processing genetic data and configuration files for the demography inference workflow.
"""
import gzip
import os
import numpy as np
if __package__ is None or __package__ == '':
import inferences
import sfs
import plots
else:
from . import inferences
from . import sfs
from . import plots
import re
from tqdm import tqdm # Import tqdm for the progress bar
def distrib_GQ(GQ_pop, line = [], pos_ind = None, bin_size = 10): #PL is a dict
"""
Calculate the distribution of Genotyping Quality (GQ) values from a VCF line.
This function parses GQ values from the INFO field of a VCF line, groups them into bins,
and counts the number of GQ values in each bin.
Parameters:
- GQ_pop (dict): A dictionary to store the GQ distribution.
- line (list): A list representing a VCF line split into fields.
- pos_ind (list): List of positions for each sample in the VCF.
- bin_size (int): The size of bins for grouping GQ values.
Returns:
- GQ_pop (dict): Updated GQ distribution dictionary.
Raises:
- ValueError: If the GQ field is not found in the VCF FORMAT field.
"""
format_field = line[8].split(":") # Split the FORMAT field
if "GQ" not in format_field:
raise ValueError("GQ field not found in FORMAT field")
gq_index = format_field.index("GQ") # Find the position of GQ field in FORMAT
samples = [line[i] for i in pos_ind] # Extract sample-specific information
gq_values = [sample.split(":")[gq_index] for sample in samples] # Extract GQ values
for gq_value in gq_values:
gq_value = int(gq_value) # Convert GQ value to integer
# Group GQ values into bins of specifed size (e.g. bin_size=10: 0-9, 10-19, etc.)
bin_value = gq_value - (gq_value % bin_size)
if bin_value in GQ_pop:
GQ_pop[bin_value] += 1
else:
GQ_pop[bin_value] = 1
return GQ_pop
def parse_config(config_file, args=None):
"""
Parse a configuration file and return the configuration parameters as a dictionary.
This function reads a configuration file line by line and extracts parameter names and values.
The resulting dictionary contains configuration parameters with their corresponding values.
Parameters:
- config_file (str): The path to the configuration file.
- args (dict): The dictionnary of command line arguments.
Returns:
- param (dict): A dictionary containing configuration parameters and values.
"""
param = {}
with open(config_file, "rt") as config:
line=config.readline()
while line != "":
if line[0] != "#":
param[line[:-1].split(": ")[0]] = line[:-1].split(": ")[1].strip()
line = config.readline()
param["folded"]=bool(param["folded"])
#param["transformed"]=bool(param["transformed"])
param["name_pop"] = param["name_pop"].split(",")
param["npop"]=int(param["npop"])
if "n_clust_kmeans" in param and param["n_clust_kmeans"] != None:
param["n_clust_kmeans"] = eval(param["n_clust_kmeans"])
if "cpus" in param:
param["cpus"]=int(param["cpus"])
else:
param["cpus"]=None
for p in param["name_pop"]:
if p in list(param.keys()):
param[p] = [item.strip() for item in param[p].split(",")]
else:
# if the pop is not defined in the config, the same list
# of all samples from the VCF are used for every pop
param[p] = get_sample_names(vcf=param["vcf"])
param["n_"+p] = len(param[p])
# SETTING SOME DEFAULTS
if "length_cutoff" not in param:
# default contig size to keep is 1Mb
param["length_cutoff"] = 100000
else:
param["length_cutoff"] = int(param["length_cutoff"])
if "ref_genome" not in param:
param["ref_genome"] = None
if 'length_cutoff' not in param.keys():
param["length_cutoff"] = length_cutoff
# # if command line arguments are parsed
# if args:
# # Iterate over the arguments, as they override config file
# for arg in vars(args):
# value = getattr(args, arg)
# if value is not None:
# param[arg] = value
return param
def update_config(config_dict, config_file):
"""
Update a configuration file with values from a dictionary and preserve in-place comments.
This function takes a dictionary of configuration parameters and their values and updates
an existing configuration file. It preserves any comments in the file and adds new entries
at the end if necessary.
Parameters:
- config_dict (dict): A dictionary containing updated configuration values.
- config_file (str): The path to the configuration file to be updated.
Returns:
- None
Note: The function modifies the configuration file in place.
"""
# Read the existing config file
with open(config_file, 'r') as file:
lines = file.readlines()
initial_dict = {key.lower(): key for key in parse_config(config_file)}
# Create a mapping of lowercase keys to original keys
key_mapping = {key.lower(): key for key in config_dict}
# Update values in the lines based on config_dict or add new entries
for i in range(len(lines)):
line = lines[i].strip()
if line.startswith('#') or not line:
# Preserve in-place comment lines
continue
key, value = map(str.strip, line.split(':', 1))
# Use the original key if it's found in the config_dict
key_original = key_mapping.get(key.lower(), key)
if key_original in config_dict:
# If the key is found in the config_dict, update the value
updated_value = config_dict[key_original]
# Check if the value is a list and format it accordingly
if isinstance(updated_value, list):
updated_value_str = ', '.join(map(str, updated_value))
else:
updated_value_str = updated_value
lines[i] = f"{key_original}: {updated_value_str}\n"
# Add new entries from config_dict at the end of the file if they are not already present
for key, value in config_dict.items():
# Check if the key is not already present in the file
if key.lower() not in initial_dict.keys():
lines.append(f"{key}: {value}\n")
# Write the updated lines back to the config file
with open(config_file, 'w') as output:
for line in lines:
output.write(line)
def parse_sfs(sfs_file):
"""
Parse a Site Frequency Spectrum (SFS) file and return a masked spectrum.
This function reads an SFS file, extracts the spectrum data, and applies a mask to it.
The mask excludes specific bins from the spectrum, resulting in a masked SFS.
Parameters:
- sfs_file (str): The path to the SFS file to be parsed, in dadi's .fs format.
Returns:
- masked_spectrum (list): A masked SFS as a list of integers.
Raises:
- FileNotFoundError: If the specified SFS file is not found.
- ValueError: If there are inconsistencies in the file format or data.
Note: The actual structure of the SFS file is based on dadi's fs format.
"""
try:
with open(sfs_file, 'r') as file:
# Read the first line which contains information about the file
num_individuals, mode, species_name = file.readline().strip().split()
num_individuals = int(num_individuals)
# Read the spectrum data
spectrum_data = list(map(int, file.readline().strip().split()))
# Check if the number of bins in the spectrum matches the expected number
if len(spectrum_data) != num_individuals:
print("Len SFS=", len(spectrum_data), "Nb indiv. = ", num_individuals)
raise ValueError("Error: Number of bins in the spectrum doesn't match the expected number of individuals.")
# Read the mask data
mask_data = list(map(int, file.readline().strip().split()))
# Check if the size of the mask matches the number of bins in the spectrum
if len(mask_data) != num_individuals:
raise ValueError("Error: Size of the mask doesn't match the number of bins in the spectrum.")
# Apply the mask to the spectrum
masked_spectrum = [spectrum_data[i] for i in range(num_individuals) if not mask_data[i]]
# Error handling
except FileNotFoundError:
print(f"Error: File not found - {sfs_file}")
except ValueError as ve:
print(f"Error: {ve}")
except Exception as e:
print(f"Error: {e}")
# final return of SFS as a list
return masked_spectrum
def get_contigs_lengths(vcf, length_cutoff=100000, only_names=False, contig_regex=None):
"""
Parse contig lengths from a VCF file header and return a dictionary of contig names and lengths.
This function reads the header of a VCF file and extracts contig information, including contig names
and their corresponding lengths. It returns a dictionary with contig names as keys and lengths as values.
Parameters:
- vcf (str): The path to the VCF file.
- length_cutoff (int): The minimum contig length to be included.
- only_names (bool): If True, return a list of contig names only.
Returns:
- contigs (dict or list): A dictionary of contig names and lengths or a list of contig names.
Note: The function can return a dictionary of contig names and lengths or a list of contig names.
"""
contigs = {}
print(f"Parsing {vcf} to get contigs sizes.")
# Parsing VCF in gzip format
with gzip.open(vcf, 'rt') as vcf_stream:
line = vcf_stream.readline()
while line != "":
if line[0:8] == "##contig":
contig_length = int(re.split('[=,]', line)[-1][:-2])
contig_name = re.split('[=,]', line)[2]
# keep only contigs that are longer than the length_cutoff parameter
if contig_length >= length_cutoff:
contigs[contig_name] = contig_length
elif line.startswith("#"):
line = vcf_stream.readline()
continue
else:
break
line = vcf_stream.readline()
#If not ##contig in the comments, need to estimate parsing all the file
if len(contigs)==0:
print("Warning: No ##contig info in VCF header. Need to parse the whole file...")
pbar = tqdm(total=0, dynamic_ncols=True, unit='line', unit_scale=True) # Initialize the progress bar
contigs_dict = {}
with gzip.open(vcf, 'rt') as vcf_stream:
line = vcf_stream.readline()
while line != "":
if line.startswith("#"):
line = vcf_stream.readline()
continue
else:
line = line.split()
pos = line[1]
chrm = line[0]
contigs[chrm] = pos
# at the end, change line
pbar.update(1)
line = vcf_stream.readline()
pbar.close()
print("Finished getting contig sizes.")
if contig_regex:
contig_regex = re.compile(contig_regex)
print(f"Keeping only contigs with regex: {contig_regex}")
contigs = {contig_name: contig_size for contig_name, contig_size in contigs.items() if re.match(contig_regex, contig_name)}
print(f"Contigs={contigs}")
if only_names:
return list(contigs.keys())
return contigs
def get_sample_names(vcf):
"""Get samples list from the VCF header
"""
print(f"Parsing {vcf} to get samples list.")
# Parsing VCF in gzip format
with gzip.open(vcf, 'rt') as vcf:
line = vcf.readline()
while line != "":
if line[0:6] == "#CHROM":
samples = list(line.split())[9:]
break
line = vcf.readline()
return samples
def dadi_output_parse(dadi_output_file):
"""
Parse Dadi model output files and return the results as a list of values.
This function parses output files generated by Dadi demographic modeling and extracts the results,
including log likelihoods, parameter values, and other relevant information.
Parameters:
- dadi_output_file (str): The path to the Dadi output file.
Returns:
- all_vals (list): A list of parsed values containing log likelihoods, parameter values, etc.
Note: The structure of Dadi output files can vary, and parsing may need to be adapted accordingly.
"""
all_vals = []
ite = 0
converged = False
with open(dadi_output_file) as dadi_output:
for line in dadi_output:
# Log(likelihood) nuB nuF TB TF misid theta
if line.startswith("#"):
# check if converged
if "Converged" in line:
converged = True
if converged and len(all_vals) > 0:
# all converged values have been parsed
# skip the top 100 results that are printed as the second series of values
return all_vals
# then skip if comments
continue
ite += 1
line_parsed = line.strip().split()
logL = float(line_parsed[0])
nuB = float(line_parsed[1])
nuF = float(line_parsed[2])
TB = float(line_parsed[3])
TF = float(line_parsed[4])
theta = float(line_parsed[5])
all_vals.append([ite, logL, [nuB, nuF, TB, TF], theta])
return all_vals
def pca_from_vcf(popid, vcf_file, nb_samples, out_dir, ploidy = 2,
keep_modified_vcf = False, modified_vcf_ram = False):
"""
Perform Principal Component Analysis (PCA) on genetic data from a VCF file and generate PCA plots.
This function conducts PCA on genetic data from a VCF file, generates PCA plots, and saves the results
in the specified output directory.
Parameters:
- popid (str): Identifier for the population being analyzed.
- vcf_file (str): The path to the VCF file containing genetic data.
- nb_samples (int): The number of samples in the VCF file.
- out_dir (str): The directory where PCA results and plots will be saved.
- ploidy (int): The ploidy level for the genetic data (default is 2).
- keep_modified_vcf (bool): If True, keep the modified VCF file; otherwise, it will be deleted.
- modified_vcf_ram (bool): If True, use RAM for the modified VCF file (only relevant for large VCF files).
Returns:
- None
"""
plink_out_dir = out_dir+"/plink/"
if not os.path.exists(plink_out_dir):
os.makedirs(plink_out_dir)
# need to use bcftools to add IDs to replace the "." with unique IDs for each variant
cmd1 = "".join(["bcftools annotate --set-id +'%CHROM:%POS' ", \
vcf_file, " -Oz -o ", \
plink_out_dir+popid+"_IDs.vcf.gz"])
cmd2 = "".join(["plink2 --vcf ", plink_out_dir+popid+"_IDs.vcf.gz", \
" --make-bed --allow-extra-chr --max-alleles ", str(ploidy), \
" --snps-only --out ", plink_out_dir+popid, " --freq"])
print(cmd1)
os.system(cmd1)
print(cmd2)
os.system(cmd2)
if keep_modified_vcf == False:
# remove temporary file
os.remove(plink_out_dir+popid+"_IDs.vcf.gz")
cmd3 = "".join(["plink2 --bfile ", plink_out_dir+popid, \
" --pca ", str(nb_samples-1), \
" --out ", plink_out_dir+popid+".pca --allow-extra-chr --read-freq ", \
plink_out_dir+popid+".afreq"])
print(cmd3)
os.system(cmd3)
# Generate plot
plots.plot_pca(plink_out_dir+popid+".pca.eigenvec", plink_out_dir+popid+".pca.eigenval", popid = popid, out_dir = out_dir)
# Function using dynamic distance evaluation with rolling positions
def vcf_line_parsing(PARAM, SFS = False, GQ = False, SMCPP = False):
"""
Parse VCF lines, calculate SNP distances, and generate Site Frequency Spectra (SFS) and GQ distributions.
This function parses VCF lines, calculates SNP distances between samples, and can generate Site Frequency Spectra (SFS)
and Genotyping Quality (GQ) distributions depending on the specified parameters.
Parameters:
- PARAM (dict): A dictionary containing configuration parameters.
- SFS (bool): If True, generate Site Frequency Spectra (SFS).
- GQ (bool): If True, generate Genotyping Quality (GQ) distributions.
- SMCPP (bool): If True, perform specific tasks for SMC++ input.
Returns:
- None
Example:
>>> params = {'folded': True, 'name_pop': ['pop1', 'pop2'], ...}
>>> SFS = True
>>> GQ = True
>>> vcf_line_parsing(params, SFS, GQ)
Note: The function performs various tasks based on the specified parameters.
"""
# cutoff is the minimum size of each contig to be used
# required for SMC++, as it works for contigs > 100kb or 1mb
length_cutoff = int(PARAM["length_cutoff"])
contigs = get_contigs_lengths(vcf=PARAM["vcf"], length_cutoff=length_cutoff)
# total size of all contigs used
Nseq = sum(list(contigs.values()))
tab_size = 4
percentile = 90
SFS_dict = {}
GQ_dict = {}
cols_in_vcf = {}
current_chrm = None
All_snp_count = 0
Kept_snp_count = 0
snp_dist_list = []
# we initialize a sfs for each population
for p in PARAM["name_pop"]:
SFS_dict[p] = np.array(sfs.build_sfs(n=PARAM["n_"+str(p)], folded=PARAM["folded"], sfs_ini=True))
if GQ:
for p in PARAM["name_pop"]:
GQ_dict[p] = {}
pbar = tqdm(total=0, dynamic_ncols=True, unit='line', unit_scale=True) # Initialize the progress bar
# First parse
print("First parsing the VCF to determine the distribution of distances between variants.")
with gzip.open(PARAM["vcf"], mode='rt') as vcf:
line = vcf.readline()
# we read all the lines of the vcf
while line != "":
if line[0:6] == "#CHROM":
# parsing the header line
for p in PARAM["name_pop"]:
pos_ind_pop = []
for ind in PARAM[p]:
pos_ind_pop.append(line[:-1].split("\t").index(ind))
cols_in_vcf["pos_"+p] = pos_ind_pop
# skip the rest of the code for this line
line = vcf.readline()
continue
if line.startswith("#"):
# ignore other comments
line = vcf.readline()
continue
chrm = line.split()[0]
pos = int(line.split()[1])
if chrm != current_chrm:
# new chromosome/contig initialization
current_chrm = chrm
# reset the tab, otherwise distances are going to be false
tab = [1] * tab_size
snp_nb = 0
if line[0] != "#" and ".:" not in line and "/." not in line and "./" not in line and ".|" not in line and "|." not in line and "," not in line.split("\t")[4]: #we only keep the bi-allelique sites
snp_nb += 1
split_line = line.split("\t")
for p in PARAM["name_pop"]:
# do something with this snp
tab[snp_nb % tab_size] = pos
if snp_nb > tab_size:
snp_dist = tab[snp_nb % tab_size] - tab[(snp_nb+1) % tab_size]
snp_dist_list.append(snp_dist)
line = vcf.readline()
pbar.update(1)
pbar.close() # Close the progress bar when done
keeping_threshold = np.percentile(snp_dist_list, percentile)
print(f"SFS parsing: Done. Filtering variants keeping only variants with a distance that is lower than : {keeping_threshold} ({percentile}' percentile)")
snps_distance_by_chr = {}
pbar = tqdm(total=0, dynamic_ncols=True, unit='line', unit_scale=True) # Initialize the progress bar
with gzip.open(PARAM["vcf"], mode='rt') as vcf:
line = vcf.readline()
# we read all the lines of the vcf
while line != "":
if line[0:6] == "#CHROM":
# parsing the header line
for p in PARAM["name_pop"]:
pos_ind_pop = []
for ind in PARAM[p]:
pos_ind_pop.append(line[:-1].split("\t").index(ind))
cols_in_vcf["pos_"+p] = pos_ind_pop
# skip the rest of the code for this line
line = vcf.readline()
continue
if line.startswith("#"):
# ignore other comments
line = vcf.readline()
continue
# the line is a variant, count it
All_snp_count += 1
chrm = line.split()[0]
pos = int(line.split()[1])
if chrm != current_chrm:
# new chromosome/contig initialization
current_chrm = chrm
snps_distance_by_chr[chrm] = {}
# reset the tab, otherwise distances are going to be false
tab = [1] * tab_size
snp_nb = 0
if line[0] != "#" and ".:" not in line and "/." not in line and "./" not in line and ".|" not in line and "|." not in line and "," not in line.split("\t")[4]: #we only keep the bi-allelique sites
snp_nb += 1
split_line = line.split("\t")
for p in PARAM["name_pop"]:
# do something with this snp
tab[snp_nb % tab_size] = pos
if snp_nb > tab_size:
snp_dist = tab[snp_nb % tab_size] - tab[(snp_nb+1) % tab_size]
if snp_dist < keeping_threshold:
Kept_snp_count += 1
snps_distance_by_chr[chrm][pos] = snp_dist
SFS_dict[p] = sfs.build_sfs(n=PARAM["n_"+p], folded=PARAM["folded"], sfs_ini = False, \
line = split_line, sfs = SFS_dict[p], pos_ind = cols_in_vcf["pos_"+p])
if GQ:
for p in PARAM["name_pop"]:
GQ_dict[p] = distrib_GQ(GQ_pop = GQ_dict[p], line = split_line, pos_ind = cols_in_vcf["pos_"+p])
line = vcf.readline()
pbar.update(1)
pbar.close() # Close the progress bar when done
L = (All_snp_count - Kept_snp_count) / All_snp_count * Nseq
print("Finished building SFS.")
return SFS_dict, GQ_dict, round(L), snps_distance_by_chr