-
Notifications
You must be signed in to change notification settings - Fork 0
/
chunk_genome.py
582 lines (528 loc) · 21 KB
/
chunk_genome.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
"""Set of functions used to generate reads derived from a set of sequences (i.e. full genome FASTA file).
* You can specify a mutation rate, a VCF file in order to add known SNPs, or a mutation matrix obtained from snpAD, :title-reference:`Kay Prüfer, snpAD: an ancient DNA genotype caller, Bioinformatics, Volume 34, Issue 24, 15 December 2018, Pages 4165–4171, https://doi.org/10.1093/bioinformatics/bty507`.
* Default read length distribution is derived from empirical data, but you can provide your own or length will be uniformly sampled.
* Work is processed in parallel in order to decrease computation time.
"""
import gzip
from Bio import SeqIO, SeqRecord
from Bio.Seq import Seq
import multiprocessing
# for creating partial function for feeding pool.map
from functools import partial
import numpy as np
import pandas as pd
import random
import sys
import pysam
import re
import argparse
import contextlib
def get_sequence_with_substitution(sequence):
"""Generate a sequence with mutations relative to the given mutation probability matrix
Parameters
----------
sequence : Bio.Seq.MutableSeq
Returns
-------
Bio.Seq.Seq
"""
read_length = len(sequence)
# create a set of draws
choices = np.random.random(read_length)
# create a profile sequence with as many profile 30 relative to read_length
positions = np.concatenate([np.arange(15), np.arange(29, 14, -1)])
positions = np.insert(positions, 15, np.full(read_length - 30, 30))
newSeq = list(sequence)
# iterate over the sequence, choices and positions
for idx, (nc, (choice, pos)) in enumerate(zip(sequence, zip(choices, positions))):
# we draw a probability which allows us to mutate
if choice > same_nuc[pos][nc]:
t = tbl[nc].loc[pos]
try:
newSeq[idx] = t[choice < t].idxmin()
except: # rounding error when over choice 0.999999
newSeq[idx] = t.idxmax()
return Seq("".join(newSeq))
def get_chrom(refseq):
"""Get the chromosome from a RefSeq entry.
Parameters
----------
refseq : Bio.Seq.Seq
A RefSeq entry
Returns
-------
str
The chromosome extracted from the sequence id
"""
chrom = refseq.id # in case of scaffold genome
if "|" in chrom:
chrom = chrom.split("|")[0]
return chrom
# sort the list of requences according to chromosome
def sort_recs(recs, chromosome_list=[], split_char="|"):
"""Sort a list of Bio.SeqRecord according to the chromosome, position, length.
Parameters
----------
recs : list of Bio.SeqRecord
chromosome_list : list of str
A list specifying the order of the chromosomes (as python doesn't really natural sort lists)
split_char : str
The delimiter used for splitting the record.id
Returns
-------
list of Bio.SeqRecord
Records properly sorted
"""
def sort_func(item):
"""we sort according to chromosomes
our header is >16|kraken:taxid|9906|69694935_57 ...
use split to access the first element
last element is pos_length
"""
chrom = item.id.split(split_char)[0]
pos, l = item.id.split(split_char)[-1].split("_")
if chromosome_list:
return (chromosome_list.index(chrom), int(pos), int(l))
else:
return (int(pos), int(l))
return sorted(recs, key=sort_func)
def simulate_deamination(sequence, nbases=3):
"""Add C -> T mutations to 5' and 3' ends
Parameters
----------
sequence : Bio.Seq.MutableSeq
nbases : int
Number of bases to mutate for both ends
Returns
-------
Bio.Seq.Seq
The deaminated sequence
"""
while "C" in sequence[:nbases]: # 5' C to T
sequence[sequence.index("C")] = "T"
while "C" in sequence[-nbases:]: # 3' C to T
sequence[-nbases + sequence[-nbases:].index("C")] = "T"
return sequence.toseq()
def mutate_unif(sequence, unif):
"""Add mutations uniformly throughout the sequence
Parameters
----------
sequence: Bio.Seq.Seq
unif: float
The mutaion rate
Returns
-------
Bio.Seq.Seq
A new mutated sequence
"""
same_nuc = set("ACGT")
read_length = len(sequence)
# we will mutate a base only if our sample is < unif
(choices,) = np.where(np.random.random(read_length) < unif)
newSeq = list(sequence)
for idx in choices:
# we call tuple in order to do random.choice (and remove the current nc from the possibilities)
newSeq[idx] = random.choice(tuple(same_nuc.difference(sequence[idx])))
return Seq("".join(newSeq))
# the idea is to generate a set of coordinates/size pairs and only take those substrings:
# sample 0:length_genome (N = #chunks desired)
# sample N sizes (N = chunks desired)
def chunk_fast(
record,
n_samples,
vcf_in=None,
chrom=None,
specie="|",
individual=0,
unif=False,
len_distrib=False,
deaminate=0,
minlength=35,
maxlength=100,
nthreads=4,
):
"""Split the sequence into chunks and add mutations (in parallel)
The function will generate a list of positions to extract reads from as well as a list of read length (following the given distribution), it will then create a list of reads which will be mutated according to the mutation rule given (unif, VCF, or matrix)
Parameters
----------
record: Bio.SeqRecord
n_samples: int
How many reads should be sampled
vcf_in: pysam.VariantFile, optional
File providing mutations informations (VCF or Substitution matrix)
chrom: int, optional
specie: str, optional
individual: int, optional
unif: float, optional
The mutaion rate, will be added on top of VCF/substitution Matrix
len_distrib: str, optional
If False, uniform distribution will be used
deaminate: int, optional
minlength: int, optional
maxlength: int, optional
nthreads: int, optional
Returns
-------
list of Bio.Seq.Seq
"""
try:
positions = random.sample(range(0, len(record) - minlength), n_samples)
except:
print(
"sample too small {}bp, sampling {} reads with replacements".format(
len(record) - minlength, n_samples
),
file=sys.stderr,
)
positions = [
random.choice(range(0, len(record) - minlength)) for _ in range(n_samples)
]
if len_distrib: # we gave a file with distribution per length
p = read_len_distrib(len_distrib, minlength, maxlength)
length = np.random.choice(np.arange(minlength, maxlength + 1), n_samples, p=p)
else:
# length = random.choices(range(minlength, maxlength), k = n_samples) #only from python3.6
# onwards
length = np.random.choice(np.arange(minlength, maxlength + 1), n_samples)
samples = []
for pos, l in zip(positions, length):
rec = record[pos : pos + l]
while not set(rec).issubset("ACGT"): # control for unknown bases
# print ("Unknown base (N) in read at {} {},
# resampling...".format(pos,l), file=sys.stderr) # debug purpose
pos = random.choice(range(0, len(record) - minlength))
if len_distrib:
l = np.random.choice(np.arange(minlength, maxlength + 1), 1, p=p)[0]
else:
l = random.choice(range(minlength, maxlength + 1))
rec = record[pos : pos + l]
# create a new header which includes the specie read pos, read length
samples += [
SeqRecord.SeqRecord(
rec.seq,
id="{}{}{}_{}".format(rec.id, specie, pos, l),
description=" ".join(rec.description.split(" ")[1:]),
)
]
if vcf_in: # insert variation from VCF
res = []
if isinstance(vcf_in, pysam.VariantFile): # we use a VCF
for sample in samples:
sequence = sample.seq.tomutable()
l = len(sequence)
for vcf_rec in vcf_in.fetch(chrom, start=pos, end=pos + l):
# Insert alt only if GT != 0/0, use the first individual by
# default
if any(vcf_rec.samples[0]["GT"]):
if len(vcf_rec.alts[0]) == 1: # only SNPs
sequence[vcf_rec.start - pos] = vcf_rec.alts[0]
if deaminate or unif: # save the seqs for later deam/unif
res.append(sequence)
else:
sample.seq = sequence.toseq()
# print(record[vcf_rec.start-3:vcf_rec.start+4].seq,vcf_rec.ref,vcf_rec.alts,vcf_rec.start,
# pos, l) # debug purpose
else: # we use a variation probability matrix
# run on multithreaded code as it is the bottleneck (each base is possibly mutated)
with multiprocessing.Pool(processes=nthreads) as pool:
res = pool.map(
get_sequence_with_substitution, [s.seq.tomutable() for s in samples]
)
if len(res): # run multi-threaded code
with multiprocessing.Pool(processes=nthreads) as pool:
if deaminate:
deam_func = partial(simulate_deamination, deaminate=deaminate)
res = pool.map(deam_func, res)
if unif:
unif_func = partial(mutate_unif, unif=unif)
res = pool.map(unif_func, res)
for s, seq in zip(samples, res):
s.seq = seq
return samples
def read_len_distrib(filename, minlen=35, maxlen=100):
"""Reads a tsv file with counts of read per length
Parameters
----------
filename : str
tsv file containing the read length count
minlen : int
minimum length for the distribution
maxlen : int
maximum length for the distribution
Returns
-------
list of float
The computed proportion of each read length
"""
from csv import reader
with open(filename, "r", newline="") as csvfile:
csvreader = reader(csvfile, delimiter="\t")
first_row = csvreader.__next__()
read_distrib = [0] * int(first_row[1]) + [int(first_row[0])]
read_distrib.extend([int(row[0]) for row in csvreader])
if len(read_distrib) < maxlen:
print(
"Read length distribution provides values only until",
len(read_distrib),
"; expected:",
maxlen,
)
s = sum(read_distrib[minlen : maxlen + 1])
return [r / s for r in read_distrib[minlen : maxlen + 1]]
def estimate_read_distribution(file_in, num_seq, n_chromosomes=None):
"""Reads a fasta file and computes the number of sequences to extract per chromosome (relative to chromosome length)
Parameters
----------
file_in : str
FASTA file you want to estimate the distribution from.
num_seq : int
The total number of sequences you want to sample.
n_chromosomes : int
If you want to take only the first *n* chromosomes of your dataset.
Returns
-------
list of int
The number of sequences to extract per chromosome.
"""
try:
with pysam.FastaFile(file_in) as fa:
print("The file contains", fa.nreferences, "sequences", file=sys.stderr)
if num_seq:
if n_chromosomes:
print("Working only with the first", n_chromosomes, file=sys.stderr)
full_size = sum(fa.lengths[:n_chromosomes])
reads_per_chrom = [
int(s / full_size * num_seq) + 1
for s in fa.lengths[:n_chromosomes]
]
else:
n_chromosomes = len(fa.lengths)
full_size = sum(fa.lengths)
reads_per_chrom = [
int(s / full_size * num_seq) + 1 for s in fa.lengths
]
except OSError: # fasta is not indexed do a naive fallback
print("Error, fasta file not indexed, doing naive sampling...", file=sys.stderr)
if n_chromosomes:
reads_per_chrom = [int(num_seq / n_chromosomes) + 1] * n_chromosomes
else:
print(
"Naive sampling needs the option --chromosomes...Aborting...",
file=sys.stderr,
)
sys.exit(1)
# our samplig tends to slightly over sample, readjust by removing reads on random chromosomes
extra_samples = sum(reads_per_chrom) - num_seq
for idx in [random.randint(0, n_chromosomes - 1) for _ in range(extra_samples)]:
# some scaffolds are so small that the don't even get a read to sample.
while reads_per_chrom[idx] == 0:
idx = random.randint(0, n_chromosomes - 1)
reads_per_chrom[idx] -= 1
return reads_per_chrom
# dummy function used for opening a file non-gzipped
@contextlib.contextmanager
def _ret_file(f):
yield f
def read_mutation_matrix(file_in):
"""Create a table from a mutation matrix provided by SNPad
Returns
-------
tbl : Pandas.table
A table containing the mutation probability for base at each position of a read.
sn : list of dict
The list contains the highest probability for which we can have a mutation on a given position.
Used to optimize the computation when we draw for a mutation.
"""
t = pd.read_table(
file_in, sep="\t", header=None, names=["profile", "from", "to", "prob"]
)
tbl = pd.pivot_table(
t, values="prob", index=["profile", "to"], columns=["from"], aggfunc=np.sum
)
# contains, for each nc, the highest probability for which we can have
# a mutation
# e.g.
# 'A': 0.00797800000000004, -> ~0.79% chances to have a mutation, any sampled number above this would result in keeping the nc
# this allows us to optimize the algorithm by not using the lookup
# table and directly keep the nc.
sn = [
{nc: prob for nc, prob in zip(tbl.columns, [tbl[x][pos][x] for x in tbl])}
for pos in range(31)
]
return tbl, sn
def _main():
random.seed()
# Process command line
parser = argparse.ArgumentParser(description="Split a genome into chuncks")
parser.add_argument(
"--num_seq", default=1, type=int, help="Number of sequences to output"
)
parser.add_argument("--outfile", help="Fasta file where to extract sequence")
parser.add_argument("--specie", help="Specie Taxa")
parser.add_argument("file_in", metavar="genome.fa[.gz], [bgzip] and indexed")
parser.add_argument(
"--chromosomes", type=int, help="How many chromosomes are expected", default=0
)
parser.add_argument(
"--minlen",
type=int,
help="Minimum read length (lengths are sampled uniformly)",
default=35,
)
parser.add_argument(
"--maxlen",
type=int,
help="Maximum read length (lengths are sampled uniformly)",
default=100,
)
parser.add_argument(
"--threads",
type=int,
default=multiprocessing.cpu_count(),
help="Specify number of threads to use",
)
# TODO, control for option dependency properly
parser.add_argument(
"--deaminate",
type=int,
help="How many bases should be deaminated of each end",
default=0,
)
parser.add_argument(
"--unif", type=float, help="Add mutations uniformly distributed", default=0.0
)
parser.add_argument(
"--individual",
type=int,
help="The individual used for variation in your VCF",
default=0,
)
parser.add_argument(
"--length",
help="2 columns (#reads, length) TSV file containing read length distribution",
default=False,
)
# either provide a VCF OR a substitution matrix, you probably don't want
# to deaminate if you choose the latter
group = parser.add_mutually_exclusive_group()
group.add_argument(
"--vcf", help="VCF containing the variation to be included in the samples"
)
group.add_argument(
"--substitution_file",
type=str,
help="File containing the substitution probability matrix",
)
sorting = parser.add_mutually_exclusive_group()
sorting.add_argument(
"--sorted",
action="store_true",
help="Natural sort the reads by chromosome name, position and length",
)
sorting.add_argument(
"--shuffled", action="store_true", help="Output shuffled reads"
)
args = parser.parse_args()
print("Loading dataset...", end="", file=sys.stderr)
if args.substitution_file:
global tbl
global same_nuc
tbl, same_nuc = read_mutation_matrix(args.substitution_file)
for nc in "ACGT":
for pos in range(31):
# do cumsum on sorted values, assuming that the highest proba is the nc itself: C->C, A->A, etc
tbl.loc[pos][nc] = \
tbl.loc[pos][nc].sort_values(ascending=False).cumsum().sort_index()
num_reads_to_sample = estimate_read_distribution(
args.file_in, args.num_seq, args.chromosomes
)
specie = "|"
if args.specie:
specie += args.specie.replace(" ", "_") + "|"
with gzip.open(args.file_in, "rt") if args.file_in.endswith("gz") else _ret_file(
args.file_in
) as file_in, open(args.outfile, "w") if args.outfile else sys.stdout as file_out:
all_chunks = []
vcf_in = args.substitution_file # will be none if we use a VCF file
chromosomes = []
if args.vcf:
vcf_in = pysam.VariantFile(args.vcf)
# subset the VCF to 1 individual only
vcf_in.subset_samples([vcf_in.header.samples[args.individual]])
# our chromosome can contain up to 3 characters (most likely 2 max:
# e.g. chromosome 21,)
p = re.compile(r"chromosome \w{1,3},", re.IGNORECASE)
for num_record, record in enumerate(SeqIO.parse(file_in, "fasta")):
# we want reads only to assigned chromosomes
if args.chromosomes:
if num_record > args.chromosomes - 1: # num_record is 0-based
# assume the genome is sorted, with chromosomes first...
break
print("Parsing chromosome", num_record + 1, end="\r", file=sys.stderr)
if num_reads_to_sample[num_record] == 0: # skip this sequence
continue
chrom = p.search(record.description)
if not chrom:
chrom = (
get_chrom(record),
get_chrom(record),
) # in case of scaffold genome
else: # save both the id and chromosome info from fasta description
chrom = (get_chrom(record), chrom.group()[len("chromosome ") : -1])
# we write chromosomes in the same order as the given input.
if not args.shuffled and not args.sorted:
all_chunks = sort_recs(
chunk_fast(
record,
num_reads_to_sample[num_record],
vcf_in,
chrom,
specie,
unif=args.unif,
deaminate=args.deaminate,
len_distrib=args.length,
minlength=args.minlen,
maxlength=args.maxlen,
nthreads=args.threads,
)
)
SeqIO.write(all_chunks, file_out, "fasta")
else:
chromosomes.append(chrom)
all_chunks += chunk_fast(
record,
num_reads_to_sample[num_record],
vcf_in,
chrom,
specie,
unif=args.unif,
deaminate=args.deaminate,
len_distrib=args.length,
minlength=args.minlen,
maxlength=args.maxlen,
nthreads=args.threads,
)
if num_record % 100 == 99: # show progress
print(num_record + 1, "sequences parsed...", end="\r", file=sys.stderr)
if args.vcf:
vcf_in.close()
print("\nDone\nWritting down records...", end="", file=sys.stderr)
if args.shuffled:
random.shuffle(all_chunks)
if args.sorted: # we will do a natural sort on the chromosomes
# if we find a chromosome in the description part of the header
# use it for sorting, otherwise sort with fasta id
chromosomes.sort(
key=lambda key: [
int(text) if text.isdigit() else text
for text in re.split(r"([0-9]+)", key[1])
]
)
all_chunks = sort_recs(
all_chunks, chromosome_list=[c[0] for c in chromosomes]
)
if args.shuffled or args.sorted:
SeqIO.write(all_chunks, file_out, "fasta")
print("Done", file=sys.stderr)
if __name__ == "__main__":
_main()