-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsqanti-sim.py
executable file
·579 lines (482 loc) · 30.5 KB
/
sqanti-sim.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
#!/usr/bin/env python3
"""
sqanti-sim.py
SQANTI-SIM: a simulator of controlled novelty and degradation of transcripts
sequenced by long-reads
Wrapper script to execute all modules of the SQANTI-SIM pipeline, including
existing long-read RNA-seq simulators such as NanoSim, PBSIM3, and IsoSeqSim.
Author: Jorge Mestre Tomas ([email protected])
"""
__version__ = "0.2.0"
import argparse
import numpy
import os
import random
import sys
from collections import defaultdict
from src import classify_gtf
from src import simulate_reads
from src import design_simulation
from src import evaluation_metrics
from time import strftime
def classif(input: list):
"""Classify transcripts in SQANTI3 structural categories
Given a GTF annotation generates an index file with the potential SQANTI3
structural category of each transcript
Args:
input (list): arguments to parse
"""
parser = argparse.ArgumentParser(prog="sqanti-sim.py classif", description="sqanti-sim.py classif parse options", )
parser.add_argument("--gtf", type=str, required=True, help="\t\tReference annotation in GTF format", )
parser.add_argument("-o", "--output", type=str, default="sqanti-sim", help="\t\tPrefix for output file", )
parser.add_argument("-d", "--dir", type=str, default=".", help="\t\tDirectory for output files (default: .)", )
parser.add_argument("-k", "--cores", type=int, default=1, help="\t\tNumber of cores to run in parallel", )
args, unknown = parser.parse_known_args(input)
if unknown:
print("[SQANTI-SIM] classif mode unrecognized arguments: {}\n".format(" ".join(unknown)), file=sys.stderr)
if not os.path.exists(args.gtf):
print("[SQANTI-SIM] ERROR: --gtf file does not exist. Provide a valid path", file=sys.stderr)
sys.exit(1)
if not os.path.isdir(args.dir):
os.makedirs(args.dir)
print("\n[SQANTI-SIM] Running with the following parameters:")
print("[SQANTI-SIM] - Ref GTF:", str(args.gtf))
print("[SQANTI-SIM] - Out prefix:", str(args.output))
print("[SQANTI-SIM] - Out dir:", str(args.dir))
print("[SQANTI-SIM] - N threads:", str(args.cores))
print("\n[SQANTI-SIM][%s] Classifying transcripts in structural categories" % (strftime("%d-%m-%Y %H:%M:%S")))
trans_info = classify_gtf.classify_gtf(args)
print("[SQANTI-SIM] Summary table from categorization")
classify_gtf.summary_table_cat(trans_info)
print("[SQANTI-SIM][%s] classif step finished" % (strftime("%d-%m-%Y %H:%M:%S")))
def design(input: list):
"""Modifies reference annotation GTF and builds expression matrix
Given the novel and known transcripts to simulate and its counts, generates
the expression matrix to give as input to the long-read RNA-seq simulators
and generetes the modified GTF to use as reference annotation in your tool
Args:
input (list): arguments to parse
"""
parser = argparse.ArgumentParser(prog="sqanti-sim.py design", description="sqanti-sim.py design parse options", )
subparsers = parser.add_subparsers(dest="mode",
description="\t\tDifferent modes to generate the expression matrix: equal (simulate with equal coverage for all reads), custom (simulate with diferent negative binomial distributions for novel and known transcripts) or sample (simulate using a real sample)")
parser_e = subparsers.add_parser("equal", help="\t\tRun in equal mode")
parser_e.add_argument("-i", "--trans_index", type=str, required=True,
help="\t\tFile with transcript information generated by SQANTI-SIM (*_index.tsv)", )
parser_e.add_argument("--gtf", type=str, required=True, help="\t\ttComplete reference annotation in GTF format", )
parser_e.add_argument("-o", "--output", type=str, default=str(), help="\t\tPrefix for output files")
parser_e.add_argument("-d", "--dir", type=str, default=".", help="\t\tDirectory for output files (default: .)", )
parser_e.add_argument("-nt", "--trans_number", type=int, default=30000,
help="\t\tTotal number of transcripts to simulate", )
parser_e.add_argument("--read_count", default=50000, type=int, help="\t\tNumber of reads to simulate", )
parser_e.add_argument("--ISM", type=int, default=0, help="\t\tNumber of incomplete-splice-matches to simulate", )
parser_e.add_argument("--NIC", type=int, default=0, help="\t\tNumber of novel-in-catalog to simulate", )
parser_e.add_argument("--NNC", type=int, default=0, help="\t\tNumber of novel-not-in-catalog to simulate", )
parser_e.add_argument("--Fusion", type=int, default=0, help="\t\tNumber of Fusion to simulate")
parser_e.add_argument("--Antisense", type=int, default=0, help="\t\tNumber of Antisense to simulate", )
parser_e.add_argument("--GG", type=int, default=0, help="\t\tNumber of Genic-genomic to simulate", )
parser_e.add_argument("--GI", type=int, default=0, help="\t\tNumber of Genic-intron to simulate", )
parser_e.add_argument("--Intergenic", type=int, default=0, help="\t\tNumber of Intergenic to simulate", )
parser_e.add_argument("-k", "--cores", type=int, default=1, help="\t\tNumber of cores to run in parallel", )
parser_e.add_argument("-s", "--seed", type=int, default=None, help="\t\tRandomizer seed", )
parser_c = subparsers.add_parser("custom", help="\t\tRun in custom mode")
parser_c.add_argument("-i", "--trans_index", type=str, required=True,
help="\t\tFile with transcript information generated with SQANTI-SIM (*_index.tsv)", )
parser_c.add_argument("--gtf", type=str, required=True, help="\t\ttComplete reference annotation in GTF format", )
parser_c.add_argument("-o", "--output", type=str, default=str(), help="\t\tPrefix for output files")
parser_c.add_argument("-d", "--dir", type=str, default=".", help="\t\tDirectory for output files (default: .)", )
parser_c.add_argument("-nt", "--trans_number", type=int, default=30000,
help="\t\tTotal number of transcripts to simulate", )
parser_c.add_argument("--nbn_known", type=float, default=15,
help="\t\tAverage read count per known transcript to simulate (the parameter 'n' of the Negative Binomial distribution)", )
parser_c.add_argument("--nbp_known", type=float, default=0.5,
help="\t\tThe parameter 'p' of the Negative Binomial distribution for known transcripts", )
parser_c.add_argument("--nbn_novel", type=float, default=5,
help="\t\tAverage read count per novel transcript to simulate (the parameter 'n' of the Negative Binomial distribution)", )
parser_c.add_argument("--nbp_novel", type=float, default=0.5,
help="\t\tThe parameter 'p' of the Negative Binomial distribution for novel transcripts", )
parser_c.add_argument("--ISM", type=int, default=0, help="\t\tNumber of incomplete-splice-matches to simulate", )
parser_c.add_argument("--NIC", type=int, default=0, help="\t\tNumber of novel-in-catalog to simulate", )
parser_c.add_argument("--NNC", type=int, default=0, help="\t\tNumber of novel-not-in-catalog to simulate", )
parser_c.add_argument("--Fusion", type=int, default=0, help="\t\tNumber of Fusion to simulate")
parser_c.add_argument("--Antisense", type=int, default=0, help="\t\tNumber of Antisense to simulate", )
parser_c.add_argument("--GG", type=int, default=0, help="\t\tNumber of Genic-genomic to simulate", )
parser_c.add_argument("--GI", type=int, default=0, help="\t\tNumber of Genic-intron to simulate", )
parser_c.add_argument("--Intergenic", type=int, default=0, help="\t\tNumber of Intergenic to simulate", )
parser_c.add_argument("-k", "--cores", type=int, default=1, help="\t\tNumber of cores to run in parallel", )
parser_c.add_argument("-s", "--seed", type=int, default=None, help="\t\tRandomizer seed", )
parser_s = subparsers.add_parser("sample", help="\t\tRun in sample mode")
parser_s.add_argument("-i", "--trans_index", type=str, required=True,
help="\t\tFile with transcript information generated with SQANTI-SIM (*_index.tsv)", )
parser_s.add_argument("--gtf", type=str, required=True, help="\t\tComplete reference annotation in GTF format", )
parser_s.add_argument("-o", "--output", type=str, default=str(), help="\t\tPrefix for output files")
parser_s.add_argument("-d", "--dir", type=str, default=".", help="\t\tDirectory for output files (default: .)", )
parser_s.add_argument("-nt", "--trans_number", type=int, default=None,
help="\t\tTotal number of transcripts to simulate", )
parser_s.add_argument("--genome", type=str, required=True, help="\t\tReference genome FASTA", )
group1 = parser_s.add_mutually_exclusive_group()
group1.add_argument("--expr_file", type=str, default=str(),
help="\t\tA 3-column tab-separated file containing (i) gene name, (ii) transcript name and (iii) transcript read count", )
group1.add_argument("--mapped_reads", type=str, default=str(),
help="\t\tInput mapped long reads for characterization in SAM format", )
group1.add_argument("--long_reads", type=str, default=str(),
help="\t\tInput long reads for characterization in FASTA or FASTQ format", )
group2 = parser_s.add_mutually_exclusive_group()
group2.add_argument("--pb", action="store_true", help="\t\tIf used the program will use PacBio settings", )
group2.add_argument("--ont", action="store_true", help="\t\tIf used the program will use ONT settings", )
parser_s.add_argument("--iso_complex", action="store_true",
help="\t\tIf used the program will approximate the expressed isoform complexity (number of isoforms per gene)", )
parser_s.add_argument("--diff_exp", type=float, default=2.0,
help="\t\tFactor for adjusting the odds of novel and known transcripts expression assignments. A value of 0 means no bias between the two types. A higher value increases the bias towards novel transcripts having lower expression. (default: 2.0)", )
parser_s.add_argument("--read_type", type=str, default="cDNA",
help="\t\tRead type for ONT expression level (if --ont)", choices=["cDNA", "dRNA"])
parser_s.add_argument("--ISM", type=int, default=0, help="\t\tNumber of incomplete-splice-matches to simulate", )
parser_s.add_argument("--NIC", type=int, default=0, help="\t\tNumber of novel-in-catalog to simulate", )
parser_s.add_argument("--NNC", type=int, default=0, help="\t\tNumber of novel-not-in-catalog to simulate", )
parser_s.add_argument("--Fusion", type=int, default=0, help="\t\tNumber of Fusion to simulate")
parser_s.add_argument("--Antisense", type=int, default=0, help="\t\tNumber of Antisense to simulate", )
parser_s.add_argument("--GG", type=int, default=0, help="\t\tNumber of Genic-genomic to simulate", )
parser_s.add_argument("--GI", type=int, default=0, help="\t\tNumber of Genic-intron to simulate", )
parser_s.add_argument("--Intergenic", type=int, default=0, help="\t\tNumber of Intergenic to simulate", )
parser_s.add_argument("-k", "--cores", type=int, default=1, help="\t\tNumber of cores to run in parallel", )
parser_s.add_argument("-s", "--seed", type=int, default=None, help="\t\tRandomizer seed", )
args, unknown = parser.parse_known_args(input)
if not os.path.isdir(args.dir):
os.makedirs(args.dir)
if not args.output:
output = os.path.basename(args.trans_index).split("_")
args.output = "_".join(output[:-1])
if unknown:
print("[SQANTI-SIM] design mode unrecognized arguments: {}\n".format(" ".join(unknown)))
total_novel = sum([args.ISM, args.NIC, args.NNC, args.Fusion, args.Antisense, args.GG, args.GI, args.Intergenic])
if args.trans_number is not None and total_novel > args.trans_number:
print(
"[SQANTI-SIM] WARNING: -nt is lower than the novel transcripts to simulate, only novel transcripts will be simulated",
file=sys.stderr)
if total_novel <= 0:
print("[SQANTI-SIM] WARNING: No novel transcript type has been specified. Proceed using the default value.",
file=sys.stderr)
args.ISM = 5000
args.NIC = 2500
args.NNC = 2500
args.Fusion = 1000
args.Antisense = 1000
args.GG = 1000
args.GI = 1000
args.Intergenic = 1000
total_novel = sum(
[args.ISM, args.NIC, args.NNC, args.Fusion, args.Antisense, args.GG, args.GI, args.Intergenic])
if args.trans_number is not None and total_novel > args.trans_number:
args.trans_number = args.trans_number + total_novel
if not args.seed:
args.seed = int.from_bytes(os.urandom(1), 'big')
random.seed(args.seed)
numpy.random.seed(args.seed)
print("\n[SQANTI-SIM] Running with the following parameters:")
if args.mode == "equal":
print("[SQANTI-SIM] - Mode: equal")
print("[SQANTI-SIM] - Ref GTF:", str(args.gtf))
print("[SQANTI-SIM] - Out prefix:", str(args.output))
print("[SQANTI-SIM] - Out dir:", str(args.dir))
print("[SQANTI-SIM] - N transcripts:", str(args.trans_number))
print("[SQANTI-SIM] - N reads:", str(args.read_count))
elif args.mode == "custom":
if args.nbn_known < 0 or args.nbn_novel < 0:
print("[SQANTI-SIM] ERROR: --nbn_known and --nbn_novel must be greater than 0", file=sys.stderr)
sys.exit(1)
if args.nbp_known < 0 or args.nbp_known > 1 or args.nbp_novel < 0 or args.nbp_novel > 1:
print("[SQANTI-SIM] ERROR: --nbp_known and --nbp_novel must be in the interval [0,1]", file=sys.stderr)
sys.exit(1)
print("[SQANTI-SIM] - Mode: custom")
print("[SQANTI-SIM] - Ref GTF:", str(args.gtf))
print("[SQANTI-SIM] - Out prefix:", str(args.output))
print("[SQANTI-SIM] - Out dir:", str(args.dir))
print("[SQANTI-SIM] - N transcripts:", str(args.trans_number))
print("[SQANTI-SIM] - Known NB mean count:", str(args.nbn_known))
print("[SQANTI-SIM] - Known NB probability:", str(args.nbp_known))
print("[SQANTI-SIM] - Novel NB mean count:", str(args.nbn_novel))
print("[SQANTI-SIM] - Novel NB probability:", str(args.nbp_novel))
elif args.mode == "sample":
if args.long_reads and (not args.pb and not args.ont):
print("[SQANTI-SIM] ERROR: specify --pb or --ont when using --long_reads", file=sys.stderr)
sys.exit(1)
print("[SQANTI-SIM] - Mode: sample")
print("[SQANTI-SIM] - Ref GTF:", str(args.gtf))
print("[SQANTI-SIM] - Ref genome:", str(args.genome))
print("[SQANTI-SIM] - Out prefix:", str(args.output))
print("[SQANTI-SIM] - Out dir:", str(args.dir))
print("[SQANTI-SIM] - N transcripts:", str(args.trans_number))
print("[SQANTI-SIM] - Diff expression:", str(args.diff_exp))
if args.long_reads:
print("[SQANTI-SIM] - Long reads:", str(args.long_reads))
else:
print("[SQANTI-SIM] - Expression file:", str(args.expr_file))
if args.pb:
print("[SQANTI-SIM] - Platform: PacBio")
elif args.ont:
print("[SQANTI-SIM] - Platform: ONT")
print("[SQANTI-SIM] - N threads:", str(args.cores))
print("[SQANTI-SIM] - Seed:", str(args.seed))
print("[SQANTI-SIM]\tISM\tNIC\tNNC\tFusion\tAS\tGG\tGI\tIntergenic")
print("[SQANTI-SIM]\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % (
str(args.ISM), str(args.NIC), str(args.NNC), str(args.Fusion),
str(args.Antisense), str(args.GG), str(args.GI), str(args.Intergenic)
))
# Modify GTF
print("\n[SQANTI-SIM][%s] Generating modified GTF" % (strftime("%d-%m-%Y %H:%M:%S")))
counts_end = design_simulation.simulate_gtf(args)
counts_ini = defaultdict(
lambda: 0,
{
"full-splice_match": 0,
"incomplete-splice_match": args.ISM,
"novel_in_catalog": args.NIC,
"novel_not_in_catalog": args.NNC,
"fusion": args.Fusion,
"antisense": args.Antisense,
"genic_intron": args.GI,
"genic": args.GG,
"intergenic": args.Intergenic,
},
)
design_simulation.summary_table_del(counts_ini, counts_end)
# Generate expression matrix
print("[SQANTI-SIM][%s] Generating expression matrix" % (strftime("%d-%m-%Y %H:%M:%S")))
index_file = os.path.join(args.dir, (args.output + "_index.tsv"))
if args.mode == "equal":
design_simulation.create_expr_file_fixed_count(index_file, args)
elif args.mode == "custom":
design_simulation.create_expr_file_nbinom(index_file, args)
elif args.mode == "sample":
design_simulation.create_expr_file_sample(index_file, args)
print("[SQANTI-SIM][%s] design step finished" % (strftime("%d-%m-%Y %H:%M:%S")))
def sim(input: list):
"""Simulate reads
Simulate PacBio and/or ONT reads using the IsoSeqSim or NanoSim pipelines.
It can also simulate Illumina reads using the polyester pipeline
Args:
input (list): arguments to parse
"""
parser = argparse.ArgumentParser(prog="sqanti-sim.py sim", description="sqanti-sim.py sim parse options")
parser.add_argument("--gtf", type=str, required=True, help="\t\tComplete reference annotation in GTF format", )
parser.add_argument("--genome", type=str, required=True, help="\t\tReference genome FASTA", )
parser.add_argument("-i", "--trans_index", type=str, required=True,
help="\t\tFile with transcript information generated with SQANTI-SIM (*_index.tsv)", )
parser.add_argument("--read_type", type=str, default="cDNA", help="\t\tRead type for NanoSim simulation (if --ont)",
choices=["cDNA", "dRNA"])
parser.add_argument("-d", "--dir", type=str, default=".", help="\t\tDirectory for output files (default: .)", )
parser.add_argument("-k", "--cores", type=int, default=8, help="\t\tNumber of cores to run in parallel", )
parser.add_argument("--pass_num", type=int, default=6,
help="\t\tThe argument is only used when --pbsim is employed to simulate reads. If the number "
"of passes (--pass_num) is two or more, multi-pass sequencing is performed to construct "
"HiFi reads.", )
parser.add_argument("--pb", action="store_true",
help="\t\tIf used the program will simulate PacBio reads (by default with PBSIM3, but can be "
"changed)", )
parser.add_argument("--ont", action="store_true",
help="\t\tIf used the program will simulate ONT reads with NanoSim", )
group = parser.add_mutually_exclusive_group(required=False)
group.add_argument("--isoseqsim", action="store_true",
help="\t\tIf used the program will simulate PacBio reads with IsoSeqSim", )
group.add_argument("--pbsim", action="store_true",
help="\t\tIf used the program will simulate PacBio reads with PBSIM3", )
parser.add_argument("--illumina", action="store_true",
help="\t\tIf used the program will simulate Illumina reads with Polyester", )
parser.add_argument("--CAGE", action="store_true",
help="\t\tIf used the program will simulate a sample-specific CAGE peak BED file and automatically simulate short-reads as well", )
parser.add_argument("--long_count", type=int, default=None,
help="\t\tNumber of long reads to simulate (if not given it will use the requested_counts from the --trans_index file)", )
parser.add_argument("--short_count", type=int, default=None,
help="\t\tNumber of short reads to simulate (if not given it will use the requested_counts from the --trans_index file)", )
parser.add_argument("--nanosim_model", type=str, default=None,
help="\t\tDirectory of the pre-trained NanoSim model")
parser.add_argument("--pbsim_model", type=str, default=None, help="\t\tPBSIM3 quality score pre-trained model.")
parser.add_argument("--isoseqsim_model", type=str, default=None,
help="\t\tOne-line tab-separated file with substitution, deletion and insertion error.")
parser.add_argument("--CAGE_model", type=str, default=None, help="\t\tDirectory of the pre-trained CAGE model")
parser.add_argument("--falseCAGE_prop", type=float, default=0.2,
help="\t\tProportion (0, 1) of simulated CAGE peaks that are not derived from actual TSS locations (default: 0.2)")
parser.add_argument("-s", "--seed", type=int, default=None, help="\t\tRandomizer seed", )
args, unknown = parser.parse_known_args(input)
if unknown:
print("[SQANTI-SIM] sim mode unrecognized arguments: {}\n".format(" ".join(unknown)))
if not args.ont and not args.pb and not args.isoseqsim and not args.pbsim:
print("sqanti-sim.py sim: error: one of the arguments --ont --pb --isoseqsim --pbsim is required",
file=sys.stderr)
sys.exit(1)
if args.ont and (args.pb or args.isoseqsim or args.pbsim):
print("sqanti-sim.py sim: error: argument --ont: not allowed with arguments --pb --pbsim or --isoseqsim",
file=sys.stderr)
sys.exit(1)
if not args.seed:
args.seed = int.from_bytes(os.urandom(1), 'big')
random.seed(args.seed)
numpy.random.seed(args.seed)
print("\n[SQANTI-SIM] Running with the following parameters:")
print("[SQANTI-SIM] - Ref GTF:", str(args.gtf))
print("[SQANTI-SIM] - Ref genome:", str(args.genome))
print("[SQANTI-SIM] - Index file:", str(args.trans_index))
print("[SQANTI-SIM] - Out dir:", str(args.dir))
if args.ont:
print("[SQANTI-SIM] - Platform: ONT")
print("[SQANTI-SIM] - Read type:", str(args.read_type))
print("[SQANTI-SIM] - NanoSim model:", str(args.nanosim_model))
else:
print("[SQANTI-SIM] - Platform: PacBio")
if args.long_count:
print("[SQANTI-SIM] - Long reads:", str(args.long_count))
else:
print("[SQANTI-SIM] - Long reads: requested_counts from index file")
if args.illumina or args.CAGE:
print("[SQANTI-SIM] - Platform: Illumina")
if args.short_count:
print("[SQANTI-SIM] - Short reads:", str(args.short_count))
else:
print("[SQANTI-SIM] - Short reads: requested_counts from index file")
if args.CAGE:
print("[SQANTI-SIM] - CAGE: Yes")
print("[SQANTI-SIM] - CAGE model dir:", str(args.CAGE_model))
else:
print("[SQANTI-SIM] - CAGE: No")
print("[SQANTI-SIM] - N threads:", str(args.cores))
print("[SQANTI-SIM] - Seed:", str(args.seed))
# Simulation with IsoSeqSim, NanoSim, Polyester and/or CAGEsim
if args.isoseqsim:
print("\n[SQANTI-SIM][%s] Simulating PacBio reads using IsoSeqSim" % (strftime("%d-%m-%Y %H:%M:%S")))
simulate_reads.isoseqsim_simulation(args)
if args.pb or args.pbsim:
print("\n[SQANTI-SIM][%s] Simulating PacBio reads using PBSIM3" % (strftime("%d-%m-%Y %H:%M:%S")))
simulate_reads.pbsim_simulation(args)
if args.ont:
print("\n[SQANTI-SIM][%s] Simulating ONT reads" % (strftime("%d-%m-%Y %H:%M:%S")))
simulate_reads.ont_simulation(args)
if args.illumina or args.CAGE:
print("\n[SQANTI-SIM][%s] Simulating Illumina reads" % (strftime("%d-%m-%Y %H:%M:%S")))
simulate_reads.illumina_simulation(args)
if args.CAGE:
print("\n[SQANTI-SIM][%s] Simulating CAGE peaks" % (strftime("%d-%m-%Y %H:%M:%S")))
simulate_reads.CAGE_simulation(args)
print("[SQANTI-SIM][%s] sim step finished" % (strftime("%d-%m-%Y %H:%M:%S")))
def eval(input: list):
"""Generates SQANTI-SIM report
Run SQANTI3 with the reconstructed transcripts retrieved by your pipeline
and generates the SQANTI-SIM report with the evaluation metrics
Args:
input (list): arguments to parse
"""
parser = argparse.ArgumentParser(prog="sqanti-sim.py eval", description="sqanti-sim.py eval parse options", )
parser.add_argument("--transcriptome", type=str, required=True,
help="\t\tLong-read-defined trancriptome reconstructed with your pipeline in GTF, FASTA or FASTQ format", )
parser.add_argument("--gtf", type=str, required=True, help="\t\Reduced reference annotation in GTF format", )
parser.add_argument("--genome", type=str, required=True, help="\t\tReference genome FASTA", )
parser.add_argument("-i", "--trans_index", type=str, required=True,
help="\t\tFile with transcript information generated with SQANTI-SIM (*_index.tsv)", )
parser.add_argument("-e", "--expression", type=str, default="none",
help="\t\tExpression of transcript models (file without header with two columns tab-separated: first with id and second with quantified number of reads, no header)", )
parser.add_argument("-o", "--output", type=str, default="sqanti-sim", help="\t\tPrefix for output files", )
parser.add_argument("-d", "--dir", type=str, default=".", help="\t\tDirectory for output files (default: .)", )
parser.add_argument('-c', '--coverage',
help='\t\tJunction coverage files (provide a single file, comma-delmited filenames, or a file pattern, ex: "mydir/*.junctions")',
required=False)
parser.add_argument('--SR_bam',
help='\t\tDirectory or fofn file with the sorted bam files of Short Reads RNA-Seq mapped against the genome',
required=False)
parser.add_argument("--short_reads", type=str, default=None,
help="\t\tFile Of File Names (fofn, space separated) with paths to FASTA or FASTQ from Short-Read RNA-Seq. If expression or coverage files are not provided, Kallisto (just for pair-end data) and STAR, respectively, will be run to calculate them.", )
parser.add_argument("--CAGE_peak", type=str, default=None,
help="\t\tCAGE Peak file in BED format (example FANTOM5)")
parser.add_argument("--fasta", action="store_true",
help="\t\tUse when running SQANTI-SIM by using as input a FASTA/FASTQ with the sequences of isoforms", )
parser.add_argument("--aligner_choice", type=str, default="minimap2",
help="\t\tIf --fasta used, choose the aligner to map your isoforms",
choices=["minimap2", "deSALT", "gmap", "uLTRA"])
parser.add_argument("--min_support", type=int, default=3,
help="\t\tMinimum number of supporting reads for an isoform", )
parser.add_argument("-k", "--cores", type=int, default=1, help="\t\tNumber of cores to run in parallel", )
args, unknown = parser.parse_known_args(input)
if unknown:
print(
"[SQANTI-SIM] sim mode unrecognized arguments: {}\n".format(" ".join(unknown)),
file=sys.stderr,
)
print("\n[SQANTI-SIM] Running with the following parameters:")
print("[SQANTI-SIM] - Reconstructed transcripts:", str(args.transcriptome))
print("[SQANTI-SIM] - Modified ref GTF:", str(args.gtf))
print("[SQANTI-SIM] - Ref genome:", str(args.genome))
print("[SQANTI-SIM] - Index file:", str(args.trans_index))
print("[SQANTI-SIM] - Out prefix:", str(args.output))
print("[SQANTI-SIM] - Out dir:", str(args.dir))
if args.expression:
print("[SQANTI-SIM] - Expression:", str(args.expression))
if args.coverage:
print("[SQANTI-SIM] - Coverage:", str(args.coverage))
if args.SR_bam:
print("[SQANTI-SIM] - Short-read BAM files:", str(args.SR_bam))
if args.short_reads:
print("[SQANTI-SIM] - Short reads:", str(args.short_reads))
if args.CAGE_peak:
print("[SQANTI-SIM] - CAGE Peak:", str(args.CAGE_peak))
print("[SQANTI-SIM] - Min support:", str(args.min_support))
print("[SQANTI-SIM] - N threads:", str(args.cores))
print()
evaluation_metrics.sqanti3_stats(args)
print("[SQANTI-SIM][%s] eval step finished" % (strftime("%d-%m-%Y %H:%M:%S")))
#####################################
# #
# MAIN #
# #
#####################################
print(
"""
_____ ____ _ _ _______ _____ _____ _____ __ __
/ ____|/ __ \ /\ | \ | |__ __|_ _| / ____|_ _| \/ |
| (___ | | | | / \ | \| | | | | |_____| (___ | | | \ / |
\___ \| | | | / /\ \ | . ` | | | | |______\___ \ | | | |\/| |
____) | |__| | / ____ \| |\ | | | _| |_ ____) |_| |_| | | |
|_____/ \___\_\/_/ \_\_| \_| |_| |_____| |_____/|_____|_| |_|
A SIMULATOR OF CONTROLLED NOVELTY AND DEGRADATION
OF TRANSCRIPTS SEQUENCED BY LONG-READS
"""
)
if len(sys.argv) < 2:
print("[SQANTI-SIM] usage: python sqanti-sim.py <mode> --help\n", file=sys.stderr)
print("[SQANTI-SIM] modes: classif, design, sim, full-sim, eval\n", file=sys.stderr)
print("[SQANTI-SIM] full-sim: classif + design + sim\n", file=sys.stderr)
sys.exit(1)
else:
mode = sys.argv[1].lower()
input = sys.argv[2:]
if mode == "classif":
print("[SQANTI-SIM] CLASSIF MODE")
res = classif(input)
elif mode == "design":
print("[SQANTI-SIM] DESIGN MODE")
res = design(input)
elif mode == "sim":
print("[SQANTI-SIM] SIM MODE")
res = sim(input)
elif mode == "eval":
print("[SQANTI-SIM] EVAL MODE")
res = eval(input)
elif mode == "full-sim":
print("[SQANTI-SIM] FULL-SIM MODE (CLASSIF + DESIGN + SIM)\n")
print("[SQANTI-SIM] CLASSIF MODE")
res = classif(input)
print("[SQANTI-SIM] DESIGN MODE")
# Parse generated index file
dir = next(
(input[i + 1] for i, value in enumerate(input) if (value == "-d" or value == "--dir") and i < len(input) - 1),
".")
output = next((input[i + 1] for i, value in enumerate(input) if
(value == "-o" or value == "--output") and i < len(input) - 1), "sqanti-sim")
input.append("--trans_index")
input.append(os.path.join(dir, (output + "_index.tsv")))
res = design(input)
print("[SQANTI-SIM] SIM MODE")
res = sim(input)
elif mode in ["--version", "-v"]:
print("[SQANTI-SIM] SQANTI-SIM %s\n" % (__version__))
else:
print("[SQANTI-SIM] usage: python sqanti-sim.py <mode> --help\n", file=sys.stderr)
print("[SQANTI-SIM] modes: classif, design, sim, eval\n", file=sys.stderr)
sys.exit(1)