-
Notifications
You must be signed in to change notification settings - Fork 1
/
extended_epitope_burden.py
827 lines (781 loc) · 32.2 KB
/
extended_epitope_burden.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
#!/usr/bin/env python
from __future__ import print_function
from Bio.SubsMat import MatrixInfo
from collections import defaultdict
from mhcnuggets.src.predict import predict
from neoepiscope import bowtie_index
from neoepiscope import paths
from neoepiscope.transcript import get_transcripts_from_tree
from pyliftover import LiftOver
import argparse
import ast
import glob
import os
import pickle
import pysam
import re
import subprocess
def liftover_coordinates(neoepiscope_file, hg19_bed_path, hg38_bed_path, liftover, chain,
remove_bed_files=True):
''' Creates a dictionary linking hg19 variants to their hg38 coordinates, if available
neoepiscope_file: path to file containing neoepiscope output (comprehensive mode)
hg19_bed_path: path to write hg19 BED file
hg38_bed_path: path to write hg38 BED file
liftover: path to liftOver executable
chain: path to hg19 to hg38 chain file
Return value: dictionary linking hg19 variant IDs to their hg38 coordinates
'''
# Establish dictionaries
hg19_position_dict = {}
hg38_position_dict = {}
# Process neoepiscope output to ID variants
with open(neoepiscope_file) as f:
f.readline()
f.readline()
# Iterate through neoepitope entry lines
for line in f:
tokens = line.strip().split('\t')
# Generate variant position ID
name = ','.join([tokens[1], tokens[2], tokens[3], tokens[4], tokens[5]])
if name not in hg19_position_dict:
# Find end position and add info to dictionary
start_pos = int(tokens[2]) - 1
# Find end position
if tokens[5] == 'V' or tokens[5] == 'D':
end_pos = start_pos + len(tokens[3])
elif tokens[5] == 'I':
end_pos = start_pos + 1
hg19_position_dict[name] = [tokens[1], str(start_pos), str(end_pos)]
# Write variants to BED file
with open(hg19_bed_path, 'w') as f:
for variant in hg19_position_dict:
out_line = hg19_position_dict[variant] + [variant]
print('\t'.join(out_line), file=f)
# Run liftOver
subprocess.call([liftover, hg19_bed_path, chain, hg38_bed_path, '.'.join([hg38_bed_path, 'unmapped'])])
# Process new BED file and add variants to dictionary
with open(hg38_bed_path) as f:
for line in f:
tokens = line.strip().split('\t')
hg38_position_dict[tokens[3]] = [tokens[0], int(tokens[1]), int(tokens[2])]
# Remove BED files
if remove_bed_files:
for file_to_remove in [hg19_bed_path, hg38_bed_path, '.'.join([hg38_bed_path, 'unmapped'])]:
os.remove(file_to_remove)
# Return dictionary
return hg38_position_dict
def process_epitope_file(neoepiscope_file, interval_dict, hg38_coordinates, binding_dict_search):
''' Process neoepiscope output to generate allele list and dictionary
neoepiscope_file: path to file containing neoepiscope output (comprehensive mode)
interval_dict: dictionary linking contigs to IntervalTree objects
connecting genomic intervals to overlapping transcripts
hg38_coordinates: dictionary linking hg19 variant IDs to their hg38 coordinates
binding_dict_search: wildcard path for binding score dictionaries
Return values: epitope_dict is a nested dictionary linking epitopes to mutations
they derive from, and a list of [paired nomal allele, list of hg19 transcripts
the epitope could derive from, list of hg38 transcripts the epitope could
derive from, and binary binding score for each relevant allele (0 = doesn't
bind allele, 1 = does bind allele)]; allele list is a list of HLA alleles for
the patient; tx_set is a set of transcripts that neoepitopes might derive from
'''
epitope_dict = defaultdict(lambda: defaultdict(list))
transcript_set = set()
# Get binding scores and alleles
binding_scores = {}
binding_dicts = glob.glob(binding_dict_search)
print(binding_dicts)
allele_list = []
for binding_dict in binding_dicts:
allele = binding_dict.split('.')[-2]
allele_list.append(allele)
binding_scores[allele] = {}
with open(binding_dict, 'rb') as p:
d = pickle.load(p)
for peptide in d:
binding_scores[allele][peptide] = d[peptide]
allele_list.sort()
# Process peptides
with open(neoepiscope_file) as f:
f.readline()
# Process header line to get relevant alleles
header = f.readline().strip().split('\t')
# Iterate through neoepitope entry lines
for line in f:
tokens = line.strip().split('\t')
tx_list = tokens[9].split(';')
# Extract paired normal and transcript list
info_list = [tokens[7], tx_list]
# Search for hg38 transcripts
variant_id = ','.join([tokens[1], tokens[2], tokens[3], tokens[4], tokens[5]])
if variant_id in hg38_coordinates:
coords = hg38_coordinates[variant_id]
overlapping_transcripts = get_transcripts_from_tree(coords[0], coords[1]+1, coords[2]+1, interval_dict)
else:
overlapping_transcripts = []
info_list.append(overlapping_transcripts)
# Iterate through binding scores
for allele in allele_list:
try:
if float(binding_scores[allele][tokens[0]]) <= 500:
# Epitope binds to this allele - tally as binder
info_list.append(1)
else:
# Epitope doesn't bind to this allele - tally as non-binder
info_list.append(0)
except ValueError:
info_list.append('NA')
# Add info to mutation for epitope in dictionary
epitope_dict[tokens[0]][(tokens[1], tokens[2], tokens[3], tokens[4], tokens[5])].append(info_list)
for tx in tx_list:
transcript_set.add(tx)
# Return dictionary and allele list
return epitope_dict, allele_list, transcript_set
def make_epitope_fasta(epitope_list, fasta):
''' Produces fasta file containing all neoepitope sequences for a sample
epitope_list: list of neoepitopes to perform a blast search on
fasta: path to output fasta file
No return value
'''
# Write epitopes from list to fasta file
with open(fasta, 'w') as fh:
for epitope in epitope_list:
print(''.join(['> seq=', epitope]), file=fh)
print(epitope, file=fh)
def run_blast(fasta, db, blastp, output, remove_fasta=True):
''' Runs blastp
fasta: path fasta containing sequences to blast against database
db: path to database to blast against
output: path to write blast output
No return value
'''
# Run blast
subprocess.call([blastp, '-outfmt', '6 qseqid sseqid length qstart qend sseq evalue', '-db', db,
'-query', fasta, '-matrix', 'BLOSUM62', '-evalue', '200000', '-ungapped',
'-comp_based_stats', 'F', '-out', output])
if remove_fasta:
# Remove fasta file
os.remove(fasta)
def score_match(pair, matrix):
''' Gives a score from a matrix for a given pair of sequences
pair: pair to score (tuple)
matrix: scoring matrix (matrix)
Return value: score
'''
# If the pair is not in the matrix, reverse it and get score
if pair not in matrix:
return matrix[(tuple(reversed(pair)))]
else:
return matrix[pair]
def score_pairwise(seq1, seq2, matrix):
''' For two sequences of equal length, score them given a matrix
Score returned is relative to score of first sequence against itself
seq1: first sequence (string)
seq2: second sequence, of same length as first sequence (string)
matrix: scoring matrix (matrix)
Return value: score
'''
self_score = 0
score = 0
# Calculate score of first sequence against itself for baseline
for i in range(len(seq1)):
pair = (seq1[i], seq1[i])
self_score += score_match(pair, matrix)
# Calulate score of first and second sequence
for i in range(len(seq1)):
pair = (seq1[i], seq2[i])
score += score_match(pair, matrix)
adj_score = score/self_score
return adj_score
def process_blast(blast_results, remove_blast_results=True):
''' Processes blast output to return a dicionary of results
blast_results: path to file contain blast results
data_dictionary: dictionary linking peptide_IDs to genes/transcripts
Return value: blast_dict - dictionary that links epitopes to a list of
[match E value, set of transcripts it comes from, set of
genes it comes from, match pepetide sequence]
'''
blast_dict = {}
# Process through all blast results
with open(blast_results, 'r') as fh:
for line in fh:
# Grab info from blast line
tokens = line.strip().split('\t')
epitope = tokens[0].split('=')[1]
evalue = float(tokens[6])
match_seq = tokens[5]
match_transcript = tokens[1].split('|')[0]
match_gene = tokens[1].split('|')[1]
# Check for presence of invalid characters in match seq
invalids = ['B', 'J', 'O', 'U', 'X', 'Z', '*']
invalid_matches = []
for char in invalids:
if char in match_seq:
invalid_matches.append(char)
# Check for match criteria
if len(match_seq) == len(epitope) and len(invalid_matches) == 0:
# Match is the right length and doesn't contain invalid characters
if epitope not in blast_dict:
# Epitope has no valid matches yet, add this one
blast_dict[epitope] = [evalue, set(), set(), match_seq]
blast_dict[epitope][1].add(match_transcript)
blast_dict[epitope][2].add(match_gene)
elif evalue < blast_dict[epitope][0]:
# This is a better match for the epitope, replace old one
blast_dict[epitope] = [evalue, set(), set(), match_seq]
blast_dict[epitope][1].add(match_transcript)
blast_dict[epitope][2].add(match_gene)
elif evalue == blast_dict[epitope][0]:
# Equally good match - check more criteria
if match_seq == blast_dict[epitope][3]:
# This sequence matches previous match(es), store tx/gene
blast_dict[epitope][1].add(match_transcript)
blast_dict[epitope][2].add(match_gene)
else:
# This is a new sequence - check more criteria
if match_seq == epitope:
# This match is identical to the epitope, replace old one
blast_dict[epitope] = [evalue, set(), set(), match_seq]
blast_dict[epitope][1].add(match_transcript)
blast_dict[epitope][2].add(match_gene)
else:
# Check whether old or new match is more similar by adjusted BLOSUM62
orig_ps = score_pairwise(epitope, blast_dict[epitope][3], MatrixInfo.blosum62)
match_ps = score_pairwise(epitope, match_seq, MatrixInfo.blosum62)
if match_ps > orig_ps:
# This match is more similar, replace the old one
blast_dict[epitope] = [evalue, set(), set(), match_seq]
blast_dict[epitope][1].add(match_transcript)
blast_dict[epitope][2].add(match_gene)
# Remove blast results
if remove_blast_results:
os.remove(blast_results)
# Return dictionary of matches
return blast_dict
def get_normal_binding_scores(blast_dict, alleles, available_alleles, output_dir, pat_id, remove_files=True):
''' Creates dictionary linking matched normal epitopes to binding scores for
different HLA alleles
blast_dict: blast_dict - dictionary that links epitopes to a list of
[match E value, set of transcripts it comes from, set of
genes it comes from, match pepetide sequence] (from process_blast())
alleles: list of HLA alleles to use for binding predictions
available_alleles: path to pickled dictionary describing available HLA alleles
for different binding affinity predictors
output_dir: path to output directory for writing temporary files
pat_id: patient identifier
Return value: nested dictionary, where keys are matched normal epitopes and
values are dictionaries, where keys are HLA alleles and values
are binding scores for that epitope/allele combo
'''
# Create list of temporary files to remove
files_to_remove = []
# Extract matched normal peptide sequences
normal_epitopes = set()
for epitope in blast_dict:
normal_epitopes.add(blast_dict[epitope][3])
# Load available alleles
with open(available_alleles, 'rb') as allele_stream:
avail_alleles = pickle.load(allele_stream)
# Initialize dictionary
normal_dict = defaultdict(dict)
for hla in alleles:
# Determine if allele is valid for mhcnuggets
if hla in avail_alleles["mhcnuggets_mhcI"]:
# Class I allele
allele_class = "I"
max_length = 15
elif hla in avail_alleles["mhcnuggets_mhcII"]:
# Class II allele
allele_class = "II"
max_length = 30
else:
# Not a valid allele
continue
# Write relevant peptides to file
peptide_file = os.path.join(output_dir, ''.join([pat_id, '.mhc.', hla, '.csv']))
files_to_remove.append(peptide_file)
with open(peptide_file, 'w') as f:
for sequence in normal_epitopes:
if len(sequence) <= max_length:
print(sequence, file=f)
# Run binding predictions
mhc_out = os.path.join(output_dir, ''.join([pat_id, '.mhc.', hla, '.out']))
files_to_remove.append(mhc_out)
predict(class_=allele_class, peptides_path=peptide_file, mhc=hla, output=mhc_out)
# Process mhcnuggets results
score_dict = {}
with open(mhc_out) as f:
f.readline()
for line in f:
tokens = line.strip().split(',')
score_dict[tokens[0]] = tokens[1]
# Store score for each epitope if available
for sequence in normal_epitopes:
if sequence in score_dict:
normal_dict[sequence][hla] = float(score_dict[sequence])
# Remove temporary files
if remove_files:
for file_to_remove in files_to_remove:
os.remove(file_to_remove)
# Return dictionary
return normal_dict
### Taken with modifications from https://www.biostars.org/p/306041/
def read_pair_generator(bam, contig):
"""
Generate read pairs in a BAM file or within a region string.
Reads are added to read_dict until a pair is found.
bam: path to BAM file
contig: name of contig to process
Return value: iterator of read1, read2 for each paired read
"""
read_dict = defaultdict(lambda: [None, None])
for read in bam.fetch(contig=contig):
if not read.is_proper_pair or read.is_secondary or read.is_supplementary:
# Ignore reads that aren't mapped as pairs or from primary alignments
continue
if read.query[-2:] in ['.1', '.2']:
# Strip '.1' or '.2' from query name
qname = read.query_name[:-2]
else:
qname = read.query_name
if qname not in read_dict:
# Establishing new read pair
if read.is_read1:
read_dict[qname][0] = read
else:
read_dict[qname][1] = read
else:
# Finish established read pair
if read.is_read1:
yield read, read_dict[qname][1]
else:
yield read_dict[qname][0], read
del read_dict[qname]
def create_bed_file(bed_path, transcripts, cds_dict):
""" Creates BED file of genome coordinates for relevant transcripts
bed_path: path to write BED file
transcripts: set of relevant transcript IDs
cds_dict: dictionary linking transcript IDs to GTF file entry info
Return value: None
"""
with open(bed_path, 'w') as f:
for tx in transcripts:
for cds in cds_dict[tx]:
out_line = [cds[0], str(cds[2]), str(cds[3]+1)]
print('\t'.join(out_line), file=f)
# Taken from Rail-RNA: https://github.com/nellore/rail
def parsed_md(md):
""" Divides an MD string up by boundaries between ^, letters, and numbers
md: an MD string (example: 33A^CC).
Return value: MD string split by boundaries described above.
"""
md_to_parse = []
md_group = [md[0]]
for i, char in enumerate(md):
if i == 0: continue
if (re.match('[A-Za-z]', char) is not None) \
!= (re.match('[A-Za-z]', md[i-1]) is not None) or \
(re.match('[0-9]', char) is not None) \
!= (re.match('[0-9]', md[i-1]) is not None):
if md_group:
md_to_parse.append(''.join(md_group))
md_group = [char]
else:
md_group.append(char)
if md_group:
md_to_parse.append(''.join(md_group))
return [char for char in md_to_parse if char != '0']
# Taken from Rail-RNA: https://github.com/nellore/rail
def indels_junctions_exons_mismatches(cigar, md, pos, seq, drop_deletions=False, junctions_only=False):
""" Finds indels, junctions, exons, mismatches from CIGAR, MD string, POS
cigar: CIGAR string
md: MD:Z string
pos: position of first aligned base
seq: read sequence
drop_deletions: drops deletions from coverage vectors iff True
junctions_only: does not populate mismatch list
Return value: tuple (insertions, deletions, junctions, exons, mismatches).
Insertions is a list of tuples (last genomic position before insertion,
string of inserted bases). Deletions is a list of tuples (first genomic
position of deletion, string of deleted bases). Junctions is a list
of tuples (intron start position (inclusive), intron end position (exclusive),
left_diplacement, right_displacement). Exons is a list of tuples (exon start
position (inclusive), exon end position (exclusive)). Mismatches is a list
of tuples (genomic position of mismatch, read base)
"""
insertions, deletions, junctions, exons, mismatches = [], [], [], [], []
cigar = re.split(r'([MINDS])', cigar)[:-1]
md = parsed_md(md)
seq_size = len(seq)
cigar_chars, cigar_sizes = [], []
cigar_index, md_index, seq_index = 0, 0, 0
max_cigar_index = len(cigar)
while cigar_index != max_cigar_index:
if cigar[cigar_index] == 0:
cigar_index += 2
continue
if cigar[cigar_index+1] == 'M':
aligned_base_cap = int(cigar[cigar_index])
aligned_bases = 0
while True:
try:
aligned_bases += int(md[md_index])
if aligned_bases <= aligned_base_cap:
md_index += 1
except ValueError:
# Not an int, but should not have reached a deletion
assert md[md_index] != '^', '\n'.join(['cigar and md:', ''.join(cigar), ''.join(md)])
if not junctions_only:
mismatches.append((pos + aligned_bases, seq[seq_index + aligned_bases]))
correction_length = len(md[md_index])
m_length = aligned_base_cap - aligned_bases
if correction_length > m_length:
md[md_index] = md[md_index][:m_length]
aligned_bases = aligned_base_cap
else:
aligned_bases += correction_length
md_index += 1
if aligned_bases > aligned_base_cap:
md[md_index] = aligned_bases - aligned_base_cap
break
elif aligned_bases == aligned_base_cap:
break
# Add exon
exons.append((pos, pos + aligned_base_cap))
pos += aligned_base_cap
seq_index += aligned_base_cap
elif cigar[cigar_index+1] == 'N':
skip_increment = int(cigar[cigar_index])
# Add junction
junctions.append((pos, pos + skip_increment, seq_index, seq_size - seq_index))
# Skip region of reference
pos += skip_increment
elif cigar[cigar_index+1] == 'I':
# Insertion
insert_size = int(cigar[cigar_index])
insertions.append((pos - 1, seq[seq_index:seq_index+insert_size]))
seq_index += insert_size
elif cigar[cigar_index+1] == 'D':
assert md[md_index] == '^', '\n'.join(['cigar and md:', ''.join(cigar), ''.join(md)])
# Deletion
delete_size = int(cigar[cigar_index])
md_delete_size = len(md[md_index+1])
assert md_delete_size >= delete_size
deletions.append((pos, md[md_index+1][:delete_size]))
if not drop_deletions: exons.append((pos, pos + delete_size))
if md_delete_size > delete_size:
md[md_index+1] = md[md_index+1][delete_size:]
else:
md_index += 2
# Skip deleted part of reference
pos += delete_size
else:
assert cigar[cigar_index+1] == 'S'
seq_index += int(cigar[cigar_index])
cigar_index += 2
# Merge exonic chunks/deletions; insertions/junctions could have chopped them up
new_exons = []
last_exon = exons[0]
for exon in exons[1:]:
if exon[0] == last_exon[1]:
# Merge ECs
last_exon = (last_exon[0], exon[1])
else:
# Push last exon to new exon list
new_exons.append(last_exon)
last_exon = exon
new_exons.append(last_exon)
return insertions, deletions, junctions, new_exons, mismatches
def get_expressed_transcripts(bam, reference_index, interval_dict, bed_path, remove_files=True):
''' Gets transcripts that are expressed by at least one read pair
bam: path to RNA-seq BAM file
reference_index: bowtie reference index
interval_dict: dictionary linking contigs to IntervalTree objects
connecting genomic intervals to overlapping transcripts
bed_path: path to BED file describing coordinates of transcripts of interest
Return value: set of transcript IDs that are expressed
'''
# Set up BAM reader/transcript set
transcript_set = set()
# Produce reduced BAM file
new_bam = bam.replace('.bam', '.reduced.bam')
subprocess.call(['samtools', 'view', '-b', '-L', bed_path, '-o', new_bam, bam])
subprocess.call(['samtools', 'index', new_bam, ''.join([new_bam, '.bai'])])
# Process BAM file
bam_reader = pysam.AlignmentFile(new_bam, 'rb')
for contig in reference_index.recs.keys():
for read1, read2 in read_pair_generator(bam_reader, contig):
# Get read info
r1_tokens = str(read1).split('\t')
r2_tokens = str(read2).split('\t')
r1_tags = ast.literal_eval(r1_tokens[11])
r2_tags = ast.literal_eval(r2_tokens[11])
r1_md = [x[1] for x in r1_tags if x[0] == 'MD'][0]
r2_md = [x[1] for x in r2_tags if x[0] == 'MD'][0]
r1_cigar = r1_tokens[5]
r2_cigar = r2_tokens[5]
r1_pos = int(r1_tokens[3])+1
r2_pos = int(r2_tokens[3])+1
r1_seq = r1_tokens[9]
r2_seq = r2_tokens[9]
# Extract data from each read
r1_insertions, r1_deletions, r1_junctions, r1_exons, r1_mismatches = indels_junctions_exons_mismatches(r1_cigar, r1_md, r1_pos, r1_seq)
r2_insertions, r2_deletions, r2_junctions, r2_exons, r2_mismatches = indels_junctions_exons_mismatches(r2_cigar, r2_md, r2_pos, r2_seq)
# Find genomic intervals covered, accounting for junctions
intervals = []
for e in r1_exons + r2_exons:
intervals.extend([e[0], e[1]+1])
# Find and store transcripts that overlap intervals
for i in range(0, len(intervals), 2):
overlapping_transcripts = interval_dict[contig].overlap(intervals[i], intervals[i+1])
for tx in [x.data for x in overlapping_transcripts]:
transcript_set.add(tx)
# Remove temporary files
if remove_files:
for file_to_remove in [new_bam, ''.join([new_bam, '.bai']), bed_path]:
os.remove(file_to_remove)
# Return trancript IDs
return transcript_set
def process_tcga_expression(tcga_dict, disease):
''' Produces set of transcripts expressed among TCGA samples
for a given cancer type
tcga_dict: TCGA expression dictionary - nested dictionary where
keys are TCGA disease types and values are sets of
transcript IDs for transcripts that are expressed at
a TPM of at least 1 for the 75th percentile expression
value for that transcript among tumor samples
disease: disease type, one of [melanoma, NSCLC, colon,
endometrial, thyroid, prostate, RCC]
Return value: set of expressed transcripts
'''
# ID TCGA disease
if disease == 'melanoma':
disease_list = ['SKCM']
elif disease == 'NSCLC':
disease_list = ['LUAD', 'LUSC']
elif disease == 'colon':
disease_list = ['COAD']
elif disease == 'endometrial':
disease_list = ['UCEC']
elif disease == 'thyroid':
disease_list = ['THCA']
elif disease == 'prostate':
disease_list = ['PRAD']
elif disease == 'RCC':
disease_list = ['KIRC']
# Identify TCGA-expressed transcripts
tx_set = set()
for disease_id in disease_list:
for transcript in tcga_dict[disease_id]:
# transcript is expressed
tx_set.add(transcript)
# Return transcript set
return tx_set
def extended_burden(patient, tumor, rna_id, epitope_dir, output_dir, blastp, db, rna_dir,
tcga_dict_path, disease, available_alleles, chain_file, liftover, binding_dir):
''' Generates extended burden information for each neoepitope
patient: patient identifier (string)
tumor: tumor sample identifier (string)
rna_id: tumor RNA sample identifier (string)
epitope_dir: path to directory with neoepitope predictions from neoepiscope (string)
output_dir: path to directory for script to write files to (string)
blastp: path to blastp executable (string)
db: path to human protein database (string)
rna_dir: path to directory with with RNA alignment subdirectories by tumor (string)
tcga_dict_path: path to dictionary with TCGA transcript expression values (string)
disease: disease of sample (string)
available alleles: path to neoepiscope's available alleles dictionary (string)
chain_file: path to chain file for liftover (string)
liftover: path to liftover executable (string)
binding_dir: path to directory with binding affinity dictionaries (string)
No return value, output file written.
'''
# Establish neoepitope directory/file
epitope_file = os.path.join(epitope_dir, ''.join([patient, '.', tumor, '.neoepiscope.comprehensive.out']))
# Find liftover coordinates
hg19_bed = os.path.join(output_dir, ''.join([patient, '.', tumor, '.hg19.bed']))
hg38_bed = os.path.join(output_dir, ''.join([patient, '.', tumor, '.hg38.bed']))
hg38_coordinates = liftover_coordinates(epitope_file, hg19_bed, hg38_bed, liftover, chain_file)
# Process neoepiscope data
with open(os.path.join(paths.gencode_v29, 'intervals_to_transcript.pickle'), 'rb') as interval_stream:
v29_intervals = pickle.load(interval_stream)
wildcard_path = ''.join([binding_dir, patient, '.', tumor, '.mhcflurry.*.pickle'])
epitope_dict, allele_list, relevant_transcripts = process_epitope_file(epitope_file, v29_intervals, hg38_coordinates, wildcard_path)
# Produce FASTA file for blast
fasta_path = os.path.join(output_dir, ''.join([patient, '.', tumor, '.epitopes.fasta']))
make_epitope_fasta(epitope_dict.keys(), fasta_path)
# Run blast
blast_output = os.path.join(output_dir, ''.join([patient, '.', tumor, '.blast.out']))
run_blast(fasta_path, db, blastp, blast_output)
# Process blast results
blast_data = process_blast(blast_output)
# Get matched normal binding scores
normal_binding = get_normal_binding_scores(blast_data, allele_list, available_alleles,
output_dir, '.'.join([patient, tumor]))
# Process RNA-seq data, if available
rna_bam = os.path.join(rna_dir, rna_id, ''.join([rna_id, '.sorted.bam']))
if os.path.isfile(rna_bam):
# Patient specific expression data available - load annotation info
with open(os.path.join(paths.gencode_v19, 'intervals_to_transcript.pickle'), 'rb') as interval_stream:
interval_dict = pickle.load(interval_stream)
with open(os.path.join(paths.gencode_v19, 'transcript_to_CDS.pickle'), 'rb') as cds_stream:
cds_dict = pickle.load(cds_stream)
reference_index = bowtie_index.BowtieIndexReference(paths.bowtie_hg19)
bed_file = os.path.join(output_dir, ''.join([patient, '.', tumor, '.bed']))
create_bed_file(bed_file, relevant_transcripts, cds_dict)
# Identify expressed transcripts
expressed_transcripts = get_expressed_transcripts(rna_bam, reference_index, interval_dict, bed_file)
else:
# No patient-specific expression data available
expressed_transcripts = None
# Process TCGA expression data
with open(tcga_dict_path, 'rb') as f:
tcga_dict = pickle.load(f)
tcga_expressed_transcripts = process_tcga_expression(tcga_dict, disease)
# Write output
output_file = os.path.join(output_dir, ''.join([patient, '.', tumor, '.extended_epitope_burden.tsv']))
with open(output_file, 'w') as o:
header = ['Patient_ID', 'Tumor_ID', 'Epitope', 'Mutation', 'Mutation_type', 'Paired_normal', 'Paired_mismatches',
'Blast_normal', 'Blast_mismatches', 'Anchor_mismatches', 'Transcript_IDs', 'Match_transcripts',
'HLA_binders', 'Alleles_bound', 'Matched_binders', 'Match_alleles', 'Patient_expression',
'Patient_transcripts', 'TCGA_expression', 'TCGA_transcripts']
print('\t'.join(header), file=o)
output_lines = set()
for epitope in epitope_dict:
# Determine matched normal data from blast, if available
if epitope in blast_data:
blast_normal = blast_data[epitope][3]
mismatches = str(sum(x1!=x2 for x1,x2 in zip(epitope,blast_normal)))
# Get anchor residue mismatch count
if mismatches != '0':
anchor_count = 0
if len(epitope) <= 11:
for i in [1, 4, 5, -1]:
if epitope[i] != blast_normal[i]:
anchor_count += 1
else:
for i in [0, 3, 5, 8]:
if epitope[i] != blast_normal[i]:
anchor_count += 1
anchor_mismatches = str(anchor_count)
else:
anchor_mismatches = '0'
normal_transcripts = ','.join(blast_data[epitope][1])
normal_genes = blast_data[epitope][1]
# Get matched normal binding scores
normal_alleles = [x for x in normal_binding[blast_normal] if normal_binding[blast_normal][x] <= 500.0]
if len(normal_alleles) > 0:
matched_binders = str(len(normal_alleles))
matched_alleles = ','.join(normal_alleles)
else:
matched_binders = '0'
matched_alleles = 'NA'
else:
blast_normal = 'NA'
mismatches = 'NA'
normal_transcripts = 'NA'
normal_genes = 'NA'
matched_binders = 'NA'
matched_alleles = 'NA'
anchor_mismatches = 'NA'
# Process through mutations associated with epitope
for mutation in epitope_dict[epitope]:
mut_id = ','.join(mutation[0:-1])
mut_type = mutation[-1]
for ep_data in epitope_dict[epitope][mutation]:
paired_normal = ep_data[0]
if paired_normal != 'NA':
paired_mismatches = str(sum(x1!=x2 for x1,x2 in zip(epitope,paired_normal)))
else:
paired_mismatches = 'NA'
transcripts = ep_data[1]
# Establish binding alleles
binding_alleles = []
for i in range(3, len(ep_data)):
if ep_data[i] == 1:
binding_alleles.append(allele_list[i-3])
if len(binding_alleles) > 0:
binders = str(len(binding_alleles))
alleles = ','.join(binding_alleles)
else:
binders = '0'
alleles = 'NA'
# Establish transcript expression if RNA-seq is available
patient_transcripts = []
for tx in transcripts:
if expressed_transcripts is not None:
if tx in expressed_transcripts:
patient_transcripts.append(tx)
if len(patient_transcripts) > 0:
patient_expression = str(len(patient_transcripts))
patient_tx_ids = ','.join(patient_transcripts)
elif expressed_transcripts is not None:
patient_expression = '0'
patient_tx_ids = 'NA'
else:
patient_expression = 'NA'
patient_tx_ids = 'NA'
# Establish TCGA expression if liftover was successful
tcga_transcripts = []
hg38_transcripts = ep_data[2]
for tx in hg38_transcripts:
if tx in tcga_expressed_transcripts:
tcga_transcripts.append(tx)
if len(tcga_transcripts) > 0:
tcga_expression = str(len(tcga_transcripts))
tcga_tx_ids = ','.join(tcga_transcripts)
else:
tcga_expression = '0'
tcga_tx_ids = 'NA'
out_line = [patient, tumor, epitope, mut_id, mut_type, paired_normal, paired_mismatches,
blast_normal, mismatches, anchor_mismatches, ','.join(transcripts), normal_transcripts,
binders, alleles, matched_binders, matched_alleles, patient_expression, patient_tx_ids,
tcga_expression, tcga_tx_ids]
print('\t'.join(out_line), file=o)
parser = argparse.ArgumentParser()
parser.add_argument('-p', '--patient', type=str, required=True,
help='patient ID'
)
parser.add_argument('-w', '--wes', type=str, required=True,
help='tumor WES ID'
)
parser.add_argument('-r', '--rna', type=str, required=True,
help='tumor RNA-seq ID'
)
parser.add_argument('-n', '--neoepitope-directory', type=str, required=True,
help='path to directory containing neoepiscope output (comprehensive mode)'
)
parser.add_argument('-o', '--output-directory', type=str, required=True,
help='path to output directory'
)
parser.add_argument('-e', '--blastp-executable', type=str, required=True,
help='path to blastp executable'
)
parser.add_argument('-b', '--blastp-database', type=str, required=True,
help='path to blastp peptide database'
)
parser.add_argument('-a', '--rna-alignments', type=str, required=True,
help='path to directory containing RNA-seq alignment subdirectories'
)
parser.add_argument('-t', '--tcga-expression', type=str, required=True,
help='path to pickled dictionary with TCGA expression info'
)
parser.add_argument('-s', '--disease-site', type=str, required=True,
help='disease site (melanoma, NSCLC, colon, endometrial, thyroid, prostate, RCC)'
)
parser.add_argument('-d', '--available-alleles-dict', type=str, required=True,
help='path to pickled dictionary with available alleles for MHC binding predictors'
)
parser.add_argument('-l', '--liftover', type=str, required=True,
help='path to liftOver executable'
)
parser.add_argument('-c', '--chain-file', type=str, required=True,
help='path to hg19-to-hg38 chain file'
)
parser.add_argument('-m', '--mhc-binding-score-dir', type=str, required=True,
help='path to MHC binding score dict directory'
)
args = parser.parse_args()
extended_burden(args.patient, args.wes, args.rna, args.neoepitope_directory, args.output_directory,
args.blastp_executable, args.blastp_database, args.rna_alignments,
args.tcga_expression, args.disease_site, args.available_alleles_dict,
args.chain_file, args.liftover, args.mhc_binding_score_dir)