Skip to content

Commit

Permalink
Improved the epitopes generation
Browse files Browse the repository at this point in the history
  • Loading branch information
jfnavarro committed Jan 27, 2021
1 parent 76cd2f0 commit 8cdacb4
Show file tree
Hide file tree
Showing 5 changed files with 84 additions and 166 deletions.
90 changes: 3 additions & 87 deletions hlapipeline/epitopes.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,96 +3,12 @@
"""
import re
from Bio.Seq import translate
from varcode import Variant


def translate_dna(seq):
return translate(seq, to_stop=True)


def create_epitope_varcode(chrm, start, ref, alt, transcript, db='GRCh37'):
"""
This function computes an epitope (mutated peptide) from a given mutation (variant)
It uses the package Varcode to obtain the WT and mutated sequences as well as the
mutation position.
:param db: the Ensembl database to use (GRCh37 or GRCh38)
:param chrm: the chromosome of the variant
:param start: the position of the variant
:param transcript: the transcript ID of the variant
:param ref: REF (reference allele) of the variant
:param alt: ALT (alternative allele) of the variant
:return:
The AA position
Error creating flags if any
The WT peptide (+12 -12)
The mutated peptide (+12 -12)
"""
errors = list()
wt_mer = '-'
mut_mer = '-'
pos = -1

# Retrieve variant info
vinfo = Variant(contig=chrm, start=start, ref=ref, alt=alt, ensembl=db)
effects = vinfo.effects()
effect = None
for e in effects:
if e is not None and e.transcript_id == transcript \
and e.short_description is not None and e.short_description.startswith('p.'):
effect = e
if effect is None:
errors.append('Could not find effects for this transcript (using top effect)')
effect = effects.top_priority_effect()

# Retrieve effect type
protein_mut = effect.short_description
if protein_mut is None:
errors.append('Could not retrieve AA mutation for this effect')
elif not protein_mut.startswith('p.'):
errors.append('Invalid AA mutation: {}'.format(protein_mut))
elif protein_mut.startswith('p.X'):
errors.append('AA mutation occurs in stop codon')
else:
# Retrieve AA mut pos (it is already ajudted to 0-based)
pos = effect.aa_mutation_start_offset
if pos is None:
errors.append('Could not find the position for this mutation')
elif pos < 0:
errors.append('Can not code for this mutated position')
elif pos == 0:
errors.append('Mutation occurs in start codon')
else:
if effect.mutant_protein_sequence is None or effect.original_protein_sequence is None:
errors.append('Could not retrieve protein sequences')
else:
# Type of effect
effect_type = type(effect).__name__
if 'Stop' in effect_type:
errors.append('Stop mutation')
elif 'FrameShift' in effect_type or 'Substitution' in effect_type \
or 'Insertion' in effect_type or 'Deletion' in effect_type:
end_wt = pos + 13
if end_wt > len(effect.original_protein_sequence):
errors.append('End of sequence is shorter than 12aa from mutation (WT)')
end_wt = len(effect.original_protein_sequence)
start = pos - 12
if start < 0:
errors.append('Start of sequence is shorter than 12aa from mutation')
start = 0
wt_mer = effect.original_protein_sequence[start:end_wt]
if 'FrameShift' in effect_type:
mut_mer = effect.mutant_protein_sequence[start:]
else:
end_mut = pos + 13
if end_mut > len(effect.mutant_protein_sequence):
errors.append('End of sequence is shorter than 12aa from mutation (MUT)')
end_mut = len(effect.mutant_protein_sequence)
mut_mer = effect.mutant_protein_sequence[start:end_mut]
else:
errors.append('Unknown exonic function {}'.format(effect_type))
return pos, ';'.join(errors), wt_mer, mut_mer


def create_epitope(ref, alt, exonic_func, cDNA_mut, protein_mut, cDNA_seq, protein_seq):
"""
This function computes an epitope (mutated peptide) from a given mutation (variant)
Expand Down Expand Up @@ -134,7 +50,7 @@ def create_epitope(ref, alt, exonic_func, cDNA_mut, protein_mut, cDNA_seq, prote
else:
end = position + 13
if end > len(protein_seq):
errors.append('End of sequence is shorter than 12aa from mutation')
errors.append('Stop codon has been reached before 12aa')
end = len(protein_seq)
start = position - 12
if start < 0:
Expand Down Expand Up @@ -181,7 +97,7 @@ def create_epitope(ref, alt, exonic_func, cDNA_mut, protein_mut, cDNA_seq, prote
mut_FASTA = str(translate_dna(mut_cDNA_seq.replace(' ', '')))
end_wt = position + 13
if end_wt > len(ref_FASTA):
errors.append('End of sequence is shorter than 12aa from mutation')
errors.append('Stop codon has been reached before 12aa (WT)')
end_wt = len(ref_FASTA)
start = position - 12
if start < 0:
Expand All @@ -191,7 +107,7 @@ def create_epitope(ref, alt, exonic_func, cDNA_mut, protein_mut, cDNA_seq, prote
if 'nonframeshift' in exonic_func:
end_mut = position + 13
if end_mut > len(mut_FASTA):
errors.append('End of sequence is shorter than 12aa from mutation (MUT)')
errors.append('Stop codon has been reached before 12aa (MUT)')
end_mut = len(mut_FASTA)
Mut_25mer = mut_FASTA[start:end_mut]
else:
Expand Down
2 changes: 1 addition & 1 deletion hlapipeline/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
OPTITYPE = 'OptiTypePipeline.py'
YARAI = 'yara_indexer'
YARAM = 'yara_mapper'
# ANNOVAR location must be in $ANNOVAR
# ANNOVAR location must be in $ANNOVAR_PATH
ANNOVAR_PATH = os.environ['ANNOVAR_PATH']
ANNOVAR_ANNO = 'refGene,knownGene,ensGene,avsnp150,gnomad211_exome,cosmic70 -operation g,g,g,f,f,f -nastring .'
VCFTOOLS = 'vcftools'
Expand Down
98 changes: 45 additions & 53 deletions hlapipeline/variants.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,12 @@
@author: Jose Fernandez Navarro <[email protected]
"""
from hlapipeline.epitopes import create_epitope
from collections import defaultdict, namedtuple
from collections import namedtuple
import numpy as np
import vcfpy

Epitope = namedtuple('Epitope', 'transcript dnamut aamut flags wtseq mutseq')
#  A convenience namedtuple to store the informatin of an epitope
Epitope = namedtuple('Epitope', 'transcript gene func dnamut aamut flags wtseq mutseq')


class Variant:
Expand All @@ -15,7 +16,6 @@ def __init__(self):
self.start = None
self.ref = None
self.alt = None
self.effects = None
self.epitopes = None
self.callers = None
self.num_callers = None
Expand All @@ -24,12 +24,13 @@ def __init__(self):
self.dbsnp = None
self.gnomad = None
self.cosmic = None

@property
def key(self):
return "{}:{} {}>{}".format(self.chrom, self.start, self.ref, self.alt)

@property
def ensemblGene(self):
def ensGene(self):
gene = None
for effect in self.effects:
if 'ensGene' in effect:
Expand All @@ -38,10 +39,19 @@ def ensemblGene(self):
return gene

@property
def geneName(self):
def knownGene(self):
gene = None
for effect in self.effects:
if 'knownGene' in effect or 'refGene' in effect:
if 'knownGene' in effect:
gene = effect.split("_")[1]
break
return gene

@property
def refGene(self):
gene = None
for effect in self.effects:
if 'refGene' in effect:
gene = effect.split("_")[1]
break
return gene
Expand All @@ -50,75 +60,59 @@ def __str__(self):
return '{}:{} {}>{} {} {}'.format(self.chrom, self.start, self.ref, self.alt, self.type, self.status)


def effects(record, cDNA_seq_dict, AA_seq_dict):
def epitopes(record, cDNA_seq_dict, AA_seq_dict):
"""
This function computes the effects of an Annovar annotated variant (record from vcfpy)
using 3 databases (Ensembl, NCBI and UCSC). The function will also compute the
mutated peptide of the variant for each effect/transcript.
This function computes the epitopes (mutated and wt peptides) of
an Annovar annotated variant (record from vcfpy) using the effects and
their isoforms from 3 databases (Ensembl, NCBI and UCSC).
The function only considers nonsynonymous and framshift effects.
:param record: A vcfpy record containing the variant information from Annovar
:param cDNA_seq_dic: a dictionary of cDNA sequences of the transcripts
:param AA_seq: a dictionary of AA sequences of the transcripts
:param cDNA_seq_dic: a dictionary of cDNA sequences of the transcripts ids
:param AA_seq: a dictionary of AA sequences of the transcripts ids
:return:
A list of unique epitopes detected in the variant (dna_mut, aa_mut, flags, wt_peptide, mut_peptide)
A dict of unique effects detected in each of the variants unique mutated peptide
A list of unique epitopes detected in the variant
Epitope (transcript gene func dnamut aamut flags wtseq mutseq)
"""
funcensGene = ''.join(record.INFO['ExonicFunc.ensGene'])
has_func_ens = 'nonsynonymous' in funcensGene or 'frame' in funcensGene
funcknownGene = ''.join(record.INFO['ExonicFunc.knownGene'])
has_func_known = 'nonsynonymous' in funcknownGene or 'frame' in funcknownGene
funcRefGene = ''.join(record.INFO['ExonicFunc.refGene'])
has_func_ref = 'nonsynonymous' in funcRefGene or 'frame' in funcRefGene
already_processed = set()

epitopes = list()
effects_seen_in = defaultdict(set)
if has_func_ens:
if 'nonsynonymous' in funcensGene or 'frame' in funcensGene:
for mutation in record.INFO['AAChange.ensGene']:
if len(mutation.split(':')) == 5:
gene, transcript, exon, mut_dna, mut_aa = mutation.split(':')
cDNA_seq = cDNA_seq_dict.get(transcript, 'None').strip()
AA_seq = AA_seq_dict.get(transcript, 'None').strip()
pos, flags, wtmer, mutmer = create_epitope(record.REF, record.ALT[0].serialize(),
funcensGene, mut_dna, mut_aa, cDNA_seq, AA_seq)
if mutmer not in already_processed and mutmer != "-":
already_processed.add(mutmer)
epitopes.append(Epitope(transcript, mut_dna, mut_aa, flags, wtmer, mutmer))
if mutmer != "-":
effects_seen_in[mutmer].add('ensGene_{}_{}'.format(gene, funcensGene))
if has_func_known:
epitopes.append(Epitope(transcript, gene, funcensGene, mut_dna, mut_aa, flags, wtmer, mutmer))
if 'nonsynonymous' in funcknownGene or 'frame' in funcknownGene:
for mutation in record.INFO['AAChange.knownGene']:
if len(mutation.split(':')) == 5:
gene, transcript, exon, mut_dna, mut_aa = mutation.split(':')
cDNA_seq = cDNA_seq_dict.get(transcript, 'None').strip()
AA_seq = AA_seq_dict.get(transcript, 'None').strip()
pos, flags, wtmer, mutmer = create_epitope(record.REF, record.ALT[0].serialize(),
funcknownGene, mut_dna, mut_aa, cDNA_seq, AA_seq)
if mutmer not in already_processed and mutmer != "-":
already_processed.add(mutmer)
epitopes.append(Epitope(transcript, mut_dna, mut_aa, flags, wtmer, mutmer))
if mutmer != "-":
effects_seen_in[mutmer].add('knownGene_{}_{}'.format(gene, funcknownGene))
if has_func_ref:
epitopes.append(Epitope(transcript, gene, funcknownGene, mut_dna, mut_aa, flags, wtmer, mutmer))
if 'nonsynonymous' in funcRefGene or 'frame' in funcRefGene:
for mutation in record.INFO['AAChange.refGene']:
if len(mutation.split(':')) == 5:
gene, transcript, exon, mut_dna, mut_aa = mutation.split(':')
cDNA_seq = cDNA_seq_dict.get(transcript, 'None').strip()
AA_seq = AA_seq_dict.get(transcript, 'None').strip()
pos, flags, wtmer, mutmer = create_epitope(record.REF, record.ALT[0].serialize(),
funcRefGene, mut_dna, mut_aa, cDNA_seq, AA_seq)
if mutmer not in already_processed and mutmer != "-":
already_processed.add(mutmer)
epitopes.append(Epitope(transcript, mut_dna, mut_aa, flags, wtmer, mutmer))
if mutmer != "-":
effects_seen_in[mutmer].add('refGene_{}_{}'.format(gene, funcRefGene))
epitopes.append(Epitope(transcript, gene, funcRefGene, mut_dna, mut_aa, flags, wtmer, mutmer))

return epitopes, effects_seen_in
return epitopes


def filter_variants_rna(file, tumor_coverage, tumor_var_depth,
tumor_var_freq, num_callers, cDNA_seq_dict, AA_seq_dict):
"""
This function parses a list of annotated RNA variants from Annovar.
This function processes a list of annotated RNA variants from Annovar (VCF).
It then applies some filters to the variants and computes the epitopes of each of
the variants nonsynonymous and frameshift effects.
The input is expected to contain HaplotypeCaller and Varscan RNA variants.
Expand All @@ -145,7 +139,8 @@ def filter_variants_rna(file, tumor_coverage, tumor_var_depth,
has_func_ref = 'nonsynonymous' in funcRefGene or 'frame' in funcRefGene
avsnp150 = record.INFO['avsnp150'][0] if record.INFO['avsnp150'] != [] else 'NA'
gnomad_AF = record.INFO['AF'][0] if record.INFO['AF'] != [] else 'NA'
cosmic70 = ';'.join(record.INFO['cosmic70']).split(":")[1].split("-")[0] if record.INFO['cosmic70'] != [] else 'NA'
cosmic70 = ';'.join(record.INFO['cosmic70']).split(":")[1].split("-")[0] if record.INFO[
'cosmic70'] != [] else 'NA'
if has_func_ens or has_func_known or has_func_ref:
called = {x.sample: x.data for x in record.calls if x.called}
filtered = dict()
Expand All @@ -168,18 +163,16 @@ def filter_variants_rna(file, tumor_coverage, tumor_var_depth,
except KeyError:
continue

is_valid = pass_variants >= num_callers
variant_effects, dbs_seen_in = effects(record, cDNA_seq_dict, AA_seq_dict)
variant_epitopes = epitopes(record, cDNA_seq_dict, AA_seq_dict)
variant = Variant()
variant.chrom = record.CHROM
variant.start = record.POS
variant.ref = record.REF
variant.alt = record.ALT[0].serialize()
variant.callers = '|'.join(['{}:{}'.format(key, value) for key, value in filtered.items()])
variant.num_callers = len(filtered)
variant.status = is_valid
variant.effects = [';'.join(dbs_seen_in[r[-1]]) for r in variant_effects]
variant.epitopes = variant_effects
variant.status = pass_variants >= num_callers
variant.epitopes = variant_epitopes
variant.dbsnp = avsnp150
variant.gnomad = gnomad_AF
variant.cosmic = cosmic70
Expand All @@ -191,9 +184,9 @@ def filter_variants_rna(file, tumor_coverage, tumor_var_depth,

def filter_variants_dna(file, normal_coverage, tumor_coverage, tumor_var_depth,
tumor_var_freq, normal_var_freq, t2n_ratio, num_callers,
num_callers_indel, cDNA_seq, AA_seq):
num_callers_indel, cDNA_seq_dict, AA_seq_dict):
"""
This function parses a list of annotated DNA variants from Annovar.
This function processes a list of annotated DNA variants from Annovar (VCF).
It then applies some filters to the variants and computes the epitopes of each of
the variants nonsynonymous and frameshift effects.
The input is expected to contain Mutect2, Strelka, SomaticSniper and Varscan DNA variants.
Expand Down Expand Up @@ -223,7 +216,8 @@ def filter_variants_dna(file, normal_coverage, tumor_coverage, tumor_var_depth,
has_func_ref = 'nonsynonymous' in funcRefGene or 'frame' in funcRefGene
avsnp150 = record.INFO['avsnp150'][0] if record.INFO['avsnp150'] != [] else 'NA'
gnomad_AF = record.INFO['AF'][0] if record.INFO['AF'] != [] else 'NA'
cosmic70 = ';'.join(record.INFO['cosmic70']).split(":")[1].split("-")[0] if record.INFO['cosmic70'] != [] else 'NA'
cosmic70 = ';'.join(record.INFO['cosmic70']).split(":")[1].split("-")[0] if record.INFO[
'cosmic70'] != [] else 'NA'

if has_func_ens or has_func_known or has_func_ref:
called = {x.sample: x.data for x in record.calls if x.called}
Expand Down Expand Up @@ -343,18 +337,16 @@ def filter_variants_dna(file, normal_coverage, tumor_coverage, tumor_var_depth,
except KeyError:
continue

is_valid = pass_snp >= num_callers or pass_indel >= num_callers_indel
variant_effects, dbs_seen_in = effects(record, cDNA_seq, AA_seq)
variant_epitopes = epitopes(record, cDNA_seq_dict, AA_seq_dict)
variant = Variant()
variant.chrom = record.CHROM
variant.start = record.POS
variant.ref = record.REF
variant.alt = record.ALT[0].serialize()
variant.callers = '|'.join(['{}:{}'.format(key, value) for key, value in filtered.items()])
variant.num_callers = len(filtered)
variant.status = is_valid
variant.effects = [';'.join(dbs_seen_in[r[-1]]) for r in variant_effects]
variant.epitopes = variant_effects
variant.status = pass_snp >= num_callers or pass_indel >= num_callers_indel
variant.epitopes = variant_epitopes
variant.dbsnp = avsnp150
variant.gnomad = gnomad_AF
variant.cosmic = cosmic70
Expand Down
2 changes: 1 addition & 1 deletion hlapipeline/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
version_number = "0.5.5"
version_number = "0.6.0"
Loading

0 comments on commit 8cdacb4

Please sign in to comment.