Improved the epitopes generation

jfnavarro · Jan 27, 2021 · 8cdacb4 · 8cdacb4
1 parent 76cd2f0
commit 8cdacb4
Show file tree

Hide file tree

Showing 5 changed files with 84 additions and 166 deletions.
diff --git a/hlapipeline/epitopes.py b/hlapipeline/epitopes.py
@@ -3,96 +3,12 @@
 """
 import re
 from Bio.Seq import translate
-from varcode import Variant
 
 
 def translate_dna(seq):
     return translate(seq, to_stop=True)
 
 
-def create_epitope_varcode(chrm, start, ref, alt, transcript, db='GRCh37'):
-    """
-    This function computes an epitope (mutated peptide) from a given mutation (variant)
-    It uses the package Varcode to obtain the WT and mutated sequences as well as the
-    mutation position.
-    :param db: the Ensembl database to use (GRCh37 or GRCh38)
-    :param chrm: the chromosome of the variant
-    :param start: the position of the variant
-    :param transcript: the transcript ID of the variant
-    :param ref: REF (reference allele) of the variant
-    :param alt: ALT (alternative allele) of the variant
-    :return:
-        The AA position
-        Error creating flags if any
-        The WT peptide (+12 -12)
-        The mutated peptide (+12 -12)
-    """
-    errors = list()
-    wt_mer = '-'
-    mut_mer = '-'
-    pos = -1
-
-    # Retrieve variant info
-    vinfo = Variant(contig=chrm, start=start, ref=ref, alt=alt, ensembl=db)
-    effects = vinfo.effects()
-    effect = None
-    for e in effects:
-        if e is not None and e.transcript_id == transcript \
-                and e.short_description is not None and e.short_description.startswith('p.'):
-            effect = e
-    if effect is None:
-        errors.append('Could not find effects for this transcript (using top effect)')
-        effect = effects.top_priority_effect()
-
-    # Retrieve effect type
-    protein_mut = effect.short_description
-    if protein_mut is None:
-        errors.append('Could not retrieve AA mutation for this effect')
-    elif not protein_mut.startswith('p.'):
-        errors.append('Invalid AA mutation: {}'.format(protein_mut))
-    elif protein_mut.startswith('p.X'):
-        errors.append('AA mutation occurs in stop codon')
-    else:
-        # Retrieve AA mut pos (it is already ajudted to 0-based)
-        pos = effect.aa_mutation_start_offset
-        if pos is None:
-            errors.append('Could not find the position for this mutation')
-        elif pos < 0:
-            errors.append('Can not code for this mutated position')
-        elif pos == 0:
-            errors.append('Mutation occurs in start codon')
-        else:
-            if effect.mutant_protein_sequence is None or effect.original_protein_sequence is None:
-                errors.append('Could not retrieve protein sequences')
-            else:
-                # Type of effect
-                effect_type = type(effect).__name__
-                if 'Stop' in effect_type:
-                    errors.append('Stop mutation')
-                elif 'FrameShift' in effect_type or 'Substitution' in effect_type \
-                        or 'Insertion' in effect_type or 'Deletion' in effect_type:
-                    end_wt = pos + 13
-                    if end_wt > len(effect.original_protein_sequence):
-                        errors.append('End of sequence is shorter than 12aa from mutation (WT)')
-                        end_wt = len(effect.original_protein_sequence)
-                    start = pos - 12
-                    if start < 0:
-                        errors.append('Start of sequence is shorter than 12aa from mutation')
-                        start = 0
-                    wt_mer = effect.original_protein_sequence[start:end_wt]
-                    if 'FrameShift' in effect_type:
-                        mut_mer = effect.mutant_protein_sequence[start:]
-                    else:
-                        end_mut = pos + 13
-                        if end_mut > len(effect.mutant_protein_sequence):
-                            errors.append('End of sequence is shorter than 12aa from mutation (MUT)')
-                            end_mut = len(effect.mutant_protein_sequence)
-                        mut_mer = effect.mutant_protein_sequence[start:end_mut]
-                else:
-                    errors.append('Unknown exonic function {}'.format(effect_type))
-    return pos, ';'.join(errors), wt_mer, mut_mer
-
-
 def create_epitope(ref, alt, exonic_func, cDNA_mut, protein_mut, cDNA_seq, protein_seq):
     """
     This function computes an epitope (mutated peptide) from a given mutation (variant)
@@ -134,7 +50,7 @@ def create_epitope(ref, alt, exonic_func, cDNA_mut, protein_mut, cDNA_seq, prote
             else:
                 end = position + 13
                 if end > len(protein_seq):
-                    errors.append('End of sequence is shorter than 12aa from mutation')
+                    errors.append('Stop codon has been reached before 12aa')
                     end = len(protein_seq)
                 start = position - 12
                 if start < 0:
@@ -181,7 +97,7 @@ def create_epitope(ref, alt, exonic_func, cDNA_mut, protein_mut, cDNA_seq, prote
                 mut_FASTA = str(translate_dna(mut_cDNA_seq.replace(' ', '')))
                 end_wt = position + 13
                 if end_wt > len(ref_FASTA):
-                    errors.append('End of sequence is shorter than 12aa from mutation')
+                    errors.append('Stop codon has been reached before 12aa (WT)')
                     end_wt = len(ref_FASTA)
                 start = position - 12
                 if start < 0:
@@ -191,7 +107,7 @@ def create_epitope(ref, alt, exonic_func, cDNA_mut, protein_mut, cDNA_seq, prote
                 if 'nonframeshift' in exonic_func:
                     end_mut = position + 13
                     if end_mut > len(mut_FASTA):
-                        errors.append('End of sequence is shorter than 12aa from mutation (MUT)')
+                        errors.append('Stop codon has been reached before 12aa (MUT)')
                         end_mut = len(mut_FASTA)
                     Mut_25mer = mut_FASTA[start:end_mut]
                 else:

diff --git a/hlapipeline/tools.py b/hlapipeline/tools.py
@@ -22,7 +22,7 @@
 OPTITYPE = 'OptiTypePipeline.py'
 YARAI = 'yara_indexer'
 YARAM = 'yara_mapper'
-# ANNOVAR location must be in $ANNOVAR
+# ANNOVAR location must be in $ANNOVAR_PATH
 ANNOVAR_PATH = os.environ['ANNOVAR_PATH']
 ANNOVAR_ANNO = 'refGene,knownGene,ensGene,avsnp150,gnomad211_exome,cosmic70 -operation g,g,g,f,f,f -nastring .'
 VCFTOOLS = 'vcftools'

diff --git a/hlapipeline/variants.py b/hlapipeline/variants.py
@@ -2,11 +2,12 @@
 @author: Jose Fernandez Navarro <[email protected]
 """
 from hlapipeline.epitopes import create_epitope
-from collections import defaultdict, namedtuple
+from collections import namedtuple
 import numpy as np
 import vcfpy
 
-Epitope = namedtuple('Epitope', 'transcript dnamut aamut flags wtseq mutseq')
+#  A convenience namedtuple to store the informatin of an epitope
+Epitope = namedtuple('Epitope', 'transcript gene func dnamut aamut flags wtseq mutseq')
 
 
 class Variant:
@@ -15,7 +16,6 @@ def __init__(self):
         self.start = None
         self.ref = None
         self.alt = None
-        self.effects = None
         self.epitopes = None
         self.callers = None
         self.num_callers = None
@@ -24,12 +24,13 @@ def __init__(self):
         self.dbsnp = None
         self.gnomad = None
         self.cosmic = None
+
     @property
     def key(self):
         return "{}:{} {}>{}".format(self.chrom, self.start, self.ref, self.alt)
 
     @property
-    def ensemblGene(self):
+    def ensGene(self):
         gene = None
         for effect in self.effects:
             if 'ensGene' in effect:
@@ -38,10 +39,19 @@ def ensemblGene(self):
         return gene
 
     @property
-    def geneName(self):
+    def knownGene(self):
         gene = None
         for effect in self.effects:
-            if 'knownGene' in effect or 'refGene' in effect:
+            if 'knownGene' in effect:
+                gene = effect.split("_")[1]
+                break
+        return gene
+
+    @property
+    def refGene(self):
+        gene = None
+        for effect in self.effects:
+            if 'refGene' in effect:
                 gene = effect.split("_")[1]
                 break
         return gene
@@ -50,75 +60,59 @@ def __str__(self):
         return '{}:{} {}>{} {} {}'.format(self.chrom, self.start, self.ref, self.alt, self.type, self.status)
 
 
-def effects(record, cDNA_seq_dict, AA_seq_dict):
+def epitopes(record, cDNA_seq_dict, AA_seq_dict):
     """
-    This function computes the effects of an Annovar annotated variant (record from vcfpy)
-    using 3 databases (Ensembl, NCBI and UCSC). The function will also compute the
-    mutated peptide of the variant for each effect/transcript.
+    This function computes the epitopes (mutated and wt peptides) of
+    an Annovar annotated variant (record from vcfpy) using the effects and
+    their isoforms from 3 databases (Ensembl, NCBI and UCSC).
     The function only considers nonsynonymous and framshift effects.
     :param record: A vcfpy record containing the variant information from Annovar
-    :param cDNA_seq_dic: a dictionary of cDNA sequences of the transcripts
-    :param AA_seq: a dictionary of AA sequences of the transcripts
+    :param cDNA_seq_dic: a dictionary of cDNA sequences of the transcripts ids
+    :param AA_seq: a dictionary of AA sequences of the transcripts ids
     :return:
-        A list of unique epitopes detected in the variant (dna_mut, aa_mut, flags, wt_peptide, mut_peptide)
-        A dict of unique effects detected in each of the variants unique mutated peptide
+        A list of unique epitopes detected in the variant
+        Epitope (transcript gene func dnamut aamut flags wtseq mutseq)
     """
     funcensGene = ''.join(record.INFO['ExonicFunc.ensGene'])
-    has_func_ens = 'nonsynonymous' in funcensGene or 'frame' in funcensGene
     funcknownGene = ''.join(record.INFO['ExonicFunc.knownGene'])
-    has_func_known = 'nonsynonymous' in funcknownGene or 'frame' in funcknownGene
     funcRefGene = ''.join(record.INFO['ExonicFunc.refGene'])
-    has_func_ref = 'nonsynonymous' in funcRefGene or 'frame' in funcRefGene
-    already_processed = set()
+
     epitopes = list()
-    effects_seen_in = defaultdict(set)
-    if has_func_ens:
+    if 'nonsynonymous' in funcensGene or 'frame' in funcensGene:
         for mutation in record.INFO['AAChange.ensGene']:
             if len(mutation.split(':')) == 5:
                 gene, transcript, exon, mut_dna, mut_aa = mutation.split(':')
                 cDNA_seq = cDNA_seq_dict.get(transcript, 'None').strip()
                 AA_seq = AA_seq_dict.get(transcript, 'None').strip()
                 pos, flags, wtmer, mutmer = create_epitope(record.REF, record.ALT[0].serialize(),
                                                            funcensGene, mut_dna, mut_aa, cDNA_seq, AA_seq)
-                if mutmer not in already_processed and mutmer != "-":
-                    already_processed.add(mutmer)
-                    epitopes.append(Epitope(transcript, mut_dna, mut_aa, flags, wtmer, mutmer))
-                if mutmer != "-":
-                    effects_seen_in[mutmer].add('ensGene_{}_{}'.format(gene, funcensGene))
-    if has_func_known:
+                epitopes.append(Epitope(transcript, gene, funcensGene, mut_dna, mut_aa, flags, wtmer, mutmer))
+    if 'nonsynonymous' in funcknownGene or 'frame' in funcknownGene:
         for mutation in record.INFO['AAChange.knownGene']:
             if len(mutation.split(':')) == 5:
                 gene, transcript, exon, mut_dna, mut_aa = mutation.split(':')
                 cDNA_seq = cDNA_seq_dict.get(transcript, 'None').strip()
                 AA_seq = AA_seq_dict.get(transcript, 'None').strip()
                 pos, flags, wtmer, mutmer = create_epitope(record.REF, record.ALT[0].serialize(),
                                                            funcknownGene, mut_dna, mut_aa, cDNA_seq, AA_seq)
-                if mutmer not in already_processed and mutmer != "-":
-                    already_processed.add(mutmer)
-                    epitopes.append(Epitope(transcript, mut_dna, mut_aa, flags, wtmer, mutmer))
-                if mutmer != "-":
-                    effects_seen_in[mutmer].add('knownGene_{}_{}'.format(gene, funcknownGene))
-    if has_func_ref:
+                epitopes.append(Epitope(transcript, gene, funcknownGene, mut_dna, mut_aa, flags, wtmer, mutmer))
+    if 'nonsynonymous' in funcRefGene or 'frame' in funcRefGene:
         for mutation in record.INFO['AAChange.refGene']:
             if len(mutation.split(':')) == 5:
                 gene, transcript, exon, mut_dna, mut_aa = mutation.split(':')
                 cDNA_seq = cDNA_seq_dict.get(transcript, 'None').strip()
                 AA_seq = AA_seq_dict.get(transcript, 'None').strip()
                 pos, flags, wtmer, mutmer = create_epitope(record.REF, record.ALT[0].serialize(),
                                                            funcRefGene, mut_dna, mut_aa, cDNA_seq, AA_seq)
-                if mutmer not in already_processed and mutmer != "-":
-                    already_processed.add(mutmer)
-                    epitopes.append(Epitope(transcript, mut_dna, mut_aa, flags, wtmer, mutmer))
-                if mutmer != "-":
-                    effects_seen_in[mutmer].add('refGene_{}_{}'.format(gene, funcRefGene))
+                epitopes.append(Epitope(transcript, gene, funcRefGene, mut_dna, mut_aa, flags, wtmer, mutmer))
 
-    return epitopes, effects_seen_in
+    return epitopes
 
 
 def filter_variants_rna(file, tumor_coverage, tumor_var_depth,
                         tumor_var_freq, num_callers, cDNA_seq_dict, AA_seq_dict):
     """
-    This function parses a list of annotated RNA variants from Annovar.
+    This function processes a list of annotated RNA variants from Annovar (VCF).
     It then applies some filters to the variants and computes the epitopes of each of
     the variants nonsynonymous and frameshift effects.
     The input is expected to contain HaplotypeCaller and Varscan RNA variants.
@@ -145,7 +139,8 @@ def filter_variants_rna(file, tumor_coverage, tumor_var_depth,
         has_func_ref = 'nonsynonymous' in funcRefGene or 'frame' in funcRefGene
         avsnp150 = record.INFO['avsnp150'][0] if record.INFO['avsnp150'] != [] else 'NA'
         gnomad_AF = record.INFO['AF'][0] if record.INFO['AF'] != [] else 'NA'
-        cosmic70 = ';'.join(record.INFO['cosmic70']).split(":")[1].split("-")[0] if record.INFO['cosmic70'] != [] else 'NA'
+        cosmic70 = ';'.join(record.INFO['cosmic70']).split(":")[1].split("-")[0] if record.INFO[
+                                                                                        'cosmic70'] != [] else 'NA'
         if has_func_ens or has_func_known or has_func_ref:
             called = {x.sample: x.data for x in record.calls if x.called}
             filtered = dict()
@@ -168,18 +163,16 @@ def filter_variants_rna(file, tumor_coverage, tumor_var_depth,
             except KeyError:
                 continue
 
-            is_valid = pass_variants >= num_callers
-            variant_effects, dbs_seen_in = effects(record, cDNA_seq_dict, AA_seq_dict)
+            variant_epitopes = epitopes(record, cDNA_seq_dict, AA_seq_dict)
             variant = Variant()
             variant.chrom = record.CHROM
             variant.start = record.POS
             variant.ref = record.REF
             variant.alt = record.ALT[0].serialize()
             variant.callers = '|'.join(['{}:{}'.format(key, value) for key, value in filtered.items()])
             variant.num_callers = len(filtered)
-            variant.status = is_valid
-            variant.effects = [';'.join(dbs_seen_in[r[-1]]) for r in variant_effects]
-            variant.epitopes = variant_effects
+            variant.status = pass_variants >= num_callers
+            variant.epitopes = variant_epitopes
             variant.dbsnp = avsnp150
             variant.gnomad = gnomad_AF
             variant.cosmic = cosmic70
@@ -191,9 +184,9 @@ def filter_variants_rna(file, tumor_coverage, tumor_var_depth,
 
 def filter_variants_dna(file, normal_coverage, tumor_coverage, tumor_var_depth,
                         tumor_var_freq, normal_var_freq, t2n_ratio, num_callers,
-                        num_callers_indel, cDNA_seq, AA_seq):
+                        num_callers_indel, cDNA_seq_dict, AA_seq_dict):
     """
-    This function parses a list of annotated DNA variants from Annovar.
+    This function processes a list of annotated DNA variants from Annovar (VCF).
     It then applies some filters to the variants and computes the epitopes of each of
     the variants nonsynonymous and frameshift effects.
     The input is expected to contain Mutect2, Strelka, SomaticSniper and Varscan DNA variants.
@@ -223,7 +216,8 @@ def filter_variants_dna(file, normal_coverage, tumor_coverage, tumor_var_depth,
         has_func_ref = 'nonsynonymous' in funcRefGene or 'frame' in funcRefGene
         avsnp150 = record.INFO['avsnp150'][0] if record.INFO['avsnp150'] != [] else 'NA'
         gnomad_AF = record.INFO['AF'][0] if record.INFO['AF'] != [] else 'NA'
-        cosmic70 = ';'.join(record.INFO['cosmic70']).split(":")[1].split("-")[0] if record.INFO['cosmic70'] != [] else 'NA'
+        cosmic70 = ';'.join(record.INFO['cosmic70']).split(":")[1].split("-")[0] if record.INFO[
+                                                                                        'cosmic70'] != [] else 'NA'
 
         if has_func_ens or has_func_known or has_func_ref:
             called = {x.sample: x.data for x in record.calls if x.called}
@@ -343,18 +337,16 @@ def filter_variants_dna(file, normal_coverage, tumor_coverage, tumor_var_depth,
             except KeyError:
                 continue
 
-            is_valid = pass_snp >= num_callers or pass_indel >= num_callers_indel
-            variant_effects, dbs_seen_in = effects(record, cDNA_seq, AA_seq)
+            variant_epitopes = epitopes(record, cDNA_seq_dict, AA_seq_dict)
             variant = Variant()
             variant.chrom = record.CHROM
             variant.start = record.POS
             variant.ref = record.REF
             variant.alt = record.ALT[0].serialize()
             variant.callers = '|'.join(['{}:{}'.format(key, value) for key, value in filtered.items()])
             variant.num_callers = len(filtered)
-            variant.status = is_valid
-            variant.effects = [';'.join(dbs_seen_in[r[-1]]) for r in variant_effects]
-            variant.epitopes = variant_effects
+            variant.status = pass_snp >= num_callers or pass_indel >= num_callers_indel
+            variant.epitopes = variant_epitopes
             variant.dbsnp = avsnp150
             variant.gnomad = gnomad_AF
             variant.cosmic = cosmic70

diff --git a/hlapipeline/version.py b/hlapipeline/version.py
@@ -1 +1 @@
-version_number = "0.5.5"
+version_number = "0.6.0"