Support exon curation

oncokb · Sep 3, 2024 · c40dfb1 · c40dfb1
1 parent 61f38b9
commit c40dfb1
Show file tree

Hide file tree

Showing 11 changed files with 308 additions and 149 deletions.
diff --git a/src/main/java/org/mskcc/oncokb/curation/domain/enumeration/CNAConsequence.java b/src/main/java/org/mskcc/oncokb/curation/domain/enumeration/CNAConsequence.java
@@ -1,9 +1,9 @@
 package org.mskcc.oncokb.curation.domain.enumeration;
 
 public enum CNAConsequence {
-    AMPLIFICATION,
-    DELETION,
-    GAIN,
-    LOSS,
-    UNKNOWN,
+    CNA_AMPLIFICATION,
+    CNA_DELETION,
+    CNA_GAIN,
+    CNA_LOSS,
+    CNA_UNKNOWN,
 }
diff --git a/src/main/java/org/mskcc/oncokb/curation/domain/enumeration/SVConsequence.java b/src/main/java/org/mskcc/oncokb/curation/domain/enumeration/SVConsequence.java
@@ -1,11 +1,11 @@
 package org.mskcc.oncokb.curation.domain.enumeration;
 
 public enum SVConsequence {
-    DELETION,
-    TRANSLOCATION,
-    DUPLICATION,
-    INSERTION,
-    INVERSION,
-    FUSION,
-    UNKNOWN,
+    SV_DELETION,
+    SV_TRANSLOCATION,
+    SV_DUPLICATION,
+    SV_INSERTION,
+    SV_INVERSION,
+    SV_FUSION,
+    SV_UNKNOWN,
 }
diff --git a/src/main/java/org/mskcc/oncokb/curation/service/MainService.java b/src/main/java/org/mskcc/oncokb/curation/service/MainService.java
@@ -16,7 +16,6 @@
 import org.mskcc.oncokb.curation.domain.dto.HotspotInfoDTO;
 import org.mskcc.oncokb.curation.domain.dto.ProteinExonDTO;
 import org.mskcc.oncokb.curation.domain.enumeration.*;
-import org.mskcc.oncokb.curation.model.IntegerRange;
 import org.mskcc.oncokb.curation.service.dto.TranscriptDTO;
 import org.mskcc.oncokb.curation.service.mapper.TranscriptMapper;
 import org.mskcc.oncokb.curation.util.AlterationUtils;
@@ -280,87 +279,51 @@ public AlterationAnnotationStatus annotateAlteration(ReferenceGenome referenceGe
         }
         annotationDTO.setHotspot(hotspotInfoDTO);
 
-        if (
-            annotatedGenes.size() == 1 &&
-            PROTEIN_CHANGE.equals(alteration.getType()) &&
-            alteration.getStart() != null &&
-            alteration.getEnd() != null
-        ) {
-            Optional<TranscriptDTO> transcriptOptional = transcriptService.findByGeneAndReferenceGenomeAndCanonicalIsTrue(
-                annotatedGenes.stream().iterator().next(),
-                referenceGenome
-            );
-            if (transcriptOptional.isPresent()) {
-                List<GenomeFragment> utrs = transcriptOptional.orElseThrow().getUtrs();
-                List<GenomeFragment> exons = transcriptOptional.orElseThrow().getExons();
-                exons.sort((o1, o2) -> {
-                    int diff = o1.getStart() - o2.getStart();
-                    if (diff == 0) {
-                        diff = o1.getEnd() - o2.getEnd();
-                    }
-                    if (diff == 0) {
-                        diff = (int) (o1.getId() - o2.getId());
-                    }
-                    return diff;
-                });
-
-                List<GenomeFragment> codingExons = new ArrayList<>();
-                exons.forEach(exon -> {
-                    Integer start = exon.getStart();
-                    Integer end = exon.getEnd();
-                    for (GenomeFragment utr : utrs) {
-                        if (utr.getStart().equals(exon.getStart())) {
-                            start = utr.getEnd() + 1;
-                        }
-                        if (utr.getEnd().equals(exon.getEnd())) {
-                            end = utr.getStart() - 1;
-                        }
-                    }
-                    if (start < end) {
-                        GenomeFragment genomeFragment = new GenomeFragment();
-                        genomeFragment.setType(GenomeFragmentType.EXON);
-                        genomeFragment.setStart(start);
-                        genomeFragment.setEnd(end);
-                        codingExons.add(genomeFragment);
-                    } else {
-                        GenomeFragment genomeFragment = new GenomeFragment();
-                        genomeFragment.setType(GenomeFragmentType.EXON);
-                        genomeFragment.setStart(0);
-                        genomeFragment.setEnd(0);
-                        codingExons.add(genomeFragment);
-                    }
-                });
-
-                if (transcriptOptional.orElseThrow().getStrand() == -1) {
-                    Collections.reverse(codingExons);
-                }
-
-                List<ProteinExonDTO> proteinExons = new ArrayList<>();
-                int startAA = 1;
-                int previousExonCodonResidues = 0;
-                for (int i = 0; i < codingExons.size(); i++) {
-                    GenomeFragment genomeFragment = codingExons.get(i);
-                    if (genomeFragment.getStart() == 0) {
-                        continue;
-                    }
-                    int proteinLength = (previousExonCodonResidues + (genomeFragment.getEnd() - genomeFragment.getStart() + 1)) / 3;
-                    previousExonCodonResidues = (previousExonCodonResidues + (genomeFragment.getEnd() - genomeFragment.getStart() + 1)) % 3;
-                    ProteinExonDTO proteinExonDTO = new ProteinExonDTO();
-                    proteinExonDTO.setExon(i + 1);
-                    IntegerRange integerRange = new IntegerRange();
-                    integerRange.setStart(startAA);
-                    integerRange.setEnd(startAA + proteinLength - 1 + (previousExonCodonResidues > 0 ? 1 : 0));
-                    proteinExonDTO.setRange(integerRange);
-                    proteinExons.add(proteinExonDTO);
-                    startAA += proteinLength;
-                }
+        if (annotatedGenes.size() == 1) {
+            List<ProteinExonDTO> proteinExons = transcriptService.getExons(annotatedGenes.stream().iterator().next(), referenceGenome);
+            if (PROTEIN_CHANGE.equals(alteration.getType()) && alteration.getStart() != null && alteration.getEnd() != null) {
+                // Filter exons based on alteration range
                 List<ProteinExonDTO> overlap = proteinExons
                     .stream()
                     .filter(exon -> alteration.getStart() <= exon.getRange().getEnd() && alteration.getEnd() >= exon.getRange().getStart())
                     .collect(Collectors.toList());
                 annotationDTO.setExons(overlap);
+            } else if (AlterationUtils.isExon(alteration.getAlteration())) {
+                List<ProteinExonDTO> overlap = new ArrayList<>();
+                List<String> problematicExonAlts = new ArrayList<>();
+                for (String exonAlterationString : Arrays.asList(alteration.getAlteration().split("\\s*\\+\\s*"))) {
+                    Integer exonNumber = Integer.parseInt(exonAlterationString.replaceAll("\\D*", ""));
+                    if (exonNumber > 0 && exonNumber < proteinExons.size() + 1) {
+                        overlap.add(proteinExons.get(exonNumber - 1));
+                    } else {
+                        problematicExonAlts.add(exonAlterationString);
+                    }
+                }
+                if (problematicExonAlts.isEmpty()) {
+                    overlap.sort(Comparator.comparingInt(ProteinExonDTO::getExon));
+                    Boolean isConsecutiveExonRange =
+                        overlap
+                            .stream()
+                            .map(ProteinExonDTO::getExon)
+                            .reduce((prev, curr) -> (curr - prev == 1) ? curr : Integer.MIN_VALUE)
+                            .orElse(Integer.MIN_VALUE) !=
+                        Integer.MIN_VALUE;
+                    if (isConsecutiveExonRange && overlap.size() > 0) {
+                        alteration.setStart(overlap.get(0).getRange().getStart());
+                        alteration.setEnd(overlap.get(overlap.size() - 1).getRange().getEnd());
+                    }
+
+                    annotationDTO.setExons(overlap);
+                } else {
+                    StringBuilder sb = new StringBuilder();
+                    sb.append("The following exon(s) do not exist: ");
+                    sb.append(problematicExonAlts.stream().collect(Collectors.joining(", ")));
+                    alterationWithStatus.setMessage(sb.toString());
+                    alterationWithStatus.setType(EntityStatusType.ERROR);
+                }
             }
         }
+
         alterationWithStatus.setAnnotation(annotationDTO);
         return alterationWithStatus;
     }

diff --git a/src/main/java/org/mskcc/oncokb/curation/service/TranscriptService.java b/src/main/java/org/mskcc/oncokb/curation/service/TranscriptService.java
@@ -3,8 +3,6 @@
 import static org.mskcc.oncokb.curation.config.Constants.ENSEMBL_POST_THRESHOLD;
 
 import java.util.*;
-import java.util.List;
-import java.util.Optional;
 import java.util.stream.Collectors;
 import org.apache.commons.lang3.StringUtils;
 import org.genome_nexus.ApiException;
@@ -13,9 +11,11 @@
 import org.mskcc.oncokb.curation.config.cache.CacheCategory;
 import org.mskcc.oncokb.curation.config.cache.CacheNameResolver;
 import org.mskcc.oncokb.curation.domain.*;
+import org.mskcc.oncokb.curation.domain.dto.ProteinExonDTO;
 import org.mskcc.oncokb.curation.domain.enumeration.GenomeFragmentType;
 import org.mskcc.oncokb.curation.domain.enumeration.ReferenceGenome;
 import org.mskcc.oncokb.curation.domain.enumeration.SequenceType;
+import org.mskcc.oncokb.curation.model.IntegerRange;
 import org.mskcc.oncokb.curation.repository.TranscriptRepository;
 import org.mskcc.oncokb.curation.service.dto.ClustalOResp;
 import org.mskcc.oncokb.curation.service.dto.TranscriptDTO;
@@ -582,6 +582,77 @@ public List<EnrichedAlignmentResult> getAlignmentResult(
         }
     }
 
+    public List<ProteinExonDTO> getExons(Gene gene, ReferenceGenome referenceGenome) {
+        Optional<TranscriptDTO> transcriptOptional = this.findByGeneAndReferenceGenomeAndCanonicalIsTrue(gene, referenceGenome);
+        if (transcriptOptional.isPresent()) {
+            List<GenomeFragment> utrs = transcriptOptional.orElseThrow().getUtrs();
+            List<GenomeFragment> exons = transcriptOptional.orElseThrow().getExons();
+            exons.sort((o1, o2) -> {
+                int diff = o1.getStart() - o2.getStart();
+                if (diff == 0) {
+                    diff = o1.getEnd() - o2.getEnd();
+                }
+                if (diff == 0) {
+                    diff = (int) (o1.getId() - o2.getId());
+                }
+                return diff;
+            });
+
+            List<GenomeFragment> codingExons = new ArrayList<>();
+            exons.forEach(exon -> {
+                Integer start = exon.getStart();
+                Integer end = exon.getEnd();
+                for (GenomeFragment utr : utrs) {
+                    if (utr.getStart().equals(exon.getStart())) {
+                        start = utr.getEnd() + 1;
+                    }
+                    if (utr.getEnd().equals(exon.getEnd())) {
+                        end = utr.getStart() - 1;
+                    }
+                }
+                if (start < end) {
+                    GenomeFragment genomeFragment = new GenomeFragment();
+                    genomeFragment.setType(GenomeFragmentType.EXON);
+                    genomeFragment.setStart(start);
+                    genomeFragment.setEnd(end);
+                    codingExons.add(genomeFragment);
+                } else {
+                    GenomeFragment genomeFragment = new GenomeFragment();
+                    genomeFragment.setType(GenomeFragmentType.EXON);
+                    genomeFragment.setStart(0);
+                    genomeFragment.setEnd(0);
+                    codingExons.add(genomeFragment);
+                }
+            });
+
+            if (transcriptOptional.orElseThrow().getStrand() == -1) {
+                Collections.reverse(codingExons);
+            }
+
+            List<ProteinExonDTO> proteinExons = new ArrayList<>();
+            int startAA = 1;
+            int previousExonCodonResidues = 0;
+            for (int i = 0; i < codingExons.size(); i++) {
+                GenomeFragment genomeFragment = codingExons.get(i);
+                if (genomeFragment.getStart() == 0) {
+                    continue;
+                }
+                int proteinLength = (previousExonCodonResidues + (genomeFragment.getEnd() - genomeFragment.getStart() + 1)) / 3;
+                previousExonCodonResidues = (previousExonCodonResidues + (genomeFragment.getEnd() - genomeFragment.getStart() + 1)) % 3;
+                ProteinExonDTO proteinExonDTO = new ProteinExonDTO();
+                proteinExonDTO.setExon(i + 1);
+                IntegerRange integerRange = new IntegerRange();
+                integerRange.setStart(startAA);
+                integerRange.setEnd(startAA + proteinLength - 1 + (previousExonCodonResidues > 0 ? 1 : 0));
+                proteinExonDTO.setRange(integerRange);
+                proteinExons.add(proteinExonDTO);
+                startAA += proteinLength;
+            }
+            return proteinExons;
+        }
+        return new ArrayList<>();
+    }
+
     private Optional<EnsemblTranscript> getEnsemblTranscriptBySequence(
         List<EnsemblTranscript> availableEnsemblTranscripts,
         EnsemblSequence sequence

diff --git a/src/main/java/org/mskcc/oncokb/curation/util/AlterationUtils.java b/src/main/java/org/mskcc/oncokb/curation/util/AlterationUtils.java
@@ -8,7 +8,6 @@
 import java.util.stream.Collectors;
 import org.apache.commons.lang3.StringUtils;
 import org.apache.commons.text.similarity.JaroWinklerSimilarity;
-import org.checkerframework.checker.regex.qual.Regex;
 import org.mskcc.oncokb.curation.domain.*;
 import org.mskcc.oncokb.curation.domain.enumeration.*;
 import org.springframework.stereotype.Component;
@@ -21,11 +20,15 @@ public class AlterationUtils {
     private static final String FUSION_REGEX = "\\s*(\\w*)" + FUSION_SEPARATOR + "(\\w*)\\s*(?i)(fusion)?\\s*";
     private static final String FUSION_ALT_REGEX = "\\s*(\\w*)" + FUSION_ALTERNATIVE_SEPARATOR + "(\\w*)\\s+(?i)fusion\\s*";
 
+    private static final String EXON_ALT_REGEX = "Exon\\s+(\\d+)(-(\\d+))?\\s+(Deletion|Insertion|Duplication)";
+
+    private static final String EXON_ALTS_REGEX = "(" + EXON_ALT_REGEX + ")(\\s*\\+\\s*" + EXON_ALT_REGEX + ")*";
+
     private Alteration parseFusion(String alteration) {
         Alteration alt = new Alteration();
 
         Consequence consequence = new Consequence();
-        consequence.setTerm(SVConsequence.FUSION.name());
+        consequence.setTerm(SVConsequence.SV_FUSION.name());
         alt.setType(AlterationType.STRUCTURAL_VARIANT);
         alt.setConsequence(consequence);
 
@@ -49,7 +52,7 @@ private Alteration parseFusion(String alteration) {
     }
 
     private Alteration parseCopyNumberAlteration(String alteration) {
-        CNAConsequence cnaTerm = CNAConsequence.UNKNOWN;
+        CNAConsequence cnaTerm = CNAConsequence.CNA_UNKNOWN;
 
         Optional<CNAConsequence> cnaConsequenceOptional = getCNAConsequence(alteration);
         if (cnaConsequenceOptional.isPresent()) {
@@ -90,6 +93,55 @@ private Alteration parseGenomicChange(String genomicChange) {
         return alt;
     }
 
+    private Alteration parseExonAlteration(String alteration) {
+        Alteration alt = new Alteration();
+        Consequence consequence = new Consequence();
+        consequence.setTerm(SVConsequence.SV_UNKNOWN.name());
+        alt.setType(AlterationType.STRUCTURAL_VARIANT);
+        alt.setConsequence(consequence);
+
+        Pattern pattern = Pattern.compile(EXON_ALT_REGEX);
+        Matcher matcher = pattern.matcher(alteration);
+        List<String> splitResults = new ArrayList<>();
+        Set<String> consequenceTermSet = new HashSet<>();
+
+        while (matcher.find()) {
+            String startExonStr = matcher.group(1); // The start exon number
+            String endExonStr = matcher.group(3); // The end exon number (if present)
+            String consequenceTerm = matcher.group(4); // consequence term
+
+            switch (consequenceTerm.toLowerCase()) {
+                case "insertion":
+                    consequence.setTerm(SVConsequence.SV_INSERTION.name());
+                    break;
+                case "duplication":
+                    consequence.setTerm(SVConsequence.SV_DUPLICATION.name());
+                    break;
+                case "deletion":
+                    consequence.setTerm(SVConsequence.SV_DELETION.name());
+                default:
+                    break;
+            }
+
+            consequenceTermSet.add(consequenceTerm);
+            if (consequenceTermSet.size() > 1) {
+                consequence.setTerm(SVConsequence.SV_UNKNOWN.name());
+            }
+
+            int startExon = Integer.parseInt(startExonStr);
+            int endExon = (endExonStr != null) ? Integer.parseInt(endExonStr) : startExon;
+
+            for (int exon = startExon; exon <= endExon; exon++) {
+                splitResults.add("Exon " + exon + " " + consequenceTerm);
+            }
+        }
+
+        alt.setAlteration(splitResults.stream().collect(Collectors.joining(" + ")));
+
+        alt.setName(alteration);
+        return alt;
+    }
+
     public EntityStatus<Alteration> parseAlteration(String alteration) {
         EntityStatus<Alteration> entityWithStatus = new EntityStatus<>();
         String message = "";
@@ -130,6 +182,14 @@ public EntityStatus<Alteration> parseAlteration(String alteration) {
             return entityWithStatus;
         }
 
+        if (isExon(alteration)) {
+            Alteration alt = parseExonAlteration(alteration);
+            entityWithStatus.setEntity(alt);
+            entityWithStatus.setType(status);
+            entityWithStatus.setMessage(message);
+            return entityWithStatus;
+        }
+
         // the following is to parse the alteration as protein change
         MutationConsequence term = UNKNOWN;
         String ref = null;
@@ -474,6 +534,12 @@ public static Boolean isGenomicChange(String alteration) {
         return m.matches();
     }
 
+    public static Boolean isExon(String alteration) {
+        Pattern p = Pattern.compile(EXON_ALTS_REGEX);
+        Matcher m = p.matcher(alteration);
+        return m.matches();
+    }
+
     public static String removeExclusionCriteria(String proteinChange) {
         Matcher exclusionMatch = getExclusionCriteriaMatcher(proteinChange);
         if (exclusionMatch.matches()) {