Skip to content

Commit

Permalink
Support exon curation
Browse files Browse the repository at this point in the history
  • Loading branch information
calvinlu3 committed Sep 3, 2024
1 parent 61f38b9 commit c40dfb1
Show file tree
Hide file tree
Showing 11 changed files with 308 additions and 149 deletions.
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
package org.mskcc.oncokb.curation.domain.enumeration;

public enum CNAConsequence {
AMPLIFICATION,
DELETION,
GAIN,
LOSS,
UNKNOWN,
CNA_AMPLIFICATION,
CNA_DELETION,
CNA_GAIN,
CNA_LOSS,
CNA_UNKNOWN,
}
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
package org.mskcc.oncokb.curation.domain.enumeration;

public enum SVConsequence {
DELETION,
TRANSLOCATION,
DUPLICATION,
INSERTION,
INVERSION,
FUSION,
UNKNOWN,
SV_DELETION,
SV_TRANSLOCATION,
SV_DUPLICATION,
SV_INSERTION,
SV_INVERSION,
SV_FUSION,
SV_UNKNOWN,
}
113 changes: 38 additions & 75 deletions src/main/java/org/mskcc/oncokb/curation/service/MainService.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
import org.mskcc.oncokb.curation.domain.dto.HotspotInfoDTO;
import org.mskcc.oncokb.curation.domain.dto.ProteinExonDTO;
import org.mskcc.oncokb.curation.domain.enumeration.*;
import org.mskcc.oncokb.curation.model.IntegerRange;
import org.mskcc.oncokb.curation.service.dto.TranscriptDTO;
import org.mskcc.oncokb.curation.service.mapper.TranscriptMapper;
import org.mskcc.oncokb.curation.util.AlterationUtils;
Expand Down Expand Up @@ -280,87 +279,51 @@ public AlterationAnnotationStatus annotateAlteration(ReferenceGenome referenceGe
}
annotationDTO.setHotspot(hotspotInfoDTO);

if (
annotatedGenes.size() == 1 &&
PROTEIN_CHANGE.equals(alteration.getType()) &&
alteration.getStart() != null &&
alteration.getEnd() != null
) {
Optional<TranscriptDTO> transcriptOptional = transcriptService.findByGeneAndReferenceGenomeAndCanonicalIsTrue(
annotatedGenes.stream().iterator().next(),
referenceGenome
);
if (transcriptOptional.isPresent()) {
List<GenomeFragment> utrs = transcriptOptional.orElseThrow().getUtrs();
List<GenomeFragment> exons = transcriptOptional.orElseThrow().getExons();
exons.sort((o1, o2) -> {
int diff = o1.getStart() - o2.getStart();
if (diff == 0) {
diff = o1.getEnd() - o2.getEnd();
}
if (diff == 0) {
diff = (int) (o1.getId() - o2.getId());
}
return diff;
});

List<GenomeFragment> codingExons = new ArrayList<>();
exons.forEach(exon -> {
Integer start = exon.getStart();
Integer end = exon.getEnd();
for (GenomeFragment utr : utrs) {
if (utr.getStart().equals(exon.getStart())) {
start = utr.getEnd() + 1;
}
if (utr.getEnd().equals(exon.getEnd())) {
end = utr.getStart() - 1;
}
}
if (start < end) {
GenomeFragment genomeFragment = new GenomeFragment();
genomeFragment.setType(GenomeFragmentType.EXON);
genomeFragment.setStart(start);
genomeFragment.setEnd(end);
codingExons.add(genomeFragment);
} else {
GenomeFragment genomeFragment = new GenomeFragment();
genomeFragment.setType(GenomeFragmentType.EXON);
genomeFragment.setStart(0);
genomeFragment.setEnd(0);
codingExons.add(genomeFragment);
}
});

if (transcriptOptional.orElseThrow().getStrand() == -1) {
Collections.reverse(codingExons);
}

List<ProteinExonDTO> proteinExons = new ArrayList<>();
int startAA = 1;
int previousExonCodonResidues = 0;
for (int i = 0; i < codingExons.size(); i++) {
GenomeFragment genomeFragment = codingExons.get(i);
if (genomeFragment.getStart() == 0) {
continue;
}
int proteinLength = (previousExonCodonResidues + (genomeFragment.getEnd() - genomeFragment.getStart() + 1)) / 3;
previousExonCodonResidues = (previousExonCodonResidues + (genomeFragment.getEnd() - genomeFragment.getStart() + 1)) % 3;
ProteinExonDTO proteinExonDTO = new ProteinExonDTO();
proteinExonDTO.setExon(i + 1);
IntegerRange integerRange = new IntegerRange();
integerRange.setStart(startAA);
integerRange.setEnd(startAA + proteinLength - 1 + (previousExonCodonResidues > 0 ? 1 : 0));
proteinExonDTO.setRange(integerRange);
proteinExons.add(proteinExonDTO);
startAA += proteinLength;
}
if (annotatedGenes.size() == 1) {
List<ProteinExonDTO> proteinExons = transcriptService.getExons(annotatedGenes.stream().iterator().next(), referenceGenome);
if (PROTEIN_CHANGE.equals(alteration.getType()) && alteration.getStart() != null && alteration.getEnd() != null) {
// Filter exons based on alteration range
List<ProteinExonDTO> overlap = proteinExons
.stream()
.filter(exon -> alteration.getStart() <= exon.getRange().getEnd() && alteration.getEnd() >= exon.getRange().getStart())
.collect(Collectors.toList());
annotationDTO.setExons(overlap);
} else if (AlterationUtils.isExon(alteration.getAlteration())) {
List<ProteinExonDTO> overlap = new ArrayList<>();
List<String> problematicExonAlts = new ArrayList<>();
for (String exonAlterationString : Arrays.asList(alteration.getAlteration().split("\\s*\\+\\s*"))) {
Integer exonNumber = Integer.parseInt(exonAlterationString.replaceAll("\\D*", ""));
if (exonNumber > 0 && exonNumber < proteinExons.size() + 1) {
overlap.add(proteinExons.get(exonNumber - 1));
} else {
problematicExonAlts.add(exonAlterationString);
}
}
if (problematicExonAlts.isEmpty()) {
overlap.sort(Comparator.comparingInt(ProteinExonDTO::getExon));
Boolean isConsecutiveExonRange =
overlap
.stream()
.map(ProteinExonDTO::getExon)
.reduce((prev, curr) -> (curr - prev == 1) ? curr : Integer.MIN_VALUE)
.orElse(Integer.MIN_VALUE) !=
Integer.MIN_VALUE;
if (isConsecutiveExonRange && overlap.size() > 0) {
alteration.setStart(overlap.get(0).getRange().getStart());
alteration.setEnd(overlap.get(overlap.size() - 1).getRange().getEnd());
}

annotationDTO.setExons(overlap);
} else {
StringBuilder sb = new StringBuilder();
sb.append("The following exon(s) do not exist: ");
sb.append(problematicExonAlts.stream().collect(Collectors.joining(", ")));
alterationWithStatus.setMessage(sb.toString());
alterationWithStatus.setType(EntityStatusType.ERROR);
}
}
}

alterationWithStatus.setAnnotation(annotationDTO);
return alterationWithStatus;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
import static org.mskcc.oncokb.curation.config.Constants.ENSEMBL_POST_THRESHOLD;

import java.util.*;
import java.util.List;
import java.util.Optional;
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import org.genome_nexus.ApiException;
Expand All @@ -13,9 +11,11 @@
import org.mskcc.oncokb.curation.config.cache.CacheCategory;
import org.mskcc.oncokb.curation.config.cache.CacheNameResolver;
import org.mskcc.oncokb.curation.domain.*;
import org.mskcc.oncokb.curation.domain.dto.ProteinExonDTO;
import org.mskcc.oncokb.curation.domain.enumeration.GenomeFragmentType;
import org.mskcc.oncokb.curation.domain.enumeration.ReferenceGenome;
import org.mskcc.oncokb.curation.domain.enumeration.SequenceType;
import org.mskcc.oncokb.curation.model.IntegerRange;
import org.mskcc.oncokb.curation.repository.TranscriptRepository;
import org.mskcc.oncokb.curation.service.dto.ClustalOResp;
import org.mskcc.oncokb.curation.service.dto.TranscriptDTO;
Expand Down Expand Up @@ -582,6 +582,77 @@ public List<EnrichedAlignmentResult> getAlignmentResult(
}
}

public List<ProteinExonDTO> getExons(Gene gene, ReferenceGenome referenceGenome) {
Optional<TranscriptDTO> transcriptOptional = this.findByGeneAndReferenceGenomeAndCanonicalIsTrue(gene, referenceGenome);
if (transcriptOptional.isPresent()) {
List<GenomeFragment> utrs = transcriptOptional.orElseThrow().getUtrs();
List<GenomeFragment> exons = transcriptOptional.orElseThrow().getExons();
exons.sort((o1, o2) -> {
int diff = o1.getStart() - o2.getStart();
if (diff == 0) {
diff = o1.getEnd() - o2.getEnd();
}
if (diff == 0) {
diff = (int) (o1.getId() - o2.getId());
}
return diff;
});

List<GenomeFragment> codingExons = new ArrayList<>();
exons.forEach(exon -> {
Integer start = exon.getStart();
Integer end = exon.getEnd();
for (GenomeFragment utr : utrs) {
if (utr.getStart().equals(exon.getStart())) {
start = utr.getEnd() + 1;
}
if (utr.getEnd().equals(exon.getEnd())) {
end = utr.getStart() - 1;
}
}
if (start < end) {
GenomeFragment genomeFragment = new GenomeFragment();
genomeFragment.setType(GenomeFragmentType.EXON);
genomeFragment.setStart(start);
genomeFragment.setEnd(end);
codingExons.add(genomeFragment);
} else {
GenomeFragment genomeFragment = new GenomeFragment();
genomeFragment.setType(GenomeFragmentType.EXON);
genomeFragment.setStart(0);
genomeFragment.setEnd(0);
codingExons.add(genomeFragment);
}
});

if (transcriptOptional.orElseThrow().getStrand() == -1) {
Collections.reverse(codingExons);
}

List<ProteinExonDTO> proteinExons = new ArrayList<>();
int startAA = 1;
int previousExonCodonResidues = 0;
for (int i = 0; i < codingExons.size(); i++) {
GenomeFragment genomeFragment = codingExons.get(i);
if (genomeFragment.getStart() == 0) {
continue;
}
int proteinLength = (previousExonCodonResidues + (genomeFragment.getEnd() - genomeFragment.getStart() + 1)) / 3;
previousExonCodonResidues = (previousExonCodonResidues + (genomeFragment.getEnd() - genomeFragment.getStart() + 1)) % 3;
ProteinExonDTO proteinExonDTO = new ProteinExonDTO();
proteinExonDTO.setExon(i + 1);
IntegerRange integerRange = new IntegerRange();
integerRange.setStart(startAA);
integerRange.setEnd(startAA + proteinLength - 1 + (previousExonCodonResidues > 0 ? 1 : 0));
proteinExonDTO.setRange(integerRange);
proteinExons.add(proteinExonDTO);
startAA += proteinLength;
}
return proteinExons;
}
return new ArrayList<>();
}

private Optional<EnsemblTranscript> getEnsemblTranscriptBySequence(
List<EnsemblTranscript> availableEnsemblTranscripts,
EnsemblSequence sequence
Expand Down
72 changes: 69 additions & 3 deletions src/main/java/org/mskcc/oncokb/curation/util/AlterationUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
import java.util.stream.Collectors;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.text.similarity.JaroWinklerSimilarity;
import org.checkerframework.checker.regex.qual.Regex;
import org.mskcc.oncokb.curation.domain.*;
import org.mskcc.oncokb.curation.domain.enumeration.*;
import org.springframework.stereotype.Component;
Expand All @@ -21,11 +20,15 @@ public class AlterationUtils {
private static final String FUSION_REGEX = "\\s*(\\w*)" + FUSION_SEPARATOR + "(\\w*)\\s*(?i)(fusion)?\\s*";
private static final String FUSION_ALT_REGEX = "\\s*(\\w*)" + FUSION_ALTERNATIVE_SEPARATOR + "(\\w*)\\s+(?i)fusion\\s*";

private static final String EXON_ALT_REGEX = "Exon\\s+(\\d+)(-(\\d+))?\\s+(Deletion|Insertion|Duplication)";

private static final String EXON_ALTS_REGEX = "(" + EXON_ALT_REGEX + ")(\\s*\\+\\s*" + EXON_ALT_REGEX + ")*";

private Alteration parseFusion(String alteration) {
Alteration alt = new Alteration();

Consequence consequence = new Consequence();
consequence.setTerm(SVConsequence.FUSION.name());
consequence.setTerm(SVConsequence.SV_FUSION.name());
alt.setType(AlterationType.STRUCTURAL_VARIANT);
alt.setConsequence(consequence);

Expand All @@ -49,7 +52,7 @@ private Alteration parseFusion(String alteration) {
}

private Alteration parseCopyNumberAlteration(String alteration) {
CNAConsequence cnaTerm = CNAConsequence.UNKNOWN;
CNAConsequence cnaTerm = CNAConsequence.CNA_UNKNOWN;

Optional<CNAConsequence> cnaConsequenceOptional = getCNAConsequence(alteration);
if (cnaConsequenceOptional.isPresent()) {
Expand Down Expand Up @@ -90,6 +93,55 @@ private Alteration parseGenomicChange(String genomicChange) {
return alt;
}

private Alteration parseExonAlteration(String alteration) {
Alteration alt = new Alteration();
Consequence consequence = new Consequence();
consequence.setTerm(SVConsequence.SV_UNKNOWN.name());
alt.setType(AlterationType.STRUCTURAL_VARIANT);
alt.setConsequence(consequence);

Pattern pattern = Pattern.compile(EXON_ALT_REGEX);
Matcher matcher = pattern.matcher(alteration);
List<String> splitResults = new ArrayList<>();
Set<String> consequenceTermSet = new HashSet<>();

while (matcher.find()) {
String startExonStr = matcher.group(1); // The start exon number
String endExonStr = matcher.group(3); // The end exon number (if present)
String consequenceTerm = matcher.group(4); // consequence term

switch (consequenceTerm.toLowerCase()) {
case "insertion":
consequence.setTerm(SVConsequence.SV_INSERTION.name());
break;
case "duplication":
consequence.setTerm(SVConsequence.SV_DUPLICATION.name());
break;
case "deletion":
consequence.setTerm(SVConsequence.SV_DELETION.name());
default:
break;
}

consequenceTermSet.add(consequenceTerm);
if (consequenceTermSet.size() > 1) {
consequence.setTerm(SVConsequence.SV_UNKNOWN.name());
}

int startExon = Integer.parseInt(startExonStr);
int endExon = (endExonStr != null) ? Integer.parseInt(endExonStr) : startExon;

for (int exon = startExon; exon <= endExon; exon++) {
splitResults.add("Exon " + exon + " " + consequenceTerm);
}
}

alt.setAlteration(splitResults.stream().collect(Collectors.joining(" + ")));

alt.setName(alteration);
return alt;
}

public EntityStatus<Alteration> parseAlteration(String alteration) {
EntityStatus<Alteration> entityWithStatus = new EntityStatus<>();
String message = "";
Expand Down Expand Up @@ -130,6 +182,14 @@ public EntityStatus<Alteration> parseAlteration(String alteration) {
return entityWithStatus;
}

if (isExon(alteration)) {
Alteration alt = parseExonAlteration(alteration);
entityWithStatus.setEntity(alt);
entityWithStatus.setType(status);
entityWithStatus.setMessage(message);
return entityWithStatus;
}

// the following is to parse the alteration as protein change
MutationConsequence term = UNKNOWN;
String ref = null;
Expand Down Expand Up @@ -474,6 +534,12 @@ public static Boolean isGenomicChange(String alteration) {
return m.matches();
}

public static Boolean isExon(String alteration) {
Pattern p = Pattern.compile(EXON_ALTS_REGEX);
Matcher m = p.matcher(alteration);
return m.matches();
}

public static String removeExclusionCriteria(String proteinChange) {
Matcher exclusionMatch = getExclusionCriteriaMatcher(proteinChange);
if (exclusionMatch.matches()) {
Expand Down
Loading

0 comments on commit c40dfb1

Please sign in to comment.