diff --git a/src/main/java/org/mskcc/oncokb/curation/service/MainService.java b/src/main/java/org/mskcc/oncokb/curation/service/MainService.java index 46f4bf8d0..863b84733 100644 --- a/src/main/java/org/mskcc/oncokb/curation/service/MainService.java +++ b/src/main/java/org/mskcc/oncokb/curation/service/MainService.java @@ -149,7 +149,7 @@ public AlterationAnnotationStatus annotateAlteration(ReferenceGenome referenceGe // update associated genes Set genes = alteration.getGenes(); - if (parsedAlteration.getType().equals(STRUCTURAL_VARIANT) && !parsedAlteration.getGenes().isEmpty()) { + if (STRUCTURAL_VARIANT.equals(parsedAlteration.getType()) && !parsedAlteration.getGenes().isEmpty()) { genes = parsedAlteration.getGenes(); } Set annotatedGenes = genes @@ -230,7 +230,8 @@ public AlterationAnnotationStatus annotateAlteration(ReferenceGenome referenceGe ) { String refRe = String.valueOf(canonicalSequenceOptional.orElseThrow().getSequence().charAt(alteration.getStart() - 1)); if (!StringUtils.isEmpty(refRe)) { - if (StringUtils.isEmpty(alteration.getRefResidues())) { + // only set the reference AA when the alteration happens on one position + if (StringUtils.isEmpty(alteration.getRefResidues()) && alteration.getStart().equals(alteration.getEnd())) { alteration.setRefResidues(refRe); } else { // If The AA in alteration is differed from the canonical transcript, and it's not X, we give warning diff --git a/src/main/java/org/mskcc/oncokb/curation/util/AlterationUtils.java b/src/main/java/org/mskcc/oncokb/curation/util/AlterationUtils.java index 0165c61b2..336bb9e4f 100644 --- a/src/main/java/org/mskcc/oncokb/curation/util/AlterationUtils.java +++ b/src/main/java/org/mskcc/oncokb/curation/util/AlterationUtils.java @@ -1,16 +1,17 @@ package org.mskcc.oncokb.curation.util; +import static java.util.regex.Pattern.CASE_INSENSITIVE; import static org.mskcc.oncokb.curation.domain.enumeration.MutationConsequence.*; +import static org.mskcc.oncokb.curation.util.parser.ProteinChangeParser.*; import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Collectors; import org.apache.commons.lang3.StringUtils; -import org.apache.commons.text.similarity.JaroWinklerSimilarity; -import org.checkerframework.checker.regex.qual.Regex; import org.mskcc.oncokb.curation.domain.*; import org.mskcc.oncokb.curation.domain.enumeration.*; +import org.mskcc.oncokb.curation.util.parser.ParsingStatus; import org.springframework.stereotype.Component; @Component @@ -90,6 +91,119 @@ private Alteration parseGenomicChange(String genomicChange) { return alt; } + private static ParsingStatus parseCategoricalAlterations(String proteinChange) { + ParsingStatus parsedAlteration = new ParsingStatus<>(); + + // truncating + if (proteinChange.toLowerCase().matches("truncating mutations?")) { + Alteration alteration = new Alteration(); + Consequence consequence = new Consequence(); + consequence.setTerm(FEATURE_TRUNCATION.name()); + alteration.setConsequence(consequence); + alteration.setAlteration(proteinChange); + alteration.setName(proteinChange); + parsedAlteration.setEntity(alteration); + parsedAlteration.setStatus(EntityStatusType.OK); + } + return parsedAlteration; + } + + private static ParsingStatus parseProteinChangeWithStatus(String proteinChange, String excludedStr) { + ParsingStatus parsedAlteration = parseProteinChangeThroughAllTypes(proteinChange, excludedStr); + + if (parsedAlteration.getEntity() != null) { + Alteration alteration = parsedAlteration.getEntity(); + alteration.setType(AlterationType.PROTEIN_CHANGE); + if (StringUtils.isEmpty(alteration.getAlteration())) { + alteration.setAlteration(alteration.getProteinChange()); + } + if (StringUtils.isEmpty(alteration.getName())) { + alteration.setName(alteration.getProteinChange()); + } + // Change the positional name + if (isPositionedAlteration(alteration)) { + if (StringUtils.isEmpty(excludedStr)) { + alteration.setName(alteration.getAlteration() + " Missense Mutations"); + } else { + alteration.setName(proteinChange + " Missense Mutations, excluding " + excludedStr); + } + } + if (alteration.getConsequence() == null) { + Consequence consequence = new Consequence(); + consequence.setTerm(MutationConsequence.UNKNOWN.name()); + alteration.setConsequence(consequence); + } + } + return parsedAlteration; + } + + private static ParsingStatus parseProteinChangeThroughAllTypes(String proteinChange, String excludedStr) { + ParsingStatus parsedAlteration = new ParsingStatus<>(); + + parsedAlteration = parseInframe(proteinChange); + if (parsedAlteration.isParsed()) return parsedAlteration; + + parsedAlteration = parseSplice(proteinChange); + if (parsedAlteration.isParsed()) return parsedAlteration; + + parsedAlteration = parseFrameshift(proteinChange); + if (parsedAlteration.isParsed()) return parsedAlteration; + + parsedAlteration = parseExtension(proteinChange); + if (parsedAlteration.isParsed()) return parsedAlteration; + + parsedAlteration = parseRange(proteinChange); + if (parsedAlteration.isParsed()) return parsedAlteration; + + parsedAlteration = parseSynonymous(proteinChange); + if (parsedAlteration.isParsed()) return parsedAlteration; + + return parseGeneral(proteinChange); + } + + public static void parseProteinChange(EntityStatus alterationEntityStatus, String proteinChange) { + if (proteinChange == null) { + proteinChange = ""; + } + + if (proteinChange.startsWith("p.")) { + proteinChange = proteinChange.substring(2); + } + + if (proteinChange.indexOf("[") != -1) { + proteinChange = proteinChange.substring(0, proteinChange.indexOf("[")); + } + + // we need to deal with the exclusion format so the protein change can properly be interpreted. + String excludedStr = ""; + Matcher exclusionMatch = getExclusionCriteriaMatcher(proteinChange); + if (exclusionMatch.matches()) { + proteinChange = exclusionMatch.group(1); + excludedStr = exclusionMatch.group(3).trim(); + } + + proteinChange = proteinChange.trim(); + + ParsingStatus parsedAlteration; + + parsedAlteration = parseProteinChangeWithStatus(proteinChange, excludedStr); + if (!parsedAlteration.isParsed()) parsedAlteration = parseCategoricalAlterations(proteinChange); + + if (!parsedAlteration.isParsed()) { + Alteration alteration = new Alteration(); + alteration.setAlteration(proteinChange); + alteration.setName(proteinChange); + Consequence consequence = new Consequence(); + consequence.setTerm(UNKNOWN.name()); + alteration.setConsequence(consequence); + parsedAlteration.setEntity(alteration); + parsedAlteration.setStatus(EntityStatusType.OK); + } + alterationEntityStatus.setEntity(parsedAlteration.getEntity()); + alterationEntityStatus.setType(parsedAlteration.getStatus()); + alterationEntityStatus.setMessage(parsedAlteration.getMessage()); + } + public EntityStatus parseAlteration(String alteration) { EntityStatus entityWithStatus = new EntityStatus<>(); String message = ""; @@ -130,267 +244,8 @@ public EntityStatus parseAlteration(String alteration) { return entityWithStatus; } - // the following is to parse the alteration as protein change - MutationConsequence term = UNKNOWN; - String ref = null; - String var = null; - Integer start = null; - Integer end = null; - - if (alteration == null) { - alteration = ""; - } - - if (alteration.startsWith("p.")) { - alteration = alteration.substring(2); - } - - if (alteration.indexOf("[") != -1) { - alteration = alteration.substring(0, alteration.indexOf("[")); - } - - String altStr = alteration; - - // we need to deal with the exclusion format so the protein change can properly be interpreted. - String excludedStr = ""; - Matcher exclusionMatch = getExclusionCriteriaMatcher(alteration); - if (exclusionMatch.matches()) { - alteration = exclusionMatch.group(1); - excludedStr = exclusionMatch.group(3).trim(); - } - - alteration = alteration.trim(); - - Pattern p = Pattern.compile("^([A-Z\\*]+)([0-9]+)([A-Z\\*\\?]*)$"); - Matcher m = p.matcher(alteration); - if (m.matches()) { - ref = m.group(1); - start = Integer.valueOf(m.group(2)); - end = start; - var = m.group(3); - - Integer refL = ref.length(); - Integer varL = var.length(); - - if (ref.equals("*")) { - term = STOP_LOST; - } else if (var.equals("*")) { - term = STOP_GAINED; - } else if (ref.equals(var)) { - term = SYNONYMOUS_VARIANT; - } else if (start == 1) { - term = START_LOST; - } else if (var.equals("?")) { - term = ANY; - } else { - end = start + refL - 1; - if (refL > 1 || varL > 1) { - // Handle in-frame insertion/deletion event. Exp: IK744K - if (refL > varL) { - term = INFRAME_DELETION; - } else if (refL < varL) { - term = INFRAME_INSERTION; - } else { - term = MISSENSE_VARIANT; - } - } else if (refL == 1 && varL == 1) { - term = MISSENSE_VARIANT; - } else { - status = EntityStatusType.WARNING; - message = "Unable to determine consequence"; - term = NA; - } - } - } else { - p = Pattern.compile("([A-Z]?)([0-9]+)(_[A-Z]?([0-9]+))?(delins|ins|del)([A-Z0-9]+)"); - m = p.matcher(alteration); - if (m.matches()) { - if (m.group(1) != null && m.group(3) == null) { - // we only want to specify reference when it's one position ins/del - ref = m.group(1); - } - start = Integer.valueOf(m.group(2)); - if (m.group(4) != null) { - end = Integer.valueOf(m.group(4)); - } else { - end = start; - } - String type = m.group(5); - if (type.equals("ins")) { - term = INFRAME_INSERTION; - } else if (type.equals("del")) { - term = INFRAME_DELETION; - } else { - Integer deletion = end - start + 1; - Integer insertion = m.group(6).length(); - - if (insertion - deletion > 0) { - term = INFRAME_INSERTION; - } else if (insertion - deletion == 0) { - term = MISSENSE_VARIANT; - } else { - term = INFRAME_DELETION; - } - } - } else { - p = Pattern.compile("([A-Z]?)([0-9]+)(_[A-Z]?([0-9]+))?(_)?splice"); - m = p.matcher(alteration); - if (m.matches()) { - if (m.group(1) != null && m.group(3) == null) { - // we only want to specify reference when it's one position splice - ref = m.group(1); - } - start = Integer.valueOf(m.group(2)); - if (m.group(4) != null) { - end = Integer.valueOf(m.group(4)); - } else { - end = start; - } - term = SPLICE_REGION_VARIANT; - } else { - p = Pattern.compile("([A-Z]?)([0-9]+)_([A-Z]?)([0-9]+)(.+)"); - m = p.matcher(alteration); - if (m.matches()) { - start = Integer.valueOf(m.group(2)); - end = Integer.valueOf(m.group(4)); - String v = m.group(5); - - HashMap termsToCheck = new HashMap<>(); - termsToCheck.put("mis", MISSENSE_VARIANT); - termsToCheck.put("ins", INFRAME_INSERTION); - termsToCheck.put("del", INFRAME_DELETION); - termsToCheck.put("fs", FEATURE_TRUNCATION); - termsToCheck.put("trunc", FEATURE_TRUNCATION); - termsToCheck.put("dup", INFRAME_INSERTION); - termsToCheck.put("mut", ANY); - - MutationConsequence consequence = termsToCheck.get(v); - if (consequence != null) { - term = consequence; - } else { - Double greatestSimilarity = -1.0; - String termWithGreatestSimilarity = ""; - JaroWinklerSimilarity jw = new JaroWinklerSimilarity(); - for (Map.Entry entry : termsToCheck.entrySet()) { - double similarity = jw.apply(v, entry.getKey()); - if (similarity > greatestSimilarity) { - greatestSimilarity = similarity; - termWithGreatestSimilarity = entry.getKey(); - } - } - status = EntityStatusType.ERROR; - message = "The alteration name is invalid, do you mean " + - m.group(1) + - m.group(2) + - "_" + - m.group(3) + - m.group(4) + - termWithGreatestSimilarity + - "?"; - } - } else { - p = Pattern.compile("([A-Z\\*])([0-9]+)[A-Z]?fs.*"); - m = p.matcher(alteration); - if (m.matches()) { - ref = m.group(1); - start = Integer.valueOf(m.group(2)); - end = start; - - term = FRAMESHIFT_VARIANT; - } else { - p = Pattern.compile("([A-Z]+)?([0-9]+)([A-Za-z]]+)"); - m = p.matcher(alteration); - if (m.matches()) { - ref = m.group(1); - start = Integer.valueOf(m.group(2)); - end = start; - String v = m.group(3); - switch (v) { - case "ins": - case "dup": - term = INFRAME_INSERTION; - break; - case "del": - term = INFRAME_DELETION; - break; - } - } else { - /** - * support extension variant (https://varnomen.hgvs.org/recommendations/protein/variant/extension/) - * the following examples are supported - * *959Qext*14 - * *110Gext*17 - * *315TextALGT* - * *327Aext*? - */ - p = Pattern.compile("(\\*)([0-9]+)[A-Z]ext([A-Z]+)?\\*([0-9]+)?(\\?)?"); - m = p.matcher(alteration); - if (m.matches()) { - ref = m.group(1); - start = Integer.valueOf(m.group(2)); - end = start; - term = STOP_LOST; - } else { - p = Pattern.compile("([A-Z\\*])?([0-9]+)="); - m = p.matcher(alteration); - if (m.matches()) { - var = ref = m.group(1); - start = Integer.valueOf(m.group(2)); - end = start; - if (ref.equals("*")) { - term = STOP_RETAINED_VARIANT; - } else { - term = SYNONYMOUS_VARIANT; - } - } else { - p = Pattern.compile("([0-9]+)"); - m = p.matcher(alteration); - if (m.matches()) { - start = Integer.valueOf(m.group(1)); - end = start; - term = UNKNOWN; - } - } - } - } - } - } - } - } - } - - // truncating - if (alteration.toLowerCase().matches("truncating mutations?")) { - term = FEATURE_TRUNCATION; - } - - Alteration alt = new Alteration(); - alt.setType(AlterationType.PROTEIN_CHANGE); - alt.setRefResidues(ref); - alt.setVariantResidues(var); - alt.setStart(start); - alt.setEnd(end); - alt.setAlteration(altStr); - alt.setProteinChange(alteration); - - Consequence consequence = new Consequence(); - consequence.setTerm(Optional.ofNullable(term).orElse(MutationConsequence.UNKNOWN).name()); - alt.setConsequence(consequence); - - // Change the positional name - if (isPositionedAlteration(alt)) { - if (StringUtils.isEmpty(excludedStr)) { - alt.setName(alt.getAlteration() + " Missense Mutations"); - } else { - alt.setName(alteration + " Missense Mutations, excluding " + excludedStr); - } - } else { - alt.setName(alteration); - } + parseProteinChange(entityWithStatus, alteration); - entityWithStatus.setEntity(alt); - entityWithStatus.setType(status); - entityWithStatus.setMessage(message); return entityWithStatus; } @@ -433,7 +288,7 @@ public static boolean isPositionedAlteration(Alteration alteration) { } private static Matcher getExclusionCriteriaMatcher(String proteinChange) { - Pattern exclusionPatter = Pattern.compile("(.*)\\{\\s*(exclude|excluding)(.*)\\}", Pattern.CASE_INSENSITIVE); + Pattern exclusionPatter = Pattern.compile("(.*)\\{\\s*(exclude|excluding)(.*)\\}", CASE_INSENSITIVE); Matcher exclusionMatch = exclusionPatter.matcher(proteinChange); return exclusionMatch; } diff --git a/src/main/java/org/mskcc/oncokb/curation/util/parser/ParsingStatus.java b/src/main/java/org/mskcc/oncokb/curation/util/parser/ParsingStatus.java new file mode 100644 index 000000000..f15e95ee2 --- /dev/null +++ b/src/main/java/org/mskcc/oncokb/curation/util/parser/ParsingStatus.java @@ -0,0 +1,38 @@ +package org.mskcc.oncokb.curation.util.parser; + +import org.mskcc.oncokb.curation.domain.enumeration.EntityStatusType; + +public class ParsingStatus { + + EntityStatusType status; + String message; + T entity; + + public Boolean isParsed() { + return this.status != null; + } + + public EntityStatusType getStatus() { + return status; + } + + public void setStatus(EntityStatusType status) { + this.status = status; + } + + public String getMessage() { + return message; + } + + public void setMessage(String message) { + this.message = message; + } + + public T getEntity() { + return entity; + } + + public void setEntity(T entity) { + this.entity = entity; + } +} diff --git a/src/main/java/org/mskcc/oncokb/curation/util/parser/ProteinChangeParser.java b/src/main/java/org/mskcc/oncokb/curation/util/parser/ProteinChangeParser.java new file mode 100644 index 000000000..99f8adb02 --- /dev/null +++ b/src/main/java/org/mskcc/oncokb/curation/util/parser/ProteinChangeParser.java @@ -0,0 +1,360 @@ +package org.mskcc.oncokb.curation.util.parser; + +import static java.util.regex.Pattern.CASE_INSENSITIVE; +import static org.mskcc.oncokb.curation.domain.enumeration.MutationConsequence.*; +import static org.mskcc.oncokb.curation.domain.enumeration.MutationConsequence.INFRAME_DELETION; + +import java.util.HashMap; +import java.util.Map; +import java.util.Optional; +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import org.apache.commons.lang3.StringUtils; +import org.apache.commons.text.similarity.JaroWinklerSimilarity; +import org.mskcc.oncokb.curation.domain.Alteration; +import org.mskcc.oncokb.curation.domain.Consequence; +import org.mskcc.oncokb.curation.domain.enumeration.EntityStatusType; +import org.mskcc.oncokb.curation.domain.enumeration.MutationConsequence; + +public class ProteinChangeParser { + + /** + * support extension variant (https://varnomen.hgvs.org/recommendations/protein/variant/extension/) + * the following examples are supported + * *959Qext*14 + * *110Gext*17 + * *315TextALGT* + * *327Aext*? + */ + public static ParsingStatus parseExtension(String proteinChange) { + ParsingStatus parsingStatus = new ParsingStatus<>(); + Alteration alteration = new Alteration(); + + Pattern p = Pattern.compile("M?1ext(-[0-9]+)?", Pattern.CASE_INSENSITIVE); + Matcher m = p.matcher(proteinChange); + if (m.matches()) { + alteration.setStart(1); + alteration.setEnd(1); + Consequence consequence = new Consequence(); + consequence.setTerm(INFRAME_INSERTION.name()); + alteration.setConsequence(consequence); + parsingStatus.setStatus(EntityStatusType.OK); + } else { + p = Pattern.compile("(\\*)?([0-9]+)([A-Z])?ext([A-Z]+)?\\*(([0-9]+)?(\\?)?)", CASE_INSENSITIVE); + m = p.matcher(proteinChange); + if (m.matches()) { + String revisedProteinChange = ""; + + alteration.setRefResidues(Optional.ofNullable(m.group(1)).orElse("*")); + alteration.setStart(Integer.valueOf(m.group(2))); + alteration.setEnd(alteration.getStart()); + String var = Optional.ofNullable(m.group(3)).orElse("").toUpperCase(); + revisedProteinChange = alteration.getRefResidues() + alteration.getStart() + var + "ext"; + if (m.group(4) != null) { + revisedProteinChange += m.group(4).toUpperCase(); + } + revisedProteinChange += "*"; + if (m.group(5) != null) { + revisedProteinChange += m.group(5); + } + Consequence consequence = new Consequence(); + consequence.setTerm(STOP_LOST.name()); + alteration.setConsequence(consequence); + alteration.setProteinChange(StringUtils.isEmpty(revisedProteinChange) ? proteinChange : revisedProteinChange); + parsingStatus.setStatus(EntityStatusType.OK); + } + } + parsingStatus.setEntity(alteration); + return parsingStatus; + } + + public static ParsingStatus parseGeneral(String proteinChange) { + Pattern p = Pattern.compile("^([A-Z\\*]+)?([0-9]+)([A-Z\\*\\?]*)$", CASE_INSENSITIVE); + Matcher m = p.matcher(proteinChange); + ParsingStatus parsingStatus = new ParsingStatus<>(); + if (m.matches()) { + Alteration alteration = new Alteration(); + String revisedProteinChange = ""; + MutationConsequence term = null; + + String ref = Optional.ofNullable(m.group(1)).orElse("").toUpperCase(); + String var = m.group(3).toUpperCase(); + alteration.setRefResidues(ref); + alteration.setVariantResidues(var); + Integer start = Integer.valueOf(m.group(2)); + alteration.setStart(start); + Integer end = start; + revisedProteinChange = ref + start + var; + + Integer refL = ref.length(); + Integer varL = var.length(); + + if (ref.equals("*")) { + term = STOP_LOST; + } else if (var.equals("*")) { + term = STOP_GAINED; + } else if (ref.equals(var)) { + term = SYNONYMOUS_VARIANT; + } else if (start == 1) { + term = START_LOST; + } else if (var.equals("?")) { + term = ANY; + } else { + end = start + refL - 1; + if (refL > 1 || varL > 1) { + // Handle in-frame insertion/deletion event. Exp: IK744K + if (refL > varL) { + term = INFRAME_DELETION; + } else if (refL < varL) { + term = INFRAME_INSERTION; + } else { + term = MISSENSE_VARIANT; + } + } else if (refL == 1 && varL == 1) { + term = MISSENSE_VARIANT; + } else { + parsingStatus.setStatus(EntityStatusType.WARNING); + parsingStatus.setMessage("Unable to determine consequence"); + term = NA; + } + } + alteration.setEnd(end); + if (term != null) { + Consequence consequence = new Consequence(); + consequence.setTerm(term.name()); + alteration.setConsequence(consequence); + } + alteration.setProteinChange(StringUtils.isEmpty(revisedProteinChange) ? proteinChange : revisedProteinChange); + parsingStatus.setEntity(alteration); + parsingStatus.setStatus(EntityStatusType.OK); + } + return parsingStatus; + } + + public static ParsingStatus parseInframe(String proteinChange) { + Pattern p = Pattern.compile("([A-Z]?)([0-9]+)(_[A-Z]?([0-9]+))?(delins|ins|del|dup)(.*)?", CASE_INSENSITIVE); + Matcher m = p.matcher(proteinChange); + ParsingStatus parsingStatus = new ParsingStatus<>(); + if (m.matches()) { + Alteration alteration = new Alteration(); + String revisedProteinChange = ""; + MutationConsequence term = UNKNOWN; + if (m.group(1) != null && m.group(3) == null) { + // we only want to specify reference when it's one position ins/del + alteration.setRefResidues(m.group(1).toUpperCase()); + revisedProteinChange += alteration.getRefResidues(); + } + alteration.setStart(Integer.valueOf(m.group(2))); + revisedProteinChange += alteration.getStart(); + if (m.group(3) != null) { + revisedProteinChange += m.group(3).toUpperCase(); + } + alteration.setEnd(m.group(4) != null ? Integer.valueOf(m.group(4)) : alteration.getStart()); + String type = m.group(5); + String var = Optional.ofNullable(m.group(6)).orElse("").toUpperCase(); + revisedProteinChange += type + var; + if (StringUtils.isNotEmpty(var) && !var.matches("[A-Z]+")) { + var = ""; + } + if (type.equals("ins")) { + if (StringUtils.isNotEmpty(var)) { + term = INFRAME_INSERTION; + } + } else if (type.equals("dup")) { + term = INFRAME_INSERTION; + } else if (type.equals("del")) { + term = INFRAME_DELETION; + } else if (StringUtils.isNotEmpty(var)) { + Integer deletion = alteration.getEnd() - alteration.getStart() + 1; + Integer insertion = m.group(6).length(); + + if (insertion - deletion > 0) { + term = INFRAME_INSERTION; + } else if (insertion - deletion == 0) { + term = MISSENSE_VARIANT; + } else { + term = INFRAME_DELETION; + } + } + + if (term != null) { + Consequence consequence = new Consequence(); + consequence.setTerm(term.name()); + alteration.setConsequence(consequence); + } + alteration.setProteinChange(StringUtils.isEmpty(revisedProteinChange) ? proteinChange : revisedProteinChange); + parsingStatus.setEntity(alteration); + parsingStatus.setStatus(EntityStatusType.OK); + } + return parsingStatus; + } + + public static ParsingStatus parseFrameshift(String proteinChange) { + Pattern p = Pattern.compile("([A-Z])?([0-9]+)([A-Z])?(_[A-Z]?([0-9]+)[A-Z]?)?fs(.*)", CASE_INSENSITIVE); + Matcher m = p.matcher(proteinChange); + + ParsingStatus parsingStatus = new ParsingStatus<>(); + if (m.matches()) { + Alteration alteration = new Alteration(); + String ref = Optional.ofNullable(m.group(1)).orElse("").toUpperCase(); + alteration.setStart(Integer.valueOf(m.group(2))); + if (m.group(5) != null) { + alteration.setEnd(Integer.valueOf(m.group(5))); + } else { + alteration.setRefResidues(ref); + alteration.setEnd(alteration.getStart()); + } + + String revisedProteinChange = ref + alteration.getStart(); + if (m.group(3) != null) { + revisedProteinChange += m.group(3).toUpperCase(); + } + if (m.group(4) != null) { + revisedProteinChange += m.group(4).toUpperCase(); + } + revisedProteinChange += "fs"; + if (m.group(6) != null) { + revisedProteinChange += m.group(6); + } + + Consequence consequence = new Consequence(); + consequence.setTerm(FRAMESHIFT_VARIANT.name()); + alteration.setConsequence(consequence); + + alteration.setProteinChange(StringUtils.isEmpty(revisedProteinChange) ? proteinChange : revisedProteinChange); + parsingStatus.setEntity(alteration); + parsingStatus.setStatus(EntityStatusType.OK); + } + return parsingStatus; + } + + public static ParsingStatus parseRange(String proteinChange) { + Pattern p = Pattern.compile("([A-Z]?)([0-9]+)_([A-Z]?)([0-9]+)(.+)", CASE_INSENSITIVE); + Matcher m = p.matcher(proteinChange); + ParsingStatus parsingStatus = new ParsingStatus<>(); + if (m.matches()) { + Alteration alteration = new Alteration(); + String revisedProteinChange = ""; + + alteration.setStart(Integer.valueOf(m.group(2))); + alteration.setEnd(Integer.valueOf(m.group(4))); + String variant = m.group(5); + + HashMap termsToCheck = new HashMap<>(); + termsToCheck.put("mis", MISSENSE_VARIANT); + termsToCheck.put("ins", INFRAME_INSERTION); + termsToCheck.put("del", INFRAME_DELETION); + termsToCheck.put("fs", FEATURE_TRUNCATION); + termsToCheck.put("trunc", FEATURE_TRUNCATION); + termsToCheck.put("dup", INFRAME_INSERTION); + termsToCheck.put("mut", ANY); + + MutationConsequence mutationConsequence = termsToCheck.get(variant); + if (mutationConsequence != null) { + revisedProteinChange += m.group(1).toUpperCase(); + revisedProteinChange += m.group(2) + "_"; + if (m.group(3) != null) { + revisedProteinChange += m.group(3).toUpperCase(); + } + revisedProteinChange += m.group(4) + variant; + + if (mutationConsequence != null) { + Consequence consequence = new Consequence(); + consequence.setTerm(mutationConsequence.name()); + alteration.setConsequence(consequence); + } + alteration.setProteinChange(StringUtils.isEmpty(revisedProteinChange) ? proteinChange : revisedProteinChange); + parsingStatus.setEntity(alteration); + parsingStatus.setStatus(EntityStatusType.OK); + } else { + Double greatestSimilarity = -1.0; + String termWithGreatestSimilarity = ""; + JaroWinklerSimilarity jw = new JaroWinklerSimilarity(); + for (Map.Entry entry : termsToCheck.entrySet()) { + double similarity = jw.apply(variant, entry.getKey()); + if (similarity > greatestSimilarity) { + greatestSimilarity = similarity; + termWithGreatestSimilarity = entry.getKey(); + } + } + parsingStatus.setStatus(EntityStatusType.ERROR); + parsingStatus.setMessage( + "The alteration name is invalid, do you mean " + + m.group(1) + + m.group(2) + + "_" + + m.group(3) + + m.group(4) + + termWithGreatestSimilarity + + "?" + ); + } + } + return parsingStatus; + } + + public static ParsingStatus parseSplice(String proteinChange) { + Pattern p = Pattern.compile("([A-Z]?)([0-9]+)(_[A-Z]?([0-9]+))?(_)?splice", CASE_INSENSITIVE); + Matcher m = p.matcher(proteinChange); + ParsingStatus parsingStatus = new ParsingStatus<>(); + if (m.matches()) { + String revisedProteinChange = ""; + Alteration alteration = new Alteration(); + if (m.group(1) != null && m.group(3) == null) { + // we only want to specify reference when it's one position splice + String var = m.group(1).toUpperCase(); + alteration.setRefResidues("X".equals(var) ? "" : var); + revisedProteinChange += alteration.getRefResidues().toUpperCase(); + } + alteration.setStart(Integer.valueOf(m.group(2))); + revisedProteinChange += alteration.getStart(); + if (m.group(3) != null) { + revisedProteinChange += m.group(3).toUpperCase() + "splice"; + } else { + revisedProteinChange += "_splice"; + } + alteration.setEnd(m.group(4) != null ? Integer.valueOf(m.group(4)) : alteration.getStart()); + + Consequence consequence = new Consequence(); + consequence.setTerm(SPLICE_REGION_VARIANT.name()); + alteration.setConsequence(consequence); + alteration.setProteinChange(StringUtils.isEmpty(revisedProteinChange) ? proteinChange : revisedProteinChange); + parsingStatus.setEntity(alteration); + parsingStatus.setStatus(EntityStatusType.OK); + } + return parsingStatus; + } + + public static ParsingStatus parseSynonymous(String proteinChange) { + Pattern p = Pattern.compile("([A-Z\\*])?([0-9]+)=", CASE_INSENSITIVE); + Matcher m = p.matcher(proteinChange); + ParsingStatus parsingStatus = new ParsingStatus<>(); + if (m.matches()) { + String revisedProteinChange = ""; + Alteration alteration = new Alteration(); + MutationConsequence term; + + if (m.group(1) != null) { + alteration.setRefResidues(m.group(1).toUpperCase()); + alteration.setVariantResidues(alteration.getRefResidues()); + revisedProteinChange += alteration.getRefResidues(); + } + alteration.setStart(Integer.valueOf(m.group(2))); + alteration.setEnd(alteration.getStart()); + revisedProteinChange += alteration.getStart() + "="; + if ("*".equals(alteration.getRefResidues())) { + term = STOP_RETAINED_VARIANT; + } else { + term = SYNONYMOUS_VARIANT; + } + + Consequence consequence = new Consequence(); + consequence.setTerm(term.name()); + alteration.setConsequence(consequence); + alteration.setProteinChange(StringUtils.isEmpty(revisedProteinChange) ? proteinChange : revisedProteinChange); + parsingStatus.setEntity(alteration); + parsingStatus.setStatus(EntityStatusType.OK); + } + return parsingStatus; + } +} diff --git a/src/test/java/org/mskcc/oncokb/curation/TestHelper.java b/src/test/java/org/mskcc/oncokb/curation/TestHelper.java new file mode 100644 index 000000000..5204c0f45 --- /dev/null +++ b/src/test/java/org/mskcc/oncokb/curation/TestHelper.java @@ -0,0 +1,20 @@ +package org.mskcc.oncokb.curation; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileReader; + +public class TestHelper { + + public static BufferedReader getTestFileBufferedReader(String filePath) throws FileNotFoundException { + if (filePath == null) { + System.out.println("Please specify the testing file path"); + return null; + } + + File file = new File(filePath); + FileReader reader = new FileReader(file); + return new BufferedReader(reader); + } +} diff --git a/src/test/java/org/mskcc/oncokb/curation/test/AlterationUtilsTest.java b/src/test/java/org/mskcc/oncokb/curation/test/AlterationUtilsTest.java new file mode 100644 index 000000000..9a1e4819d --- /dev/null +++ b/src/test/java/org/mskcc/oncokb/curation/test/AlterationUtilsTest.java @@ -0,0 +1,25 @@ +package org.mskcc.oncokb.curation.test; + +import static org.junit.Assert.assertEquals; +import static org.mskcc.oncokb.curation.util.AlterationUtils.parseProteinChange; + +import org.junit.jupiter.api.Test; +import org.mskcc.oncokb.curation.domain.Alteration; +import org.mskcc.oncokb.curation.domain.AlterationAnnotationStatus; +import org.mskcc.oncokb.curation.domain.EntityStatus; + +public class AlterationUtilsTest { + + @Test + public void testRevisedProteinChangeInParseProteinChange() { + EntityStatus status = new AlterationAnnotationStatus(); + parseProteinChange(status, "v600e"); + assertEquals("V600E", status.getEntity().getProteinChange()); + + parseProteinChange(status, "*757kext*"); + assertEquals("*757Kext*", status.getEntity().getProteinChange()); + + parseProteinChange(status, "T599delinsip"); + assertEquals("T599delinsIP", status.getEntity().getProteinChange()); + } +} diff --git a/src/test/java/org/mskcc/oncokb/curation/test/ParseProteinChangeParameterizedTest.java b/src/test/java/org/mskcc/oncokb/curation/test/ParseProteinChangeParameterizedTest.java new file mode 100644 index 000000000..9be71f5ee --- /dev/null +++ b/src/test/java/org/mskcc/oncokb/curation/test/ParseProteinChangeParameterizedTest.java @@ -0,0 +1,144 @@ +package org.mskcc.oncokb.curation.test; + +import static org.junit.Assert.assertEquals; +import static org.mskcc.oncokb.curation.TestHelper.getTestFileBufferedReader; +import static org.mskcc.oncokb.curation.util.AlterationUtils.parseProteinChange; + +import java.io.BufferedReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; +import org.apache.commons.lang3.StringUtils; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.mskcc.oncokb.curation.domain.Alteration; +import org.mskcc.oncokb.curation.domain.AlterationAnnotationStatus; +import org.mskcc.oncokb.curation.domain.EntityStatus; + +@RunWith(Parameterized.class) +public class ParseProteinChangeParameterizedTest { + + private static String TEST_FILE_PATH = "src/test/resources/data/test_parse_protein_change.tsv"; + + private String proteinChange; + private String expectedConsequence; + private String expectedRefAllele; + private String expectedVarAllele; + private String expectedProteinStart; + private String expectedProteinEnd; + + public ParseProteinChangeParameterizedTest( + String proteinChange, + String expectedConsequence, + String expectedRefAllele, + String expectedVarAllele, + String expectedProteinStart, + String expectedProteinEnd + ) { + this.proteinChange = proteinChange; + this.expectedConsequence = expectedConsequence; + this.expectedRefAllele = expectedRefAllele; + this.expectedVarAllele = expectedVarAllele; + this.expectedProteinStart = expectedProteinStart; + this.expectedProteinEnd = expectedProteinEnd; + } + + @Parameterized.Parameters + public static Collection getParameters() throws IOException { + return importer(); + } + + private static List importer() throws IOException { + BufferedReader buf = getTestFileBufferedReader(TEST_FILE_PATH); + String line = buf.readLine(); + + List queries = new ArrayList<>(); + int count = 0; + while (line != null) { + if (!line.startsWith("#") && line.trim().length() > 0) { + try { + String parts[] = line.split("\t"); + if (parts.length < 1) { + throw new IllegalArgumentException("Test case should have at least protein change. Current case: " + line); + } + String proteinChange = parts[0]; + String expectedConsequence = parts.length > 1 ? parts[1].toUpperCase() : ""; + String expectedRefAllele = parts.length > 2 ? parts[2] : ""; + String expectedVarAllele = parts.length > 3 ? parts[3] : ""; + String expectedProteinStart = parts.length > 4 ? parts[4] : ""; + String expectedProteinEnd = parts.length > 5 ? parts[5] : ""; + String[] query = { + proteinChange, + expectedConsequence, + expectedRefAllele, + expectedVarAllele, + expectedProteinStart, + expectedProteinEnd, + }; + queries.add(query); + count++; + } catch (Exception e) { + System.err.println("Could not add line '" + line + "'. " + e); + } + } + line = buf.readLine(); + } + System.err.println("Contains " + count + " queries."); + System.err.println("Done."); + + return queries; + } + + private void testSuite( + Alteration annotatedAlteration, + String proteinChange, + String expectedConsequence, + String expectedRefAllele, + String expectedVarAllele, + String expectedProteinStart, + String expectedProteinEnd + ) { + assertEquals( + "Not expected consequence. Query: " + proteinChange, + expectedConsequence, + annotatedAlteration.getConsequence() == null ? "" : annotatedAlteration.getConsequence().getTerm() + ); + assertEquals( + "Not expected ref allele. Query: " + proteinChange, + expectedRefAllele, + StringUtils.isEmpty(annotatedAlteration.getRefResidues()) ? "" : annotatedAlteration.getRefResidues() + ); + assertEquals( + "Not expected var allele. Query: " + proteinChange, + expectedVarAllele, + StringUtils.isEmpty(annotatedAlteration.getVariantResidues()) ? "" : annotatedAlteration.getVariantResidues() + ); + assertEquals( + "Not expected protein start. Query: " + proteinChange, + expectedProteinStart, + annotatedAlteration.getStart() == null ? "" : Integer.toString(annotatedAlteration.getStart()) + ); + assertEquals( + "Not expected protein end. Query: " + proteinChange, + expectedProteinEnd, + annotatedAlteration.getEnd() == null ? "" : Integer.toString(annotatedAlteration.getEnd()) + ); + } + + @Test + public void testSummary() { + EntityStatus alterationEntityStatus = new AlterationAnnotationStatus(); + parseProteinChange(alterationEntityStatus, proteinChange); + testSuite( + alterationEntityStatus.getEntity(), + proteinChange, + expectedConsequence, + expectedRefAllele, + expectedVarAllele, + expectedProteinStart, + expectedProteinEnd + ); + } +} diff --git a/src/test/resources/data/test_parse_protein_change.tsv b/src/test/resources/data/test_parse_protein_change.tsv new file mode 100644 index 000000000..7bd536a77 --- /dev/null +++ b/src/test/resources/data/test_parse_protein_change.tsv @@ -0,0 +1,92 @@ +# +# List of alterations for testing. Each line is the following format: +# +# Protein Change\tExpected consequence\tExpected Reference Allele\tExpected Variant Allele\tExpected Protein Start\tExpected Protein End +# + +# missense variants +N505I missense_variant N I 505 505 +E323_D324delinsKN missense_variant 323 324 +814_852mis missense_variant 814 852 +IK744KI missense_variant IK KI 744 745 + +# inframes +Q58_Q59insL inframe_insertion 58 59 +58_Q59insL inframe_insertion 58 59 +Q58_59insL inframe_insertion 58 59 +58_59insL inframe_insertion 58 59 +P68_C77dup inframe_insertion 68 77 +P68_77dup inframe_insertion 68 77 +68_C77dup inframe_insertion 68 77 +68_77dup inframe_insertion 68 77 +T599delinsIP inframe_insertion T 599 599 +599delinsIP inframe_insertion 599 599 +599delins UNKNOWN 599 599 +599delins2 UNKNOWN 599 599 +I744KI inframe_insertion I KI 744 744 +M1ext-1 inframe_insertion 1 1 +1ext-1 inframe_insertion 1 1 +1ext inframe_insertion 1 1 +C359del inframe_deletion C 359 359 +359del inframe_deletion 359 359 +IK744A inframe_deletion IK A 744 745 +1459_1468del inframe_deletion 1459 1468 +A1459_D1468del inframe_deletion 1459 1468 +1459_D1468del inframe_deletion 1459 1468 +A1459_1468del inframe_deletion 1459 1468 +D1161_S1172delinsE inframe_deletion 1161 1172 +D1161_1172delinsE inframe_deletion 1161 1172 +1161_S1172delinsE inframe_deletion 1161 1172 +1161_1172delinsE inframe_deletion 1161 1172 + +# frame shifts +S859Afs*12 frameshift_variant S 859 859 +859Afs*12 frameshift_variant 859 859 +S859fs*12 frameshift_variant S 859 859 +G314fs frameshift_variant G 314 314 +314fs frameshift_variant 314 314 +S330_S352fs frameshift_variant 330 352 +330_S352fs frameshift_variant 330 352 +S330_352fs frameshift_variant 330 352 +330_352fs frameshift_variant 330 352 + +# truncating mutations +W143_A314trunc feature_truncation 143 314 +W143_314trunc feature_truncation 143 314 +143_A314trunc feature_truncation 143 314 +422_605trunc feature_truncation 422 605 +Truncating Mutations feature_truncation + +# splice variants +596_619splice splice_region_variant 596 619 +X1429_splice splice_region_variant 1429 1429 +1429_splice splice_region_variant 1429 1429 +M1I start_lost M I 1 1 +1I start_lost I 1 1 +M1? start_lost M ? 1 1 +A149* stop_gained A * 149 149 +149* stop_gained * 149 149 +*149* stop_lost * * 149 149 + +# extension +*757Kext*36 stop_lost * 757 757 +757Kext*36 stop_lost * 757 757 +*757ext*36 stop_lost * 757 757 +*757Kext* stop_lost * 757 757 +*757Kext*? stop_lost * 757 757 + +# synonymous variants +K24K synonymous_variant K K 24 24 + +# any +449_514mut any 449 514 + +# Unknown variants +D399 NA D 399 399 +MCUR1-AKT1 fusion UNKNOWN +Fusions UNKNOWN +Deletion UNKNOWN +Amplification UNKNOWN +Oncogenic Mutations UNKNOWN +Oncogenic Mutations {excluding V600} UNKNOWN +