diff --git a/.github/workflows/pull-request-approved.yml b/.github/workflows/pull-request-approved.yml index d339f65b..ba378d3a 100644 --- a/.github/workflows/pull-request-approved.yml +++ b/.github/workflows/pull-request-approved.yml @@ -24,7 +24,7 @@ jobs: chmod +x ./.github/workflows/scripts/get-xetabase-branch.sh echo "github.event.pull_request.base.ref: ${{ github.event.pull_request.base.ref }}" echo "github.event.pull_request.head.ref: ${{ github.event.pull_request.head.ref }}" - xetabase_branch=$(./.github/workflows/scripts/get-xetabase-branch.sh ${{ github.event.pull_request.head.ref }}) + xetabase_branch=$(./.github/workflows/scripts/get-xetabase-branch.sh ${{ github.event.pull_request.base.ref }}) echo "__Xetabase ref:__ \"${xetabase_branch}\"" | tee -a ${GITHUB_STEP_SUMMARY} echo "xetabase_branch=${xetabase_branch}" >> $GITHUB_OUTPUT env: diff --git a/.github/workflows/scripts/get-xetabase-branch.sh b/.github/workflows/scripts/get-xetabase-branch.sh index a1eb7e52..781e29a3 100644 --- a/.github/workflows/scripts/get-xetabase-branch.sh +++ b/.github/workflows/scripts/get-xetabase-branch.sh @@ -19,11 +19,11 @@ get_xetabase_branch() { return 0 fi - # Check if the branch name starts with "release-" and follows the patterns "release-a.b.x" or "release-a.b.c.x" - if [[ "$input_branch" =~ ^release-([0-9]+)\.([0-9]+)\.x$ ]] || [[ "$input_branch" =~ ^release-([0-9]+)\.([0-9]+)\.([0-9]+)\.x$ ]]; then + # Check if the branch name starts with "release-" and follows the patterns "release-a.x.x" or "release-a.b.x" + if [[ "$input_branch" =~ ^release-([0-9]+)\.x\.x$ ]] || [[ "$input_branch" =~ ^release-([0-9]+)\.([0-9]+)\.x$ ]]; then # Extract the MAJOR part of the branch name MAJOR=${BASH_REMATCH[1]} - # Calculate the XETABASE_MAJOR by subtracting 3 from MAJOR + # Calculate the XETABASE_MAJOR by subtracting 1 from MAJOR XETABASE_MAJOR=$((MAJOR - 1)) # Check if the XETABASE_MAJOR is negative if (( XETABASE_MAJOR < 0 )); then diff --git a/biodata-formats/src/main/java/org/opencb/biodata/formats/variant/VariantAnnotationUtils.java b/biodata-formats/src/main/java/org/opencb/biodata/formats/variant/VariantAnnotationUtils.java new file mode 100644 index 00000000..157fa507 --- /dev/null +++ b/biodata-formats/src/main/java/org/opencb/biodata/formats/variant/VariantAnnotationUtils.java @@ -0,0 +1,82 @@ +/* + * Copyright 2015-2020 OpenCB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.opencb.biodata.formats.variant; + +import org.opencb.biodata.models.variant.avro.AlleleOrigin; + +import java.util.HashMap; +import java.util.Map; + +/** + * Created by fjlopez on 22/06/15. + */ +public class VariantAnnotationUtils { + + private static final Map ORIGIN_STRING_TO_ALLELE_ORIGIN = new HashMap<>(); + private static final Map COMPLEMENTARY_NT = new HashMap<>(); + + static { + + /////////////////////////////////////////////////////////////////////// + ///// ClinVar and Cosmic allele origins to SO terms /////////////// + /////////////////////////////////////////////////////////////////////// + ORIGIN_STRING_TO_ALLELE_ORIGIN.put("germline", AlleleOrigin.germline_variant); + ORIGIN_STRING_TO_ALLELE_ORIGIN.put("maternal", AlleleOrigin.maternal_variant); + ORIGIN_STRING_TO_ALLELE_ORIGIN.put("de novo", AlleleOrigin.de_novo_variant); + ORIGIN_STRING_TO_ALLELE_ORIGIN.put("paternal", AlleleOrigin.paternal_variant); + ORIGIN_STRING_TO_ALLELE_ORIGIN.put("somatic", AlleleOrigin.somatic_variant); + + COMPLEMENTARY_NT.put('A', 'T'); + COMPLEMENTARY_NT.put('a', 't'); + COMPLEMENTARY_NT.put('C', 'G'); + COMPLEMENTARY_NT.put('c', 'g'); + COMPLEMENTARY_NT.put('G', 'C'); + COMPLEMENTARY_NT.put('g', 'c'); + COMPLEMENTARY_NT.put('T', 'A'); + COMPLEMENTARY_NT.put('t', 'a'); + COMPLEMENTARY_NT.put('N', 'N'); + COMPLEMENTARY_NT.put('n', 'n'); + } + + public static String reverseComplement(String string) { + return reverseComplement(string, false); + } + + public static String reverseComplement(String string, boolean failOnUnknownNt) { + StringBuilder stringBuilder = new StringBuilder(string).reverse(); + for (int i = 0; i < stringBuilder.length(); i++) { + char nextNt = stringBuilder.charAt(i); + // Protection against weird characters, e.g. alternate:"TBS" found in ClinVar + if (VariantAnnotationUtils.COMPLEMENTARY_NT.containsKey(nextNt)) { + stringBuilder.setCharAt(i, VariantAnnotationUtils.COMPLEMENTARY_NT.get(nextNt)); + } else { + if (failOnUnknownNt) { + throw new IllegalArgumentException("Unknown nucleotide: '" + nextNt+ "'. " + + "Unable to reverse-complement sequence '" + string + "'."); + } else { + return null; + } + } + } + return stringBuilder.toString(); + } + + public static AlleleOrigin parseAlleleOrigin(String alleleOrigin) { + return ORIGIN_STRING_TO_ALLELE_ORIGIN.get(alleleOrigin); + } + +} diff --git a/biodata-formats/src/main/java/org/opencb/biodata/formats/variant/cosmic/CosmicParser.java b/biodata-formats/src/main/java/org/opencb/biodata/formats/variant/cosmic/CosmicParser.java new file mode 100755 index 00000000..85ff905f --- /dev/null +++ b/biodata-formats/src/main/java/org/opencb/biodata/formats/variant/cosmic/CosmicParser.java @@ -0,0 +1,514 @@ +/* + * + * + */ + +package org.opencb.biodata.formats.variant.cosmic; + +import org.apache.commons.lang3.StringUtils; +import org.opencb.biodata.formats.io.FileFormatException; +import org.opencb.biodata.formats.variant.VariantAnnotationUtils; +import org.opencb.biodata.models.sequence.SequenceLocation; +import org.opencb.biodata.models.variant.avro.*; +import org.opencb.commons.utils.FileUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.nio.file.Path; +import java.text.NumberFormat; +import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class CosmicParser { + + private static final int GENE_NAMES_COLUMN = 0; + private static final int HGNC_COLUMN = 3; + private static final int PRIMARY_SITE_COLUMN = 7; + private static final int SITE_SUBTYPE_COLUMN = 8; + private static final int PRIMARY_HISTOLOGY_COLUMN = 11; + private static final int HISTOLOGY_SUBTYPE_COLUMN = 12; + private static final int ID_COLUMN = 16; + private static final int COSM_ID_COLUMN = 17; + private static final int HGVS_COLUMN = 19; + private static final int MUTATION_DESCRIPTION_COLUMN = 21; + private static final int MUTATION_ZYGOSITY_COLUMN = 22; + private static final int FATHMM_PREDICTION_COLUMN = 29; + private static final int FATHMM_SCORE_COLUMN = 30; + private static final int MUTATION_SOMATIC_STATUS_COLUMN = 31; + private static final int PUBMED_PMID_COLUMN = 32; + private static final int SAMPLE_SOURCE_COLUMN = 34; + private static final int TUMOUR_ORIGIN_COLUMN = 35; + + private static final String SYMBOL = "symbol"; + + private static final String HGVS_INSERTION_TAG = "ins"; + private static final String HGVS_SNV_CHANGE_SYMBOL = ">"; + private static final String HGVS_DELETION_TAG = "del"; + private static final String HGVS_DUPLICATION_TAG = "dup"; + private static final String CHROMOSOME = "CHR"; + private static final String START = "START"; + private static final String END = "END"; + private static final String REF = "REF"; + private static final String ALT = "ALT"; + + private static final String VARIANT_STRING_PATTERN = "[ACGT]*"; + + private static final Pattern mutationGRCh37GenomePositionPattern = Pattern.compile("(?<" + CHROMOSOME + ">\\S+):(?<" + START + ">\\d+)-(?<" + END + ">\\d+)"); + private static final Pattern snvPattern = Pattern.compile("c\\.\\d+((\\+|\\-|_)\\d+)?(?<" + REF + ">([ACTG])+)>(?<" + ALT + ">([ACTG])+)"); + + private static Logger logger = LoggerFactory.getLogger(CosmicParser.class); + + private CosmicParser() { + throw new IllegalStateException("Utility class"); + } + + /** + * Method to parse the COSMIC file and call the callback function for the evidence entries for the given location + * + * @param cosmicFile Cosmic file to parse + * @param version Cosmic version, e.g: v95 + * @param name Evidence source name, e.g.: cosmic + * @param assembly Assembly, e.g.: GRCh38 + * @param callback Callback function to process the evidence entries for that location + * @throws IOException + */ + public static void parse(Path cosmicFile, String version, String name, String assembly, CosmicParserCallback callback) + throws IOException, FileFormatException { + + int numCosmicFields = 39; + int assemblyFieldIndex = 24; + + int totalNumberRecords = 0; + int ignoredCosmicLines = 0; + int numberProcessedRecords = 0; + int invalidPositionLines = 0; + int invalidSubstitutionLines = 0; + int invalidDeletionLines = 0; + int invalidInsertionLines = 0; + int invalidDuplicationLines = 0; + int invalidMutationCDSOtherReason = 0; + + try (BufferedReader cosmicReader = new BufferedReader(new InputStreamReader(FileUtils.newInputStream(cosmicFile)))) { + long t0; + long t1 = 0; + long t2 = 0; + List evidenceEntries = new ArrayList<>(); + SequenceLocation old = null; + + String headerLine = cosmicReader.readLine(); // First line is the header -> ignore it + logger.info("Skipping header line: {}", headerLine); + String[] headerFields = headerLine.split("\t", -1); + if (headerFields.length != numCosmicFields) { + throw new FileFormatException("Invalid COSMIC format file. Expected " + numCosmicFields + " fields, got " + + headerFields.length + " at " + headerLine); + } + + String line; + while ((line = cosmicReader.readLine()) != null) { + String[] fields = line.split("\t", -1); + + // Check fields number + if (headerFields.length != numCosmicFields) { + throw new FileFormatException("Invalid COSMIC format file. Expected " + numCosmicFields + " fields, got " + + headerFields.length + " at " + headerLine); + } + // Check assembly + String cosmicAssembly = headerFields[assemblyFieldIndex] + fields[assemblyFieldIndex]; + if (!cosmicAssembly.equalsIgnoreCase(assembly)) { + throw new IllegalArgumentException("Mismatch assembly: COSMIC file assembly is " + cosmicAssembly + " but input" + + " assembly is " + assembly); + } + + t0 = System.currentTimeMillis(); + EvidenceEntry evidenceEntry = buildCosmic(name, version, assembly, fields); + t1 += System.currentTimeMillis() - t0; + + SequenceLocation sequenceLocation = parseLocation(fields); + if (sequenceLocation == null) { + invalidPositionLines++; + } + if (old == null) { + old = sequenceLocation; + } + + if (sequenceLocation != null) { + // Parse variant + boolean validVariant = false; + String mutationCds = fields[HGVS_COLUMN]; + VariantType variantType = getVariantType(mutationCds); + if (variantType != null) { + switch (variantType) { + case SNV: + validVariant = parseSnv(mutationCds, sequenceLocation); + if (!validVariant) { + invalidSubstitutionLines++; + } + break; + case DELETION: + validVariant = parseDeletion(mutationCds, sequenceLocation); + if (!validVariant) { + invalidDeletionLines++; + } + break; + case INSERTION: + validVariant = parseInsertion(mutationCds, sequenceLocation); + if (!validVariant) { + invalidInsertionLines++; + } + break; + case DUPLICATION: + validVariant = parseDuplication(mutationCds); + if (!validVariant) { + invalidDuplicationLines++; + } + break; + default: + logger.warn("Skipping unkonwn variant type = {}", variantType); + validVariant = false; + invalidMutationCDSOtherReason++; + } + } + + if (validVariant) { + if (sequenceLocation.getStart() == old.getStart() && sequenceLocation.getAlternate().equals(old.getAlternate())) { + evidenceEntries.add(evidenceEntry); + } else { + boolean success = callback.processEvidenceEntries(old, evidenceEntries); + t2 += System.currentTimeMillis() - t0; + if (success) { + numberProcessedRecords += evidenceEntries.size(); + } else { + ignoredCosmicLines += evidenceEntries.size(); + } + old = sequenceLocation; + evidenceEntries.clear(); + evidenceEntries.add(evidenceEntry); + } + } else { + ignoredCosmicLines++; + } + } else { + ignoredCosmicLines++; + } + totalNumberRecords++; + + if (totalNumberRecords % 10000 == 0) { + logger.info("totalNumberRecords = {}", totalNumberRecords); + logger.info("numberIndexedRecords = {} ({} %)", numberProcessedRecords, + (numberProcessedRecords * 100 / totalNumberRecords)); + logger.info("ignoredCosmicLines = {}", ignoredCosmicLines); + logger.info("buildCosmic time = {}", t1); + logger.info("callback time = {}", t2); + + t1 = 0; + t2 = 0; + } + } + } finally { + logger.info("Done"); + logger.info("Total number of parsed Cosmic records: {}", totalNumberRecords); + logger.info("Number of processed Cosmic records: {}", numberProcessedRecords); + NumberFormat formatter = NumberFormat.getInstance(); + if (logger.isInfoEnabled()) { + logger.info("{} cosmic lines ignored: ", formatter.format(ignoredCosmicLines)); + } + if (invalidPositionLines > 0 && logger.isInfoEnabled()) { + logger.info("\t- {} lines by invalid position", formatter.format(invalidPositionLines)); + } + if (invalidSubstitutionLines > 0 && logger.isInfoEnabled()) { + logger.info("\t- {} lines by invalid substitution CDS", formatter.format(invalidSubstitutionLines)); + } + if (invalidInsertionLines > 0 && logger.isInfoEnabled()) { + logger.info("\t- {} lines by invalid insertion CDS", formatter.format(invalidInsertionLines)); + } + if (invalidDeletionLines > 0 && logger.isInfoEnabled()) { + logger.info("\t- {} lines by invalid deletion CDS", formatter.format(invalidDeletionLines)); + } + if (invalidDuplicationLines > 0 && logger.isInfoEnabled()) { + logger.info("\t- {} lines because mutation CDS is a duplication", formatter.format(invalidDuplicationLines)); + } + if (invalidMutationCDSOtherReason > 0 && logger.isInfoEnabled()) { + logger.info("\t- {} lines because mutation CDS is invalid for other reasons", + formatter.format(invalidMutationCDSOtherReason)); + } + } + } + + private static VariantType getVariantType(String mutationCds) { + if (mutationCds.contains(HGVS_SNV_CHANGE_SYMBOL)) { + return VariantType.SNV; + } else if (mutationCds.contains(HGVS_DELETION_TAG)) { + return VariantType.DELETION; + } else if (mutationCds.contains(HGVS_INSERTION_TAG)) { + return VariantType.INSERTION; + } else if (mutationCds.contains(HGVS_DUPLICATION_TAG)) { + return VariantType.DUPLICATION; + } else { + return null; + } + } + + private static boolean parseDuplication(String dup) { + // TODO: The only Duplication in Cosmic V70 is a structural variation that is not going to be serialized + return false; + } + + private static boolean parseInsertion(String mutationCds, SequenceLocation sequenceLocation) { + boolean validVariant = true; + String[] insParts = mutationCds.split("ins"); + + if (insParts.length > 1) { + String insertedNucleotides = insParts[1]; + if (insertedNucleotides.matches("\\d+") || !insertedNucleotides.matches(VARIANT_STRING_PATTERN)) { + //c.503_508ins30 + validVariant = false; + } else { + sequenceLocation.setReference(""); + sequenceLocation.setAlternate(getPositiveStrandString(insertedNucleotides, sequenceLocation.getStrand())); + } + } else { + validVariant = false; + } + + return validVariant; + } + + private static boolean parseDeletion(String mutationCds, SequenceLocation sequenceLocation) { + boolean validVariant = true; + String[] mutationCDSArray = mutationCds.split("del"); + + // For deletions, only deletions of, at most, deletionLength nucleotide are allowed + if (mutationCDSArray.length < 2) { // c.503_508del (usually, deletions of several nucleotides) + // TODO: allow these variants + validVariant = false; + } else if (mutationCDSArray[1].matches("\\d+") + || !mutationCDSArray[1].matches(VARIANT_STRING_PATTERN)) { // Avoid allele strings containing Ns, for example + validVariant = false; + } else { + sequenceLocation.setReference(getPositiveStrandString(mutationCDSArray[1], sequenceLocation.getStrand())); + sequenceLocation.setAlternate(""); + } + + return validVariant; + } + + private static boolean parseSnv(String mutationCds, SequenceLocation sequenceLocation) { + boolean validVariant = true; + Matcher snvMatcher = snvPattern.matcher(mutationCds); + + if (snvMatcher.matches()) { + String ref = snvMatcher.group(REF); + String alt = snvMatcher.group(ALT); + if (!ref.equalsIgnoreCase("N") && !alt.equalsIgnoreCase("N")) { + sequenceLocation.setReference(getPositiveStrandString(ref, sequenceLocation.getStrand())); + sequenceLocation.setAlternate(getPositiveStrandString(alt, sequenceLocation.getStrand())); + } else { + validVariant = false; + } + } else { + validVariant = false; + } + + return validVariant; + } + + private static String getPositiveStrandString(String alleleString, String strand) { + if (strand.equals("-")) { + return VariantAnnotationUtils.reverseComplement(alleleString, true); + } else { + return alleleString; + } + } + + private static EvidenceEntry buildCosmic(String name, String version, String assembly, String[] fields) { + String id = fields[ID_COLUMN]; + String url = "https://cancer.sanger.ac.uk/cosmic/search?q=" + id; + + EvidenceSource evidenceSource = new EvidenceSource(name, version, null); + SomaticInformation somaticInformation = getSomaticInformation(fields); + List genomicFeatureList = getGenomicFeature(fields); + + List additionalProperties = new ArrayList<>(); + additionalProperties.add(new Property("COSM_ID", "Legacy COSM ID", fields[COSM_ID_COLUMN])); + additionalProperties.add(new Property("MUTATION_DESCRIPTION", "Description", fields[MUTATION_DESCRIPTION_COLUMN])); + if (StringUtils.isNotEmpty(fields[MUTATION_ZYGOSITY_COLUMN])) { + additionalProperties.add(new Property("MUTATION_ZYGOSITY", "Mutation Zygosity", fields[MUTATION_ZYGOSITY_COLUMN])); + } + additionalProperties.add(new Property("FATHMM_PREDICTION", "FATHMM Prediction", fields[FATHMM_PREDICTION_COLUMN])); + additionalProperties.add(new Property("FATHMM_SCORE", "FATHMM Score", "0" + fields[FATHMM_SCORE_COLUMN])); + additionalProperties.add(new Property("MUTATION_SOMATIC_STATUS", "Mutation Somatic Status", + fields[MUTATION_SOMATIC_STATUS_COLUMN])); + + List bibliography = getBibliography(fields[PUBMED_PMID_COLUMN]); + + return new EvidenceEntry(evidenceSource, Collections.emptyList(), somaticInformation, + url, id, assembly, + getAlleleOriginList(Collections.singletonList(fields[MUTATION_SOMATIC_STATUS_COLUMN])), + Collections.emptyList(), genomicFeatureList, null, null, null, null, + EthnicCategory.Z, null, null, null, additionalProperties, bibliography); + } + + private static SomaticInformation getSomaticInformation(String[] fields) { + String primarySite = null; + if (!isMissing(fields[PRIMARY_SITE_COLUMN])) { + primarySite = fields[PRIMARY_SITE_COLUMN].replace("_", " "); + } + String siteSubtype = null; + if (!isMissing(fields[SITE_SUBTYPE_COLUMN])) { + siteSubtype = fields[SITE_SUBTYPE_COLUMN].replace("_", " "); + } + String primaryHistology = null; + if (!isMissing(fields[PRIMARY_HISTOLOGY_COLUMN])) { + primaryHistology = fields[PRIMARY_HISTOLOGY_COLUMN].replace("_", " "); + } + String histologySubtype = null; + if (!isMissing(fields[HISTOLOGY_SUBTYPE_COLUMN])) { + histologySubtype = fields[HISTOLOGY_SUBTYPE_COLUMN].replace("_", " "); + } + String tumourOrigin = null; + if (!isMissing(fields[TUMOUR_ORIGIN_COLUMN])) { + tumourOrigin = fields[TUMOUR_ORIGIN_COLUMN].replace("_", " "); + } + String sampleSource = null; + if (!isMissing(fields[SAMPLE_SOURCE_COLUMN])) { + sampleSource = fields[SAMPLE_SOURCE_COLUMN].replace("_", " "); + } + + return new SomaticInformation(primarySite, siteSubtype, primaryHistology, histologySubtype, tumourOrigin, sampleSource); + } + + private static List getBibliography(String bibliographyString) { + if (!isMissing(bibliographyString)) { + return Collections.singletonList("PMID:" + bibliographyString); + } + + return Collections.emptyList(); + } + + private static List getGenomicFeature(String[] fields) { + List genomicFeatureList = new ArrayList<>(5); + if (fields[GENE_NAMES_COLUMN].contains("_")) { + genomicFeatureList.add(createGeneGenomicFeature(fields[GENE_NAMES_COLUMN].split("_")[0])); + } + // Add transcript ID + if (StringUtils.isNotEmpty(fields[1])) { + genomicFeatureList.add(createGeneGenomicFeature(fields[1], FeatureTypes.transcript)); + } + if (!fields[HGNC_COLUMN].equalsIgnoreCase(fields[GENE_NAMES_COLUMN]) && !isMissing(fields[HGNC_COLUMN])) { + genomicFeatureList.add(createGeneGenomicFeature(fields[HGNC_COLUMN])); + } + + return genomicFeatureList; + } + + private static SequenceLocation parseLocation(String[] fields) { + SequenceLocation sequenceLocation = null; + String locationString = fields[25]; + if (StringUtils.isNotEmpty(locationString)) { + Matcher matcher = mutationGRCh37GenomePositionPattern.matcher(locationString); + if (matcher.matches()) { + sequenceLocation = new SequenceLocation(); + sequenceLocation.setChromosome(getCosmicChromosome(matcher.group(CHROMOSOME))); + sequenceLocation.setStrand(fields[26]); + + String mutationCds = fields[HGVS_COLUMN]; + VariantType variantType = getVariantType(mutationCds); + if (VariantType.INSERTION.equals(variantType)) { + sequenceLocation.setEnd(Integer.parseInt(matcher.group(START))); + sequenceLocation.setStart(Integer.parseInt(matcher.group(END))); + } else { + sequenceLocation.setStart(Integer.parseInt(matcher.group(START))); + sequenceLocation.setEnd(Integer.parseInt(matcher.group(END))); + } + } + } + return sequenceLocation; + } + + private static String getCosmicChromosome(String chromosome) { + switch (chromosome) { + case "23": + return "X"; + case "24": + return "Y"; + case "25": + return "MT"; + default: + return chromosome; + } + } + + private static GenomicFeature createGeneGenomicFeature(String gene) { + Map map = new HashMap<>(1); + map.put(SYMBOL, gene); + + return new GenomicFeature(FeatureTypes.gene, null, map); + } + + private static GenomicFeature createGeneGenomicFeature(String featureId, FeatureTypes featureTypes) { + Map map = new HashMap<>(1); + map.put(SYMBOL, featureId); + return new GenomicFeature(featureTypes, null, map); + } + + private static Map ORIGIN_STRING_TO_ALLELE_ORIGIN = new HashMap<>(); + + static { + + /////////////////////////////////////////////////////////////////////// + ///// ClinVar and Cosmic allele origins to SO terms /////////////// + /////////////////////////////////////////////////////////////////////// + ORIGIN_STRING_TO_ALLELE_ORIGIN.put("germline", AlleleOrigin.germline_variant); + ORIGIN_STRING_TO_ALLELE_ORIGIN.put("maternal", AlleleOrigin.maternal_variant); + ORIGIN_STRING_TO_ALLELE_ORIGIN.put("de novo", AlleleOrigin.de_novo_variant); + ORIGIN_STRING_TO_ALLELE_ORIGIN.put("paternal", AlleleOrigin.paternal_variant); + ORIGIN_STRING_TO_ALLELE_ORIGIN.put("somatic", AlleleOrigin.somatic_variant); + } + + + private static List getAlleleOriginList(List sourceOriginList) { + List alleleOrigin; + alleleOrigin = new ArrayList<>(sourceOriginList.size()); + for (String originString : sourceOriginList) { + AlleleOrigin alleleOriginValue = VariantAnnotationUtils.parseAlleleOrigin(originString); + if (alleleOriginValue != null) { + alleleOrigin.add(alleleOriginValue); + } else { + logger.debug("No SO term found for allele origin {}. Skipping.", originString); + } + } + return alleleOrigin; + } + + private static boolean isMissing(String string) { + return !((string != null) && !string.isEmpty() + && !string.replace(" ", "") + .replace("not specified", "") + .replace("NS", "") + .replace("NA", "") + .replace("na", "") + .replace("NULL", "") + .replace("null", "") + .replace("\t", "") + .replace(".", "") + .replace("-", "").isEmpty()); + } +} diff --git a/biodata-formats/src/main/java/org/opencb/biodata/formats/variant/cosmic/CosmicParserCallback.java b/biodata-formats/src/main/java/org/opencb/biodata/formats/variant/cosmic/CosmicParserCallback.java new file mode 100644 index 00000000..36614184 --- /dev/null +++ b/biodata-formats/src/main/java/org/opencb/biodata/formats/variant/cosmic/CosmicParserCallback.java @@ -0,0 +1,10 @@ +package org.opencb.biodata.formats.variant.cosmic; + +import org.opencb.biodata.models.sequence.SequenceLocation; +import org.opencb.biodata.models.variant.avro.EvidenceEntry; + +import java.util.List; + +public interface CosmicParserCallback { + boolean processEvidenceEntries(SequenceLocation sequenceLocation, List evidenceEntries); +} diff --git a/biodata-formats/src/test/java/org/opencb/biodata/formats/variant/cosmic/CosmicParserTest.java b/biodata-formats/src/test/java/org/opencb/biodata/formats/variant/cosmic/CosmicParserTest.java new file mode 100644 index 00000000..9aba1123 --- /dev/null +++ b/biodata-formats/src/test/java/org/opencb/biodata/formats/variant/cosmic/CosmicParserTest.java @@ -0,0 +1,55 @@ +package org.opencb.biodata.formats.variant.cosmic; + +import org.junit.Assert; +import org.junit.Test; +import org.opencb.biodata.formats.io.FileFormatException; +import org.opencb.biodata.models.sequence.SequenceLocation; +import org.opencb.biodata.models.variant.avro.EvidenceEntry; + +import java.io.IOException; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.List; + +public class CosmicParserTest { + + // Implementation of the LineCallback function + public class MyCallback implements CosmicParserCallback { + private String msg; + private int counter; + + public MyCallback(String msg) { + this.msg = msg; + this.counter = 0; + } + + @Override + public boolean processEvidenceEntries(SequenceLocation sequenceLocation, List evidenceEntries) { + System.out.println(msg); + System.out.println("Sequence location = " + sequenceLocation); + System.out.println("Num. evidences = " + evidenceEntries.size()); + for (EvidenceEntry evidenceEntry : evidenceEntries) { + System.out.println("evidences = " + evidenceEntry); + counter++; + } + return true; + } + + public int getCounter() { + return counter; + } + } + + @Test + public void testCosmicParser() throws IOException, FileFormatException { + Path cosmicFile = Paths.get(getClass().getResource("/cosmic.small.tsv.gz").getPath()); + String version = "v95"; + String name = "cosmic"; + String assembly = "GRCh38"; + + MyCallback callback = new MyCallback(">>> Testing message"); + + CosmicParser.parse(cosmicFile, version, name, assembly, callback); + Assert.assertEquals(90, callback.getCounter()); + } +} \ No newline at end of file diff --git a/biodata-formats/src/test/resources/cosmic.small.tsv.gz b/biodata-formats/src/test/resources/cosmic.small.tsv.gz new file mode 100644 index 00000000..e1539f82 Binary files /dev/null and b/biodata-formats/src/test/resources/cosmic.small.tsv.gz differ diff --git a/biodata-models/src/main/java/org/opencb/biodata/models/common/DataVersion.java b/biodata-models/src/main/java/org/opencb/biodata/models/common/DataVersion.java new file mode 100644 index 00000000..d5397a71 --- /dev/null +++ b/biodata-models/src/main/java/org/opencb/biodata/models/common/DataVersion.java @@ -0,0 +1,135 @@ +/* + * Copyright 2015-2020 OpenCB + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.opencb.biodata.models.common; + +import org.opencb.commons.datastore.core.ObjectMap; + +import java.util.ArrayList; +import java.util.List; + +public class DataVersion { + + private String data; + private String name; + private String version; + private String date; + private String species; + private String assembly; + private List files; + private List urls; + private ObjectMap attributes; + + public DataVersion() { + files = new ArrayList<>(); + urls = new ArrayList<>(); + attributes = new ObjectMap(); + } + + public DataVersion(String data, String name, String version, String date, String species, String assembly, List files, + List urls, ObjectMap attributes) { + this.data = data; + this.name = name; + this.version = version; + this.date = date; + this.species = species; + this.assembly = assembly; + this.files = files; + this.urls = urls; + this.attributes = attributes; + } + + public String getData() { + return data; + } + + public DataVersion setData(String data) { + this.data = data; + return this; + } + + public String getName() { + return name; + } + + public DataVersion setName(String name) { + this.name = name; + return this; + } + + public String getVersion() { + return version; + } + + public DataVersion setVersion(String version) { + this.version = version; + return this; + } + + public String getDate() { + return date; + } + + public DataVersion setDate(String date) { + this.date = date; + return this; + } + + public String getSpecies() { + return species; + } + + public DataVersion setSpecies(String species) { + this.species = species; + return this; + } + + public String getAssembly() { + return assembly; + } + + public DataVersion setAssembly(String assembly) { + this.assembly = assembly; + return this; + } + + public List getFiles() { + return files; + } + + public DataVersion setFiles(List files) { + this.files = files; + return this; + } + + public List getUrls() { + return urls; + } + + public DataVersion setUrls(List urls) { + this.urls = urls; + return this; + } + + public ObjectMap getAttributes() { + return attributes; + } + + public DataVersion setAttributes(ObjectMap attributes) { + this.attributes = attributes; + return this; + } +} diff --git a/biodata-models/src/main/java/org/opencb/biodata/models/sequence/SequenceLocation.java b/biodata-models/src/main/java/org/opencb/biodata/models/sequence/SequenceLocation.java new file mode 100644 index 00000000..0d8a6cea --- /dev/null +++ b/biodata-models/src/main/java/org/opencb/biodata/models/sequence/SequenceLocation.java @@ -0,0 +1,87 @@ +package org.opencb.biodata.models.sequence; + +public class SequenceLocation { + private String chromosome; + private int start; + private int end; + private String reference; + private String alternate; + private String strand; + + public SequenceLocation() { + } + + public SequenceLocation(String chromosome, int start, int end, String reference, String alternate) { + this(chromosome, start, end, reference, alternate, "+"); + } + + public SequenceLocation(String chromosome, int start, int end, String reference, String alternate, String strand) { + this.chromosome = chromosome; + this.start = start; + this.end = end; + this.reference = reference; + this.alternate = alternate; + this.strand = strand; + } + + @Override + public String toString() { + final StringBuilder sb = new StringBuilder("SequenceLocation{"); + sb.append("chromosome='").append(chromosome).append('\''); + sb.append(", start=").append(start); + sb.append(", end=").append(end); + sb.append(", reference='").append(reference).append('\''); + sb.append(", alternate='").append(alternate).append('\''); + sb.append(", strand='").append(strand).append('\''); + sb.append('}'); + return sb.toString(); + } + + public String getChromosome() { + return chromosome; + } + + public int getStart() { + return start; + } + + public int getEnd() { + return end; + } + + public String getReference() { + return reference; + } + + public String getAlternate() { + return alternate; + } + + public String getStrand() { + return strand; + } + + public void setChromosome(String chromosome) { + this.chromosome = chromosome; + } + + public void setStart(int start) { + this.start = start; + } + + public void setEnd(int end) { + this.end = end; + } + + public void setReference(String reference) { + this.reference = reference; + } + + public void setAlternate(String alternate) { + this.alternate = alternate; + } + + public void setStrand(String strand) { + this.strand = strand; + } +} diff --git a/pom.xml b/pom.xml index 81ce0082..6dc030db 100644 --- a/pom.xml +++ b/pom.xml @@ -38,7 +38,7 @@ - 5.2.2 + 5.3.0-SNAPSHOT 2.14.3 4.4