From 8875ff0e4944c349f8aba13895de695c3de862c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Tue, 9 Apr 2024 00:55:40 +0100 Subject: [PATCH 1/4] storage: Allow hgvs filtering via xrefs. #TASK-5878 --- .../core/variant/adaptors/VariantQuery.java | 14 +- .../variant/adaptors/VariantQueryParam.java | 2 +- .../VariantAnnotationModelUtils.java | 114 ++++++++ .../variant/query/ParsedVariantQuery.java | 11 + .../variant/query/VariantQueryParser.java | 2 +- .../core/variant/query/VariantQueryUtils.java | 11 + ...AbstractTwoPhasedVariantQueryExecutor.java | 2 +- .../VariantSearchToVariantConverter.java | 66 ++--- .../adaptors/VariantDBAdaptorTest.java | 61 +++-- .../executors/VariantQueryExecutorTest.java | 246 ++++++++++++++++++ .../VariantAnnotationToPhoenixConverter.java | 29 +-- .../SampleIndexVariantQueryExecutor.java | 6 +- .../HadoopVariantQueryExecutorTest.java | 30 +++ 13 files changed, 490 insertions(+), 104 deletions(-) create mode 100644 opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/annotation/converters/VariantAnnotationModelUtils.java create mode 100644 opencga-storage/opencga-storage-core/src/test/java/org/opencb/opencga/storage/core/variant/query/executors/VariantQueryExecutorTest.java create mode 100644 opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/query/executors/HadoopVariantQueryExecutorTest.java diff --git a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/adaptors/VariantQuery.java b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/adaptors/VariantQuery.java index 65c2bda05e6..9c04ad9c9bf 100644 --- a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/adaptors/VariantQuery.java +++ b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/adaptors/VariantQuery.java @@ -105,6 +105,15 @@ public VariantQuery sample(String value) { return sample(VariantQueryUtils.QueryOperation.OR, value); } + public VariantQuery sample(List value) { + return sample(VariantQueryUtils.QueryOperation.OR, value); + } + + public VariantQuery sample(VariantQueryUtils.QueryOperation operation, List value) { + put(VariantQueryParam.SAMPLE.key(), value.stream().collect(Collectors.joining(operation.separator()))); + return this; + } + public VariantQuery sample(String... value) { return sample(VariantQueryUtils.QueryOperation.OR, value); } @@ -178,8 +187,9 @@ public VariantQuery includeSampleId(boolean value) { put(VariantQueryParam.INCLUDE_SAMPLE_ID.key(), value); return this; } - public String includeSampleId() { - return getString(VariantQueryParam.INCLUDE_SAMPLE_ID.key()); + + public boolean includeSampleId() { + return getBoolean(VariantQueryParam.INCLUDE_SAMPLE_ID.key()); } public VariantQuery sampleMetadata(boolean value) { diff --git a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/adaptors/VariantQueryParam.java b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/adaptors/VariantQueryParam.java index b021b0e1382..f2d03ba6ddb 100644 --- a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/adaptors/VariantQueryParam.java +++ b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/adaptors/VariantQueryParam.java @@ -215,7 +215,7 @@ public final class VariantQueryParam implements QueryParam { public static final String ANNOT_XREF_DESCR = "List of any external reference, these can be genes, proteins or variants. " - + "Accepted IDs include HGNC, Ensembl genes, dbSNP, ClinVar, HPO, Cosmic, ..."; + + "Accepted IDs include HGNC, Ensembl genes, dbSNP, ClinVar, HPO, Cosmic, HGVS ..."; public static final VariantQueryParam ANNOT_XREF = new VariantQueryParam("xref", TEXT_ARRAY, ANNOT_XREF_DESCR); public static final String GENE_DESCR diff --git a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/annotation/converters/VariantAnnotationModelUtils.java b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/annotation/converters/VariantAnnotationModelUtils.java new file mode 100644 index 00000000000..cb317120362 --- /dev/null +++ b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/annotation/converters/VariantAnnotationModelUtils.java @@ -0,0 +1,114 @@ +package org.opencb.opencga.storage.core.variant.annotation.converters; + +import org.apache.commons.collections4.CollectionUtils; +import org.opencb.biodata.models.variant.avro.*; + +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +public class VariantAnnotationModelUtils { + + /** + * Extracts all the XRefs from a VariantAnnotation object. + * Includes: + * - annotation.id + * - annotation.xrefs.id + * - annotation.cytoband.chromosome + cytoband.name + * - annotation.hgvs + * - annotation.consequenceTypes.geneName + * - annotation.consequenceTypes.geneId + * - annotation.consequenceTypes.ensemblGeneId + * - annotation.consequenceTypes.transcriptId + * - annotation.consequenceTypes.ensemblTranscriptId + * - annotation.consequenceTypes.hgvs + * - annotation.consequenceTypes.proteinVariantAnnotation.uniprotAccession + * - annotation.consequenceTypes.proteinVariantAnnotation.uniprotName + * - annotation.consequenceTypes.proteinVariantAnnotation.uniprotVariantId + * - annotation.consequenceTypes.proteinVariantAnnotation.features.id + * - annotation.traitAssociation.id + * - annotation.geneTraitAssociation.hpo + * - annotation.geneTraitAssociation.id + * + * @param variantAnnotation VariantAnnotation object + * @return Set of XRefs + */ + public Set extractXRefs(VariantAnnotation variantAnnotation) { + Set xrefs = new HashSet<>(); + + if (variantAnnotation == null) { + return xrefs; + } + + xrefs.add(variantAnnotation.getId()); + + if (variantAnnotation.getXrefs() != null) { + for (Xref xref : variantAnnotation.getXrefs()) { + if (xref != null) { + xrefs.add(xref.getId()); + } + } + } + + if (variantAnnotation.getCytoband() != null) { + for (Cytoband cytoband : variantAnnotation.getCytoband()) { + // TODO: Why do we need to add the chromosome name? + xrefs.add(cytoband.getChromosome() + cytoband.getName()); + } + } + + if (variantAnnotation.getHgvs() != null) { + xrefs.addAll(variantAnnotation.getHgvs()); + } + + List consequenceTypes = variantAnnotation.getConsequenceTypes(); + if (consequenceTypes != null) { + for (ConsequenceType conseqType : consequenceTypes) { + xrefs.add(conseqType.getGeneName()); + xrefs.add(conseqType.getGeneId()); + xrefs.add(conseqType.getEnsemblGeneId()); + xrefs.add(conseqType.getTranscriptId()); + xrefs.add(conseqType.getEnsemblTranscriptId()); + + if (conseqType.getHgvs() != null) { + xrefs.addAll(conseqType.getHgvs()); + } + + ProteinVariantAnnotation protVarAnnotation = conseqType.getProteinVariantAnnotation(); + if (protVarAnnotation != null) { + + xrefs.add(protVarAnnotation.getUniprotAccession()); + xrefs.add(protVarAnnotation.getUniprotName()); + xrefs.add(protVarAnnotation.getUniprotVariantId()); + + if (protVarAnnotation.getFeatures() != null) { + for (ProteinFeature proteinFeature : protVarAnnotation.getFeatures()) { + xrefs.add(proteinFeature.getId()); + } + } + } + } + + } + + if (CollectionUtils.isNotEmpty(variantAnnotation.getTraitAssociation())) { + for (EvidenceEntry evidenceEntry : variantAnnotation.getTraitAssociation()) { + xrefs.add(evidenceEntry.getId()); + } + } + + if (variantAnnotation.getGeneTraitAssociation() != null) { + for (GeneTraitAssociation geneTrait : variantAnnotation.getGeneTraitAssociation()) { + xrefs.add(geneTrait.getHpo()); + xrefs.add(geneTrait.getId()); + } + } + + // Remove empty strings and nulls + xrefs.remove(""); + xrefs.remove(null); + + return xrefs; + } + +} diff --git a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/query/ParsedVariantQuery.java b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/query/ParsedVariantQuery.java index 9c89ffb5b61..5c90df943fc 100644 --- a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/query/ParsedVariantQuery.java +++ b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/query/ParsedVariantQuery.java @@ -226,5 +226,16 @@ public List getIDsAndXrefs() { public boolean isEmpty() { return genes.isEmpty() && variants.isEmpty() && ids.isEmpty() && otherXrefs.isEmpty(); } + + @Override + public String toString() { + final StringBuilder sb = new StringBuilder("VariantQueryXref{"); + sb.append("genes=").append(genes); + sb.append(", variants=").append(variants); + sb.append(", ids=").append(ids); + sb.append(", otherXrefs=").append(otherXrefs); + sb.append('}'); + return sb.toString(); + } } } diff --git a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/query/VariantQueryParser.java b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/query/VariantQueryParser.java index 1564ad0fabf..21c1124c88e 100644 --- a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/query/VariantQueryParser.java +++ b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/query/VariantQueryParser.java @@ -851,7 +851,7 @@ public static ParsedVariantQuery.VariantQueryXref parseXrefs(Query query) { if (variant != null) { xrefs.getVariants().add(variant); } else { - if (isVariantAccession(value) || isClinicalAccession(value) || isGeneAccession(value)) { + if (isVariantAccession(value) || isClinicalAccession(value) || isGeneAccession(value) || isHGVS(value)) { xrefs.getOtherXrefs().add(value); } else { genes.add(value); diff --git a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/query/VariantQueryUtils.java b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/query/VariantQueryUtils.java index 114379c77a9..c11c3f64634 100644 --- a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/query/VariantQueryUtils.java +++ b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/query/VariantQueryUtils.java @@ -466,6 +466,17 @@ public static boolean isVariantAccession(String value) { return value.startsWith("rs") || value.startsWith("VAR_"); } + /** + * Determines if the given value is a HGVS. + * + * @param value Value to check + * @return If is a known accession + */ + public static boolean isHGVS(String value) { + // Check regex ':[cnpg].' + return value.contains(":c.") || value.contains(":n.") || value.contains(":p.") || value.contains(":g."); + } + /** * Determines if the given value is a known clinical accession or not. *

diff --git a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/query/executors/AbstractTwoPhasedVariantQueryExecutor.java b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/query/executors/AbstractTwoPhasedVariantQueryExecutor.java index a49fe60c3e4..5697998edb1 100644 --- a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/query/executors/AbstractTwoPhasedVariantQueryExecutor.java +++ b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/query/executors/AbstractTwoPhasedVariantQueryExecutor.java @@ -133,7 +133,7 @@ protected boolean shouldGetApproximateCount(QueryOptions options, boolean iterat // } protected int getLimit(QueryOptions options) { - return options.getInt(QueryOptions.LIMIT); + return options.getInt(QueryOptions.LIMIT, -1); } protected int getSkip(QueryOptions options) { diff --git a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/search/VariantSearchToVariantConverter.java b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/search/VariantSearchToVariantConverter.java index 1eef2a8ef5e..62841c0a3b9 100644 --- a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/search/VariantSearchToVariantConverter.java +++ b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/search/VariantSearchToVariantConverter.java @@ -34,6 +34,7 @@ import org.opencb.opencga.core.api.ParamConstants; import org.opencb.opencga.core.common.JacksonUtils; import org.opencb.opencga.storage.core.variant.adaptors.VariantField; +import org.opencb.opencga.storage.core.variant.annotation.converters.VariantAnnotationModelUtils; import org.opencb.opencga.storage.core.variant.annotation.converters.VariantTraitAssociationToEvidenceEntryConverter; import org.opencb.opencga.storage.core.variant.query.VariantQueryUtils; import org.slf4j.Logger; @@ -59,6 +60,8 @@ public class VariantSearchToVariantConverter implements ComplexTypeConverter includeFields; + private final VariantAnnotationModelUtils variantAnnotationModelUtils = new VariantAnnotationModelUtils(); + public VariantSearchToVariantConverter() { this.includeFields = null; } @@ -666,19 +669,6 @@ public VariantSearchModel convertToStorageType(Variant variant) { variantSearchModel.setEnd(variant.getEnd()); variantSearchModel.setType(variant.getType().toString()); - // This field contains all possible IDs: id, dbSNP, names, genes, transcripts, protein, clinvar, hpo, ... - // This will help when searching by variant id. This is added at the end of the method after collecting all IDs - Set xrefs = new HashSet<>(); - xrefs.add(variantSearchModel.getId()); - xrefs.add(variantSearchModel.getVariantId()); - if (variant.getNames() != null && !variant.getNames().isEmpty()) { - variant.getNames().forEach(name -> { - if (name != null) { - xrefs.add(name); - } - }); - } - // convert Study related information convertStudies(variant, variantSearchModel, other); @@ -698,11 +688,6 @@ public VariantSearchModel convertToStorageType(Variant variant) { // Process Variant Annotation VariantAnnotation variantAnnotation = variant.getAnnotation(); if (variantAnnotation != null) { - - if (StringUtils.isNotEmpty(variantAnnotation.getId())) { - xrefs.add(variantAnnotation.getId()); - } - // This object will store all info and descriptions for full-text search Set traits = new HashSet<>(); @@ -721,27 +706,6 @@ public VariantSearchModel convertToStorageType(Variant variant) { } variantSearchModel.setRelease(release); - // Add cytoband names - if (variantAnnotation.getCytoband() != null) { - for (Cytoband cytoband : variantAnnotation.getCytoband()) { - xrefs.add(cytoband.getChromosome() + cytoband.getName()); - } - } - - // Add all XRefs coming from the variant annotation - if (variantAnnotation.getXrefs() != null && !variantAnnotation.getXrefs().isEmpty()) { - variantAnnotation.getXrefs().forEach(xref -> { - if (xref != null) { - xrefs.add(xref.getId()); - } - }); - } - - // Add all HGVS coming from the variant annotation - if (ListUtils.isNotEmpty(variantAnnotation.getHgvs())) { - xrefs.addAll(variantAnnotation.getHgvs()); - } - // Set Genes and Consequence Types List consequenceTypes = variantAnnotation.getConsequenceTypes(); if (consequenceTypes != null) { @@ -782,10 +746,6 @@ public VariantSearchModel convertToStorageType(Variant variant) { } } - xrefs.add(gene); - xrefs.add(conseqType.getGeneId()); - xrefs.add(conseqType.getTranscriptId()); - if (StringUtils.isNotEmpty(conseqType.getBiotype())) { biotypes.add(conseqType.getBiotype()); @@ -855,19 +815,16 @@ public VariantSearchModel convertToStorageType(Variant variant) { trans.append(FIELD_SEP); if (StringUtils.isNotEmpty(protVarAnnotation.getUniprotAccession())) { trans.append(protVarAnnotation.getUniprotAccession()); - xrefs.add(protVarAnnotation.getUniprotAccession()); } trans.append(FIELD_SEP); if (StringUtils.isNotEmpty(protVarAnnotation.getUniprotName())) { trans.append(protVarAnnotation.getUniprotName()); - xrefs.add(protVarAnnotation.getUniprotName()); } trans.append(FIELD_SEP); if (StringUtils.isNotEmpty(protVarAnnotation.getUniprotVariantId())) { trans.append(protVarAnnotation.getUniprotVariantId()); - xrefs.add(protVarAnnotation.getUniprotVariantId()); } trans.append(FIELD_SEP).append(protVarAnnotation.getPosition() == null @@ -908,8 +865,6 @@ public VariantSearchModel convertToStorageType(Variant variant) { if (protVarAnnotation.getFeatures() != null) { for (ProteinFeature proteinFeature : protVarAnnotation.getFeatures()) { if (StringUtils.isNotEmpty(proteinFeature.getId())) { - // We store them in xrefs and traits, the number of these IDs is very small - xrefs.add(proteinFeature.getId()); traits.add("PD" + FIELD_SEP + proteinFeature.getId() + FIELD_SEP + proteinFeature.getDescription()); } @@ -992,9 +947,6 @@ public VariantSearchModel convertToStorageType(Variant variant) { List clinical = VariantQueryUtils.buildClinicalCombinations(variantAnnotation); for (EvidenceEntry ev : variantAnnotation.getTraitAssociation()) { if (ev.getSource() != null && StringUtils.isNotEmpty(ev.getSource().getName())) { - if (StringUtils.isNotEmpty(ev.getId())) { - xrefs.add(ev.getId()); - } if ("clinvar".equalsIgnoreCase(ev.getSource().getName())) { String clinSigSuffix = ""; if (ev.getVariantClassification() != null @@ -1064,6 +1016,18 @@ public VariantSearchModel convertToStorageType(Variant variant) { variantSearchModel.setOther(other); } + // This field contains all possible IDs: id, dbSNP, names, genes, transcripts, protein, clinvar, hpo, ... + // This will help when searching by variant id. This is added at the end of the method after collecting all IDs + Set xrefs = variantAnnotationModelUtils.extractXRefs(variant.getAnnotation()); + xrefs.add(variantSearchModel.getId()); + xrefs.add(variantSearchModel.getVariantId()); + if (variant.getNames() != null && !variant.getNames().isEmpty()) { + variant.getNames().forEach(name -> { + if (name != null) { + xrefs.add(name); + } + }); + } variantSearchModel.setXrefs(new ArrayList<>(xrefs)); return variantSearchModel; } diff --git a/opencga-storage/opencga-storage-core/src/test/java/org/opencb/opencga/storage/core/variant/adaptors/VariantDBAdaptorTest.java b/opencga-storage/opencga-storage-core/src/test/java/org/opencb/opencga/storage/core/variant/adaptors/VariantDBAdaptorTest.java index 1f305574a64..010c72b1ea4 100644 --- a/opencga-storage/opencga-storage-core/src/test/java/org/opencb/opencga/storage/core/variant/adaptors/VariantDBAdaptorTest.java +++ b/opencga-storage/opencga-storage-core/src/test/java/org/opencb/opencga/storage/core/variant/adaptors/VariantDBAdaptorTest.java @@ -30,9 +30,7 @@ import org.opencb.biodata.models.variant.Variant; import org.opencb.biodata.models.variant.VariantFileMetadata; import org.opencb.biodata.models.variant.avro.*; -import org.opencb.biodata.models.variant.exceptions.NonStandardCompliantSampleField; import org.opencb.biodata.models.variant.stats.VariantStats; -import org.opencb.biodata.tools.variant.VariantNormalizer; import org.opencb.commons.datastore.core.DataResult; import org.opencb.commons.datastore.core.ObjectMap; import org.opencb.commons.datastore.core.Query; @@ -558,28 +556,45 @@ public void testGetAllVariants_variantId() { @Test public void testGetAllVariants_xref() { - Query query = new Query(ANNOT_XREF.key(), "3:108634973:C:A,rs2032582,HP:0001250,VAR_048225,Q9BY64,ENSG00000250026,TMPRSS11B,COSM1421316"); - queryResult = query(query, null); - assertThat(queryResult, everyResult(allVariantsSummary, anyOf( - hasAnnotation(at("3:108634973:C:A")), - with("id", Variant::getId, is("rs2032582")), - hasAnnotation(with("GeneTraitAssociation", VariantAnnotation::getGeneTraitAssociation, - hasItem(with("HPO", GeneTraitAssociation::getHpo, is("HP:0001250"))))), - hasAnnotation(with("ConsequenceType", VariantAnnotation::getConsequenceTypes, - hasItem(with("ProteinVariantAnnotation", ConsequenceType::getProteinVariantAnnotation, - with("UniprotVariantId", ProteinVariantAnnotation::getUniprotVariantId, is("VAR_048225")))))), - hasAnnotation(with("ConsequenceType", VariantAnnotation::getConsequenceTypes, - hasItem(with("ProteinVariantAnnotation", ConsequenceType::getProteinVariantAnnotation, - with("UniprotName", ProteinVariantAnnotation::getUniprotAccession, is("Q9BY64")))))), - hasAnnotation(with("ConsequenceType", VariantAnnotation::getConsequenceTypes, - hasItem(with("EnsemblGene", ConsequenceType::getGeneId, is("ENSG00000250026"))))), - hasAnnotation(with("ConsequenceType", VariantAnnotation::getConsequenceTypes, - hasItem(with("GeneName", ConsequenceType::getGeneName, is("TMPRSS11B"))))) -// hasAnnotation(with("VariantTraitAssociation", VariantAnnotation::getVariantTraitAssociation, -// with("Cosmic", VariantTraitAssociation::getCosmic, -// hasItem(with("MutationId", Cosmic::getMutationId, is("COSM1421316")))))) - ))); + Map> matchers = new HashMap<>(); + matchers.put("3:108634973:C:A", hasAnnotation(at("3:108634973:C:A"))); + matchers.put("rs2032582", with("id", Variant::getId, is("rs2032582"))); + matchers.put("HP:0001250", hasAnnotation(with("GeneTraitAssociation", VariantAnnotation::getGeneTraitAssociation, + hasItem(with("HPO", GeneTraitAssociation::getHpo, is("HP:0001250")))))); + matchers.put("VAR_048225", hasAnnotation(with("ConsequenceType", VariantAnnotation::getConsequenceTypes, + hasItem(with("ProteinVariantAnnotation", ConsequenceType::getProteinVariantAnnotation, + with("UniprotVariantId", ProteinVariantAnnotation::getUniprotVariantId, is("VAR_048225"))))))); + matchers.put("Q9BY64", hasAnnotation( + with("ConsequenceType", VariantAnnotation::getConsequenceTypes, hasItem( + with("ProteinVariantAnnotation", ConsequenceType::getProteinVariantAnnotation, + with("UniprotAccession", ProteinVariantAnnotation::getUniprotAccession, + is("Q9BY64"))))))); + matchers.put("ENSG00000250026", hasAnnotation( + with("ConsequenceType", VariantAnnotation::getConsequenceTypes, hasItem( + with("GeneId", ConsequenceType::getGeneId, + is("ENSG00000250026")))))); + matchers.put("TMPRSS11B", hasAnnotation( + with("ConsequenceType", VariantAnnotation::getConsequenceTypes, hasItem( + with("GeneName", ConsequenceType::getGeneName, + is("TMPRSS11B")))))); + matchers.put("COSM1421316", hasAnnotation(with("TraitAssociation", VariantAnnotation::getTraitAssociation, hasItem( + with("Cosmic", EvidenceEntry::getId, is("COSM1421316")))))); + + // Query one by one + for (Map.Entry> entry : matchers.entrySet()) { + Query query = new Query(ANNOT_XREF.key(), entry.getKey()); + queryResult = query(query, null); + assertThat(entry.getKey(), queryResult, everyResult(allVariantsSummary, allOf(entry.getValue()))); + query = new Query(ID.key(), entry.getKey()); + queryResult = query(query, null); + assertThat(entry.getKey(), queryResult, everyResult(allVariantsSummary, allOf(entry.getValue()))); + } + + // Query by all xrefs + Query query = new Query(ANNOT_XREF.key(), matchers.keySet()); + queryResult = query(query, null); + assertThat(queryResult, everyResult(allVariantsSummary, anyOf(matchers.values()))); } @Test diff --git a/opencga-storage/opencga-storage-core/src/test/java/org/opencb/opencga/storage/core/variant/query/executors/VariantQueryExecutorTest.java b/opencga-storage/opencga-storage-core/src/test/java/org/opencb/opencga/storage/core/variant/query/executors/VariantQueryExecutorTest.java new file mode 100644 index 00000000000..9da19716cfd --- /dev/null +++ b/opencga-storage/opencga-storage-core/src/test/java/org/opencb/opencga/storage/core/variant/query/executors/VariantQueryExecutorTest.java @@ -0,0 +1,246 @@ +package org.opencb.opencga.storage.core.variant.query.executors; + +import org.hamcrest.Matcher; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Ignore; +import org.junit.Test; +import org.opencb.biodata.models.variant.Variant; +import org.opencb.biodata.models.variant.VariantFileMetadata; +import org.opencb.biodata.models.variant.avro.*; +import org.opencb.commons.datastore.core.ObjectMap; +import org.opencb.commons.datastore.core.Query; +import org.opencb.commons.datastore.core.QueryOptions; +import org.opencb.opencga.core.response.VariantQueryResult; +import org.opencb.opencga.storage.core.StoragePipelineResult; +import org.opencb.opencga.storage.core.exceptions.StorageEngineException; +import org.opencb.opencga.storage.core.metadata.VariantStorageMetadataManager; +import org.opencb.opencga.storage.core.metadata.models.StudyMetadata; +import org.opencb.opencga.storage.core.variant.VariantStorageBaseTest; +import org.opencb.opencga.storage.core.variant.VariantStorageOptions; +import org.opencb.opencga.storage.core.variant.adaptors.GenotypeClass; +import org.opencb.opencga.storage.core.variant.adaptors.VariantDBAdaptor; +import org.opencb.opencga.storage.core.variant.adaptors.VariantQuery; +import org.opencb.opencga.storage.core.variant.adaptors.VariantQueryParam; +import org.opencb.opencga.storage.core.variant.query.ParsedVariantQuery; +import org.opencb.opencga.storage.core.variant.query.VariantQueryUtils; +import org.opencb.opencga.storage.core.variant.query.projection.VariantQueryProjection; +import org.opencb.opencga.storage.core.variant.stats.DefaultVariantStatisticsManager; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.nio.file.Paths; +import java.util.*; + +import static org.hamcrest.CoreMatchers.hasItem; +import static org.hamcrest.CoreMatchers.is; +import static org.hamcrest.MatcherAssert.assertThat; +import static org.opencb.opencga.storage.core.variant.adaptors.VariantMatchers.*; + +@Ignore +public abstract class VariantQueryExecutorTest extends VariantStorageBaseTest { + + private static Logger logger = LoggerFactory.getLogger(VariantQueryExecutorTest.class); + + protected boolean fileIndexed; + private StudyMetadata studyMetadata; + private VariantFileMetadata fileMetadata; + private int numVariants; + private DBAdaptorVariantQueryExecutor dbQueryExecutor; + private List variantQueryExecutors; + + @Before + public void setUp() throws Exception { + + VariantDBAdaptor dbAdaptor = getVariantStorageEngine().getDBAdaptor(); + VariantStorageMetadataManager metadataManager = dbAdaptor.getMetadataManager(); + if (!fileIndexed) { + studyMetadata = newStudyMetadata(); +// variantSource = new VariantSource(smallInputUri.getPath(), "testAlias", "testStudy", "Study for testing purposes"); + clearDB(DB_NAME); + ObjectMap params = new ObjectMap() + .append(VariantStorageOptions.ASSEMBLY.key(), "GRCH38") + .append(VariantStorageOptions.ANNOTATE.key(), true) + .append(VariantStorageOptions.ANNOTATION_CHECKPOINT_SIZE.key(), 500) + .append(VariantStorageOptions.STATS_CALCULATE.key(), true); + + StoragePipelineResult etlResult = runDefaultETL(smallInputUri, getVariantStorageEngine(), studyMetadata, params); + fileMetadata = variantStorageEngine.getVariantReaderUtils().readVariantFileMetadata(Paths.get(etlResult.getTransformResult().getPath()).toUri()); + numVariants = getExpectedNumLoadedVariants(fileMetadata); + fileIndexed = true; + Integer indexedFileId = metadataManager.getIndexedFiles(studyMetadata.getId()).iterator().next(); + + //Calculate stats + QueryOptions options = new QueryOptions(VariantStorageOptions.STUDY.key(), STUDY_NAME) + .append(VariantStorageOptions.LOAD_BATCH_SIZE.key(), 100) + .append(DefaultVariantStatisticsManager.OUTPUT, outputUri) + .append(DefaultVariantStatisticsManager.OUTPUT_FILE_NAME, "cohort1.cohort2.stats"); + Iterator iterator = metadataManager.getFileMetadata(studyMetadata.getId(), indexedFileId).getSamples().iterator(); + + /** Create cohorts **/ + HashSet cohort1 = new HashSet<>(); + cohort1.add(metadataManager.getSampleName(studyMetadata.getId(), iterator.next())); + cohort1.add(metadataManager.getSampleName(studyMetadata.getId(), iterator.next())); + + HashSet cohort2 = new HashSet<>(); + cohort2.add(metadataManager.getSampleName(studyMetadata.getId(), iterator.next())); + cohort2.add(metadataManager.getSampleName(studyMetadata.getId(), iterator.next())); + + Map> cohorts = new HashMap<>(); + cohorts.put("cohort1", cohort1); + cohorts.put("cohort2", cohort2); + metadataManager.registerCohorts(studyMetadata.getName(), cohorts); + + variantStorageEngine.calculateStats(studyMetadata.getName(), + new ArrayList<>(cohorts.keySet()), options); + + + variantQueryExecutors = variantStorageEngine.getVariantQueryExecutors(); + dbQueryExecutor = null; + for (VariantQueryExecutor variantQueryExecutor : variantQueryExecutors) { + if (variantQueryExecutor instanceof DBAdaptorVariantQueryExecutor) { + dbQueryExecutor = (DBAdaptorVariantQueryExecutor) variantQueryExecutor; + break; + } + } + Assert.assertNotNull(dbQueryExecutor); + } + } + + @Test + public void testXRefRs() throws StorageEngineException { + Map> matchers = new LinkedHashMap<>(); + matchers.put("3:108634973:C:A", hasAnnotation(at("3:108634973:C:A"))); +// matchers.put("rs2032582", with("id", Variant::getId, is("rs2032582"))); + matchers.put("HP:0001250", hasAnnotation(with("GeneTraitAssociation", VariantAnnotation::getGeneTraitAssociation, + hasItem(with("HPO", GeneTraitAssociation::getHpo, is("HP:0001250")))))); + matchers.put("VAR_031174", hasAnnotation(with("ConsequenceType", VariantAnnotation::getConsequenceTypes, + hasItem(with("ProteinVariantAnnotation", ConsequenceType::getProteinVariantAnnotation, + with("UniprotVariantId", ProteinVariantAnnotation::getUniprotVariantId, is("VAR_031174"))))))); +// matchers.put("Q9BY64", hasAnnotation( +// with("ConsequenceType", VariantAnnotation::getConsequenceTypes, hasItem( +// with("ProteinVariantAnnotation", ConsequenceType::getProteinVariantAnnotation, +// with("UniprotAccession", ProteinVariantAnnotation::getUniprotAccession, +// is("Q9BY64"))))))); + matchers.put("ENSG00000170925", hasAnnotation( + with("ConsequenceType", VariantAnnotation::getConsequenceTypes, hasItem( + with("GeneId", ConsequenceType::getGeneId, + is("ENSG00000170925")))))); + matchers.put("TEX13B", hasAnnotation( + with("ConsequenceType", VariantAnnotation::getConsequenceTypes, hasItem( + with("GeneName", ConsequenceType::getGeneName, + is("TEX13B")))))); + matchers.put("COSV60260399", hasAnnotation(with("TraitAssociation", VariantAnnotation::getTraitAssociation, hasItem( + with("Cosmic", EvidenceEntry::getId, is("COSV60260399")))))); + matchers.put("ENST00000341832.11(ENSG00000248333):c.356-1170A>G", hasAnnotation(with("HGVS", VariantAnnotation::getHgvs, hasItem( + is("ENST00000341832.11(ENSG00000248333):c.356-1170A>G"))))); + + for (Map.Entry> entry : matchers.entrySet()) { + testQuery(new VariantQuery().xref(entry.getKey()), new QueryOptions(), entry.getValue()); + VariantQueryResult result = testQuery(new VariantQuery().xref(entry.getKey()) + .study(studyMetadata.getName()) + .includeSampleId(true) + .includeSampleAll(), new QueryOptions(), entry.getValue()); + if (result.getNumResults() == 1) { + for (SampleEntry sample : result.first().getStudies().get(0).getSamples()) { + if (GenotypeClass.MAIN_ALT.test(sample.getData().get(0))) { + testQuery(new VariantQuery().xref(entry.getKey()) + .study(studyMetadata.getName()) + .includeSampleId(true) + .sample(sample.getSampleId()), + new QueryOptions(), entry.getValue()); + } + } + } else { + List samples = result.first().getStudies().get(0).getOrderedSamplesName(); + testQuery(new VariantQuery().xref(entry.getKey()) + .study(studyMetadata.getName()) + .includeSampleId(true) + .sample(samples), + new QueryOptions(), entry.getValue()); + } +// Variant v = result.first(); +// +// for (SampleEntry sample : v.getStudies().get(0).getSamples()) { +// if (GenotypeClass.MAIN_ALT.test(sample.getData().get(0))) { +// testQuery(new VariantQuery().xref(entry.getKey()) +// .study(studyMetadata.getName()) +// .includeSampleId(true) +// .sample(sample.getSampleId()), +// new QueryOptions(), entry.getValue()); +// } +// } + } + } + + public VariantQueryResult testQuery(Query query, QueryOptions options, Matcher matcher) throws StorageEngineException { + logger.info(""); + logger.info(""); + logger.info("####################################################"); + logger.info("########## Testing query " + query.toJson()); + logger.info("####################################################"); + Assert.assertTrue(dbQueryExecutor.canUseThisExecutor(query, options)); + + ParsedVariantQuery variantQuery = variantStorageEngine.parseQuery(query, options); + VariantQueryResult expected = dbQueryExecutor.get(new Query(variantQuery.getQuery()), new QueryOptions(options)); + + VariantQueryResult unfilteredResult = null; + VariantQueryResult result = null; + if (matcher != null) { + Query emptyQuery = new Query(); + List fileNames = new LinkedList<>(); + List sampleNames = new LinkedList<>(); + List studyNames = new LinkedList<>(); + emptyQuery.put(VariantQueryParam.INCLUDE_FILE.key(), VariantQueryUtils.NONE); + emptyQuery.put(VariantQueryParam.INCLUDE_SAMPLE.key(), VariantQueryUtils.NONE); + emptyQuery.put(VariantQueryParam.INCLUDE_STUDY.key(), VariantQueryUtils.NONE); + emptyQuery.putIfNotNull(VariantQueryParam.INCLUDE_SAMPLE_ID.key(), query.get(VariantQueryParam.INCLUDE_SAMPLE_ID.key())); + emptyQuery.putIfNotNull(VariantQueryParam.INCLUDE_GENOTYPE.key(), query.get(VariantQueryParam.INCLUDE_GENOTYPE.key())); + emptyQuery.putIfNotNull(VariantQueryParam.INCLUDE_SAMPLE_DATA.key(), query.get(VariantQueryParam.INCLUDE_SAMPLE_DATA.key())); + for (VariantQueryProjection.StudyVariantQueryProjection study : variantQuery.getProjection().getStudies().values()) { + studyNames.add(study.getStudyMetadata().getName()); + for (Integer file : study.getFiles()) { + String fileName = metadataManager.getFileName(study.getStudyMetadata().getId(), file); + fileNames.add(fileName); + } + for (Integer sample : study.getSamples()) { + String sampleName = metadataManager.getSampleName(study.getStudyMetadata().getId(), sample); + sampleNames.add(sampleName); + } + } + if (!studyNames.isEmpty()) { + emptyQuery.put(VariantQueryParam.INCLUDE_STUDY.key(), studyNames); + } + if (!fileNames.isEmpty()) { + emptyQuery.put(VariantQueryParam.INCLUDE_FILE.key(), fileNames); + } + if (!sampleNames.isEmpty()) { + emptyQuery.put(VariantQueryParam.INCLUDE_SAMPLE.key(), sampleNames); + } + QueryOptions emptyOptions = new QueryOptions(); + emptyOptions.putIfNotEmpty(QueryOptions.INCLUDE, options.getString(QueryOptions.INCLUDE)); + emptyOptions.putIfNotEmpty(QueryOptions.EXCLUDE, options.getString(QueryOptions.EXCLUDE)); + unfilteredResult = dbQueryExecutor.get(emptyQuery, emptyOptions); + } + + for (VariantQueryExecutor variantQueryExecutor : variantQueryExecutors) { + if (variantQueryExecutor.canUseThisExecutor(query, options)) { + logger.info("##########################"); + logger.info("########## Testing " + variantQueryExecutor.getClass().getSimpleName() + " with query " + query.toJson()); + result = variantQueryExecutor.get(new Query(variantQuery.getQuery()), new QueryOptions(options)); + logger.info("########## Num results : " + result.getNumResults()); + logger.info("##########################"); + Assert.assertEquals(expected.getResults(), result.getResults()); + + assertThat(result, numResults(gt(0))); + + if (matcher != null) { + assertThat(result, everyResult(unfilteredResult, matcher)); + } + } + } + // Return any result. + return result; + } + +} diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/converters/annotation/VariantAnnotationToPhoenixConverter.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/converters/annotation/VariantAnnotationToPhoenixConverter.java index c9bb8646c04..d991c8a3c30 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/converters/annotation/VariantAnnotationToPhoenixConverter.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/converters/annotation/VariantAnnotationToPhoenixConverter.java @@ -16,7 +16,6 @@ package org.opencb.opencga.storage.hadoop.variant.converters.annotation; -import org.apache.commons.collections4.CollectionUtils; import org.apache.commons.lang3.StringUtils; import org.apache.hadoop.hbase.client.Put; import org.apache.hadoop.hbase.util.Bytes; @@ -24,6 +23,7 @@ import org.opencb.biodata.models.variant.avro.*; import org.opencb.biodata.tools.commons.Converter; import org.opencb.commons.utils.CompressionUtils; +import org.opencb.opencga.storage.core.variant.annotation.converters.VariantAnnotationModelUtils; import org.opencb.opencga.storage.core.variant.query.VariantQueryUtils; import org.opencb.opencga.storage.hadoop.variant.adaptors.phoenix.PhoenixHelper; import org.opencb.opencga.storage.hadoop.variant.adaptors.phoenix.VariantPhoenixSchema; @@ -62,6 +62,7 @@ public class VariantAnnotationToPhoenixConverter extends AbstractPhoenixConverte } private int annotationId; + private VariantAnnotationModelUtils variantAnnotationModelUtils = new VariantAnnotationModelUtils(); @Deprecated public VariantAnnotationToPhoenixConverter(byte[] columnFamily) { @@ -114,10 +115,6 @@ public VariantAnnotationToPhoenixConverter(byte[] columnFamily, int annotationId // Set hpo = new HashSet<>(); Set drugs = new HashSet<>(); Set proteinKeywords = new HashSet<>(); - // Contains all the xrefs, and the id, the geneNames and transcripts - Set xrefs = new HashSet<>(); - - addNotNull(xrefs, variantAnnotation.getId()); List consequenceTypes = variantAnnotation.getConsequenceTypes() == null ? Collections.emptyList() @@ -125,7 +122,9 @@ public VariantAnnotationToPhoenixConverter(byte[] columnFamily, int annotationId for (ConsequenceType consequenceType : consequenceTypes) { addNotNull(genes, consequenceType.getGeneName()); addNotNull(genes, consequenceType.getGeneId()); + addNotNull(genes, consequenceType.getEnsemblGeneId()); addNotNull(transcripts, consequenceType.getTranscriptId()); + addNotNull(transcripts, consequenceType.getEnsemblTranscriptId()); addNotNull(biotype, consequenceType.getBiotype()); addAllNotNull(flags, consequenceType.getTranscriptFlags()); @@ -192,32 +191,14 @@ public VariantAnnotationToPhoenixConverter(byte[] columnFamily, int annotationId if (proteinVariantAnnotation.getKeywords() != null) { proteinKeywords.addAll(proteinVariantAnnotation.getKeywords()); } - addNotNull(xrefs, proteinVariantAnnotation.getUniprotName()); - addNotNull(xrefs, proteinVariantAnnotation.getUniprotAccession()); - addNotNull(xrefs, proteinVariantAnnotation.getUniprotVariantId()); } } - if (CollectionUtils.isNotEmpty(variantAnnotation.getTraitAssociation())) { - for (EvidenceEntry evidenceEntry : variantAnnotation.getTraitAssociation()) { - addNotNull(xrefs, evidenceEntry.getId()); - } - } - - xrefs.addAll(genes); - xrefs.addAll(transcripts); - if (variantAnnotation.getXrefs() != null) { - for (Xref xref : variantAnnotation.getXrefs()) { - addNotNull(xrefs, xref.getId()); - } - } if (variantAnnotation.getGeneTraitAssociation() != null) { for (GeneTraitAssociation geneTrait : variantAnnotation.getGeneTraitAssociation()) { addNotNull(geneTraitName, geneTrait.getName()); addNotNull(geneTraitId, geneTrait.getId()); - addNotNull(xrefs, geneTrait.getHpo()); - addNotNull(xrefs, geneTrait.getId()); } } @@ -235,6 +216,8 @@ public VariantAnnotationToPhoenixConverter(byte[] columnFamily, int annotationId geneSoFlag.remove(null); soFlag.remove(null); + Set xrefs = variantAnnotationModelUtils.extractXRefs(variantAnnotation); + map.put(CHROMOSOME, variantAnnotation.getChromosome()); map.put(POSITION, variantAnnotation.getStart()); map.put(REFERENCE, variantAnnotation.getReference()); diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/index/SampleIndexVariantQueryExecutor.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/index/SampleIndexVariantQueryExecutor.java index 64cf8669153..2e1b74b9f88 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/index/SampleIndexVariantQueryExecutor.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/index/SampleIndexVariantQueryExecutor.java @@ -131,7 +131,9 @@ protected Object getOrIterator(Query query, QueryOptions inputOptions, boolean i // Ensure results are sorted and it's not counting from variants dbAdaptor options.put(QueryOptions.SORT, true); options.put(QueryOptions.COUNT, false); - options.put(QueryOptions.LIMIT, tmpLimit); + if (limit > 0) { + options.put(QueryOptions.LIMIT, tmpLimit); + } MultiVariantDBIterator variantDBIterator = dbAdaptor.iterator( new org.opencb.opencga.storage.core.variant.adaptors.iterators.DelegatedVariantDBIterator(variants) { @@ -166,7 +168,7 @@ public void close() throws Exception { } // Ensure limit - if (result.getNumResults() > limit) { + if (limit > 0 && result.getNumResults() > limit) { result.setResults(result.getResults().subList(0, limit)); result.setNumResults(limit); } diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/query/executors/HadoopVariantQueryExecutorTest.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/query/executors/HadoopVariantQueryExecutorTest.java new file mode 100644 index 00000000000..7cc495bbe63 --- /dev/null +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/query/executors/HadoopVariantQueryExecutorTest.java @@ -0,0 +1,30 @@ +package org.opencb.opencga.storage.hadoop.variant.query.executors; + +import org.junit.Before; +import org.junit.ClassRule; +import org.junit.experimental.categories.Category; +import org.junit.rules.ExternalResource; +import org.opencb.opencga.core.testclassification.duration.LongTests; +import org.opencb.opencga.storage.core.variant.query.executors.VariantQueryExecutorTest; +import org.opencb.opencga.storage.hadoop.variant.HadoopVariantStorageTest; +import org.opencb.opencga.storage.hadoop.variant.VariantHbaseTestUtils; + +import static org.opencb.opencga.storage.hadoop.variant.VariantHbaseTestUtils.printVariants; + + +@Category(LongTests.class) +public class HadoopVariantQueryExecutorTest extends VariantQueryExecutorTest implements HadoopVariantStorageTest { + + @ClassRule + public static ExternalResource externalResource = new HadoopExternalResource(); + + @Override + @Before + public void setUp() throws Exception { + boolean firstRun = !super.fileIndexed; + super.setUp(); + if (firstRun) { + VariantHbaseTestUtils.printVariants(getVariantStorageEngine().getDBAdaptor(), newOutputUri()); + } + } +} From 9d315933453c6f9d392ab08b213da76d8c4fa4fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Wed, 10 Apr 2024 16:26:44 +0100 Subject: [PATCH 2/4] storage: Identify ProteinFeatureID as an xref #TASK-5878 --- .../variant/adaptors/VariantQueryParam.java | 2 +- .../VariantAnnotationModelUtils.java | 8 --- .../variant/query/VariantQueryParser.java | 6 +- .../core/variant/query/VariantQueryUtils.java | 15 +++++ .../executors/VariantQueryExecutorTest.java | 60 ++++++++++++------- .../solr/VariantSolrExternalResource.java | 1 + 6 files changed, 62 insertions(+), 30 deletions(-) diff --git a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/adaptors/VariantQueryParam.java b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/adaptors/VariantQueryParam.java index f2d03ba6ddb..c10fd5def09 100644 --- a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/adaptors/VariantQueryParam.java +++ b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/adaptors/VariantQueryParam.java @@ -41,7 +41,7 @@ public final class VariantQueryParam implements QueryParam { private static final String ACCEPTS_AND_OR = "Accepts AND (" + AND + ") and OR (" + OR + ") operators."; public static final String ID_DESCR - = "List of IDs, these can be rs IDs (dbSNP) or variants in the format chrom:start:ref:alt, e.g. rs116600158,19:7177679:C:T"; + = "List of variant IDs in the format chrom:start:ref:alt, e.g. 19:7177679:C:T"; public static final VariantQueryParam ID = new VariantQueryParam("id", TEXT_ARRAY, ID_DESCR); public static final String REGION_DESCR diff --git a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/annotation/converters/VariantAnnotationModelUtils.java b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/annotation/converters/VariantAnnotationModelUtils.java index cb317120362..c31b7137236 100644 --- a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/annotation/converters/VariantAnnotationModelUtils.java +++ b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/annotation/converters/VariantAnnotationModelUtils.java @@ -14,7 +14,6 @@ public class VariantAnnotationModelUtils { * Includes: * - annotation.id * - annotation.xrefs.id - * - annotation.cytoband.chromosome + cytoband.name * - annotation.hgvs * - annotation.consequenceTypes.geneName * - annotation.consequenceTypes.geneId @@ -50,13 +49,6 @@ public Set extractXRefs(VariantAnnotation variantAnnotation) { } } - if (variantAnnotation.getCytoband() != null) { - for (Cytoband cytoband : variantAnnotation.getCytoband()) { - // TODO: Why do we need to add the chromosome name? - xrefs.add(cytoband.getChromosome() + cytoband.getName()); - } - } - if (variantAnnotation.getHgvs() != null) { xrefs.addAll(variantAnnotation.getHgvs()); } diff --git a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/query/VariantQueryParser.java b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/query/VariantQueryParser.java index 21c1124c88e..c28e76eab24 100644 --- a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/query/VariantQueryParser.java +++ b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/query/VariantQueryParser.java @@ -851,7 +851,11 @@ public static ParsedVariantQuery.VariantQueryXref parseXrefs(Query query) { if (variant != null) { xrefs.getVariants().add(variant); } else { - if (isVariantAccession(value) || isClinicalAccession(value) || isGeneAccession(value) || isHGVS(value)) { + if (isVariantAccession(value) + || isClinicalAccession(value) + || isGeneAccession(value) + || isHGVS(value) + || isProteinFeatureId(value)) { xrefs.getOtherXrefs().add(value); } else { genes.add(value); diff --git a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/query/VariantQueryUtils.java b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/query/VariantQueryUtils.java index c11c3f64634..371b7b3239a 100644 --- a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/query/VariantQueryUtils.java +++ b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/query/VariantQueryUtils.java @@ -474,6 +474,9 @@ public static boolean isVariantAccession(String value) { */ public static boolean isHGVS(String value) { // Check regex ':[cnpg].' + // HGVC examples : + // - "1:g.65325832G>A" + // - "1:g.65325832_65325833insA" return value.contains(":c.") || value.contains(":n.") || value.contains(":p.") || value.contains(":g."); } @@ -505,6 +508,18 @@ public static boolean isGeneAccession(String value) { return isHpo(value) || value.startsWith("OMIM:") || value.startsWith("umls:"); } + /** + * Determines if the given value is a valid protein feature id. + *

+ * Protein feature id starts with 'PRO_', 'VAR_' or 'VSP_' + * + * @param value Value to check + * @return If is a known accession + */ + public static boolean isProteinFeatureId(String value) { + return value.startsWith("PRO_") | value.startsWith("VAR_") | value.startsWith("VSP_"); + } + /** * Determines if the given value is a HPO term or not. *

diff --git a/opencga-storage/opencga-storage-core/src/test/java/org/opencb/opencga/storage/core/variant/query/executors/VariantQueryExecutorTest.java b/opencga-storage/opencga-storage-core/src/test/java/org/opencb/opencga/storage/core/variant/query/executors/VariantQueryExecutorTest.java index 9da19716cfd..b5fdde0fb85 100644 --- a/opencga-storage/opencga-storage-core/src/test/java/org/opencb/opencga/storage/core/variant/query/executors/VariantQueryExecutorTest.java +++ b/opencga-storage/opencga-storage-core/src/test/java/org/opencb/opencga/storage/core/variant/query/executors/VariantQueryExecutorTest.java @@ -1,10 +1,7 @@ package org.opencb.opencga.storage.core.variant.query.executors; import org.hamcrest.Matcher; -import org.junit.Assert; -import org.junit.Before; -import org.junit.Ignore; -import org.junit.Test; +import org.junit.*; import org.opencb.biodata.models.variant.Variant; import org.opencb.biodata.models.variant.VariantFileMetadata; import org.opencb.biodata.models.variant.avro.*; @@ -25,6 +22,7 @@ import org.opencb.opencga.storage.core.variant.query.ParsedVariantQuery; import org.opencb.opencga.storage.core.variant.query.VariantQueryUtils; import org.opencb.opencga.storage.core.variant.query.projection.VariantQueryProjection; +import org.opencb.opencga.storage.core.variant.solr.VariantSolrExternalResource; import org.opencb.opencga.storage.core.variant.stats.DefaultVariantStatisticsManager; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -49,11 +47,15 @@ public abstract class VariantQueryExecutorTest extends VariantStorageBaseTest { private DBAdaptorVariantQueryExecutor dbQueryExecutor; private List variantQueryExecutors; + @ClassRule + public static VariantSolrExternalResource solr = new VariantSolrExternalResource(); + @Before public void setUp() throws Exception { VariantDBAdaptor dbAdaptor = getVariantStorageEngine().getDBAdaptor(); VariantStorageMetadataManager metadataManager = dbAdaptor.getMetadataManager(); + solr.configure(variantStorageEngine); if (!fileIndexed) { studyMetadata = newStudyMetadata(); // variantSource = new VariantSource(smallInputUri.getPath(), "testAlias", "testStudy", "Study for testing purposes"); @@ -94,6 +96,9 @@ public void setUp() throws Exception { variantStorageEngine.calculateStats(studyMetadata.getName(), new ArrayList<>(cohorts.keySet()), options); + solr.configure(variantStorageEngine); + variantStorageEngine.secondaryIndex(); + Assert.assertTrue(variantStorageEngine.secondaryAnnotationIndexActiveAndAlive()); variantQueryExecutors = variantStorageEngine.getVariantQueryExecutors(); dbQueryExecutor = null; @@ -134,6 +139,21 @@ public void testXRefRs() throws StorageEngineException { with("Cosmic", EvidenceEntry::getId, is("COSV60260399")))))); matchers.put("ENST00000341832.11(ENSG00000248333):c.356-1170A>G", hasAnnotation(with("HGVS", VariantAnnotation::getHgvs, hasItem( is("ENST00000341832.11(ENSG00000248333):c.356-1170A>G"))))); + matchers.put("VSP_039324", hasAnnotation( + with("ConsequenceType", VariantAnnotation::getConsequenceTypes, + hasItem(with("ProteinVariantAnnotation", ConsequenceType::getProteinVariantAnnotation, + with("Features", ProteinVariantAnnotation::getFeatures, + hasItem(with("id", ProteinFeature::getId, is("VSP_039324"))))))))); + matchers.put("VAR_081776", hasAnnotation( + with("ConsequenceType", VariantAnnotation::getConsequenceTypes, + hasItem(with("ProteinVariantAnnotation", ConsequenceType::getProteinVariantAnnotation, + with("Features", ProteinVariantAnnotation::getFeatures, + hasItem(with("id", ProteinFeature::getId, is("VAR_081776"))))))))); + matchers.put("PRO_0000211180", hasAnnotation( + with("ConsequenceType", VariantAnnotation::getConsequenceTypes, + hasItem(with("ProteinVariantAnnotation", ConsequenceType::getProteinVariantAnnotation, + with("Features", ProteinVariantAnnotation::getFeatures, + hasItem(with("id", ProteinFeature::getId, is("PRO_0000211180"))))))))); for (Map.Entry> entry : matchers.entrySet()) { testQuery(new VariantQuery().xref(entry.getKey()), new QueryOptions(), entry.getValue()); @@ -159,17 +179,6 @@ public void testXRefRs() throws StorageEngineException { .sample(samples), new QueryOptions(), entry.getValue()); } -// Variant v = result.first(); -// -// for (SampleEntry sample : v.getStudies().get(0).getSamples()) { -// if (GenotypeClass.MAIN_ALT.test(sample.getData().get(0))) { -// testQuery(new VariantQuery().xref(entry.getKey()) -// .study(studyMetadata.getName()) -// .includeSampleId(true) -// .sample(sample.getSampleId()), -// new QueryOptions(), entry.getValue()); -// } -// } } } @@ -177,8 +186,15 @@ public VariantQueryResult testQuery(Query query, QueryOptions options, logger.info(""); logger.info(""); logger.info("####################################################"); - logger.info("########## Testing query " + query.toJson()); + logger.info("########## TEST QUERY :" + query.toJson()); logger.info("####################################################"); + logger.info("## Allowed VariantQueryExecutors:"); + for (VariantQueryExecutor variantQueryExecutor : variantQueryExecutors) { + if (variantQueryExecutor.canUseThisExecutor(query, options)) { + logger.info("## - " + variantQueryExecutor.getClass().getSimpleName()); + } + } + logger.info("## Using DBAdaptorVariantQueryExecutor for expected results"); Assert.assertTrue(dbQueryExecutor.canUseThisExecutor(query, options)); ParsedVariantQuery variantQuery = variantStorageEngine.parseQuery(query, options); @@ -187,6 +203,7 @@ public VariantQueryResult testQuery(Query query, QueryOptions options, VariantQueryResult unfilteredResult = null; VariantQueryResult result = null; if (matcher != null) { + logger.info("## Unfiltered query for comparison"); Query emptyQuery = new Query(); List fileNames = new LinkedList<>(); List sampleNames = new LinkedList<>(); @@ -225,11 +242,14 @@ public VariantQueryResult testQuery(Query query, QueryOptions options, for (VariantQueryExecutor variantQueryExecutor : variantQueryExecutors) { if (variantQueryExecutor.canUseThisExecutor(query, options)) { - logger.info("##########################"); - logger.info("########## Testing " + variantQueryExecutor.getClass().getSimpleName() + " with query " + query.toJson()); + logger.info(""); + logger.info("###################"); + logger.info("### Testing " + variantQueryExecutor.getClass().getSimpleName()); result = variantQueryExecutor.get(new Query(variantQuery.getQuery()), new QueryOptions(options)); - logger.info("########## Num results : " + result.getNumResults()); - logger.info("##########################"); + logger.info("### Num results : " + result.getNumResults()); + logger.info("###################"); + expected.getResults().sort(Comparator.comparing(Variant::toString)); + result.getResults().sort(Comparator.comparing(Variant::toString)); Assert.assertEquals(expected.getResults(), result.getResults()); assertThat(result, numResults(gt(0))); diff --git a/opencga-storage/opencga-storage-core/src/test/java/org/opencb/opencga/storage/core/variant/solr/VariantSolrExternalResource.java b/opencga-storage/opencga-storage-core/src/test/java/org/opencb/opencga/storage/core/variant/solr/VariantSolrExternalResource.java index 7ae1991221d..03548c65102 100644 --- a/opencga-storage/opencga-storage-core/src/test/java/org/opencb/opencga/storage/core/variant/solr/VariantSolrExternalResource.java +++ b/opencga-storage/opencga-storage-core/src/test/java/org/opencb/opencga/storage/core/variant/solr/VariantSolrExternalResource.java @@ -120,6 +120,7 @@ public VariantSearchManager configure(VariantStorageEngine variantStorageEngine) VariantSearchManager variantSearchManager = variantStorageEngine.getVariantSearchManager(); variantSearchManager.setSolrManager(new SolrManager(solrClient, "localhost", "core", variantStorageEngine.getConfiguration().getSearch().getTimeout())); + variantSearchManager.setSolrClient(solrClient); return variantSearchManager; } From 4caa913179fbe37d6b97a2bca6a618e60f0b1112 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Fri, 12 Apr 2024 14:15:31 +0100 Subject: [PATCH 3/4] storage: Fix rs-id filtering in sample/data endpoint #TASK-5878 --- .../storage/core/utils/CellBaseUtils.java | 1 + .../core/variant/VariantStorageEngine.java | 32 +++++++++++++++++-- .../sample/VariantSampleDataManager.java | 12 +++---- .../variant/adaptors/VariantMatchers.java | 14 ++++++++ .../annotation/DummyTestAnnotator.java | 7 ++++ .../VariantAnnotationManagerTest.java | 5 +++ .../variant/HadoopVariantStorageEngine.java | 6 ++-- .../sample/HBaseVariantSampleDataManager.java | 20 +++--------- 8 files changed, 70 insertions(+), 27 deletions(-) diff --git a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/utils/CellBaseUtils.java b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/utils/CellBaseUtils.java index ed0848a0e0b..f8dfc2d1f9f 100644 --- a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/utils/CellBaseUtils.java +++ b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/utils/CellBaseUtils.java @@ -343,6 +343,7 @@ public List getVariants(List variantsStr) { List variants = new ArrayList<>(variantsStr.size()); List> response = null; try { + // FIXME: This method should call genomic/variant/snp/search response = checkNulls(cellBaseClient.getVariantClient().get(variantsStr, new QueryOptions(QueryOptions.INCLUDE, VariantField.CHROMOSOME.fieldName() + "," diff --git a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/VariantStorageEngine.java b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/VariantStorageEngine.java index a3ab7e539b5..018d09292f1 100644 --- a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/VariantStorageEngine.java +++ b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/VariantStorageEngine.java @@ -1192,8 +1192,36 @@ public VariantQueryResult getCompoundHeterozygous(String study, String return get(query, options); } - public DataResult getSampleData(String variant, String study, QueryOptions options) throws StorageEngineException { - return new VariantSampleDataManager(getDBAdaptor()).getSampleData(variant, study, options); + public DataResult getSampleData(String variantStr, String study, QueryOptions options) throws StorageEngineException { + final Variant variant = getVariant(variantStr); + return getVariantSampleDataManager().getSampleData(variant, study, options); + } + + public Variant getVariant(String variantStr) { + final Variant variant; + if (VariantQueryUtils.isVariantId(variantStr)) { + variant = new Variant(variantStr); + } else if (VariantQueryUtils.isVariantAccession(variantStr)) { + VariantQueryResult result = get(new Query(VariantQueryParam.ANNOT_XREF.key(), variantStr), + new QueryOptions(QueryOptions.INCLUDE, VariantField.ID).append(QueryOptions.LIMIT, 1).append(QueryOptions.COUNT, true)); + if (result.getNumMatches() > 1) { + throw new VariantQueryException("Not unique variant identifier '" + variantStr + "'." + + " Found " + result.getNumMatches() + " results"); + } else if (result.getNumResults() == 1) { + variant = result.first(); + } else { + throw VariantQueryException.variantNotFound(variantStr); + } + } else { + throw new VariantQueryException("Variant not valid. Variant = '" + variantStr + "'. Supported values:" + + " {chr}:{start}:{end}:{ref}:{alt}, rs{id}"); + } + variant.setId(variant.toString()); + return variant; + } + + protected VariantSampleDataManager getVariantSampleDataManager() throws StorageEngineException { + return new VariantSampleDataManager(getDBAdaptor()); } public VariantQueryResult get(Query query, QueryOptions options) { diff --git a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/adaptors/sample/VariantSampleDataManager.java b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/adaptors/sample/VariantSampleDataManager.java index d0e09c582e5..b575d1f63e5 100644 --- a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/adaptors/sample/VariantSampleDataManager.java +++ b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/adaptors/sample/VariantSampleDataManager.java @@ -40,13 +40,13 @@ public VariantSampleDataManager(VariantDBAdaptor dbAdaptor) { } - public final DataResult getSampleData(String variant, String study, QueryOptions options) { + public final DataResult getSampleData(Variant variant, String study, QueryOptions options) { options = options == null ? new QueryOptions() : options; int sampleLimit = options.getInt(SAMPLE_BATCH_SIZE, SAMPLE_BATCH_SIZE_DEFAULT); return getSampleData(variant, study, options, sampleLimit); } - public final DataResult getSampleData(String variant, String study, QueryOptions options, int sampleLimit) { + public final DataResult getSampleData(Variant variant, String study, QueryOptions options, int sampleLimit) { options = options == null ? new QueryOptions() : options; @@ -77,7 +77,7 @@ public final DataResult getSampleData(String variant, String study, Que } protected DataResult getSampleData( - String variantStr, String study, QueryOptions options, List includeSamples, Set genotypes, + Variant variant, String study, QueryOptions options, List includeSamples, Set genotypes, int sampleLimit) { options = options == null ? new QueryOptions() : options; Set includeFields = VariantField.getIncludeFields(options); @@ -98,7 +98,7 @@ protected DataResult getSampleData( int queries = 0; while (true) { queries++; - Query query = new Query(VariantQueryParam.ID.key(), variantStr) + Query query = new Query(VariantQueryParam.ID.key(), variant.toString()) .append(VariantQueryParam.STUDY.key(), study) .append(VariantQueryParam.INCLUDE_GENOTYPE.key(), options.get(VariantQueryParam.INCLUDE_GENOTYPE.key())) .append(VariantQueryParam.INCLUDE_SAMPLE_DATA.key(), options.get(VariantQueryParam.INCLUDE_SAMPLE_DATA.key())) @@ -130,7 +130,7 @@ protected DataResult getSampleData( DataResult result = dbAdaptor.get(query, variantQueryOptions); if (result.getNumResults() == 0) { - throw VariantQueryException.variantNotFound(variantStr); + throw VariantQueryException.variantNotFound(variant.toString()); } dbTime += result.getTime(); Variant partialVariant = result.first(); @@ -199,7 +199,7 @@ protected DataResult getSampleData( } } - Variant variant = new Variant(variantStr); + variant = new Variant(variant.toString()); variant.setAnnotation(annotation); StudyEntry studyEntry = new StudyEntry(study); variant.addStudyEntry(studyEntry); diff --git a/opencga-storage/opencga-storage-core/src/test/java/org/opencb/opencga/storage/core/variant/adaptors/VariantMatchers.java b/opencga-storage/opencga-storage-core/src/test/java/org/opencb/opencga/storage/core/variant/adaptors/VariantMatchers.java index 7f790ffe609..11c9a6f8927 100644 --- a/opencga-storage/opencga-storage-core/src/test/java/org/opencb/opencga/storage/core/variant/adaptors/VariantMatchers.java +++ b/opencga-storage/opencga-storage-core/src/test/java/org/opencb/opencga/storage/core/variant/adaptors/VariantMatchers.java @@ -139,6 +139,20 @@ public static Matcher overlaps(Variant variant) { return overlaps(new Region(variant.getChromosome(), variant.getStart(), variant.getEnd()), true); } + public static Matcher samePosition(Variant variant) { + return new TypeSafeDiagnosingMatcher() { + @Override + protected boolean matchesSafely(Variant item, Description mismatchDescription) { + return variant.sameGenomicVariant(item); + } + + @Override + public void describeTo(Description description) { + description.appendText("has same genomic region " + variant); + } + }; + } + public static Matcher overlaps(Region region) { return overlaps(region, true); } diff --git a/opencga-storage/opencga-storage-core/src/test/java/org/opencb/opencga/storage/core/variant/annotation/DummyTestAnnotator.java b/opencga-storage/opencga-storage-core/src/test/java/org/opencb/opencga/storage/core/variant/annotation/DummyTestAnnotator.java index b36e8fa90ad..8491d03b41e 100644 --- a/opencga-storage/opencga-storage-core/src/test/java/org/opencb/opencga/storage/core/variant/annotation/DummyTestAnnotator.java +++ b/opencga-storage/opencga-storage-core/src/test/java/org/opencb/opencga/storage/core/variant/annotation/DummyTestAnnotator.java @@ -4,7 +4,9 @@ import org.opencb.biodata.models.variant.avro.AdditionalAttribute; import org.opencb.biodata.models.variant.avro.ConsequenceType; import org.opencb.biodata.models.variant.avro.VariantAnnotation; +import org.opencb.biodata.models.variant.avro.Xref; import org.opencb.commons.datastore.core.ObjectMap; +import org.opencb.commons.utils.CryptoUtils; import org.opencb.opencga.core.config.storage.StorageConfiguration; import org.opencb.opencga.storage.core.metadata.models.ProjectMetadata; import org.opencb.opencga.storage.core.variant.annotation.annotators.VariantAnnotator; @@ -29,6 +31,10 @@ public DummyTestAnnotator(StorageConfiguration configuration, ProjectMetadata pr fail = options.getBoolean(FAIL, false); } + static String getRs(Variant variant) { + return "rs" + variant.toString().hashCode(); + } + @Override public List annotate(List variants) throws VariantAnnotatorException { if (fail) { @@ -48,6 +54,7 @@ public List annotate(List variants) throws VariantAn ct.setExonOverlap(Collections.emptyList()); ct.setTranscriptFlags(Collections.emptyList()); a.setConsequenceTypes(Collections.singletonList(ct)); + a.setXrefs(Collections.singletonList(new Xref(getRs(v), "dbSNP"))); a.setAdditionalAttributes( Collections.singletonMap(GROUP_NAME.key(), new AdditionalAttribute(Collections.singletonMap(VARIANT_ID.key(), v.toString())))); diff --git a/opencga-storage/opencga-storage-core/src/test/java/org/opencb/opencga/storage/core/variant/annotation/VariantAnnotationManagerTest.java b/opencga-storage/opencga-storage-core/src/test/java/org/opencb/opencga/storage/core/variant/annotation/VariantAnnotationManagerTest.java index dc2721882fa..00544a3d715 100644 --- a/opencga-storage/opencga-storage-core/src/test/java/org/opencb/opencga/storage/core/variant/annotation/VariantAnnotationManagerTest.java +++ b/opencga-storage/opencga-storage-core/src/test/java/org/opencb/opencga/storage/core/variant/annotation/VariantAnnotationManagerTest.java @@ -4,6 +4,7 @@ import org.apache.commons.lang.StringUtils; import org.junit.Assume; import org.junit.Test; +import org.opencb.biodata.models.variant.Variant; import org.opencb.biodata.models.variant.avro.VariantAnnotation; import org.opencb.commons.datastore.core.ObjectMap; import org.opencb.commons.datastore.core.Query; @@ -262,6 +263,10 @@ public void testQueries(VariantStorageEngine variantStorageEngine) throws Storag assertThat(annotation.getConsequenceTypes(), VariantMatchers.isEmpty()); } + for (Variant variant : variantStorageEngine) { + Variant thisVariant = variantStorageEngine.getVariant(DummyTestAnnotator.getRs(variant)); + assertThat(thisVariant, VariantMatchers.samePosition(variant)); + } // Get annotations from a deleted snapshot thrown.expectMessage("Variant Annotation snapshot \"v1\" not found!"); diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStorageEngine.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStorageEngine.java index 917b679cf1c..2f0f6143b6b 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStorageEngine.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStorageEngine.java @@ -26,7 +26,6 @@ import org.apache.hadoop.hbase.client.HBaseAdmin; import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.util.StopWatch; -import org.opencb.biodata.models.variant.Variant; import org.opencb.biodata.models.variant.avro.VariantType; import org.opencb.commons.datastore.core.DataResult; import org.opencb.commons.datastore.core.ObjectMap; @@ -55,6 +54,7 @@ import org.opencb.opencga.storage.core.variant.adaptors.VariantQueryException; import org.opencb.opencga.storage.core.variant.adaptors.VariantQueryParam; import org.opencb.opencga.storage.core.variant.adaptors.iterators.VariantDBIterator; +import org.opencb.opencga.storage.core.variant.adaptors.sample.VariantSampleDataManager; import org.opencb.opencga.storage.core.variant.annotation.VariantAnnotationManager; import org.opencb.opencga.storage.core.variant.annotation.annotators.VariantAnnotator; import org.opencb.opencga.storage.core.variant.io.VariantExporter; @@ -1164,8 +1164,8 @@ public VariantStorageMetadataManager getMetadataManager() throws StorageEngineEx } @Override - public DataResult getSampleData(String variant, String study, QueryOptions options) throws StorageEngineException { - return new HBaseVariantSampleDataManager(getDBAdaptor(), getCellBaseUtils()).getSampleData(variant, study, options); + protected VariantSampleDataManager getVariantSampleDataManager() throws StorageEngineException { + return new HBaseVariantSampleDataManager(getDBAdaptor()); } @Override diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/adaptors/sample/HBaseVariantSampleDataManager.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/adaptors/sample/HBaseVariantSampleDataManager.java index 588833cfb6e..a96c57e9a45 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/adaptors/sample/HBaseVariantSampleDataManager.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/adaptors/sample/HBaseVariantSampleDataManager.java @@ -15,7 +15,6 @@ import org.opencb.commons.datastore.core.DataResult; import org.opencb.commons.datastore.core.QueryOptions; import org.opencb.opencga.storage.core.metadata.VariantStorageMetadataManager; -import org.opencb.opencga.storage.core.utils.CellBaseUtils; import org.opencb.opencga.storage.core.variant.VariantStorageOptions; import org.opencb.opencga.storage.core.variant.adaptors.GenotypeClass; import org.opencb.opencga.storage.core.variant.adaptors.VariantField; @@ -26,8 +25,8 @@ import org.opencb.opencga.storage.hadoop.variant.GenomeHelper; import org.opencb.opencga.storage.hadoop.variant.adaptors.VariantHadoopDBAdaptor; import org.opencb.opencga.storage.hadoop.variant.adaptors.phoenix.PhoenixHelper; -import org.opencb.opencga.storage.hadoop.variant.adaptors.phoenix.VariantPhoenixSchema; import org.opencb.opencga.storage.hadoop.variant.adaptors.phoenix.VariantPhoenixKeyFactory; +import org.opencb.opencga.storage.hadoop.variant.adaptors.phoenix.VariantPhoenixSchema; import org.opencb.opencga.storage.hadoop.variant.converters.HBaseVariantConverterConfiguration; import org.opencb.opencga.storage.hadoop.variant.converters.VariantRow; import org.opencb.opencga.storage.hadoop.variant.converters.annotation.HBaseToVariantAnnotationConverter; @@ -45,31 +44,20 @@ public class HBaseVariantSampleDataManager extends VariantSampleDataManager { private final VariantHadoopDBAdaptor dbAdaptor; private final VariantStorageMetadataManager metadataManager; - private final CellBaseUtils cellBaseUtils; - - public HBaseVariantSampleDataManager(VariantHadoopDBAdaptor dbAdaptor, CellBaseUtils cellBaseUtils) { + public HBaseVariantSampleDataManager(VariantHadoopDBAdaptor dbAdaptor) { super(dbAdaptor); this.dbAdaptor = dbAdaptor; metadataManager = dbAdaptor.getMetadataManager(); - this.cellBaseUtils = cellBaseUtils; } @Override - protected DataResult getSampleData(String variantStr, String study, QueryOptions options, + protected DataResult getSampleData(Variant variant, String study, QueryOptions options, List includeSamples, Set genotypes, int sampleLimit) { StopWatch stopWatch = StopWatch.createStarted(); Set includeFields = VariantField.getIncludeFields(options); - final Variant variant; - if (VariantQueryUtils.isVariantId(variantStr)) { - variant = new Variant(variantStr); - } else { - variant = cellBaseUtils.getVariant(variantStr); - } - variant.setId(variant.toString()); - int studyId = metadataManager.getStudyId(study); boolean includeNoneSamples = VariantQueryUtils.isNoneOrEmpty(includeSamples); @@ -172,7 +160,7 @@ protected DataResult getSampleData(String variantStr, String study, Que Result result = table.get(get); if (result == null || result.isEmpty()) { - throw VariantQueryException.variantNotFound(variantStr); + throw VariantQueryException.variantNotFound(variant.toString()); } // Walk row VariantRow.walker(result) From 86f68e660014cda5f0c324462a0dfd37c3bd0040 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Fri, 12 Apr 2024 15:05:42 +0100 Subject: [PATCH 4/4] storage: Fix VariantQueryParam type. #TASK-5878 --- .../storage/core/variant/adaptors/VariantQueryParam.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/adaptors/VariantQueryParam.java b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/adaptors/VariantQueryParam.java index c10fd5def09..c801f4448ff 100644 --- a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/adaptors/VariantQueryParam.java +++ b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/adaptors/VariantQueryParam.java @@ -121,7 +121,7 @@ public final class VariantQueryParam implements QueryParam { public static final String INCLUDE_SAMPLE_ID_DESCR = "Include sampleId on each result"; - public static final VariantQueryParam INCLUDE_SAMPLE_ID = new VariantQueryParam("includeSampleId", TEXT_ARRAY, INCLUDE_SAMPLE_ID_DESCR); + public static final VariantQueryParam INCLUDE_SAMPLE_ID = new VariantQueryParam("includeSampleId", BOOLEAN, INCLUDE_SAMPLE_ID_DESCR); public static final String SAMPLE_METADATA_DESCR = "Return the samples metadata group by study. Sample names will appear in the same order as their corresponding genotypes.";