From a37f1b42a3ea0539a60e4162ea9a8dac287b57b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Coll=20Morag=C3=B3n?= Date: Tue, 19 Mar 2024 17:16:04 +0000 Subject: [PATCH] storage: Accept non-normalized variants when filtering by ID or XREF. #TASK-5877 --- .../variant/query/VariantQueryParser.java | 26 +++++++++++++++++++ .../core/variant/query/VariantQueryUtils.java | 8 ++++++ .../VariantDBAdaptorMultiFileTest.java | 10 +++++++ .../adaptors/VariantDBAdaptorTest.java | 25 +++++++++++++----- .../sample/HBaseVariantSampleDataManager.java | 4 +-- 5 files changed, 65 insertions(+), 8 deletions(-) diff --git a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/query/VariantQueryParser.java b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/query/VariantQueryParser.java index 1564ad0fabf..6481d85c53c 100644 --- a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/query/VariantQueryParser.java +++ b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/query/VariantQueryParser.java @@ -8,7 +8,9 @@ import org.opencb.biodata.models.variant.Variant; import org.opencb.biodata.models.variant.avro.ClinicalSignificance; import org.opencb.biodata.models.variant.avro.VariantType; +import org.opencb.biodata.models.variant.exceptions.NonStandardCompliantSampleField; import org.opencb.biodata.models.variant.metadata.VariantFileHeaderComplexLine; +import org.opencb.biodata.tools.variant.VariantNormalizer; import org.opencb.commons.datastore.core.Query; import org.opencb.commons.datastore.core.QueryOptions; import org.opencb.commons.datastore.core.QueryParam; @@ -858,8 +860,17 @@ public static ParsedVariantQuery.VariantQueryXref parseXrefs(Query query) { } } } + } + if (!xrefs.getVariants().isEmpty()) { + List normalizedVariants = normalizeVariants(xrefs.getVariants()); + for (Variant normalizedVariant : normalizedVariants) { + if (!xrefs.getVariants().contains(normalizedVariant)) { + xrefs.getVariants().add(normalizedVariant); + } + } } + if (isValidParam(query, ANNOT_GENE_ROLE_IN_CANER_GENES)) { List thisGenes = query.getAsStringList(ANNOT_GENE_ROLE_IN_CANER_GENES.key()); if (thisGenes.size() != 1 || !thisGenes.get(0).equals(NONE)) { @@ -886,6 +897,21 @@ public static ParsedVariantQuery.VariantQueryXref parseXrefs(Query query) { return xrefs; } + public static Variant normalizeVariant(Variant variant) { + return normalizeVariants(Collections.singletonList(variant)).get(0); + } + + public static List normalizeVariants(List variants) { + VariantNormalizer variantNormalizer = new VariantNormalizer(); + List normalizedVariants; + try { + normalizedVariants = variantNormalizer.normalize(variants, false); + } catch (NonStandardCompliantSampleField e) { + throw VariantQueryException.internalException(e); + } + return normalizedVariants; + } + public static ParsedQuery> parseFreqFilter(Query query, QueryParam queryParam) { return VariantQueryUtils.splitValue(query, queryParam) .map(VariantQueryUtils::parseKeyOpValue) diff --git a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/query/VariantQueryUtils.java b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/query/VariantQueryUtils.java index 114379c77a9..75703f083d3 100644 --- a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/query/VariantQueryUtils.java +++ b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/query/VariantQueryUtils.java @@ -552,6 +552,14 @@ public static Variant toVariant(String value) { return variant; } + public static Variant toVariant(String variantStr, boolean normalize) { + Variant variant = toVariant(variantStr); + if (normalize && variant != null) { + return VariantQueryParser.normalizeVariant(variant); + } + return variant; + } + public static String[] splitStudyResource(String value) { int idx = value.lastIndexOf(STUDY_RESOURCE_SEPARATOR); if (idx <= 0 || idx == value.length() - 1) { diff --git a/opencga-storage/opencga-storage-core/src/test/java/org/opencb/opencga/storage/core/variant/adaptors/VariantDBAdaptorMultiFileTest.java b/opencga-storage/opencga-storage-core/src/test/java/org/opencb/opencga/storage/core/variant/adaptors/VariantDBAdaptorMultiFileTest.java index 9a95ed053e5..8ad47c67c2c 100644 --- a/opencga-storage/opencga-storage-core/src/test/java/org/opencb/opencga/storage/core/variant/adaptors/VariantDBAdaptorMultiFileTest.java +++ b/opencga-storage/opencga-storage-core/src/test/java/org/opencb/opencga/storage/core/variant/adaptors/VariantDBAdaptorMultiFileTest.java @@ -1333,6 +1333,16 @@ public void testSampleData() throws Exception { assertEquals(0, variant.getStudies().get(0).getFiles().size()); } + @Test + public void testSampleDataUnnormalized() throws Exception { + // Check unnormalized queries + Variant variant = variantStorageEngine.getSampleData("1:10352:T:TA", study1, new QueryOptions()).first(); + assertEquals("1:10353:-:A", variant.toString()); + System.out.println("variant = " + variant.toJson()); + assertNotNull(variant.getStudies().get(0).getStats(DEFAULT_COHORT)); + assertEquals(4, variant.getStudies().get(0).getSamples().size()); + assertEquals(4, variant.getStudies().get(0).getFiles().size()); + } @Test public void testCount() throws StorageEngineException { diff --git a/opencga-storage/opencga-storage-core/src/test/java/org/opencb/opencga/storage/core/variant/adaptors/VariantDBAdaptorTest.java b/opencga-storage/opencga-storage-core/src/test/java/org/opencb/opencga/storage/core/variant/adaptors/VariantDBAdaptorTest.java index 1f305574a64..b5bd4b841c3 100644 --- a/opencga-storage/opencga-storage-core/src/test/java/org/opencb/opencga/storage/core/variant/adaptors/VariantDBAdaptorTest.java +++ b/opencga-storage/opencga-storage-core/src/test/java/org/opencb/opencga/storage/core/variant/adaptors/VariantDBAdaptorTest.java @@ -30,9 +30,7 @@ import org.opencb.biodata.models.variant.Variant; import org.opencb.biodata.models.variant.VariantFileMetadata; import org.opencb.biodata.models.variant.avro.*; -import org.opencb.biodata.models.variant.exceptions.NonStandardCompliantSampleField; import org.opencb.biodata.models.variant.stats.VariantStats; -import org.opencb.biodata.tools.variant.VariantNormalizer; import org.opencb.commons.datastore.core.DataResult; import org.opencb.commons.datastore.core.ObjectMap; import org.opencb.commons.datastore.core.Query; @@ -531,11 +529,14 @@ public long filterPopulation(DataResult queryResult, Predicate public void testGetAllVariants_variantId() { int i = 0; List variants = new ArrayList<>(); + Map normalizedVariants = new HashMap<>(); for (Variant variant : allVariants.getResults()) { - if (i++ % 10 == 0) { - if (!variant.isSymbolic()) { - variants.add(variant); - } + if ((i++ % 10) == 0) { + variants.add(variant); + } + OriginalCall call = variant.getStudies().get(0).getFiles().get(0).getCall(); + if (call != null) { + normalizedVariants.put(variant.toString(), call.getVariantId()); } } List result = query(new Query(ID.key(), variants), new QueryOptions()).getResults(); @@ -554,6 +555,18 @@ public void testGetAllVariants_variantId() { } } assertEquals(expectedList, actualList); + + normalizedVariants.forEach((key, value) -> { + System.out.println(key + " = " + value); + }); + List resultNormalized = query(new Query(ID.key(), normalizedVariants.values()).append(INCLUDE_FILE.key(), ALL), new QueryOptions()).getResults(); + assertEquals(normalizedVariants.size(), resultNormalized.size()); + assertTrue(!resultNormalized.isEmpty()); + for (Variant variant : resultNormalized) { + String expected = normalizedVariants.get(variant.toString()); + String actual = variant.getStudies().get(0).getFiles().get(0).getCall().getVariantId(); + assertEquals(expected, actual); + } } @Test diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/adaptors/sample/HBaseVariantSampleDataManager.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/adaptors/sample/HBaseVariantSampleDataManager.java index 588833cfb6e..cbe7e0880c2 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/adaptors/sample/HBaseVariantSampleDataManager.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/adaptors/sample/HBaseVariantSampleDataManager.java @@ -26,8 +26,8 @@ import org.opencb.opencga.storage.hadoop.variant.GenomeHelper; import org.opencb.opencga.storage.hadoop.variant.adaptors.VariantHadoopDBAdaptor; import org.opencb.opencga.storage.hadoop.variant.adaptors.phoenix.PhoenixHelper; -import org.opencb.opencga.storage.hadoop.variant.adaptors.phoenix.VariantPhoenixSchema; import org.opencb.opencga.storage.hadoop.variant.adaptors.phoenix.VariantPhoenixKeyFactory; +import org.opencb.opencga.storage.hadoop.variant.adaptors.phoenix.VariantPhoenixSchema; import org.opencb.opencga.storage.hadoop.variant.converters.HBaseVariantConverterConfiguration; import org.opencb.opencga.storage.hadoop.variant.converters.VariantRow; import org.opencb.opencga.storage.hadoop.variant.converters.annotation.HBaseToVariantAnnotationConverter; @@ -64,7 +64,7 @@ protected DataResult getSampleData(String variantStr, String study, Que final Variant variant; if (VariantQueryUtils.isVariantId(variantStr)) { - variant = new Variant(variantStr); + variant = VariantQueryUtils.toVariant(variantStr, true); } else { variant = cellBaseUtils.getVariant(variantStr); }