diff --git a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/knockout/KnockoutAnalysis.java b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/knockout/KnockoutAnalysis.java index 34b337731a5..069d7ae46a7 100644 --- a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/knockout/KnockoutAnalysis.java +++ b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/knockout/KnockoutAnalysis.java @@ -202,12 +202,12 @@ protected void run() throws Exception { if (family == null || StringUtils.isEmpty(family.getId())) { continue; } - List> trios = variantStorageManager.getTriosFromFamily(getStudy(), family, true, getToken()); - for (List trio : trios) { - String child = trio.get(2); + List trios = variantStorageManager.getTriosFromFamily(getStudy(), family, true, getToken()); + for (Trio trio : trios) { + String child = trio.getChild(); if (analysisParams.getSample().contains(child)) { - String father = trio.get(0); - String mother = trio.get(1); + String father = trio.getFather(); + String mother = trio.getMother(); triosMap.put(child, new Trio(family.getId(), "-".equals(father) ? null : father, "-".equals(mother) ? null : mother, diff --git a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/manager/VariantCatalogQueryUtils.java b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/manager/VariantCatalogQueryUtils.java index 2d08fb77566..7a588c900f5 100644 --- a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/manager/VariantCatalogQueryUtils.java +++ b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/manager/VariantCatalogQueryUtils.java @@ -1299,10 +1299,10 @@ public Integer getReleaseFilter(Query query, String sessionId) throws CatalogExc return release; } - public List> getTriosFromFamily( + public List getTriosFromFamily( String studyFqn, Family family, VariantStorageMetadataManager metadataManager, boolean skipIncompleteFamily, String sessionId) throws StorageEngineException, CatalogException { - List> trios = getTrios(studyFqn, metadataManager, family.getMembers(), sessionId); + List trios = getTrios(studyFqn, metadataManager, family.getMembers(), sessionId); if (trios.size() == 0) { if (skipIncompleteFamily) { logger.debug("Skip family '" + family.getId() + "'. "); @@ -1313,7 +1313,7 @@ public List> getTriosFromFamily( return trios; } - public List> getTriosFromSamples( + public List getTriosFromSamples( String studyFqn, VariantStorageMetadataManager metadataManager, Collection sampleIds, String token) throws CatalogException { OpenCGAResult individualResult = catalogManager.getIndividualManager() @@ -1330,12 +1330,12 @@ public List> getTriosFromSamples( return getTrios(studyFqn, metadataManager, individualResult.getResults(), token); } - public List> getTrios( + public List getTrios( String studyFqn, VariantStorageMetadataManager metadataManager, List membersList, String sessionId) throws CatalogException { int studyId = metadataManager.getStudyId(studyFqn); Map membersMap = membersList.stream().collect(Collectors.toMap(Individual::getUid, i -> i)); - List> trios = new LinkedList<>(); + List trios = new LinkedList<>(); for (Individual individual : membersList) { String fatherSample = null; String motherSample = null; @@ -1402,10 +1402,7 @@ public List> getTrios( // Allow one missing parent if (childSample != null && (fatherSample != null || motherSample != null)) { - trios.add(Arrays.asList( - fatherSample == null ? "-" : fatherSample, - motherSample == null ? "-" : motherSample, - childSample)); + trios.add(new Trio(fatherSample, motherSample, childSample)); } } return trios; diff --git a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/manager/VariantStorageManager.java b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/manager/VariantStorageManager.java index 48b5cf87114..076c950cdd7 100644 --- a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/manager/VariantStorageManager.java +++ b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/manager/VariantStorageManager.java @@ -76,10 +76,7 @@ import org.opencb.opencga.storage.core.exceptions.StorageEngineException; import org.opencb.opencga.storage.core.metadata.VariantMetadataFactory; import org.opencb.opencga.storage.core.metadata.VariantStorageMetadataManager; -import org.opencb.opencga.storage.core.metadata.models.ProjectMetadata; -import org.opencb.opencga.storage.core.metadata.models.SampleMetadata; -import org.opencb.opencga.storage.core.metadata.models.StudyMetadata; -import org.opencb.opencga.storage.core.metadata.models.VariantScoreMetadata; +import org.opencb.opencga.storage.core.metadata.models.*; import org.opencb.opencga.storage.core.utils.CellBaseUtils; import org.opencb.opencga.storage.core.variant.BeaconResponse; import org.opencb.opencga.storage.core.variant.VariantStorageEngine; @@ -397,7 +394,7 @@ public void sampleIndexAnnotate(String study, List samples, ObjectMap pa }); } - public DataResult> familyIndexUpdate(String study, + public DataResult familyIndexUpdate(String study, ObjectMap params, String token) throws CatalogException, StorageEngineException { return secureOperation(VariantFamilyIndexOperationTool.ID, study, params, token, engine -> { @@ -405,11 +402,11 @@ public DataResult> familyIndexUpdate(String study, }); } - public DataResult> familyIndex(String study, List familiesStr, boolean skipIncompleteFamilies, + public DataResult familyIndex(String study, List familiesStr, boolean skipIncompleteFamilies, ObjectMap params, String token) throws CatalogException, StorageEngineException { return secureOperation(VariantFamilyIndexOperationTool.ID, study, params, token, engine -> { - List> trios = new LinkedList<>(); + List trios = new LinkedList<>(); List events = new LinkedList<>(); VariantStorageMetadataManager metadataManager = engine.getMetadataManager(); VariantCatalogQueryUtils catalogUtils = new VariantCatalogQueryUtils(catalogManager); @@ -425,9 +422,9 @@ public DataResult> familyIndex(String study, List familiesS trios.addAll(catalogUtils.getTriosFromFamily(study, family, metadataManager, skipIncompleteFamilies, token)); } } - DataResult> dataResult = engine.familyIndex(study, trios, params); + DataResult dataResult = engine.familyIndex(study, trios, params); getSynchronizer(engine).synchronizeCatalogSamplesFromStorage(study, trios.stream() - .flatMap(Collection::stream) + .flatMap(t->t.toList().stream()) .collect(Collectors.toList()), token); return dataResult; }); @@ -439,7 +436,7 @@ private CatalogStorageMetadataSynchronizer getSynchronizer(VariantStorageEngine return synchronizer; } - public DataResult> familyIndexBySamples(String study, Collection samples, ObjectMap params, String token) + public DataResult familyIndexBySamples(String study, Collection samples, ObjectMap params, String token) throws CatalogException, StorageEngineException { return secureOperation(VariantFamilyIndexOperationTool.ID, study, params, token, engine -> { Collection thisSamples = samples; @@ -447,17 +444,16 @@ public DataResult> familyIndexBySamples(String study, Collection> trios = catalogUtils.getTriosFromSamples(study, engine.getMetadataManager(), thisSamples, token); - - DataResult> dataResult = engine.familyIndex(study, trios, params); + List trios = catalogUtils.getTriosFromSamples(study, engine.getMetadataManager(), thisSamples, token); + DataResult dataResult = engine.familyIndex(study, trios, params); getSynchronizer(engine).synchronizeCatalogSamplesFromStorage(study, trios.stream() - .flatMap(Collection::stream) + .flatMap(t -> t.toList().stream()) .collect(Collectors.toList()), token); return dataResult; }); } - public List> getTriosFromFamily(String study, Family family, boolean skipIncompleteFamilies, String token) + public List getTriosFromFamily(String study, Family family, boolean skipIncompleteFamilies, String token) throws CatalogException, StorageEngineException { VariantStorageEngine variantStorageEngine = getVariantStorageEngine(study, token); return catalogUtils.getTriosFromFamily(study, family, variantStorageEngine.getMetadataManager(), skipIncompleteFamilies, token); diff --git a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/operations/VariantFamilyIndexOperationTool.java b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/operations/VariantFamilyIndexOperationTool.java index ed774beca11..c7258c79f74 100644 --- a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/operations/VariantFamilyIndexOperationTool.java +++ b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/operations/VariantFamilyIndexOperationTool.java @@ -24,6 +24,7 @@ import org.opencb.opencga.core.models.operations.variant.VariantFamilyIndexParams; import org.opencb.opencga.core.tools.annotations.Tool; import org.opencb.opencga.core.tools.annotations.ToolParams; +import org.opencb.opencga.storage.core.metadata.models.Trio; import java.util.Collections; import java.util.List; @@ -67,7 +68,7 @@ protected void check() throws Exception { @Override protected void run() throws Exception { step(() -> { - DataResult> trios; + DataResult trios; if (variantFamilyIndexParams.isUpdateIndex()) { trios = variantStorageManager.familyIndexUpdate(study, params, token); } else { diff --git a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/operations/VariantSecondarySampleIndexOperationTool.java b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/operations/VariantSecondarySampleIndexOperationTool.java index 1954d7eb790..d4dd7ab8a89 100644 --- a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/operations/VariantSecondarySampleIndexOperationTool.java +++ b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/operations/VariantSecondarySampleIndexOperationTool.java @@ -24,6 +24,7 @@ import org.opencb.opencga.core.models.operations.variant.VariantSecondarySampleIndexParams; import org.opencb.opencga.core.tools.annotations.Tool; import org.opencb.opencga.core.tools.annotations.ToolParams; +import org.opencb.opencga.storage.core.metadata.models.Trio; import java.util.ArrayList; import java.util.List; @@ -80,7 +81,7 @@ protected void run() throws Exception { } if (sampleIndexParams.isFamilyIndex()) { step("familyIndex", () -> { - DataResult> result = variantStorageManager.familyIndexBySamples(study, sampleIndexParams.getSample(), params, + DataResult result = variantStorageManager.familyIndexBySamples(study, sampleIndexParams.getSample(), params, getToken()); if (result.getEvents() != null) { for (Event event : result.getEvents()) { diff --git a/opencga-analysis/src/test/java/org/opencb/opencga/analysis/variant/manager/VariantCatalogQueryUtilsTest.java b/opencga-analysis/src/test/java/org/opencb/opencga/analysis/variant/manager/VariantCatalogQueryUtilsTest.java index 58aa48f7828..b38101ef933 100644 --- a/opencga-analysis/src/test/java/org/opencb/opencga/analysis/variant/manager/VariantCatalogQueryUtilsTest.java +++ b/opencga-analysis/src/test/java/org/opencb/opencga/analysis/variant/manager/VariantCatalogQueryUtilsTest.java @@ -60,6 +60,7 @@ import org.opencb.opencga.storage.core.metadata.VariantStorageMetadataManager; import org.opencb.opencga.storage.core.metadata.models.StudyMetadata; import org.opencb.opencga.storage.core.metadata.models.TaskMetadata; +import org.opencb.opencga.storage.core.metadata.models.Trio; import org.opencb.opencga.storage.core.utils.CellBaseUtils; import org.opencb.opencga.storage.core.variant.adaptors.VariantField; import org.opencb.opencga.storage.core.variant.adaptors.VariantQuery; @@ -868,9 +869,9 @@ public void getTriosFromFamily() throws Exception { sampleMetadata -> sampleMetadata.setIndexStatus(TaskMetadata.Status.READY)); } - List> trios = queryUtils.getTriosFromFamily("s1", f1, metadataManager, true, sessionId); + List trios = queryUtils.getTriosFromFamily("s1", f1, metadataManager, true, sessionId); // System.out.println("trios = " + trios); - assertEquals(Arrays.asList(Arrays.asList("sample1", "sample2", "sample3"), Arrays.asList("sample1", "sample2", "sample4")), trios); + assertEquals(Arrays.asList(new Trio("sample1", "sample2", "sample3"), new Trio("sample1", "sample2", "sample4")), trios); } diff --git a/opencga-analysis/src/test/java/org/opencb/opencga/analysis/variant/manager/VariantOperationsTest.java b/opencga-analysis/src/test/java/org/opencb/opencga/analysis/variant/manager/VariantOperationsTest.java index 13f07647b84..bc82cf2e3d8 100644 --- a/opencga-analysis/src/test/java/org/opencb/opencga/analysis/variant/manager/VariantOperationsTest.java +++ b/opencga-analysis/src/test/java/org/opencb/opencga/analysis/variant/manager/VariantOperationsTest.java @@ -44,9 +44,7 @@ import org.opencb.opencga.core.models.common.IndexStatus; import org.opencb.opencga.core.models.family.Family; import org.opencb.opencga.core.models.file.File; -import org.opencb.opencga.core.models.individual.Individual; -import org.opencb.opencga.core.models.individual.IndividualInternal; -import org.opencb.opencga.core.models.individual.Location; +import org.opencb.opencga.core.models.individual.*; import org.opencb.opencga.core.models.job.Job; import org.opencb.opencga.core.models.operations.variant.VariantAnnotationIndexParams; import org.opencb.opencga.core.models.operations.variant.VariantSecondaryAnnotationIndexParams; @@ -151,6 +149,16 @@ public void setUp() throws Throwable { @After public void tearDown() { if (hadoopExternalResource != null) { + + try { + VariantStorageEngine engine = opencga.getStorageEngineFactory().getVariantStorageEngine(storageEngine, DB_NAME); + if (storageEngine.equals(HadoopVariantStorageEngine.STORAGE_ENGINE_ID)) { + VariantHbaseTestUtils.printVariants(((VariantHadoopDBAdaptor) engine.getDBAdaptor()), Paths.get(opencga.createTmpOutdir("_hbase_print_variants_AFTER")).toUri()); + } + } catch (Exception ignore) { + ignore.printStackTrace(); + } + hadoopExternalResource.after(); hadoopExternalResource = null; } @@ -249,7 +257,7 @@ private void loadDataset() throws Throwable { Collections.emptyList(), false, 0, Collections.emptyList(), Collections.emptyList(), Collections.emptyList(), IndividualInternal.init(), Collections.emptyMap()).setFather(individuals.get(0)).setMother(individuals.get(1)), Collections.singletonList(daughter), new QueryOptions(ParamConstants.INCLUDE_RESULT_PARAM, true), token).first()); catalogManager.getFamilyManager().create( STUDY, - new Family("f1", "f1", Collections.singletonList(phenotype), Collections.singletonList(disorder), null, null, 3, null, null), + new Family("f1", "f1", Collections.singletonList(phenotype), Collections.singletonList(disorder), null, null, 4, null, null), individuals.stream().map(Individual::getId).collect(Collectors.toList()), new QueryOptions(), token); @@ -401,6 +409,50 @@ public void testVariantSecondarySampleIndex() throws Exception { } } + @Test + public void testVariantSecondarySampleIndexPartialFamily() throws Exception { + Assume.assumeThat(storageEngine, anyOf( +// is(DummyVariantStorageEngine.STORAGE_ENGINE_ID), + is(HadoopVariantStorageEngine.STORAGE_ENGINE_ID) + )); + for (String sample : samples) { + SampleInternalVariantSecondarySampleIndex sampleIndex = catalogManager.getSampleManager().get(STUDY, sample, new QueryOptions(), token).first().getInternal().getVariant().getSecondarySampleIndex(); + assertEquals(sample, IndexStatus.READY, sampleIndex.getStatus().getId()); + assertEquals(sample, IndexStatus.NONE, sampleIndex.getFamilyStatus().getId()); + assertEquals(sample, 1, sampleIndex.getVersion().intValue()); + } + + Phenotype phenotype = new Phenotype("phenotype", "phenotype", ""); + Disorder disorder = new Disorder("disorder", "disorder", "", "", Collections.singletonList(phenotype), Collections.emptyMap()); + + catalogManager.getFamilyManager().delete(STUDY, Collections.singletonList("f1"), null, token); + catalogManager.getIndividualManager().update(STUDY, daughter, new IndividualUpdateParams() + .setMother(new IndividualReferenceParam(null, null)), null, token); + catalogManager.getFamilyManager().create( + STUDY, + new Family("f2", "f2", Collections.singletonList(phenotype), Collections.singletonList(disorder), null, null, 2, null, null), + Arrays.asList(father, daughter), new QueryOptions(), + token); + + // Run family index. The family index status should be READY on offspring + toolRunner.execute(VariantSecondarySampleIndexOperationTool.class, STUDY, + new VariantSecondarySampleIndexParams() + .setFamilyIndex(true) + .setSample(Arrays.asList(daughter)), + Paths.get(opencga.createTmpOutdir()), "index", token); + + for (String sample : samples) { + SampleInternalVariantSecondarySampleIndex sampleIndex = catalogManager.getSampleManager().get(STUDY, sample, new QueryOptions(), token).first().getInternal().getVariant().getSecondarySampleIndex(); + assertEquals(sample, IndexStatus.READY, sampleIndex.getStatus().getId()); + if (sample.equals(daughter)) { + assertEquals(sample, IndexStatus.READY, sampleIndex.getFamilyStatus().getId()); + } else { + assertEquals(sample, IndexStatus.NONE, sampleIndex.getFamilyStatus().getId()); + } + assertEquals(sample, 1, sampleIndex.getVersion().intValue()); + } + } + @Test public void testGwasIndex() throws Exception { // Variant scores can not be loaded in mongodb nor dummy diff --git a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/metadata/models/Trio.java b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/metadata/models/Trio.java index 2c12a0021e0..82c85135b74 100644 --- a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/metadata/models/Trio.java +++ b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/metadata/models/Trio.java @@ -6,21 +6,42 @@ import java.util.List; import java.util.Objects; +/** + * This class represents a family trio. + * + * - Father and mother can be null + * - Child cannot be null + * - All samples must be unique + * - All samples must be different to NA (dash) + * + */ public class Trio { + public static final String NA = "-"; private final String id; private final String father; private final String mother; private final String child; + public Trio(String trio) { + String[] split = trio.split(","); + if (split.length != 3) { + throw new IllegalArgumentException("Expected three samples in trio '" + trio + "'"); + } + this.id = null; + this.child = split[0]; + this.father = NA.equals(split[1]) ? null : split[1]; + this.mother = NA.equals(split[2]) ? null : split[2]; + } + public Trio(List trio) { this(null, trio); } public Trio(String id, List trio) { this.id = id; - this.father = trio.get(1); - this.mother = trio.get(2); this.child = trio.get(0); + this.father = NA.equals(trio.get(1)) ? null : trio.get(1); + this.mother = NA.equals(trio.get(2)) ? null : trio.get(2); } public Trio(String father, String mother, String child) { @@ -29,8 +50,8 @@ public Trio(String father, String mother, String child) { public Trio(String id, String father, String mother, String child) { this.id = id; - this.father = father; - this.mother = mother; + this.father = NA.equals(father) ? null : father; + this.mother = NA.equals(mother) ? null : mother; this.child = child; } @@ -50,6 +71,11 @@ public String getChild() { return child; } + /** + * Returns a list with the non-null samples in the trio. + * + * @return List of samples + */ public List toList() { ArrayList list = new ArrayList<>(3); list.add(getChild()); @@ -82,6 +108,31 @@ public int hashCode() { return Objects.hash(id, father, mother, child); } + /** + * Serialize the trio into a string contain the three samples separated by commas. + * order: child, father, mother. + * If the father or mother are null, they will be replaced by {@link #NA}. + * + * Can be deserialized using {@link #Trio(String)}. + * + * @return String + */ + public String serialize() { + ArrayList list = new ArrayList<>(3); + list.add(getChild()); + if (getFather() == null) { + list.add(NA); + } else { + list.add(getFather()); + } + if (getMother() == null) { + list.add(NA); + } else { + list.add(getMother()); + } + return Strings.join(list, ','); + } + @Override public String toString() { return Strings.join(toList(), ','); diff --git a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/VariantStorageEngine.java b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/VariantStorageEngine.java index cd6202ebf35..8779643964b 100644 --- a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/VariantStorageEngine.java +++ b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/VariantStorageEngine.java @@ -582,45 +582,52 @@ public void sampleIndexAnnotate(String study, List samples, ObjectMap op * The Family Index is used alongside with the SampleIndex to speed up queries involving children and parents. * * @param study Study - * @param trios List of trios "father, mother, child". - * Missing parents in trios are specified with "-", - * If a family has two children, two trios should be defined. + * @param trios Trios to index. If a family has two children, two trios should be defined. * @param options Other options * @throws StorageEngineException in an error occurs * @return List of trios used to index. Empty if there was nothing to do. */ - public DataResult> familyIndex(String study, List> trios, ObjectMap options) throws StorageEngineException { + public DataResult familyIndex(String study, List trios, ObjectMap options) throws StorageEngineException { throw new UnsupportedOperationException("Unsupported familyIndex"); } - public DataResult> familyIndexUpdate(String study, ObjectMap options) throws StorageEngineException { + /** + * Update the family index. + * The Family Index is used alongside with the SampleIndex to speed up queries involving children and parents. + * + * @param study Study + * @param options Other options + * @throws StorageEngineException in an error occurs + * @return List of trios used to index. Empty if there was nothing to do. + */ + public DataResult familyIndexUpdate(String study, ObjectMap options) throws StorageEngineException { StudyMetadata studyMetadata = getMetadataManager().getStudyMetadata(study); int studyId = studyMetadata.getId(); int version = studyMetadata.getSampleIndexConfigurationLatest().getVersion(); - List> trios = new LinkedList<>(); + List trios = new LinkedList<>(); for (SampleMetadata sampleMetadata : getMetadataManager().sampleMetadataIterable(studyId)) { if (sampleMetadata.isFamilyIndexDefined()) { if (sampleMetadata.getFamilyIndexStatus(version) != TaskMetadata.Status.READY) { // This sample's family index needs to be updated String father; if (sampleMetadata.getFather() == null) { - father = "-"; + father = null; } else { father = getMetadataManager().getSampleName(studyId, sampleMetadata.getFather()); } String mother; if (sampleMetadata.getMother() == null) { - mother = "-"; + mother = null; } else { mother = getMetadataManager().getSampleName(studyId, sampleMetadata.getMother()); } - trios.add(Arrays.asList(father, mother, sampleMetadata.getName())); + trios.add(new Trio(father, mother, sampleMetadata.getName())); } } } if (trios.isEmpty()) { logger.info("Nothing to do!"); - return new DataResult>().setEvents(Collections.singletonList(new Event(Event.Type.INFO, "Nothing to do"))); + return new DataResult().setEvents(Collections.singletonList(new Event(Event.Type.INFO, "Nothing to do"))); } else { return familyIndex(study, trios, options); } diff --git a/opencga-storage/opencga-storage-core/src/test/java/org/opencb/opencga/storage/core/variant/dummy/DummyVariantStorageEngine.java b/opencga-storage/opencga-storage-core/src/test/java/org/opencb/opencga/storage/core/variant/dummy/DummyVariantStorageEngine.java index c81c7d21831..55866e24160 100644 --- a/opencga-storage/opencga-storage-core/src/test/java/org/opencb/opencga/storage/core/variant/dummy/DummyVariantStorageEngine.java +++ b/opencga-storage/opencga-storage-core/src/test/java/org/opencb/opencga/storage/core/variant/dummy/DummyVariantStorageEngine.java @@ -28,6 +28,7 @@ import org.opencb.opencga.storage.core.metadata.VariantStorageMetadataManager; import org.opencb.opencga.storage.core.metadata.models.StudyMetadata; import org.opencb.opencga.storage.core.metadata.models.TaskMetadata; +import org.opencb.opencga.storage.core.metadata.models.Trio; import org.opencb.opencga.storage.core.variant.VariantStorageEngine; import org.opencb.opencga.storage.core.variant.adaptors.VariantDBAdaptor; import org.opencb.opencga.storage.core.variant.io.VariantImporter; @@ -102,7 +103,7 @@ public DummyVariantStoragePipeline newStoragePipeline(boolean connected) throws } @Override - public DataResult> familyIndex(String study, List> trios, ObjectMap options) throws StorageEngineException { + public DataResult familyIndex(String study, List trios, ObjectMap options) throws StorageEngineException { logger.info("Running family index!"); VariantStorageMetadataManager metadataManager = getMetadataManager(); StudyMetadata studyMetadata = metadataManager.getStudyMetadata(study); @@ -121,7 +122,7 @@ public DataResult> familyIndex(String study, List> tri } }); } - return new DataResult>().setResults(trios); + return new DataResult().setResults(trios); } @Override diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStorageEngine.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStorageEngine.java index c3afc1f5040..917b679cf1c 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStorageEngine.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStorageEngine.java @@ -46,10 +46,7 @@ import org.opencb.opencga.storage.core.io.managers.IOConnectorProvider; import org.opencb.opencga.storage.core.metadata.VariantMetadataFactory; import org.opencb.opencga.storage.core.metadata.VariantStorageMetadataManager; -import org.opencb.opencga.storage.core.metadata.models.FileMetadata; -import org.opencb.opencga.storage.core.metadata.models.SampleMetadata; -import org.opencb.opencga.storage.core.metadata.models.StudyMetadata; -import org.opencb.opencga.storage.core.metadata.models.TaskMetadata; +import org.opencb.opencga.storage.core.metadata.models.*; import org.opencb.opencga.storage.core.utils.CellBaseUtils; import org.opencb.opencga.storage.core.variant.VariantStorageEngine; import org.opencb.opencga.storage.core.variant.VariantStorageOptions; @@ -400,7 +397,7 @@ public void sampleIndexAnnotate(String study, List samples, ObjectMap op @Override - public DataResult> familyIndex(String study, List> trios, ObjectMap options) throws StorageEngineException { + public DataResult familyIndex(String study, List trios, ObjectMap options) throws StorageEngineException { options = getMergedOptions(options); return new FamilyIndexLoader(getSampleIndexDBAdaptor(), getDBAdaptor(), getMRExecutor()) .load(study, trios, options); diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/index/family/FamilyIndexDriver.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/index/family/FamilyIndexDriver.java index 36f17f5f3a1..ada52270419 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/index/family/FamilyIndexDriver.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/index/family/FamilyIndexDriver.java @@ -17,6 +17,7 @@ import org.opencb.opencga.storage.core.metadata.models.CohortMetadata; import org.opencb.opencga.storage.core.metadata.models.SampleMetadata; import org.opencb.opencga.storage.core.metadata.models.TaskMetadata; +import org.opencb.opencga.storage.core.metadata.models.Trio; import org.opencb.opencga.storage.core.variant.adaptors.VariantQueryParam; import org.opencb.opencga.storage.hadoop.variant.AbstractVariantsTableDriver; import org.opencb.opencga.storage.hadoop.variant.GenomeHelper; @@ -98,28 +99,28 @@ protected void parseAndValidateParameters() throws IOException { if (StringUtils.isNotEmpty(triosStr)) { String[] trios = triosStr.split(";"); List trioList = new ArrayList<>(3); - for (String trio : trios) { - for (String sample : trio.split(",")) { - Integer sampleId; - if (sample.equals("-")) { - sampleId = MISSING_SAMPLE; - } else { - sampleId = metadataManager.getSampleId(getStudyId(), sample); - if (sampleId == null) { - throw new IllegalArgumentException("Sample '" + sample + "' not found."); - } - } - trioList.add(sampleId); + for (String trioString : trios) { + Trio trio = new Trio(Arrays.asList(trioString.split(","))); + + if (trio.getFather() == null) { + trioList.add(MISSING_SAMPLE); + } else { + trioList.add(metadataManager.getSampleIdOrFail(getStudyId(), trio.getFather())); } - if (trioList.size() != 3) { - throw new IllegalArgumentException("Found trio with " + trioList.size() + " members, instead of 3: " + trioList); + if (trio.getMother() == null) { + trioList.add(MISSING_SAMPLE); + } else { + trioList.add(metadataManager.getSampleIdOrFail(getStudyId(), trio.getMother())); } - SampleMetadata sampleMetadata = metadataManager.getSampleMetadata(getStudyId(), trioList.get(2)); + int childId = metadataManager.getSampleIdOrFail(getStudyId(), trio.getChild()); + trioList.add(childId); + + SampleMetadata sampleMetadata = metadataManager.getSampleMetadata(getStudyId(), childId); if (!overwrite && sampleMetadata.getFamilyIndexStatus(sampleIndexVersion) == TaskMetadata.Status.READY) { LOGGER.info("Skip sample " + sampleMetadata.getName() + ". Already precomputed!"); } else { sampleIds.addAll(trioList); - LOGGER.info("Trio: " + trio + " -> " + trioList); + LOGGER.info("Trio: " + trioString + " -> " + trioList); } trioList.clear(); } diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/index/family/FamilyIndexLoader.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/index/family/FamilyIndexLoader.java index 244945ce71e..17083348c45 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/index/family/FamilyIndexLoader.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/index/family/FamilyIndexLoader.java @@ -9,6 +9,7 @@ import org.opencb.opencga.storage.core.metadata.models.CohortMetadata; import org.opencb.opencga.storage.core.metadata.models.SampleMetadata; import org.opencb.opencga.storage.core.metadata.models.TaskMetadata; +import org.opencb.opencga.storage.core.metadata.models.Trio; import org.opencb.opencga.storage.hadoop.variant.HadoopVariantStorageOptions; import org.opencb.opencga.storage.hadoop.variant.adaptors.VariantHadoopDBAdaptor; import org.opencb.opencga.storage.hadoop.variant.executors.MRExecutor; @@ -17,7 +18,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.util.ArrayList; import java.util.Iterator; import java.util.LinkedList; import java.util.List; @@ -38,9 +38,9 @@ public FamilyIndexLoader(SampleIndexDBAdaptor sampleIndexDBAdaptor, VariantHadoo this.mrExecutor = mrExecutor; } - public DataResult> load(String study, List> trios, ObjectMap options) throws StorageEngineException { + public DataResult load(String study, List trios, ObjectMap options) throws StorageEngineException { trios = new LinkedList<>(trios); - DataResult> dr = new DataResult<>(); + DataResult dr = new DataResult<>(); dr.setResults(trios); dr.setEvents(new LinkedList<>()); @@ -55,35 +55,34 @@ public DataResult> load(String study, List> trios, Obj options.put(FamilyIndexDriver.SAMPLE_INDEX_VERSION, version); options.put(FamilyIndexDriver.OUTPUT, sampleIndexDBAdaptor.getSampleIndexTableName(studyId, version)); - Iterator> iterator = trios.iterator(); + Iterator iterator = trios.iterator(); while (iterator.hasNext()) { - List trioIds = new ArrayList<>(3); - List trio = iterator.next(); - for (String sample : trio) { - Integer sampleId; - if (sample.equals("-")) { - sampleId = -1; - } else { - sampleId = metadataManager.getSampleId(studyId, sample); - if (sampleId == null) { - throw new IllegalArgumentException("Sample '" + sample + "' not found."); - } - } - trioIds.add(sampleId); + Trio trio = iterator.next(); + + final Integer fatherId; + final Integer motherId; + final Integer childId; + + childId = metadataManager.getSampleId(studyId, trio.getChild()); + if (trio.getFather() == null) { + fatherId = -1; + } else { + fatherId = metadataManager.getSampleIdOrFail(studyId, trio.getFather()); } - if (trioIds.size() != 3) { - throw new IllegalArgumentException("Found trio with " + trioIds.size() + " members, instead of 3: " + trioIds); + if (trio.getMother() == null) { + motherId = -1; + } else { + motherId = metadataManager.getSampleIdOrFail(studyId, trio.getMother()); } - SampleMetadata sampleMetadata = metadataManager.getSampleMetadata(studyId, trioIds.get(2)); + + SampleMetadata sampleMetadata = metadataManager.getSampleMetadata(studyId, childId); if (!overwrite && sampleMetadata.getFamilyIndexStatus(version) == TaskMetadata.Status.READY) { String msg = "Skip sample " + sampleMetadata.getName() + ". Already precomputed!"; logger.info(msg); dr.getEvents().add(new Event(Event.Type.INFO, msg)); iterator.remove(); } else { - Integer fatherId = trioIds.get(0); boolean fatherDefined = fatherId != -1; - Integer motherId = trioIds.get(1); boolean motherDefined = motherId != -1; if (fatherDefined && !fatherId.equals(sampleMetadata.getFather()) || motherDefined && !motherId.equals(sampleMetadata.getMother())) { @@ -106,14 +105,14 @@ public DataResult> load(String study, List> trios, Obj int batchSize = options.getInt(HadoopVariantStorageOptions.SAMPLE_INDEX_FAMILY_MAX_TRIOS_PER_MR.key(), HadoopVariantStorageOptions.SAMPLE_INDEX_FAMILY_MAX_TRIOS_PER_MR.defaultValue()); - List>> batches = BatchUtils.splitBatches(trios, batchSize); + List> batches = BatchUtils.splitBatches(trios, batchSize); if (batches.size() == 1) { runBatch(study, trios, options, studyId); } else { logger.warn("Unable to run family index in one single MapReduce operation."); logger.info("Split in {} jobs of {} samples each.", batches, batches.get(0).size()); for (int i = 0; i < batches.size(); i++) { - List> batch = batches.get(i); + List batch = batches.get(i); logger.info("Running MapReduce {}/{} over {} trios", i + 1, batches, batch.size()); runBatch(study, batch, options, studyId); } @@ -122,12 +121,14 @@ public DataResult> load(String study, List> trios, Obj return dr; } - private void runBatch(String study, List> trios, ObjectMap options, int studyId) throws StorageEngineException { + private void runBatch(String study, List trios, ObjectMap options, int studyId) throws StorageEngineException { if (trios.size() < 500) { - options.put(FamilyIndexDriver.TRIOS, trios.stream().map(trio -> String.join(",", trio)).collect(Collectors.joining(";"))); + options.put(FamilyIndexDriver.TRIOS, trios.stream() + .map(Trio::serialize) + .collect(Collectors.joining(";"))); } else { CohortMetadata cohortMetadata = metadataManager.registerTemporaryCohort(study, "pendingFamilyIndexSamples", - trios.stream().map(t -> t.get(2)).collect(Collectors.toList())); + trios.stream().map(Trio::getChild).collect(Collectors.toList())); options.put(FamilyIndexDriver.TRIOS_COHORT, cohortMetadata.getName()); options.put(FamilyIndexDriver.TRIOS_COHORT_DELETE, true); @@ -137,7 +138,7 @@ private void runBatch(String study, List> trios, ObjectMap options, tableNameGenerator.getArchiveTableName(studyId), tableNameGenerator.getVariantTableName(), studyId, null, options), - "Precompute mendelian errors for " + (trios.size() == 1 ? "trio " + trios.get(0) : trios.size() + " trios")); + "Precompute mendelian errors for " + (trios.size() == 1 ? "trio " + trios.get(0).serialize() : trios.size() + " trios")); } public void postIndex(int studyId, int version) diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/index/family/FamilyIndexTest.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/index/family/FamilyIndexTest.java index 630ddb4960d..034e676c1b4 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/index/family/FamilyIndexTest.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/index/family/FamilyIndexTest.java @@ -18,6 +18,7 @@ import org.opencb.opencga.core.models.operations.variant.VariantAggregateFamilyParams; import org.opencb.opencga.core.response.VariantQueryResult; import org.opencb.opencga.core.testclassification.duration.MediumTests; +import org.opencb.opencga.storage.core.metadata.models.Trio; import org.opencb.opencga.storage.core.variant.VariantStorageBaseTest; import org.opencb.opencga.storage.core.variant.VariantStorageOptions; import org.opencb.opencga.storage.core.variant.adaptors.GenotypeClass; @@ -80,12 +81,13 @@ public void before() throws Exception { runETL(variantStorageEngine, getResourceUri("variant-test-me.vcf"), outputUri, params, true, true, true); - List family = Arrays.asList(father, mother, child); + Trio family = new Trio(father, mother, child); + Trio family2 = new Trio("FATHER", "MOTHER", "PROBAND"); - variantStorageEngine.aggregateFamily(study, new VariantAggregateFamilyParams(family, false), new ObjectMap()); + variantStorageEngine.aggregateFamily(study, new VariantAggregateFamilyParams(family.toList(), false), new ObjectMap()); variantStorageEngine.familyIndex(study, Collections.singletonList(family), new ObjectMap()); - variantStorageEngine.familyIndex(study, Collections.singletonList(Arrays.asList("FATHER", "MOTHER", "PROBAND")), new ObjectMap()); + variantStorageEngine.familyIndex(study, Collections.singletonList(family2), new ObjectMap()); variantStorageEngine.annotate(outputUri, new ObjectMap()); diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/index/sample/SampleIndexTest.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/index/sample/SampleIndexTest.java index 0319c2bc43b..3e6ab96edcd 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/index/sample/SampleIndexTest.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/index/sample/SampleIndexTest.java @@ -31,6 +31,7 @@ import org.opencb.opencga.storage.core.exceptions.StorageEngineException; import org.opencb.opencga.storage.core.metadata.models.SampleMetadata; import org.opencb.opencga.storage.core.metadata.models.StudyMetadata; +import org.opencb.opencga.storage.core.metadata.models.Trio; import org.opencb.opencga.storage.core.variant.VariantStorageBaseTest; import org.opencb.opencga.storage.core.variant.VariantStorageEngine; import org.opencb.opencga.storage.core.variant.VariantStorageOptions; @@ -111,12 +112,12 @@ public class SampleIndexTest extends VariantStorageBaseTest implements HadoopVar // Arrays.asList("NA19661", "NA19685", "NA19600"), // Arrays.asList("NA19685", "NA19600", "NA19660") // ); - private static List> trios = Arrays.asList( - Arrays.asList("NA19660", "NA19661", "NA19685"), - Arrays.asList("NA19660", "NA19661", "NA19600") + private static List trios = Arrays.asList( + new Trio("NA19660", "NA19661", "NA19685"), + new Trio("NA19660", "NA19661", "NA19600") ); - private static List> triosPlatinum = Arrays.asList( - Arrays.asList("NA12877", "-", "NA12878") + private static List triosPlatinum = Arrays.asList( + new Trio("NA12877", null, "NA12878") ); @Before @@ -341,8 +342,8 @@ public void regenerateSampleIndex() throws Exception { studyId, Collections.emptySet(), options), ""); - if (sampleNames.get(study).containsAll(trios.get(0))) { - options.put(FamilyIndexDriver.TRIOS, trios.stream().map(trio -> String.join(",", trio)).collect(Collectors.joining(";"))); + if (sampleNames.get(study).containsAll(trios.get(0).toList())) { + options.put(FamilyIndexDriver.TRIOS, trios.stream().map(Trio::serialize).collect(Collectors.joining(";"))); options.put(FamilyIndexDriver.OVERWRITE, true); new TestMRExecutor().run(FamilyIndexDriver.class, FamilyIndexDriver.buildArgs( dbAdaptor.getArchiveTableName(studyId), @@ -350,7 +351,7 @@ public void regenerateSampleIndex() throws Exception { studyId, Collections.emptySet(), options), ""); } else if (study.equals(STUDY_NAME_3)) { - options.put(FamilyIndexDriver.TRIOS, triosPlatinum.stream().map(trio -> String.join(",", trio)).collect(Collectors.joining(";"))); + options.put(FamilyIndexDriver.TRIOS, triosPlatinum.stream().map(Trio::serialize).collect(Collectors.joining(";"))); options.put(FamilyIndexDriver.OVERWRITE, true); new TestMRExecutor().run(FamilyIndexDriver.class, FamilyIndexDriver.buildArgs( dbAdaptor.getArchiveTableName(studyId), @@ -1036,8 +1037,8 @@ public void testApproximateCount() { @Test public void testFamilyIndexQueryCount() { - List trio = trios.get(0); - String proband = trio.get(2); + Trio trio = trios.get(0); + String proband = trio.getChild(); VariantQueryResult result = variantStorageEngine.get( new Query() .append(STUDY.key(), STUDY_NAME)