diff --git a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/manager/VariantStorageManager.java b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/manager/VariantStorageManager.java index 6fb44855fb7..bd71414a4e8 100644 --- a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/manager/VariantStorageManager.java +++ b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/manager/VariantStorageManager.java @@ -71,8 +71,9 @@ import org.opencb.opencga.core.models.sample.SamplePermissions; import org.opencb.opencga.core.models.study.Study; import org.opencb.opencga.core.models.study.StudyPermissions; +import org.opencb.opencga.core.models.study.VariantSetupResult; +import org.opencb.opencga.core.models.variant.VariantSetupParams; import org.opencb.opencga.core.response.OpenCGAResult; -import org.opencb.opencga.storage.core.variant.query.VariantQueryResult; import org.opencb.opencga.core.tools.ToolParams; import org.opencb.opencga.storage.core.StorageEngineFactory; import org.opencb.opencga.storage.core.StoragePipelineResult; @@ -88,6 +89,7 @@ import org.opencb.opencga.storage.core.variant.adaptors.iterators.VariantDBIterator; import org.opencb.opencga.storage.core.variant.io.VariantWriterFactory.VariantOutputFormat; import org.opencb.opencga.storage.core.variant.query.ParsedQuery; +import org.opencb.opencga.storage.core.variant.query.VariantQueryResult; import org.opencb.opencga.storage.core.variant.query.VariantQueryUtils; import org.opencb.opencga.storage.core.variant.query.projection.VariantQueryProjectionParser; import org.opencb.opencga.storage.core.variant.score.VariantScoreFormatDescriptor; @@ -490,6 +492,18 @@ public void aggregate(String studyStr, VariantAggregateParams params, String tok }); } + public VariantSetupResult variantSetup(String studyStr, VariantSetupParams params, String token) + throws CatalogException, StorageEngineException { + return secureOperation(VariantSetupOperationManager.ID, studyStr, params.toObjectMap(), token, + engine -> new VariantSetupOperationManager(this, engine).setup(getStudyFqn(studyStr, token), params, token)); + } + + public boolean hasVariantSetup(String studyStr, String token) throws CatalogException { + Study study = catalogManager.getStudyManager().get(studyStr, + new QueryOptions(INCLUDE, StudyDBAdaptor.QueryParams.INTERNAL_CONFIGURATION_VARIANT_ENGINE.key()), token).first(); + return VariantSetupOperationManager.hasVariantSetup(study); + } + public ObjectMap configureProject(String projectStr, ObjectMap params, String token) throws CatalogException, StorageEngineException { return secureOperationByProject("configure", projectStr, params, token, engine -> { DataStore dataStore = getDataStoreByProjectId(projectStr, token); @@ -1181,7 +1195,7 @@ private interface VariantOperationFunction { private R secureOperationByProject(String operationName, String project, ObjectMap params, String token, VariantOperationFunction operation) throws CatalogException, StorageEngineException { try (VariantStorageEngine variantStorageEngine = getVariantStorageEngineByProject(project, params, token)) { - return secureTool(operationName, true, params, token, variantStorageEngine, operation); + return secureTool(operationName, true, null, params, token, variantStorageEngine, operation); } catch (IOException e) { throw new StorageEngineException("Error closing the VariantStorageEngine", e); } @@ -1190,7 +1204,7 @@ private R secureOperationByProject(String operationName, String project, Obj private R secureOperation(String operationName, String study, ObjectMap params, String token, VariantOperationFunction operation) throws CatalogException, StorageEngineException { try (VariantStorageEngine variantStorageEngine = getVariantStorageEngineForStudyOperation(study, params, token)) { - return secureTool(operationName, true, params, token, variantStorageEngine, operation); + return secureTool(operationName, true, study, params, token, variantStorageEngine, operation); } catch (IOException e) { throw new StorageEngineException("Error closing the VariantStorageEngine", e); } @@ -1199,7 +1213,7 @@ private R secureOperation(String operationName, String study, ObjectMap para private R secureAnalysis(String operationName, String study, ObjectMap params, String token, VariantOperationFunction operation) throws CatalogException, StorageEngineException { try (VariantStorageEngine variantStorageEngine = getVariantStorageEngineForStudyOperation(study, params, token)) { - return secureTool(operationName, false, params, token, variantStorageEngine, operation); + return secureTool(operationName, false, study, params, token, variantStorageEngine, operation); } catch (IOException e) { throw new StorageEngineException("Error closing the VariantStorageEngine", e); } @@ -1221,7 +1235,7 @@ private R secureOperationByProject(String operationName, String projectStr, return secureOperationByProject(operationName, projectStr, params, token, operation); } - private R secureTool(String toolId, boolean isOperation, ObjectMap params, String token, + private R secureTool(String toolId, boolean isOperation, String study, ObjectMap params, String token, VariantStorageEngine variantStorageEngine, VariantOperationFunction operation) throws CatalogException, StorageEngineException { @@ -1241,6 +1255,15 @@ private R secureTool(String toolId, boolean isOperation, ObjectMap params, S throw new StorageEngineException("Unable to execute operation '" + toolId + "'. " + "The storage engine is in mode=" + storageConfiguration.getMode()); } + if (isOperation && study != null && !VariantSetupOperationManager.ID.equals(toolId)) { + // Ensure that the variant setup has been executed + // do not check for the setup operation itself + // Project level operations can not be checked for setup. + if (!hasVariantSetup(study, token)) { + throw new StorageEngineException("Unable to execute operation '" + toolId + "'. " + + "The variant storage has not been setup for study '" + study + "'"); + } + } result = operation.apply(variantStorageEngine); return result; } catch (CatalogException | StorageEngineException e) { diff --git a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/manager/operations/VariantSetupOperationManager.java b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/manager/operations/VariantSetupOperationManager.java new file mode 100644 index 00000000000..0003ca105bf --- /dev/null +++ b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/manager/operations/VariantSetupOperationManager.java @@ -0,0 +1,166 @@ +package org.opencb.opencga.analysis.variant.manager.operations; + +import org.opencb.commons.datastore.core.ObjectMap; +import org.opencb.commons.datastore.core.QueryOptions; +import org.opencb.opencga.analysis.variant.manager.VariantStorageManager; +import org.opencb.opencga.catalog.db.api.StudyDBAdaptor; +import org.opencb.opencga.catalog.exceptions.CatalogException; +import org.opencb.opencga.core.common.TimeUtils; +import org.opencb.opencga.core.models.study.Study; +import org.opencb.opencga.core.models.study.VariantSetupResult; +import org.opencb.opencga.core.models.variant.VariantSetupParams; +import org.opencb.opencga.storage.core.exceptions.StorageEngineException; +import org.opencb.opencga.storage.core.metadata.VariantStorageMetadataManager; +import org.opencb.opencga.storage.core.variant.VariantStorageEngine; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class VariantSetupOperationManager extends OperationManager { + + + public static final String ID = "variant-setup"; + private static Logger logger = LoggerFactory.getLogger(VariantSetupOperationManager.class); + + public VariantSetupOperationManager(VariantStorageManager variantStorageManager, VariantStorageEngine variantStorageEngine) { + super(variantStorageManager, variantStorageEngine); + } + + public VariantSetupResult setup(String studyFqn, VariantSetupParams params, String token) + throws CatalogException, StorageEngineException { + // Copy params to avoid modifying input object + params = new VariantSetupParams(params); + check(studyFqn, params, token); + + VariantSetupResult result = new VariantSetupResult(); + result.setDate(TimeUtils.getTime()); + result.setUserId(catalogManager.getUserManager().getUserIdContextStudy(studyFqn, token)); + result.setParams(params.toObjectMap()); + result.setStatus(VariantSetupResult.Status.READY); + + inferParams(params); + + ObjectMap options = variantStorageEngine.inferConfigurationParams(params); + result.setOptions(options); + + catalogManager.getStudyManager().setVariantEngineSetupOptions(studyFqn, result, token); + + return result; + } + + /** + * Infer some parameters from others. + * - averageFileSize inferred from fileType + * - samplesPerFile inferred from dataDistribution or expectedSamplesNumber and expectedFilesNumber + * - numberOfVariantsPerSample inferred from fileType + * @param params params to infer + */ + private void inferParams(VariantSetupParams params) { + if (params.getFileType() != null) { + switch (params.getFileType()) { + case GENOME_gVCF: + if (params.getAverageFileSize() == null) { + params.setAverageFileSize("1GiB"); + } + if (params.getVariantsPerSample() == null) { + params.setVariantsPerSample(5000000); + } + break; + case GENOME_VCF: + if (params.getAverageFileSize() == null) { + params.setAverageFileSize("500MiB"); + } + if (params.getVariantsPerSample() == null) { + params.setVariantsPerSample(5000000); + } + break; + case EXOME: + if (params.getAverageFileSize() == null) { + params.setAverageFileSize("100MiB"); + } + if (params.getVariantsPerSample() == null) { + params.setVariantsPerSample(100000); + } + break; + default: + throw new IllegalArgumentException("Unknown fileType " + params.getFileType()); + } + } + // Unable to tell. Use a default value for numberOfVariantsPerSample + if (params.getVariantsPerSample() == null) { + params.setVariantsPerSample(5000000); + } + + if (params.getAverageSamplesPerFile() == null) { + if (params.getDataDistribution() == null) { + params.setAverageSamplesPerFile(params.getExpectedSamples().floatValue() / params.getExpectedFiles().floatValue()); + } else { + switch (params.getDataDistribution()) { + case SINGLE_SAMPLE_PER_FILE: + params.setAverageSamplesPerFile(1f); + break; + case MULTIPLE_SAMPLES_PER_FILE: + params.setAverageSamplesPerFile(params.getExpectedSamples().floatValue() / params.getExpectedFiles().floatValue()); + break; + case MULTIPLE_FILES_PER_SAMPLE: + // Hard to tell. Let's assume 2 samples per file + params.setAverageSamplesPerFile(2f); + break; + case FILES_SPLIT_BY_CHROMOSOME: + case FILES_SPLIT_BY_REGION: + params.setAverageSamplesPerFile(params.getExpectedSamples().floatValue()); + break; + default: + throw new IllegalArgumentException("Unknown dataDistribution " + params.getDataDistribution()); + } + } + } + } + + private void check(String studyStr, VariantSetupParams params, String token) throws CatalogException, StorageEngineException { + Study study = catalogManager.getStudyManager().get(studyStr, + new QueryOptions(QueryOptions.INCLUDE, StudyDBAdaptor.QueryParams.INTERNAL_CONFIGURATION_VARIANT_ENGINE.key()), token) + .first(); + + VariantStorageMetadataManager metadataManager = variantStorageEngine.getMetadataManager(); + if (metadataManager.studyExists(studyStr)) { + int studyId = metadataManager.getStudyId(studyStr); + if (!metadataManager.getIndexedFiles(studyId).isEmpty()) { + throw new IllegalArgumentException("Unable to execute variant-setup on study '" + studyStr + "'. " + + "It already has indexed files."); + } + } + if (hasVariantSetup(study)) { + logger.info("Study {} was already setup. Re executing variant-setup", studyStr); + } + + if (params.getExpectedFiles() == null || params.getExpectedFiles() <= 0) { + throw new IllegalArgumentException("Missing expectedFiles"); + } + if (params.getExpectedSamples() == null || params.getExpectedSamples() <= 0) { + throw new IllegalArgumentException("Missing expectedSamples"); + } + + if (params.getAverageFileSize() == null && params.getFileType() == null) { + throw new IllegalArgumentException("Missing averageFileSize or fileType"); + } + } + + public static boolean hasVariantSetup(Study study) { + boolean hasSetup = false; + VariantSetupResult setup = getVariantSetupResult(study); + if (setup != null && setup.getStatus() == VariantSetupResult.Status.READY) { + hasSetup = true; + } + return hasSetup; + } + + private static VariantSetupResult getVariantSetupResult(Study study) { + if (study.getInternal() != null + && study.getInternal().getConfiguration() != null + && study.getInternal().getConfiguration().getVariantEngine() != null) { + return study.getInternal().getConfiguration().getVariantEngine().getSetup(); + } + return null; + } + +} diff --git a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/operations/VariantIndexOperationTool.java b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/operations/VariantIndexOperationTool.java index d4e7270d2e6..766962db671 100644 --- a/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/operations/VariantIndexOperationTool.java +++ b/opencga-analysis/src/main/java/org/opencb/opencga/analysis/variant/operations/VariantIndexOperationTool.java @@ -77,7 +77,9 @@ protected void check() throws Exception { params.putIfNotEmpty(VariantStorageOptions.INCLUDE_GENOTYPE.key(), indexParams.getIncludeGenotypes()); params.put(VariantStorageOptions.STATS_AGGREGATION.key(), indexParams.getAggregated()); params.putIfNotEmpty(VariantStorageOptions.STATS_AGGREGATION_MAPPING_FILE.key(), indexParams.getAggregationMappingFile()); - params.put(VariantStorageOptions.GVCF.key(), indexParams.isGvcf()); + if (indexParams.isGvcf()) { + params.put(VariantStorageOptions.GVCF.key(), indexParams.isGvcf()); + } // queryOptions.putIfNotNull(VariantFileIndexerStorageOperation.TRANSFORMED_FILES, indexParams.transformedPaths); @@ -92,7 +94,9 @@ protected void check() throws Exception { params.put(VariantStorageOptions.FAMILY.key(), indexParams.isFamily()); params.put(VariantStorageOptions.SOMATIC.key(), indexParams.isSomatic()); params.putIfNotEmpty(VariantStorageOptions.LOAD_SPLIT_DATA.key(), indexParams.getLoadSplitData()); - params.put(VariantStorageOptions.LOAD_MULTI_FILE_DATA.key(), indexParams.isLoadMultiFileData()); + if (indexParams.isLoadMultiFileData()) { + params.put(VariantStorageOptions.LOAD_MULTI_FILE_DATA.key(), indexParams.isLoadMultiFileData()); + } params.putIfNotEmpty(VariantStorageOptions.LOAD_SAMPLE_INDEX.key(), indexParams.getLoadSampleIndex()); params.putIfNotEmpty(VariantStorageOptions.LOAD_ARCHIVE.key(), indexParams.getLoadArchive()); params.putIfNotEmpty(VariantStorageOptions.LOAD_HOM_REF.key(), indexParams.getLoadHomRef()); diff --git a/opencga-analysis/src/test/java/org/opencb/opencga/analysis/clinical/ClinicalAnalysisUtilsTest.java b/opencga-analysis/src/test/java/org/opencb/opencga/analysis/clinical/ClinicalAnalysisUtilsTest.java index 5e6033750b6..87a8d1d2a26 100644 --- a/opencga-analysis/src/test/java/org/opencb/opencga/analysis/clinical/ClinicalAnalysisUtilsTest.java +++ b/opencga-analysis/src/test/java/org/opencb/opencga/analysis/clinical/ClinicalAnalysisUtilsTest.java @@ -24,6 +24,7 @@ import org.opencb.biodata.models.variant.avro.SequenceOntologyTerm; import org.opencb.commons.datastore.core.ObjectMap; import org.opencb.opencga.analysis.variant.OpenCGATestExternalResource; +import org.opencb.opencga.analysis.variant.manager.VariantOperationsTest; import org.opencb.opencga.analysis.variant.manager.VariantStorageManager; import org.opencb.opencga.catalog.exceptions.CatalogException; import org.opencb.opencga.catalog.managers.AbstractClinicalManagerTest; @@ -56,11 +57,12 @@ public static AbstractClinicalManagerTest getClinicalTest(OpenCGATestExternalRes .append(VariantStorageOptions.ANNOTATE.key(), true) .append(VariantStorageOptions.STATS_CALCULATE.key(), false); - VariantStorageManager variantStorageManager = new VariantStorageManager(opencga.getCatalogManager(), opencga.getStorageEngineFactory()); + VariantStorageManager variantStorageManager = opencga.getVariantStorageManager(); Path outDir = Paths.get("target/test-data").resolve("junit_clinical_analysis_" + RandomStringUtils.randomAlphabetic(10)); Files.createDirectories(outDir); + VariantOperationsTest.dummyVariantSetup(variantStorageManager, clinicalTest.studyFqn, clinicalTest.token); variantStorageManager.index(clinicalTest.studyFqn, "family.vcf", outDir.toString(), storageOptions, clinicalTest.token); variantStorageManager.index(clinicalTest.studyFqn, "exomiser.vcf.gz", outDir.toString(), storageOptions, clinicalTest.token); variantStorageManager.index(clinicalTest.studyFqn, "HG004.1k.vcf.gz", outDir.toString(), storageOptions, clinicalTest.token); diff --git a/opencga-analysis/src/test/java/org/opencb/opencga/analysis/variant/OpenCGATestExternalResource.java b/opencga-analysis/src/test/java/org/opencb/opencga/analysis/variant/OpenCGATestExternalResource.java index b333a01b625..6fca2e7c2aa 100644 --- a/opencga-analysis/src/test/java/org/opencb/opencga/analysis/variant/OpenCGATestExternalResource.java +++ b/opencga-analysis/src/test/java/org/opencb/opencga/analysis/variant/OpenCGATestExternalResource.java @@ -25,12 +25,14 @@ import org.opencb.opencga.analysis.clinical.exomiser.ExomiserInterpretationAnalysisTest; import org.opencb.opencga.analysis.tools.ToolRunner; import org.opencb.opencga.analysis.variant.manager.VariantStorageManager; +import org.opencb.opencga.catalog.db.mongodb.MongoBackupUtils; import org.opencb.opencga.catalog.exceptions.CatalogException; import org.opencb.opencga.catalog.managers.CatalogManager; import org.opencb.opencga.catalog.managers.CatalogManagerExternalResource; import org.opencb.opencga.core.config.Configuration; import org.opencb.opencga.core.config.storage.StorageConfiguration; import org.opencb.opencga.core.models.file.File; +import org.opencb.opencga.core.models.project.DataStore; import org.opencb.opencga.storage.core.StorageEngineFactory; import org.opencb.opencga.storage.core.exceptions.StorageEngineException; import org.opencb.opencga.storage.core.variant.VariantStorageBaseTest; @@ -40,20 +42,20 @@ import org.opencb.opencga.storage.core.variant.solr.VariantSolrExternalResource; import org.opencb.opencga.storage.hadoop.variant.HadoopVariantStorageEngine; import org.opencb.opencga.storage.hadoop.variant.HadoopVariantStorageTest; +import org.reflections.Reflections; +import org.reflections.scanners.ResourcesScanner; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.*; import java.net.URI; +import java.net.URL; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.nio.file.StandardCopyOption; import java.text.SimpleDateFormat; -import java.util.Arrays; -import java.util.Date; -import java.util.List; -import java.util.Map; +import java.util.*; /** * Created on 26/08/15 @@ -73,7 +75,8 @@ public class OpenCGATestExternalResource extends ExternalResource { private ToolRunner toolRunner; - public static HadoopVariantStorageTest.HadoopExternalResource hadoopExternalResource = new HadoopVariantStorageTest.HadoopExternalResource(); + public static HadoopVariantStorageTest.HadoopExternalResource hadoopExternalResource + = new HadoopVariantStorageTest.HadoopExternalResource(); public OpenCGATestExternalResource() { this(false); @@ -235,6 +238,9 @@ public Path isolateOpenCGA() throws IOException { StorageEngineFactory.configure(storageConfiguration); storageEngineFactory = StorageEngineFactory.get(storageConfiguration); + if (storageEngine.equals(DummyVariantStorageEngine.STORAGE_ENGINE_ID)) { + DummyVariantStorageEngine.configure(getStorageEngineFactory(), true); + } // inputStream = StorageEngine.class.getClassLoader().getResourceAsStream("client-configuration-test.yml"); // Files.copy(inputStream, conf.resolve("client-configuration.yml"), StandardCopyOption.REPLACE_EXISTING); @@ -362,6 +368,34 @@ public String createTmpOutdir(String suffix) throws IOException { // return getCatalogManager().getJobManager().createJobOutDir(studyId, "I_tmp_" + date + sufix, sessionId).toString(); } + public void restore(URL resource) throws Exception { + if (resource.getProtocol().equals("jar")) { + Reflections reflections = new Reflections(resource.getPath().replace('/','.'), new ResourcesScanner()); + Set resources = reflections.getResources(x -> true); + for (String file : resources) { + catalogManagerExternalResource.getResourceUri(file.replace('.', '/')); + } + MongoBackupUtils.restore(getCatalogManager(), opencgaHome, opencgaHome + .resolve("resources") + .resolve(resource.getPath()) + .resolve("mongodb")); + } else { + MongoBackupUtils.restore(getCatalogManager(), opencgaHome, Paths.get(resource.toURI()).resolve("mongodb")); + } + catalogManagerExternalResource.resetCatalogManager(); + } + + public final VariantStorageEngine getVariantStorageEngineByProject(String projectFqn) throws Exception { + DataStore dataStore = getVariantStorageManager().getDataStoreByProjectId(projectFqn, getAdminToken()); + VariantStorageEngine variantStorageEngine = storageEngineFactory + .getVariantStorageEngine(dataStore.getStorageEngine(), dataStore.getDbName()); + if (dataStore.getOptions() != null) { + variantStorageEngine.getOptions().putAll(dataStore.getOptions()); + } + return variantStorageEngine; + } + + // private class StorageLocalExecutorManager extends LocalExecutorManager { // // public StorageLocalExecutorManager(String sessionId) { diff --git a/opencga-analysis/src/test/java/org/opencb/opencga/analysis/variant/VariantAnalysisTest.java b/opencga-analysis/src/test/java/org/opencb/opencga/analysis/variant/VariantAnalysisTest.java index c91909d3abb..f9e9392be80 100644 --- a/opencga-analysis/src/test/java/org/opencb/opencga/analysis/variant/VariantAnalysisTest.java +++ b/opencga-analysis/src/test/java/org/opencb/opencga/analysis/variant/VariantAnalysisTest.java @@ -44,6 +44,7 @@ import org.opencb.opencga.analysis.variant.gwas.GwasAnalysis; import org.opencb.opencga.analysis.variant.hrdetect.HRDetectAnalysis; import org.opencb.opencga.analysis.variant.knockout.KnockoutAnalysis; +import org.opencb.opencga.analysis.variant.manager.VariantOperationsTest; import org.opencb.opencga.analysis.variant.manager.VariantStorageManager; import org.opencb.opencga.analysis.variant.mutationalSignature.MutationalSignatureAnalysis; import org.opencb.opencga.analysis.variant.operations.VariantIndexOperationTool; @@ -179,6 +180,9 @@ public void setUp() throws Throwable { setUpCatalogManager(); + VariantOperationsTest.dummyVariantSetup(variantStorageManager, STUDY, token); + VariantOperationsTest.dummyVariantSetup(variantStorageManager, CANCER_STUDY, token); + file = opencga.createFile(STUDY, "variant-test-file.vcf.gz", token); variantStorageManager.index(STUDY, file.getId(), opencga.createTmpOutdir("_index"), new ObjectMap(VariantStorageOptions.ANNOTATE.key(), true), token); diff --git a/opencga-analysis/src/test/java/org/opencb/opencga/analysis/variant/manager/VariantOperationsTest.java b/opencga-analysis/src/test/java/org/opencb/opencga/analysis/variant/manager/VariantOperationsTest.java index 8ad7f3c3479..ef6b79bd153 100644 --- a/opencga-analysis/src/test/java/org/opencb/opencga/analysis/variant/manager/VariantOperationsTest.java +++ b/opencga-analysis/src/test/java/org/opencb/opencga/analysis/variant/manager/VariantOperationsTest.java @@ -17,6 +17,7 @@ package org.opencb.opencga.analysis.variant.manager; import org.hamcrest.CoreMatchers; +import org.hamcrest.MatcherAssert; import org.junit.*; import org.junit.experimental.categories.Category; import org.junit.runner.RunWith; @@ -32,6 +33,7 @@ import org.opencb.opencga.analysis.variant.OpenCGATestExternalResource; import org.opencb.opencga.analysis.variant.gwas.GwasAnalysis; import org.opencb.opencga.analysis.variant.operations.*; +import org.opencb.opencga.catalog.exceptions.CatalogException; import org.opencb.opencga.catalog.managers.CatalogManager; import org.opencb.opencga.core.api.ParamConstants; import org.opencb.opencga.core.common.JacksonUtils; @@ -53,10 +55,14 @@ import org.opencb.opencga.core.models.project.ProjectCreateParams; import org.opencb.opencga.core.models.project.ProjectOrganism; import org.opencb.opencga.core.models.sample.*; +import org.opencb.opencga.core.models.study.VariantSetupResult; +import org.opencb.opencga.core.models.variant.VariantSetupParams; import org.opencb.opencga.core.response.OpenCGAResult; import org.opencb.opencga.core.testclassification.duration.LongTests; import org.opencb.opencga.core.tools.result.ExecutionResult; import org.opencb.opencga.storage.core.exceptions.StorageEngineException; +import org.opencb.opencga.storage.core.metadata.VariantStorageMetadataManager; +import org.opencb.opencga.storage.core.metadata.models.SampleMetadata; import org.opencb.opencga.storage.core.metadata.models.VariantScoreMetadata; import org.opencb.opencga.storage.core.utils.CellBaseUtils; import org.opencb.opencga.storage.core.variant.VariantStorageEngine; @@ -67,6 +73,7 @@ import org.opencb.opencga.storage.hadoop.variant.HadoopVariantStorageTest; import org.opencb.opencga.storage.hadoop.variant.VariantHbaseTestUtils; import org.opencb.opencga.storage.hadoop.variant.adaptors.VariantHadoopDBAdaptor; +import org.slf4j.LoggerFactory; import java.nio.file.Path; import java.nio.file.Paths; @@ -85,8 +92,9 @@ public class VariantOperationsTest { public static final String USER = "user"; public static final String PASSWORD = TestParamConstants.PASSWORD; public static final String PROJECT = "project"; + public static final String PROJECT_FQN = ORGANIZATION + '@' + PROJECT; public static final String STUDY = "study"; - public static final String STUDY_FQN = ORGANIZATION + '@' + PROJECT + ':' + STUDY; + public static final String STUDY_FQN = PROJECT_FQN + ':' + STUDY; public static final String PHENOTYPE_NAME = "myPhenotype"; public static final Phenotype PHENOTYPE = new Phenotype(PHENOTYPE_NAME, PHENOTYPE_NAME, "mySource") .setStatus(Phenotype.Status.OBSERVED); @@ -156,8 +164,8 @@ public void tearDown() { if (storageEngine.equals(HadoopVariantStorageEngine.STORAGE_ENGINE_ID)) { VariantHbaseTestUtils.printVariants(((VariantHadoopDBAdaptor) engine.getDBAdaptor()), Paths.get(opencga.createTmpOutdir("_hbase_print_variants_AFTER")).toUri()); } - } catch (Exception ignore) { - ignore.printStackTrace(); + } catch (Exception e) { + LoggerFactory.getLogger(getClass()).error("Ignoring exception printing variants", e); } hadoopExternalResource.after(); @@ -224,6 +232,8 @@ private void loadDataset() throws Throwable { solrExternalResource.configure(variantStorageManager.getVariantStorageEngineForStudyOperation(STUDY, new ObjectMap(), token)); } + dummyVariantSetup(variantStorageManager, STUDY, token); + file = opencga.createFile(STUDY, "variant-test-file.vcf.gz", token); // variantStorageManager.index(STUDY, file.getId(), opencga.createTmpOutdir("_index"), new ObjectMap(VariantStorageOptions.ANNOTATE.key(), true), token); toolRunner.execute(VariantIndexOperationTool.class, STUDY, @@ -285,6 +295,15 @@ private void loadDataset() throws Throwable { } } + public static void dummyVariantSetup(VariantStorageManager variantStorageManager, String study, String token) + throws CatalogException, StorageEngineException { + variantStorageManager.variantSetup(study, new VariantSetupParams() + .setAverageFileSize("100B") + .setExpectedFiles(5) + .setExpectedSamples(5) + .setVariantsPerSample(1000), token); + } + public void setUpCatalogManager() throws Exception { catalogManager.getOrganizationManager().create(new OrganizationCreateParams().setId(ORGANIZATION), QueryOptions.empty(), opencga.getAdminToken()); @@ -309,6 +328,84 @@ public void setUpCatalogManager() throws Exception { } + @Test + public void testSetup() throws Exception { + String study2 = "study2"; + String study2fqn = catalogManager.getStudyManager() + .create(PROJECT, study2, null, "Phase 1", "Done", null, null, null, null, null, token) + .first().getFqn(); + File file = opencga.createFile(study2, "variant-test-file.vcf.gz", token); + + try { + toolRunner.execute(VariantIndexOperationTool.class, study2, + new VariantIndexParams() + .setFile(file.getId()) + .setAnnotate(false) + .setLoadHomRef(YesNoAuto.YES.name()), + Paths.get(opencga.createTmpOutdir("_index")), "index", false, token); + fail("Should have thrown an exception"); + } catch (ToolException e) { + MatcherAssert.assertThat(e.getCause().getMessage(), CoreMatchers.containsString("The variant storage has not been setup for study")); + } + + try { + VariantSetupParams setupParams = new VariantSetupParams() + .setFileType(VariantSetupParams.FileType.GENOME_VCF) + .setDataDistribution(VariantSetupParams.DataDistribution.MULTIPLE_SAMPLES_PER_FILE) + .setExpectedFiles(20) + .setExpectedSamples(100) + .setNormalizeExtensions(Arrays.asList("VS", "SV")); + variantStorageManager.variantSetup(study2, setupParams, token); + fail("should have failed"); + } catch (Exception e) { + System.err.println(e.getMessage()); + MatcherAssert.assertThat(e.getMessage(), CoreMatchers.containsString("Unsupported normalize extensions")); + } + + try { + VariantSetupParams setupParams = new VariantSetupParams() + .setFileType(VariantSetupParams.FileType.GENOME_VCF) + .setDataDistribution(VariantSetupParams.DataDistribution.MULTIPLE_SAMPLES_PER_FILE) + .setExpectedSamples(100) + .setNormalizeExtensions(Arrays.asList("VS", "SV")); + variantStorageManager.variantSetup(study2, setupParams, token); + fail("should have failed"); + } catch (Exception e) { + System.err.println(e.getMessage()); + MatcherAssert.assertThat(e.getMessage(), CoreMatchers.containsString("Missing expectedFiles")); + } + + VariantSetupParams setupParams = new VariantSetupParams() + .setFileType(VariantSetupParams.FileType.GENOME_VCF) + .setDataDistribution(VariantSetupParams.DataDistribution.MULTIPLE_FILES_PER_SAMPLE) + .setExpectedFiles(20) + .setAverageSamplesPerFile(2.5f) + .setExpectedSamples(10) + .setNormalizeExtensions(Arrays.asList("SV", "VAF")); + VariantSetupResult result = variantStorageManager.variantSetup(study2, setupParams, token); + assertEquals(VariantSetupResult.Status.READY, result.getStatus()); + + toolRunner.execute(VariantIndexOperationTool.class, study2, + new VariantIndexParams() + .setFile(file.getId()) + .setLoadHomRef(YesNoAuto.YES.name()), + Paths.get(opencga.createTmpOutdir("_index")), "index", false, token); + + VariantStorageMetadataManager metadataManager = opencga.getVariantStorageEngineByProject(PROJECT_FQN).getMetadataManager(); + int studyId = metadataManager.getStudyId(study2fqn); + int sampleId = metadataManager.getSampleId(studyId, "NA19600"); + SampleMetadata sampleMetadata = metadataManager.getSampleMetadata(studyId, sampleId); + assertEquals(VariantStorageEngine.SplitData.MULTI, sampleMetadata.getSplitData()); + + try { + variantStorageManager.variantSetup(STUDY, setupParams, token); + fail("Should fail"); + } catch (Exception e) { + MatcherAssert.assertThat(e.getMessage(), CoreMatchers.containsString("Unable to execute variant-setup on study")); + MatcherAssert.assertThat(e.getMessage(), CoreMatchers.containsString("It already has indexed files.")); + } + } + @Test public void testVariantFileReload() throws Exception { try { diff --git a/opencga-analysis/src/test/java/org/opencb/opencga/analysis/variant/manager/VariantStorageManagerTest.java b/opencga-analysis/src/test/java/org/opencb/opencga/analysis/variant/manager/VariantStorageManagerTest.java index 0a5ac63293d..4aeedde871f 100644 --- a/opencga-analysis/src/test/java/org/opencb/opencga/analysis/variant/manager/VariantStorageManagerTest.java +++ b/opencga-analysis/src/test/java/org/opencb/opencga/analysis/variant/manager/VariantStorageManagerTest.java @@ -17,12 +17,15 @@ package org.opencb.opencga.analysis.variant.manager; import org.apache.commons.lang3.RandomStringUtils; +import org.hamcrest.CoreMatchers; +import org.hamcrest.MatcherAssert; import org.junit.Test; import org.junit.experimental.categories.Category; import org.opencb.biodata.models.variant.metadata.Aggregation; import org.opencb.commons.datastore.core.ObjectMap; import org.opencb.commons.datastore.core.QueryOptions; import org.opencb.opencga.analysis.variant.manager.operations.AbstractVariantOperationManagerTest; +import org.opencb.opencga.analysis.variant.stats.VariantStatsAnalysis; import org.opencb.opencga.catalog.exceptions.CatalogException; import org.opencb.opencga.core.config.storage.SampleIndexConfiguration; import org.opencb.opencga.core.models.file.File; @@ -64,6 +67,18 @@ public void testConfigure() throws CatalogException, StorageEngineException { variantManager.configureStudy(studyFqn, expectedStudyConfiguration1, sessionId); variantManager.configureStudy(studyId2, expectedStudyConfiguration2, sessionId); + + try { + Study study = catalogManager.getStudyManager().create(projectId, "s_no_setup", "s_no_setup", "s_no_setup", + "Study 1", null, null, null, Collections.singletonMap(VariantStatsAnalysis.STATS_AGGREGATION_CATALOG, getAggregation()), null, sessionId) + .first(); + // Variant setup mandatory for configuring study + variantManager.configureStudy(study.getFqn(), expectedStudyConfiguration1, sessionId); + fail("Expect exception. Study not setup"); + } catch (Exception e) { + MatcherAssert.assertThat(e.getMessage(), CoreMatchers.containsString("The variant storage has not been setup for study")); + } + ObjectMap configuration = variantManager.getDataStoreByProjectId(projectId, sessionId).getOptions(); assertEquals(expectedConfiguration, configuration); diff --git a/opencga-analysis/src/test/java/org/opencb/opencga/analysis/variant/manager/operations/AbstractVariantOperationManagerTest.java b/opencga-analysis/src/test/java/org/opencb/opencga/analysis/variant/manager/operations/AbstractVariantOperationManagerTest.java index 63b6ab090a3..73838816fb3 100644 --- a/opencga-analysis/src/test/java/org/opencb/opencga/analysis/variant/manager/operations/AbstractVariantOperationManagerTest.java +++ b/opencga-analysis/src/test/java/org/opencb/opencga/analysis/variant/manager/operations/AbstractVariantOperationManagerTest.java @@ -27,6 +27,7 @@ import org.opencb.commons.test.GenericTest; import org.opencb.opencga.TestParamConstants; import org.opencb.opencga.analysis.variant.OpenCGATestExternalResource; +import org.opencb.opencga.analysis.variant.manager.VariantOperationsTest; import org.opencb.opencga.analysis.variant.manager.VariantStorageManager; import org.opencb.opencga.analysis.variant.operations.OperationTool; import org.opencb.opencga.analysis.variant.stats.VariantStatsAnalysis; @@ -169,6 +170,10 @@ public final void setUpAbstract() throws Exception { true, null, QueryOptions.empty(), sessionId).first().getId(); files = Arrays.asList(new File[5]); + + + VariantOperationsTest.dummyVariantSetup(variantManager, studyFqn, sessionId); + VariantOperationsTest.dummyVariantSetup(variantManager, studyId2, sessionId); } @After diff --git a/opencga-app/pom.xml b/opencga-app/pom.xml index cac11d63828..b738ad32c0e 100644 --- a/opencga-app/pom.xml +++ b/opencga-app/pom.xml @@ -239,12 +239,43 @@ jline-terminal-jna runtime + + org.opencb.opencga opencga-storage-core test-jar test + + org.opencb.opencga + opencga-storage-hadoop-core + test + + + org.opencb.opencga + opencga-storage-hadoop-core + test + test-jar + + + org.mockito + mockito-core + test + + + org.opencb.opencga.hadoop.thirdparty + ${opencga-hadoop-shaded.artifactId} + ${opencga.hadoop.thirdparty.version} + test + test-jar + + + org.yaml + snakeyaml + + + @@ -726,10 +757,15 @@ + Ensure that only one opencga-hadoop-shaded*.jar and opencga-storage-hadoop-compat*.jar libs are included + + + + diff --git a/opencga-app/src/main/java/org/opencb/opencga/app/cli/main/OpenCgaCompleter.java b/opencga-app/src/main/java/org/opencb/opencga/app/cli/main/OpenCgaCompleter.java index 4d1ed716a48..fa5b3482284 100644 --- a/opencga-app/src/main/java/org/opencb/opencga/app/cli/main/OpenCgaCompleter.java +++ b/opencga-app/src/main/java/org/opencb/opencga/app/cli/main/OpenCgaCompleter.java @@ -104,7 +104,7 @@ public abstract class OpenCgaCompleter implements Completer { .map(Candidate::new) .collect(toList()); - private List operationsList = asList( "cellbase-configure","variant-aggregate","variant-annotation-delete","variant-annotation-index","variant-annotation-save","variant-configure","variant-delete","variant-family-aggregate","variant-family-index","variant-index","variant-index-launcher","variant-julie-run","variant-metadata-repair","variant-metadata-synchronize","variant-prune","variant-sample-delete","variant-sample-index","variant-sample-index-configure","variant-score-delete","variant-score-index","variant-secondary-annotation-index","variant-secondary-sample-index","configure-variant-secondary-sample-index","variant-secondary-index","variant-secondary-index-delete","variant-stats-delete","variant-stats-index","variant-study-delete") + private List operationsList = asList( "cellbase-configure","variant-aggregate","variant-annotation-delete","variant-annotation-index","variant-annotation-save","variant-configure","variant-delete","variant-family-aggregate","variant-family-index","variant-index","variant-index-launcher","variant-julie-run","variant-metadata-repair","variant-metadata-synchronize","variant-prune","variant-sample-delete","variant-sample-index","variant-sample-index-configure","variant-score-delete","variant-score-index","variant-secondary-annotation-index","variant-secondary-sample-index","configure-variant-secondary-sample-index","variant-secondary-index","variant-secondary-index-delete","variant-setup","variant-stats-delete","variant-stats-index","variant-study-delete") .stream() .map(Candidate::new) .collect(toList()); diff --git a/opencga-app/src/main/java/org/opencb/opencga/app/cli/main/OpencgaCliOptionsParser.java b/opencga-app/src/main/java/org/opencb/opencga/app/cli/main/OpencgaCliOptionsParser.java index d99dbba3488..cfd4cee9919 100644 --- a/opencga-app/src/main/java/org/opencb/opencga/app/cli/main/OpencgaCliOptionsParser.java +++ b/opencga-app/src/main/java/org/opencb/opencga/app/cli/main/OpencgaCliOptionsParser.java @@ -364,6 +364,7 @@ public OpencgaCliOptionsParser() { operationsVariantStorageSubCommands.addCommand("configure-variant-secondary-sample-index", operationsVariantStorageCommandOptions.configureVariantSecondarySampleIndexCommandOptions); operationsVariantStorageSubCommands.addCommand("variant-secondary-index", operationsVariantStorageCommandOptions.secondaryIndexVariantCommandOptions); operationsVariantStorageSubCommands.addCommand("variant-secondary-index-delete", operationsVariantStorageCommandOptions.deleteVariantSecondaryIndexCommandOptions); + operationsVariantStorageSubCommands.addCommand("variant-setup", operationsVariantStorageCommandOptions.setupVariantCommandOptions); operationsVariantStorageSubCommands.addCommand("variant-stats-delete", operationsVariantStorageCommandOptions.deleteVariantStatsCommandOptions); operationsVariantStorageSubCommands.addCommand("variant-stats-index", operationsVariantStorageCommandOptions.indexVariantStatsCommandOptions); operationsVariantStorageSubCommands.addCommand("variant-study-delete", operationsVariantStorageCommandOptions.deleteVariantStudyCommandOptions); diff --git a/opencga-app/src/main/java/org/opencb/opencga/app/cli/main/executors/OperationsVariantStorageCommandExecutor.java b/opencga-app/src/main/java/org/opencb/opencga/app/cli/main/executors/OperationsVariantStorageCommandExecutor.java index 6f351a6b38a..35f0c49c312 100644 --- a/opencga-app/src/main/java/org/opencb/opencga/app/cli/main/executors/OperationsVariantStorageCommandExecutor.java +++ b/opencga-app/src/main/java/org/opencb/opencga/app/cli/main/executors/OperationsVariantStorageCommandExecutor.java @@ -37,6 +37,8 @@ import org.opencb.opencga.core.models.operations.variant.VariantStorageMetadataRepairToolParams; import org.opencb.opencga.core.models.operations.variant.VariantStorageMetadataSynchronizeParams; import org.opencb.opencga.core.models.operations.variant.VariantStudyDeleteParams; +import org.opencb.opencga.core.models.study.VariantSetupResult; +import org.opencb.opencga.core.models.variant.VariantSetupParams; import org.opencb.opencga.core.response.QueryType; import org.opencb.opencga.core.response.RestResponse; @@ -149,6 +151,9 @@ public void execute() throws Exception { case "variant-secondary-index-delete": queryResponse = deleteVariantSecondaryIndex(); break; + case "variant-setup": + queryResponse = setupVariant(); + break; case "variant-stats-delete": queryResponse = deleteVariantStats(); break; @@ -1146,6 +1151,45 @@ private RestResponse deleteVariantSecondaryIndex() throws Exception { return openCGAClient.getVariantOperationClient().deleteVariantSecondaryIndex(queryParams); } + private RestResponse setupVariant() throws Exception { + logger.debug("Executing setupVariant in Operations - Variant Storage command line"); + + OperationsVariantStorageCommandOptions.SetupVariantCommandOptions commandOptions = operationsVariantStorageCommandOptions.setupVariantCommandOptions; + + ObjectMap queryParams = new ObjectMap(); + queryParams.putIfNotEmpty("study", commandOptions.study); + if (queryParams.get("study") == null && OpencgaMain.isShellMode()) { + queryParams.putIfNotEmpty("study", sessionManager.getSession().getCurrentStudy()); + } + + + VariantSetupParams variantSetupParams = null; + if (commandOptions.jsonDataModel) { + RestResponse res = new RestResponse<>(); + res.setType(QueryType.VOID); + PrintUtils.println(getObjectAsJSON(categoryName,"/{apiVersion}/operation/variant/setup")); + return res; + } else if (commandOptions.jsonFile != null) { + variantSetupParams = JacksonUtils.getDefaultObjectMapper() + .readValue(new java.io.File(commandOptions.jsonFile), VariantSetupParams.class); + } else { + ObjectMap beanParams = new ObjectMap(); + putNestedIfNotNull(beanParams, "expectedSamples", commandOptions.expectedSamples, true); + putNestedIfNotNull(beanParams, "expectedFiles", commandOptions.expectedFiles, true); + putNestedIfNotNull(beanParams, "fileType", commandOptions.fileType, true); + putNestedIfNotEmpty(beanParams, "averageFileSize", commandOptions.averageFileSize, true); + putNestedIfNotNull(beanParams, "variantsPerSample", commandOptions.variantsPerSample, true); + putNestedIfNotNull(beanParams, "averageSamplesPerFile", commandOptions.averageSamplesPerFile, true); + putNestedIfNotNull(beanParams, "dataDistribution", commandOptions.dataDistribution, true); + putNestedIfNotNull(beanParams, "normalizeExtensions", commandOptions.normalizeExtensions, true); + + variantSetupParams = JacksonUtils.getDefaultObjectMapper().copy() + .configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, true) + .readValue(beanParams.toJson(), VariantSetupParams.class); + } + return openCGAClient.getVariantOperationClient().setupVariant(variantSetupParams, queryParams); + } + private RestResponse deleteVariantStats() throws Exception { logger.debug("Executing deleteVariantStats in Operations - Variant Storage command line"); diff --git a/opencga-app/src/main/java/org/opencb/opencga/app/cli/main/options/OperationsVariantStorageCommandOptions.java b/opencga-app/src/main/java/org/opencb/opencga/app/cli/main/options/OperationsVariantStorageCommandOptions.java index ac7b0b968fe..6f932901d33 100644 --- a/opencga-app/src/main/java/org/opencb/opencga/app/cli/main/options/OperationsVariantStorageCommandOptions.java +++ b/opencga-app/src/main/java/org/opencb/opencga/app/cli/main/options/OperationsVariantStorageCommandOptions.java @@ -58,6 +58,7 @@ public class OperationsVariantStorageCommandOptions { public ConfigureVariantSecondarySampleIndexCommandOptions configureVariantSecondarySampleIndexCommandOptions; public SecondaryIndexVariantCommandOptions secondaryIndexVariantCommandOptions; public DeleteVariantSecondaryIndexCommandOptions deleteVariantSecondaryIndexCommandOptions; + public SetupVariantCommandOptions setupVariantCommandOptions; public DeleteVariantStatsCommandOptions deleteVariantStatsCommandOptions; public IndexVariantStatsCommandOptions indexVariantStatsCommandOptions; public DeleteVariantStudyCommandOptions deleteVariantStudyCommandOptions; @@ -92,6 +93,7 @@ public OperationsVariantStorageCommandOptions(CommonCommandOptions commonCommand this.configureVariantSecondarySampleIndexCommandOptions = new ConfigureVariantSecondarySampleIndexCommandOptions(); this.secondaryIndexVariantCommandOptions = new SecondaryIndexVariantCommandOptions(); this.deleteVariantSecondaryIndexCommandOptions = new DeleteVariantSecondaryIndexCommandOptions(); + this.setupVariantCommandOptions = new SetupVariantCommandOptions(); this.deleteVariantStatsCommandOptions = new DeleteVariantStatsCommandOptions(); this.indexVariantStatsCommandOptions = new IndexVariantStatsCommandOptions(); this.deleteVariantStudyCommandOptions = new DeleteVariantStudyCommandOptions(); @@ -1360,6 +1362,47 @@ public class DeleteVariantSecondaryIndexCommandOptions { } + @Parameters(commandNames = {"variant-setup"}, commandDescription ="Execute Variant Setup to allow using the variant engine. This setup is necessary before starting any variant operation.") + public class SetupVariantCommandOptions { + + @ParametersDelegate + public CommonCommandOptions commonOptions = commonCommandOptions; + + @Parameter(names = {"--json-file"}, description = "File with the body data in JSON format. Note, that using this parameter will ignore all the other parameters.", required = false, arity = 1) + public String jsonFile; + + @Parameter(names = {"--json-data-model"}, description = "Show example of file structure for body data.", help = true, arity = 0) + public Boolean jsonDataModel = false; + + @Parameter(names = {"--study", "-s"}, description = "Study [[organization@]project:]study where study and project can be either the ID or UUID", required = false, arity = 1) + public String study; + + @Parameter(names = {"--expected-samples"}, description = "Expected number of samples that will be loaded. Used to infer some parameters. This number is only used as a hint. If the real number of samples is different, if it grows beyond expectation, or if , the loader should be able to handle it.", required = false, arity = 1) + public Integer expectedSamples; + + @Parameter(names = {"--expected-files"}, description = "Expected number of files that will be loaded. Used to infer some parameters. This number is only used as a hint. If the real number of files is different, the loader should be able to handle it.", required = false, arity = 1) + public Integer expectedFiles; + + @Parameter(names = {"--file-type"}, description = "Main type of the files that will be loaded. If the dataset contains multiple types of files, provide the one that matches most of the files.", required = false, arity = 1) + public String fileType; + + @Parameter(names = {"--average-file-size"}, description = "Average size of the files that will be loaded. This number is only used as a hint. If the real size of the files is different, the loader should be able to handle it. Accepts units. e.g. 435MB, 2GB, 86KB. If not provided, the value will be inferred from the file type.", required = false, arity = 1) + public String averageFileSize; + + @Parameter(names = {"--variants-per-sample"}, description = "Number of variants per sample (non hom_ref variants). This number is only used as a hint. If the real number of variants per sample is different, the loader should be able to handle it. If not provided, the value will be inferred from the file type.", required = false, arity = 1) + public Integer variantsPerSample; + + @Parameter(names = {"--average-samples-per-file"}, description = "Average number of samples per file. This number is only used as a hint. If the real number of samples per file is different, the loader should be able to handle it. If not provided, the value will be inferred from the expectedSamples and expectedFiles and dataDistribution.", required = false, arity = 1) + public Float averageSamplesPerFile; + + @Parameter(names = {"--data-distribution"}, description = "Data distribution of the files. This parameter is used to infer the number of samples per file.", required = false, arity = 1) + public String dataDistribution; + + @Parameter(names = {"--normalize-extensions"}, description = "List of normalization extensions that will be used to normalize the files.", required = false, arity = 1) + public String normalizeExtensions; + + } + @Parameters(commandNames = {"variant-stats-delete"}, commandDescription ="Deletes the VariantStats of a cohort/s from the database") public class DeleteVariantStatsCommandOptions { diff --git a/opencga-app/src/main/java/org/opencb/opencga/app/migrations/StorageMigrationTool.java b/opencga-app/src/main/java/org/opencb/opencga/app/migrations/StorageMigrationTool.java index c9aeee6c7e2..4952f9d9812 100644 --- a/opencga-app/src/main/java/org/opencb/opencga/app/migrations/StorageMigrationTool.java +++ b/opencga-app/src/main/java/org/opencb/opencga/app/migrations/StorageMigrationTool.java @@ -46,10 +46,10 @@ protected final VariantStorageEngine getVariantStorageEngineByProject(String pro * @return List of projects * @throws Exception on error */ - protected final List getVariantStorageProjects(String organizationId) throws Exception { + protected final List getVariantStorageProjects() throws Exception { Set projects = new LinkedHashSet<>(); - for (String studyFqn : getVariantStorageStudies(organizationId)) { + for (String studyFqn : getVariantStorageStudies()) { projects.add(catalogManager.getStudyManager().getProjectFqn(studyFqn)); } @@ -61,7 +61,7 @@ protected final List getVariantStorageProjects(String organizationId) th * @return List of projects * @throws Exception on error */ - protected final List getVariantStorageStudies(String organizationId) throws Exception { + protected final List getVariantStorageStudies() throws Exception { Set studies = new LinkedHashSet<>(); VariantStorageManager variantStorageManager = getVariantStorageManager(); for (Study study : catalogManager.getStudyManager().searchInOrganization(organizationId, new Query(), new QueryOptions(QueryOptions.INCLUDE, diff --git a/opencga-app/src/main/java/org/opencb/opencga/app/migrations/v2/v2_12_5/storage/AddAllelesColumnToPhoenix.java b/opencga-app/src/main/java/org/opencb/opencga/app/migrations/v2/v2_12_5/storage/AddAllelesColumnToPhoenix.java index 93104000f20..d641207a04e 100644 --- a/opencga-app/src/main/java/org/opencb/opencga/app/migrations/v2/v2_12_5/storage/AddAllelesColumnToPhoenix.java +++ b/opencga-app/src/main/java/org/opencb/opencga/app/migrations/v2/v2_12_5/storage/AddAllelesColumnToPhoenix.java @@ -11,7 +11,7 @@ public class AddAllelesColumnToPhoenix extends StorageMigrationTool { @Override protected void run() throws Exception { - for (String project : getVariantStorageProjects(organizationId)) { + for (String project : getVariantStorageProjects()) { VariantStorageEngine engine = getVariantStorageEngineByProject(project); if (engine.getStorageEngineId().equals("hadoop")) { logger.info("Adding missing columns (if any) for project " + project); diff --git a/opencga-app/src/main/java/org/opencb/opencga/app/migrations/v2/v2_12_5/storage/DetectIllegalConcurrentFileLoadingsMigration.java b/opencga-app/src/main/java/org/opencb/opencga/app/migrations/v2/v2_12_5/storage/DetectIllegalConcurrentFileLoadingsMigration.java index 6d30f1a4de7..aba2f366b77 100644 --- a/opencga-app/src/main/java/org/opencb/opencga/app/migrations/v2/v2_12_5/storage/DetectIllegalConcurrentFileLoadingsMigration.java +++ b/opencga-app/src/main/java/org/opencb/opencga/app/migrations/v2/v2_12_5/storage/DetectIllegalConcurrentFileLoadingsMigration.java @@ -42,7 +42,7 @@ public class DetectIllegalConcurrentFileLoadingsMigration extends StorageMigrati @Override protected void run() throws Exception { - for (String project : getVariantStorageProjects(organizationId)) { + for (String project : getVariantStorageProjects()) { VariantStorageEngine engine = getVariantStorageEngineByProject(project); if (!engine.getStorageEngineId().equals("hadoop")) { continue; diff --git a/opencga-app/src/main/java/org/opencb/opencga/app/migrations/v3_2_0/VariantSetupMigration.java b/opencga-app/src/main/java/org/opencb/opencga/app/migrations/v3_2_0/VariantSetupMigration.java new file mode 100644 index 00000000000..9d02073d099 --- /dev/null +++ b/opencga-app/src/main/java/org/opencb/opencga/app/migrations/v3_2_0/VariantSetupMigration.java @@ -0,0 +1,47 @@ +package org.opencb.opencga.app.migrations.v3_2_0; + +import org.opencb.commons.datastore.core.ObjectMap; +import org.opencb.opencga.analysis.variant.manager.VariantStorageManager; +import org.opencb.opencga.app.migrations.StorageMigrationTool; +import org.opencb.opencga.catalog.migration.Migration; +import org.opencb.opencga.core.common.TimeUtils; +import org.opencb.opencga.core.models.study.VariantSetupResult; +import org.opencb.opencga.storage.core.metadata.VariantStorageMetadataManager; + +import java.util.LinkedHashSet; + +@Migration(id = "variant_setup", description = "Add a dummy variant setup for studies with data", version = "3.2.0", + domain = Migration.MigrationDomain.STORAGE, date = 20240516) +public class VariantSetupMigration extends StorageMigrationTool { + + @Override + protected void run() throws Exception { + VariantStorageManager variantStorageManager = getVariantStorageManager(); + for (String study : getVariantStorageStudies()) { + logger.info("--- Checking study '{}'", study); + if (variantStorageManager.hasVariantSetup(study, token)) { + logger.info("Study '{}' already has a variant setup", study); + continue; + } + + String projectFqn = catalogManager.getStudyManager().getProjectFqn(study); + VariantStorageMetadataManager metadataManager = getVariantStorageEngineByProject(projectFqn).getMetadataManager(); + int studyId = metadataManager.getStudyId(study); + LinkedHashSet indexedFiles = metadataManager.getIndexedFiles(studyId); + if (indexedFiles.isEmpty()) { + logger.info("Study '{}' does not have any indexed files. Skipping variant setup", study); + continue; + } + logger.info("Study '{}' doesn't have a variant setup, but it has {} indexed files. Creating a dummy variant setup", + study, indexedFiles.size()); + logger.info("Creating a dummy variant setup for study '{}'", study); + VariantSetupResult dummy = new VariantSetupResult(); + dummy.setDate(TimeUtils.getTime()); + dummy.setUserId(catalogManager.getUserManager().getUserId(token)); + dummy.setParams(new ObjectMap("executed_from_migration", getId())); + dummy.setStatus(VariantSetupResult.Status.READY); + dummy.setOptions(new ObjectMap()); + catalogManager.getStudyManager().setVariantEngineSetupOptions(study, dummy, token); + } + } +} diff --git a/opencga-app/src/test/java/org/opencb/opencga/app/migrations/MigrationsTest.java b/opencga-app/src/test/java/org/opencb/opencga/app/migrations/MigrationsTest.java new file mode 100644 index 00000000000..6d72b9149f5 --- /dev/null +++ b/opencga-app/src/test/java/org/opencb/opencga/app/migrations/MigrationsTest.java @@ -0,0 +1,88 @@ +package org.opencb.opencga.app.migrations; + +import org.junit.After; +import org.junit.Assert; +import org.junit.Test; +import org.junit.experimental.categories.Category; +import org.opencb.commons.datastore.core.ObjectMap; +import org.opencb.commons.datastore.core.Query; +import org.opencb.commons.datastore.core.QueryOptions; +import org.opencb.opencga.analysis.variant.OpenCGATestExternalResource; +import org.opencb.opencga.app.migrations.v3_2_0.VariantSetupMigration; +import org.opencb.opencga.catalog.exceptions.CatalogException; +import org.opencb.opencga.catalog.migration.Migration; +import org.opencb.opencga.catalog.migration.MigrationTool; +import org.opencb.opencga.core.models.study.Study; +import org.opencb.opencga.core.testclassification.duration.LongTests; +import org.opencb.opencga.storage.core.metadata.VariantStorageMetadataManager; +import org.opencb.opencga.storage.core.metadata.models.StudyMetadata; + +import java.net.URL; +import java.util.Arrays; +import java.util.Collections; + +@Category(LongTests.class) +public class MigrationsTest { + + public OpenCGATestExternalResource opencga; + + @Test + public void testVariantSetupMigration() throws Exception { + setup("v3.0.0", false); + String studyName = "test@1000G:phase1"; + + VariantStorageMetadataManager metadataManager = opencga.getVariantStorageEngineByProject("test@1000G").getMetadataManager(); + StudyMetadata studyMetadata = metadataManager.createStudy(studyName); + int fileId = metadataManager.registerFile(studyMetadata.getId(), "folder/file.vcf", Arrays.asList("s1", "s2")); + metadataManager.addIndexedFiles(studyMetadata.getId(), Collections.singletonList(fileId)); + + for (Study study : opencga.getCatalogManager().getStudyManager().searchInOrganization("test", new Query(), new QueryOptions(), opencga.getAdminToken()).getResults()) { + Assert.assertNull(study.getInternal().getConfiguration().getVariantEngine().getSetup()); + } + + runMigration(VariantSetupMigration.class); + + for (Study study : opencga.getCatalogManager().getStudyManager().searchInOrganization("test", new Query(), new QueryOptions(), opencga.getAdminToken()).getResults()) { + if (study.getFqn().equals(studyName)) { + Assert.assertNotNull(study.getInternal().getConfiguration().getVariantEngine().getSetup()); + } else { + Assert.assertNull(study.getInternal().getConfiguration().getVariantEngine().getSetup()); + } + } + } + + @After + public void tearDown() throws Exception { + if (opencga != null) { + opencga.after(); + opencga = null; + } + } + + protected void testMigration(Class migration, String dataset) throws Exception { + setup(dataset); + runMigration(migration); + } + + private void setup(String dataset) throws Exception { + setup(dataset, false); + } + + private void setup(String dataset, boolean storageHadoop) throws Exception { + if (opencga != null) { + opencga.after(); + opencga = null; + } + opencga = new OpenCGATestExternalResource(storageHadoop); + opencga.before(); + URL resource = getClass().getResource("/datasets/opencga/" + dataset + "/"); + opencga.restore(resource); + } + + private void runMigration(Class migration) throws CatalogException { + Migration annotation = migration.getAnnotation(Migration.class); + opencga.getCatalogManager().getMigrationManager() + .runManualMigration(annotation.version(), annotation.id(), opencga.getOpencgaHome(), new ObjectMap(), opencga.getAdminToken()); + } + +} \ No newline at end of file diff --git a/opencga-app/src/test/resources/datasets/opencga/v3.0.0/mongodb/opencga/audit.json.gz b/opencga-app/src/test/resources/datasets/opencga/v3.0.0/mongodb/opencga/audit.json.gz new file mode 100644 index 00000000000..2b44b35ae32 Binary files /dev/null and b/opencga-app/src/test/resources/datasets/opencga/v3.0.0/mongodb/opencga/audit.json.gz differ diff --git a/opencga-app/src/test/resources/datasets/opencga/v3.0.0/mongodb/opencga/file.json.gz b/opencga-app/src/test/resources/datasets/opencga/v3.0.0/mongodb/opencga/file.json.gz new file mode 100644 index 00000000000..cb73bece530 Binary files /dev/null and b/opencga-app/src/test/resources/datasets/opencga/v3.0.0/mongodb/opencga/file.json.gz differ diff --git a/opencga-app/src/test/resources/datasets/opencga/v3.0.0/mongodb/opencga/migration.json.gz b/opencga-app/src/test/resources/datasets/opencga/v3.0.0/mongodb/opencga/migration.json.gz new file mode 100644 index 00000000000..58bb941a512 Binary files /dev/null and b/opencga-app/src/test/resources/datasets/opencga/v3.0.0/mongodb/opencga/migration.json.gz differ diff --git a/opencga-app/src/test/resources/datasets/opencga/v3.0.0/mongodb/opencga/note.json.gz b/opencga-app/src/test/resources/datasets/opencga/v3.0.0/mongodb/opencga/note.json.gz new file mode 100644 index 00000000000..7500b56e11e Binary files /dev/null and b/opencga-app/src/test/resources/datasets/opencga/v3.0.0/mongodb/opencga/note.json.gz differ diff --git a/opencga-app/src/test/resources/datasets/opencga/v3.0.0/mongodb/opencga/organization.json.gz b/opencga-app/src/test/resources/datasets/opencga/v3.0.0/mongodb/opencga/organization.json.gz new file mode 100644 index 00000000000..57cafc8910a Binary files /dev/null and b/opencga-app/src/test/resources/datasets/opencga/v3.0.0/mongodb/opencga/organization.json.gz differ diff --git a/opencga-app/src/test/resources/datasets/opencga/v3.0.0/mongodb/opencga/project.json.gz b/opencga-app/src/test/resources/datasets/opencga/v3.0.0/mongodb/opencga/project.json.gz new file mode 100644 index 00000000000..ef669b9c164 Binary files /dev/null and b/opencga-app/src/test/resources/datasets/opencga/v3.0.0/mongodb/opencga/project.json.gz differ diff --git a/opencga-app/src/test/resources/datasets/opencga/v3.0.0/mongodb/opencga/study.json.gz b/opencga-app/src/test/resources/datasets/opencga/v3.0.0/mongodb/opencga/study.json.gz new file mode 100644 index 00000000000..e5aad572d01 Binary files /dev/null and b/opencga-app/src/test/resources/datasets/opencga/v3.0.0/mongodb/opencga/study.json.gz differ diff --git a/opencga-app/src/test/resources/datasets/opencga/v3.0.0/mongodb/opencga/user.json.gz b/opencga-app/src/test/resources/datasets/opencga/v3.0.0/mongodb/opencga/user.json.gz new file mode 100644 index 00000000000..3425717560d Binary files /dev/null and b/opencga-app/src/test/resources/datasets/opencga/v3.0.0/mongodb/opencga/user.json.gz differ diff --git a/opencga-app/src/test/resources/datasets/opencga/v3.0.0/mongodb/test/audit.json.gz b/opencga-app/src/test/resources/datasets/opencga/v3.0.0/mongodb/test/audit.json.gz new file mode 100644 index 00000000000..9e783d0c02b Binary files /dev/null and b/opencga-app/src/test/resources/datasets/opencga/v3.0.0/mongodb/test/audit.json.gz differ diff --git a/opencga-app/src/test/resources/datasets/opencga/v3.0.0/mongodb/test/cohort.json.gz b/opencga-app/src/test/resources/datasets/opencga/v3.0.0/mongodb/test/cohort.json.gz new file mode 100644 index 00000000000..fff41de3126 Binary files /dev/null and b/opencga-app/src/test/resources/datasets/opencga/v3.0.0/mongodb/test/cohort.json.gz differ diff --git a/opencga-app/src/test/resources/datasets/opencga/v3.0.0/mongodb/test/file.json.gz b/opencga-app/src/test/resources/datasets/opencga/v3.0.0/mongodb/test/file.json.gz new file mode 100644 index 00000000000..72820a3d31c Binary files /dev/null and b/opencga-app/src/test/resources/datasets/opencga/v3.0.0/mongodb/test/file.json.gz differ diff --git a/opencga-app/src/test/resources/datasets/opencga/v3.0.0/mongodb/test/individual.json.gz b/opencga-app/src/test/resources/datasets/opencga/v3.0.0/mongodb/test/individual.json.gz new file mode 100644 index 00000000000..ed972760e0b Binary files /dev/null and b/opencga-app/src/test/resources/datasets/opencga/v3.0.0/mongodb/test/individual.json.gz differ diff --git a/opencga-app/src/test/resources/datasets/opencga/v3.0.0/mongodb/test/migration.json.gz b/opencga-app/src/test/resources/datasets/opencga/v3.0.0/mongodb/test/migration.json.gz new file mode 100644 index 00000000000..432b4097641 Binary files /dev/null and b/opencga-app/src/test/resources/datasets/opencga/v3.0.0/mongodb/test/migration.json.gz differ diff --git a/opencga-app/src/test/resources/datasets/opencga/v3.0.0/mongodb/test/organization.json.gz b/opencga-app/src/test/resources/datasets/opencga/v3.0.0/mongodb/test/organization.json.gz new file mode 100644 index 00000000000..9290eaeeff1 Binary files /dev/null and b/opencga-app/src/test/resources/datasets/opencga/v3.0.0/mongodb/test/organization.json.gz differ diff --git a/opencga-app/src/test/resources/datasets/opencga/v3.0.0/mongodb/test/project.json.gz b/opencga-app/src/test/resources/datasets/opencga/v3.0.0/mongodb/test/project.json.gz new file mode 100644 index 00000000000..41e2ae77fa8 Binary files /dev/null and b/opencga-app/src/test/resources/datasets/opencga/v3.0.0/mongodb/test/project.json.gz differ diff --git a/opencga-app/src/test/resources/datasets/opencga/v3.0.0/mongodb/test/sample.json.gz b/opencga-app/src/test/resources/datasets/opencga/v3.0.0/mongodb/test/sample.json.gz new file mode 100644 index 00000000000..f8a2ba4f5fe Binary files /dev/null and b/opencga-app/src/test/resources/datasets/opencga/v3.0.0/mongodb/test/sample.json.gz differ diff --git a/opencga-app/src/test/resources/datasets/opencga/v3.0.0/mongodb/test/study.json.gz b/opencga-app/src/test/resources/datasets/opencga/v3.0.0/mongodb/test/study.json.gz new file mode 100644 index 00000000000..0ea0384acdc Binary files /dev/null and b/opencga-app/src/test/resources/datasets/opencga/v3.0.0/mongodb/test/study.json.gz differ diff --git a/opencga-app/src/test/resources/datasets/opencga/v3.0.0/mongodb/test/user.json.gz b/opencga-app/src/test/resources/datasets/opencga/v3.0.0/mongodb/test/user.json.gz new file mode 100644 index 00000000000..aefbad4adcc Binary files /dev/null and b/opencga-app/src/test/resources/datasets/opencga/v3.0.0/mongodb/test/user.json.gz differ diff --git a/opencga-catalog/src/main/java/org/opencb/opencga/catalog/managers/StudyManager.java b/opencga-catalog/src/main/java/org/opencb/opencga/catalog/managers/StudyManager.java index 848cb8e5e2d..70f933f6430 100644 --- a/opencga-catalog/src/main/java/org/opencb/opencga/catalog/managers/StudyManager.java +++ b/opencga-catalog/src/main/java/org/opencb/opencga/catalog/managers/StudyManager.java @@ -1820,6 +1820,40 @@ public void setVariantEngineConfigurationOptions(String studyStr, ObjectMap opti getStudyDBAdaptor(organizationId).update(study.getUid(), parameters, QueryOptions.empty()); } + public void setVariantEngineSetupOptions(String studyStr, VariantSetupResult variantSetupResult, String token) throws CatalogException { + JwtPayload tokenPayload = catalogManager.getUserManager().validateToken(token); + CatalogFqn studyFqn = CatalogFqn.extractFqnFromStudy(studyStr, tokenPayload); + String organizationId = studyFqn.getOrganizationId(); + String userId = tokenPayload.getUserId(organizationId); + Study study = resolveId(studyFqn, + new QueryOptions(QueryOptions.INCLUDE, Arrays.asList(StudyDBAdaptor.QueryParams.UID.key(), + StudyDBAdaptor.QueryParams.INTERNAL_CONFIGURATION_VARIANT_ENGINE.key())), + tokenPayload); + + authorizationManager.checkIsAtLeastStudyAdministrator(organizationId, study.getUid(), userId); + StudyVariantEngineConfiguration configuration = study.getInternal().getConfiguration().getVariantEngine(); + if (configuration == null) { + configuration = new StudyVariantEngineConfiguration(); + } + if (configuration.getOptions() == null) { + configuration.setOptions(new ObjectMap()); + } + VariantSetupResult prevSetupValue = configuration.getSetup(); + if (prevSetupValue != null && prevSetupValue.getOptions() != null) { + // Variant setup was already executed. + // Remove the options from the previous execution before adding the new ones + // Check that both key/value matches, to avoid removing options that might have been modified manually + for (Map.Entry entry : prevSetupValue.getOptions().entrySet()) { + configuration.getOptions().remove(entry.getKey(), entry.getValue()); + } + } + configuration.getOptions().putAll(variantSetupResult.getOptions()); + configuration.setSetup(variantSetupResult); + + ObjectMap parameters = new ObjectMap(StudyDBAdaptor.QueryParams.INTERNAL_CONFIGURATION_VARIANT_ENGINE.key(), configuration); + getStudyDBAdaptor(organizationId).update(study.getUid(), parameters, QueryOptions.empty()); + } + public void setVariantEngineConfigurationSampleIndex(String studyStr, SampleIndexConfiguration sampleIndexConfiguration, String token) throws CatalogException { JwtPayload tokenPayload = catalogManager.getUserManager().validateToken(token); diff --git a/opencga-catalog/src/main/java/org/opencb/opencga/catalog/managers/UserManager.java b/opencga-catalog/src/main/java/org/opencb/opencga/catalog/managers/UserManager.java index 8070d177233..7e8e8cdffeb 100644 --- a/opencga-catalog/src/main/java/org/opencb/opencga/catalog/managers/UserManager.java +++ b/opencga-catalog/src/main/java/org/opencb/opencga/catalog/managers/UserManager.java @@ -31,6 +31,7 @@ import org.opencb.opencga.catalog.db.api.UserDBAdaptor; import org.opencb.opencga.catalog.exceptions.*; import org.opencb.opencga.catalog.io.CatalogIOManager; +import org.opencb.opencga.catalog.utils.CatalogFqn; import org.opencb.opencga.catalog.utils.ParamUtils; import org.opencb.opencga.catalog.utils.UuidUtils; import org.opencb.opencga.core.api.ParamConstants; @@ -1572,4 +1573,22 @@ private String getValidUserId(String userId, JwtPayload payload) throws CatalogE } } + public String getUserId(String token) throws CatalogException { + JwtPayload tokenPayload = catalogManager.getUserManager().validateToken(token); + return tokenPayload.getUserId(); + } + + public String getUserIdContextProject(String projectStr, String token) throws CatalogException { + JwtPayload tokenPayload = catalogManager.getUserManager().validateToken(token); + CatalogFqn catalogFqn = CatalogFqn.extractFqnFromProject(projectStr, tokenPayload); + String organizationId = catalogFqn.getOrganizationId(); + return tokenPayload.getUserId(organizationId); + } + + public String getUserIdContextStudy(String studyStr, String token) throws CatalogException { + JwtPayload tokenPayload = catalogManager.getUserManager().validateToken(token); + CatalogFqn catalogFqn = CatalogFqn.extractFqnFromStudy(studyStr, tokenPayload); + String organizationId = catalogFqn.getOrganizationId(); + return tokenPayload.getUserId(organizationId); + } } diff --git a/opencga-catalog/src/test/java/org/opencb/opencga/catalog/db/mongodb/MongoBackupUtils.java b/opencga-catalog/src/test/java/org/opencb/opencga/catalog/db/mongodb/MongoBackupUtils.java index 48ee2b0cfc0..252085c353b 100644 --- a/opencga-catalog/src/test/java/org/opencb/opencga/catalog/db/mongodb/MongoBackupUtils.java +++ b/opencga-catalog/src/test/java/org/opencb/opencga/catalog/db/mongodb/MongoBackupUtils.java @@ -19,18 +19,17 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.DataOutputStream; -import java.io.FileOutputStream; -import java.io.IOException; +import java.io.*; import java.net.URI; import java.net.URISyntaxException; +import java.net.URL; +import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; -import java.util.HashMap; -import java.util.LinkedList; -import java.util.List; -import java.util.Map; +import java.util.*; import java.util.concurrent.TimeUnit; +import java.util.stream.Stream; +import java.util.zip.GZIPInputStream; /** * Contains two methods mainly for testing purposes. One to create a dump of the current testing OpenCGA installation and a second one to @@ -38,6 +37,7 @@ */ public class MongoBackupUtils { + private static final String TEMPORAL_FOLDER_HERE = "TEMPORAL_FOLDER_HERE"; private static Logger logger = LoggerFactory.getLogger(MongoBackupUtils.class); public static void dump(CatalogManager catalogManager, Path opencgaHome) throws CatalogDBException { @@ -89,12 +89,12 @@ public static void dump(CatalogManager catalogManager, Path opencgaHome) throws } public static void restore(CatalogManager catalogManager, Path opencgaHome) - throws CatalogDBException, IOException, CatalogIOException, URISyntaxException { - StopWatch stopWatch = StopWatch.createStarted(); + throws Exception { try (MongoDBAdaptorFactory dbAdaptorFactory = new MongoDBAdaptorFactory(catalogManager.getConfiguration(), catalogManager.getIoManagerFactory())) { MongoClient mongoClient = dbAdaptorFactory.getOrganizationMongoDBAdaptorFactory(ParamConstants.ADMIN_ORGANIZATION) .getMongoDataStore().getMongoClient(); + MongoDatabase dumpDatabase = mongoClient.getDatabase("test_dump"); Map databaseNames = new HashMap<>(); try (MongoCursor mongoIterator = dumpDatabase.getCollection("summary") @@ -107,25 +107,67 @@ public static void restore(CatalogManager catalogManager, Path opencgaHome) } } - List organizationIds = dbAdaptorFactory.getOrganizationIds(); - for (String organizationId : organizationIds) { + restore(catalogManager, opencgaHome, dumpDatabase, databaseNames.keySet()); + } + } + + public static void restore(CatalogManager catalogManager, Path opencgaHome, URL restoreFolder) + throws Exception { + /* + dataset=v3.0.0 + mongo test_dump --eval 'db.getCollectionNames()' --quiet | jq .[] -r | grep -v summary | while read i ; do + org=$(echo $i | cut -d "_" -f 1) ; + collection=$(echo $i | cut -d "_" -f 3) ; + mkdir -p opencga-app/src/test/resources/datasets/catalog/$dataset/mongodb/$org ; + mongo test_dump --quiet --eval "db.getCollection(\"$i\").find().forEach(function(d){ print(tojsononeline(d)); })" | gzip > opencga-app/src/test/resources/datasets/opencga/$dataset/mongodb/$org/$collection.json.gz ; + done + */ + if (restoreFolder.getProtocol().equals("file")) { + restore(catalogManager, opencgaHome, Paths.get(restoreFolder.toURI())); + } else if (restoreFolder.getProtocol().equals("jar")) { + throw new UnsupportedOperationException("Cannot restore from a jar file"); + } + } + + public static void restore(CatalogManager catalogManager, Path opencgaHome, Path restoreFolder) + throws Exception { + + List organizationIds = new ArrayList<>(); + try (Stream stream = Files.list(restoreFolder)) { + stream.forEach(file -> organizationIds.add(file.getFileName().toString())); + } + if (organizationIds.isEmpty()) { + throw new CatalogDBException("No organization found in the restore folder '" + restoreFolder + "'"); + } + restore(catalogManager, opencgaHome, restoreFolder, organizationIds); + } + + private static void restore(CatalogManager catalogManager, Path opencgaHome, Object source, Collection organizationIds) + throws Exception { + logger.info("Restore opencga from source " + source + " for organizations " + organizationIds); + StopWatch stopWatch = StopWatch.createStarted(); + try (MongoDBAdaptorFactory dbAdaptorFactory = new MongoDBAdaptorFactory(catalogManager.getConfiguration(), + catalogManager.getIoManagerFactory())) { + + for (String existingOrganizationId : dbAdaptorFactory.getOrganizationIds()) { // We need to completely remove databases that were not backed up so tests that attempt to create them again don't fail - if (!databaseNames.containsKey(organizationId)) { - logger.info("Completely removing database for organization '{}'", organizationId); - dbAdaptorFactory.getOrganizationMongoDBAdaptorFactory(organizationId).deleteCatalogDB(); + if (!organizationIds.contains(existingOrganizationId)) { + logger.info("Completely removing database for organization '{}'", existingOrganizationId); + dbAdaptorFactory.getOrganizationMongoDBAdaptorFactory(existingOrganizationId).deleteCatalogDB(); } } // First restore the main admin database String adminDBName = dbAdaptorFactory.getOrganizationMongoDBAdaptorFactory(ParamConstants.ADMIN_ORGANIZATION) .getMongoDataStore().getDatabaseName(); - restoreDatabase(catalogManager, opencgaHome, ParamConstants.ADMIN_ORGANIZATION, adminDBName, dbAdaptorFactory); + logger.info("Restoring database for organization '{}'", ParamConstants.ADMIN_ORGANIZATION); + restoreDatabase(catalogManager, opencgaHome, ParamConstants.ADMIN_ORGANIZATION, adminDBName, dbAdaptorFactory, source); - for (Map.Entry entry : databaseNames.entrySet()) { - String organizationId = entry.getKey(); + for (String organizationId : organizationIds) { if (!ParamConstants.ADMIN_ORGANIZATION.equals(organizationId)) { - String databaseName = entry.getValue(); - restoreDatabase(catalogManager, opencgaHome, organizationId, databaseName, dbAdaptorFactory); + logger.info("Restoring database for organization '{}'", organizationId); + String databaseName = catalogManager.getCatalogDatabase(organizationId); + restoreDatabase(catalogManager, opencgaHome, organizationId, databaseName, dbAdaptorFactory, source); } } } @@ -133,10 +175,9 @@ public static void restore(CatalogManager catalogManager, Path opencgaHome) } private static void restoreDatabase(CatalogManager catalogManager, Path opencgaHome, String organizationId, String databaseName, - MongoDBAdaptorFactory dbAdaptorFactory) - throws IOException, CatalogIOException, CatalogDBException, URISyntaxException { + MongoDBAdaptorFactory dbAdaptorFactory, Object source) + throws Exception { MongoClient mongoClient = dbAdaptorFactory.getOrganizationMongoDBAdaptorFactory(ParamConstants.ADMIN_ORGANIZATION).getMongoDataStore().getMongoClient(); - MongoDatabase dumpDatabase = mongoClient.getDatabase("test_dump"); Bson emptyBsonQuery = new Document(); MongoDatabase database = mongoClient.getDatabase(databaseName); @@ -152,10 +193,9 @@ private static void restoreDatabase(CatalogManager catalogManager, Path opencgaH for (String collection : OrganizationMongoDBAdaptorFactory.COLLECTIONS_LIST) { MongoCollection dbCollection = database.getCollection(collection); - MongoCollection dumpCollection = dumpDatabase.getCollection(organizationId + "__" + collection); dbCollection.deleteMany(emptyBsonQuery); - try (MongoCursor iterator = dumpCollection.find(emptyBsonQuery).noCursorTimeout(true).iterator()) { + try (CloseableIterator iterator = documentIterator(source, organizationId, collection)) { List documentList = new LinkedList<>(); while (iterator.hasNext()) { Document document = iterator.next(); @@ -171,8 +211,8 @@ private static void restoreDatabase(CatalogManager catalogManager, Path opencgaH // Write actual temporal folder in database String uri = document.getString("uri"); - String temporalFolder = opencgaHome.getFileName().toString(); - String replacedUri = uri.replace("TEMPORAL_FOLDER_HERE", temporalFolder); + String uriPath = uri.substring(uri.indexOf(TEMPORAL_FOLDER_HERE) + TEMPORAL_FOLDER_HERE.length() + 1); + String replacedUri = opencgaHome.resolve(uriPath).toUri().toString(); document.put("uri", replacedUri); if (OrganizationMongoDBAdaptorFactory.FILE_COLLECTION.equals(collection)) { @@ -198,6 +238,72 @@ private static void restoreDatabase(CatalogManager catalogManager, Path opencgaH } } + private static CloseableIterator documentIterator(Object source, String organizationId, String collection) + throws IOException { + if (source instanceof MongoDatabase) { + MongoDatabase dumpDatabase = (MongoDatabase) source; + + MongoCollection dumpCollection = dumpDatabase.getCollection(organizationId + "__" + collection); + logger.info("Restoring {}:{} from database - {}", organizationId, collection, dumpCollection.getNamespace().getFullName()); + return new CloseableIterator<>(dumpCollection.find(new Document()).noCursorTimeout(true).iterator()); + } else if (source instanceof Path) { + Path dir = (Path) source; + java.io.File file = dir.resolve(organizationId).resolve(collection + ".json.gz").toFile(); + if (!file.exists()) { +// logger.info("File {} not found", file); + return new CloseableIterator<>(null, Collections.emptyIterator()); + } + logger.info("Restoring {}:{} from file - {}", organizationId, collection, file); + // Read file lines + Stream stream = new BufferedReader( + new InputStreamReader( + new GZIPInputStream( + Files.newInputStream(file.toPath())))).lines(); + + Iterator iterator = stream + .filter(s -> !s.isEmpty()) +// .peek(System.out::println) + .map(Document::parse) + .iterator(); + return new CloseableIterator<>(stream, iterator); + } else { + throw new IllegalArgumentException("Unknown restore source type " + source.getClass()); + } + } + + private static class CloseableIterator implements Iterator, AutoCloseable { + + private final AutoCloseable closeable; + private final Iterator iterator; + + public CloseableIterator(AutoCloseable closeable, Iterator iterator) { + this.closeable = closeable; + this.iterator = iterator; + } + + public & Closeable> CloseableIterator(CI c) { + this.closeable = c; + this.iterator = c; + } + + @Override + public void close() throws Exception { + if (closeable != null) { + closeable.close(); + } + } + + @Override + public boolean hasNext() { + return iterator.hasNext(); + } + + @Override + public T next() { + return iterator.next(); + } + } + private static void createFile(IOManager ioManager, Document document) throws IOException, CatalogIOException { String type = document.getString("type"); if (File.Type.FILE.name().equals(type)) { diff --git a/opencga-catalog/src/test/java/org/opencb/opencga/catalog/managers/AbstractManagerTest.java b/opencga-catalog/src/test/java/org/opencb/opencga/catalog/managers/AbstractManagerTest.java index 4bafb0cc4e0..dc857e15965 100644 --- a/opencga-catalog/src/test/java/org/opencb/opencga/catalog/managers/AbstractManagerTest.java +++ b/opencga-catalog/src/test/java/org/opencb/opencga/catalog/managers/AbstractManagerTest.java @@ -54,8 +54,6 @@ import org.opencb.opencga.core.response.OpenCGAResult; import org.opencb.opencga.core.testclassification.duration.MediumTests; -import java.io.IOException; -import java.net.URISyntaxException; import java.nio.file.Path; import java.nio.file.Paths; import java.util.*; @@ -186,7 +184,7 @@ public void setUp() throws Exception { setUpCatalogManager(catalogManager); } - public void setUpCatalogManager(CatalogManager catalogManager) throws IOException, CatalogException, URISyntaxException { + public void setUpCatalogManager(CatalogManager catalogManager) throws Exception { if (!firstExecutionFinished) { createDummyData(catalogManager); MongoBackupUtils.dump(catalogManager, catalogManagerResource.getOpencgaHome()); diff --git a/opencga-catalog/src/test/java/org/opencb/opencga/catalog/managers/CatalogManagerExternalResource.java b/opencga-catalog/src/test/java/org/opencb/opencga/catalog/managers/CatalogManagerExternalResource.java index e8259db2c4b..09c94ee33d3 100644 --- a/opencga-catalog/src/test/java/org/opencb/opencga/catalog/managers/CatalogManagerExternalResource.java +++ b/opencga-catalog/src/test/java/org/opencb/opencga/catalog/managers/CatalogManagerExternalResource.java @@ -124,6 +124,7 @@ public CatalogManager getCatalogManager() { public CatalogManager resetCatalogManager() throws CatalogException { catalogManager.close(); catalogManager = new CatalogManager(configuration); + adminToken = catalogManager.getUserManager().loginAsAdmin(TestParamConstants.ADMIN_PASSWORD).getToken(); return catalogManager; } diff --git a/opencga-client/src/main/R/R/Operation-methods.R b/opencga-client/src/main/R/R/Operation-methods.R index bd581f1ba5e..1c3fc8d7072 100644 --- a/opencga-client/src/main/R/R/Operation-methods.R +++ b/opencga-client/src/main/R/R/Operation-methods.R @@ -44,6 +44,7 @@ #' | configureVariantSecondarySampleIndex | /{apiVersion}/operation/variant/secondary/sample/index/configure | study, skipRebuild, body | #' | secondaryIndexVariant | /{apiVersion}/operation/variant/secondaryIndex | jobId, jobDescription, jobDependsOn, jobTags, jobScheduledStartTime, jobPriority, jobDryRun, project, study, body | #' | deleteVariantSecondaryIndex | /{apiVersion}/operation/variant/secondaryIndex/delete | jobId, jobDescription, jobDependsOn, jobTags, jobScheduledStartTime, jobPriority, jobDryRun, study, samples | +#' | setupVariant | /{apiVersion}/operation/variant/setup | study, body | #' | deleteVariantStats | /{apiVersion}/operation/variant/stats/delete | study, jobId, jobDescription, jobDependsOn, jobTags, jobScheduledStartTime, jobPriority, jobDryRun, body[*] | #' | indexVariantStats | /{apiVersion}/operation/variant/stats/index | study, jobId, jobDescription, jobDependsOn, jobTags, jobScheduledStartTime, jobPriority, jobDryRun, body[*] | #' | deleteVariantStudy | /{apiVersion}/operation/variant/study/delete | jobId, jobDescription, jobDependsOn, jobTags, jobScheduledStartTime, jobPriority, jobDryRun, study, body | @@ -406,6 +407,13 @@ setMethod("operationClient", "OpencgaR", function(OpencgaR, endpointName, params subcategory="variant/secondaryIndex", subcategoryId=NULL, action="delete", params=params, httpMethod="DELETE", as.queryParam=NULL, ...), + #' @section Endpoint /{apiVersion}/operation/variant/setup: + #' Execute Variant Setup to allow using the variant engine. This setup is necessary before starting any variant operation. + #' @param study Study [[organization@]project:]study where study and project can be either the ID or UUID. + #' @param data Variant setup params. + setupVariant=fetchOpenCGA(object=OpencgaR, category="operation", categoryId=NULL, subcategory="variant", + subcategoryId=NULL, action="setup", params=params, httpMethod="POST", as.queryParam=NULL, ...), + #' @section Endpoint /{apiVersion}/operation/variant/stats/delete: #' Deletes the VariantStats of a cohort/s from the database. #' @param study Study [[organization@]project:]study where study and project can be either the ID or UUID. diff --git a/opencga-client/src/main/java/org/opencb/opencga/client/rest/clients/VariantOperationClient.java b/opencga-client/src/main/java/org/opencb/opencga/client/rest/clients/VariantOperationClient.java index e3f43735a7d..97c999d192d 100644 --- a/opencga-client/src/main/java/org/opencb/opencga/client/rest/clients/VariantOperationClient.java +++ b/opencga-client/src/main/java/org/opencb/opencga/client/rest/clients/VariantOperationClient.java @@ -43,6 +43,8 @@ import org.opencb.opencga.core.models.operations.variant.VariantStorageMetadataRepairToolParams; import org.opencb.opencga.core.models.operations.variant.VariantStorageMetadataSynchronizeParams; import org.opencb.opencga.core.models.operations.variant.VariantStudyDeleteParams; +import org.opencb.opencga.core.models.study.VariantSetupResult; +import org.opencb.opencga.core.models.variant.VariantSetupParams; import org.opencb.opencga.core.response.RestResponse; @@ -598,6 +600,20 @@ public RestResponse deleteVariantSecondaryIndex(ObjectMap params) throws Cl return execute("operation", null, "variant/secondaryIndex", null, "delete", params, DELETE, Job.class); } + /** + * Execute Variant Setup to allow using the variant engine. This setup is necessary before starting any variant operation. + * @param data Variant setup params. + * @param params Map containing any of the following optional parameters. + * study: Study [[organization@]project:]study where study and project can be either the ID or UUID. + * @return a RestResponse object. + * @throws ClientException ClientException if there is any server error. + */ + public RestResponse setupVariant(VariantSetupParams data, ObjectMap params) throws ClientException { + params = params != null ? params : new ObjectMap(); + params.put("body", data); + return execute("operation", null, "variant", null, "setup", params, POST, VariantSetupResult.class); + } + /** * Deletes the VariantStats of a cohort/s from the database. * @param data Variant stats delete params. diff --git a/opencga-client/src/main/javascript/VariantOperation.js b/opencga-client/src/main/javascript/VariantOperation.js index 0d79024c496..feb3959ec70 100644 --- a/opencga-client/src/main/javascript/VariantOperation.js +++ b/opencga-client/src/main/javascript/VariantOperation.js @@ -488,6 +488,16 @@ export default class VariantOperation extends OpenCGAParentClass { return this._delete("operation", null, "variant/secondaryIndex", null, "delete", params); } + /** Execute Variant Setup to allow using the variant engine. This setup is necessary before starting any variant operation. + * @param {Object} [data] - Variant setup params. + * @param {Object} [params] - The Object containing the following optional parameters: + * @param {String} [params.study] - Study [[organization@]project:]study where study and project can be either the ID or UUID. + * @returns {Promise} Promise object in the form of RestResponse instance. + */ + setupVariant(data, params) { + return this._post("operation", null, "variant", null, "setup", data, params); + } + /** Deletes the VariantStats of a cohort/s from the database * @param {Object} data - Variant stats delete params. * @param {Object} [params] - The Object containing the following optional parameters: diff --git a/opencga-client/src/main/python/pyopencga/rest_clients/variant_operation_client.py b/opencga-client/src/main/python/pyopencga/rest_clients/variant_operation_client.py index 98dfc7d4c25..f764a18198d 100644 --- a/opencga-client/src/main/python/pyopencga/rest_clients/variant_operation_client.py +++ b/opencga-client/src/main/python/pyopencga/rest_clients/variant_operation_client.py @@ -628,6 +628,19 @@ def delete_variant_secondary_index(self, **options): return self._delete(category='operation', resource='delete', subcategory='variant/secondaryIndex', **options) + def setup_variant(self, data=None, **options): + """ + Execute Variant Setup to allow using the variant engine. This setup is + necessary before starting any variant operation. + PATH: /{apiVersion}/operation/variant/setup + + :param str study: Study [[organization@]project:]study where study and + project can be either the ID or UUID. + :param dict data: Variant setup params. + """ + + return self._post(category='operation', resource='setup', subcategory='variant', data=data, **options) + def delete_variant_stats(self, data=None, **options): """ Deletes the VariantStats of a cohort/s from the database. diff --git a/opencga-core/src/main/java/org/opencb/opencga/core/common/IOUtils.java b/opencga-core/src/main/java/org/opencb/opencga/core/common/IOUtils.java index a14652c67cd..e37374e76ea 100644 --- a/opencga-core/src/main/java/org/opencb/opencga/core/common/IOUtils.java +++ b/opencga-core/src/main/java/org/opencb/opencga/core/common/IOUtils.java @@ -375,6 +375,17 @@ public static String humanReadableByteCount(long bytes, boolean si) { * @return number of bytes */ public static long fromHumanReadableToByte(String value) { + return fromHumanReadableToByte(value, false); + } + + /** + * Get Bytes numbers from a human-readable string + * + * @param value Human-readable value + * @param assumeBinary Use Binary Units (power of 2) + * @return number of bytes + */ + public static long fromHumanReadableToByte(String value, boolean assumeBinary) { if (value.endsWith("B")) { value = value.substring(0, value.length() - 1); } @@ -385,8 +396,11 @@ public static long fromHumanReadableToByte(String value) { } else { si = true; } + if (assumeBinary) { + si = false; + } int unit = si ? 1000 : 1024; - int exp = (si ? "kMGTPE" : "KMGTPE").indexOf(value.charAt(value.length() - 1)) + 1; + int exp = "KMGTPE".indexOf(value.toUpperCase().charAt(value.length() - 1)) + 1; if (exp > 0) { value = value.substring(0, value.length() - 1); } diff --git a/opencga-core/src/main/java/org/opencb/opencga/core/models/study/StudyVariantEngineConfiguration.java b/opencga-core/src/main/java/org/opencb/opencga/core/models/study/StudyVariantEngineConfiguration.java index 3bd986b2fb4..a0a47596cd1 100644 --- a/opencga-core/src/main/java/org/opencb/opencga/core/models/study/StudyVariantEngineConfiguration.java +++ b/opencga-core/src/main/java/org/opencb/opencga/core/models/study/StudyVariantEngineConfiguration.java @@ -1,5 +1,6 @@ package org.opencb.opencga.core.models.study; +import org.opencb.commons.annotations.DataField; import org.opencb.commons.datastore.core.ObjectMap; import org.opencb.opencga.core.config.storage.SampleIndexConfiguration; @@ -8,6 +9,9 @@ public class StudyVariantEngineConfiguration { private ObjectMap options; private SampleIndexConfiguration sampleIndex; + @DataField(description = "Variant setup run", since = "3.2.0") + private VariantSetupResult setup; + public StudyVariantEngineConfiguration() { } @@ -34,11 +38,21 @@ public StudyVariantEngineConfiguration setSampleIndex(SampleIndexConfiguration s return this; } + public VariantSetupResult getSetup() { + return setup; + } + + public StudyVariantEngineConfiguration setSetup(VariantSetupResult setup) { + this.setup = setup; + return this; + } + @Override public String toString() { final StringBuilder sb = new StringBuilder("StudyVariantEngineConfiguration{"); - // sb.append("options=").append(options != null ? options.toJson() : ""); + sb.append("options=").append(options != null ? options.toJson() : ""); sb.append(", sampleIndex=").append(sampleIndex); + sb.append(", setup=").append(setup); sb.append('}'); return sb.toString(); } diff --git a/opencga-core/src/main/java/org/opencb/opencga/core/models/study/VariantSetupResult.java b/opencga-core/src/main/java/org/opencb/opencga/core/models/study/VariantSetupResult.java new file mode 100644 index 00000000000..a1127f873f8 --- /dev/null +++ b/opencga-core/src/main/java/org/opencb/opencga/core/models/study/VariantSetupResult.java @@ -0,0 +1,74 @@ +package org.opencb.opencga.core.models.study; + +import org.opencb.commons.annotations.DataField; +import org.opencb.commons.datastore.core.ObjectMap; + +public class VariantSetupResult { + + @DataField(description = "User ID that started the setup run") + private String userId; + @DataField(description = "Date when the variant setup was executed") + private String date; + @DataField(description = "Variant setup status") + private Status status; + @DataField(description = "Input params for the variant setup") + private ObjectMap params; + + @DataField(description = "Generated variant storage configuration options given the input params.") + private ObjectMap options; + + public enum Status { + READY, + NOT_READY + } + + public VariantSetupResult() { + } + + public String getUserId() { + return userId; + } + + public VariantSetupResult setUserId(String userId) { + this.userId = userId; + return this; + } + + public String getDate() { + return date; + } + + public VariantSetupResult setDate(String date) { + this.date = date; + return this; + } + + public Status getStatus() { + return status; + } + + public VariantSetupResult setStatus(Status status) { + this.status = status; + return this; + } + + public ObjectMap getParams() { + return params; + } + + public VariantSetupResult setParams(ObjectMap params) { + this.params = params; + return this; + } + + public ObjectMap getOptions() { + return options; + } + + public VariantSetupResult setOptions(ObjectMap options) { + this.options = options; + return this; + } + + +} diff --git a/opencga-core/src/main/java/org/opencb/opencga/core/models/variant/VariantSetupParams.java b/opencga-core/src/main/java/org/opencb/opencga/core/models/variant/VariantSetupParams.java new file mode 100644 index 00000000000..1a97c0de539 --- /dev/null +++ b/opencga-core/src/main/java/org/opencb/opencga/core/models/variant/VariantSetupParams.java @@ -0,0 +1,173 @@ +package org.opencb.opencga.core.models.variant; + +import org.opencb.commons.annotations.DataField; +import org.opencb.opencga.core.tools.ToolParams; + +import java.util.List; + +public class VariantSetupParams extends ToolParams { + + @DataField(description = "Expected number of samples that will be loaded. Used to infer some parameters. " + + "This number is only used as a hint. " + + "If the real number of samples is different, if it grows beyond expectation, or if , the loader should be able to handle it.", + required = true) + private Integer expectedSamples; + + @DataField(description = "Expected number of files that will be loaded. Used to infer some parameters. " + + "This number is only used as a hint. " + + "If the real number of files is different, the loader should be able to handle it.", required = true) + private Integer expectedFiles; + + @DataField(description = "Main type of the files that will be loaded. If the dataset contains multiple types of files," + + " provide the one that matches most of the files.") + private FileType fileType; + + @DataField(description = "Average size of the files that will be loaded. This number is only used as a hint. " + + "If the real size of the files is different, the loader should be able to handle it. Accepts units. e.g. 435MB, 2GB, 86KB. " + + "If not provided, the value will be inferred from the file type.") + private String averageFileSize; + + @DataField(description = "Number of variants per sample (non hom_ref variants). This number is only used as a hint. " + + "If the real number of variants per sample is different, the loader should be able to handle it. " + + "If not provided, the value will be inferred from the file type.") + private Integer variantsPerSample; + + @DataField(description = "Average number of samples per file. This number is only used as a hint. " + + "If the real number of samples per file is different, the loader should be able to handle it. " + + "If not provided, the value will be inferred from the expectedSamples and expectedFiles and dataDistribution.") + private Float averageSamplesPerFile; + + @DataField(description = "Data distribution of the files. This parameter is used to infer the number of samples per file.") + private DataDistribution dataDistribution; + + @DataField(description = "List of normalization extensions that will be used to normalize the files.") + private List normalizeExtensions; + + public VariantSetupParams(VariantSetupParams params) { + this.expectedSamples = params.expectedSamples; + this.expectedFiles = params.expectedFiles; + this.fileType = params.fileType; + this.averageFileSize = params.averageFileSize; + this.variantsPerSample = params.variantsPerSample; + this.averageSamplesPerFile = params.averageSamplesPerFile; + this.dataDistribution = params.dataDistribution; + this.normalizeExtensions = params.normalizeExtensions; + } + + public VariantSetupParams() { + } + + public enum DataDistribution { + // Single sample VCF files. One file per sample. + // e.g. + // - Platinum gVCF + // - Cancer germline + // - RD germline without family calling + @DataField(description = "Single sample VCF files. One file per sample. e.g. Platinum gVCF, Cancer germline, RD germline without family calling") + SINGLE_SAMPLE_PER_FILE, + + // Multi samples VCF files. One file with multiple samples. + // e.g. + // - Corpasome + // - RD germline with family calling + @DataField(description = "Multi samples VCF files. One file with multiple samples. e.g. Corpasome, RD germline with family calling") + MULTIPLE_SAMPLES_PER_FILE, + + // Multiple files per sample. Each file might have multiple samples. + // e.g. + // - Somatic study with multiple callers + @DataField(description = "Multiple files per sample. Each file might have multiple samples. e.g. Somatic study with multiple callers") + MULTIPLE_FILES_PER_SAMPLE, + + // Large aggregated/joined/merged files. Each file has all samples. Each file contains a specific set of chromosomes. + // e.g. + // - 1000 genomes + @DataField(description = "Large aggregated/joined/merged files. Each file has all samples. Each file contains a specific set of chromosomes. e.g. 1000 genomes") + FILES_SPLIT_BY_CHROMOSOME, + + // Large aggregated/joined/merged files. Each file has all samples. Each file contains an arbitrary region. + @DataField(description = "Large aggregated/joined/merged files. Each file has all samples. Each file contains an arbitrary region.") + FILES_SPLIT_BY_REGION, + } + + public enum FileType { + @DataField(description = "Whole genome VCF file.") + GENOME_VCF, + @DataField(description = "Whole genome gVCF file.") + GENOME_gVCF, + @DataField(description = "Exome VCF file.") + EXOME + } + + public Integer getExpectedSamples() { + return expectedSamples; + } + + public VariantSetupParams setExpectedSamples(Integer expectedSamples) { + this.expectedSamples = expectedSamples; + return this; + } + + public Integer getExpectedFiles() { + return expectedFiles; + } + + public VariantSetupParams setExpectedFiles(Integer expectedFiles) { + this.expectedFiles = expectedFiles; + return this; + } + + public FileType getFileType() { + return fileType; + } + + public VariantSetupParams setFileType(FileType fileType) { + this.fileType = fileType; + return this; + } + + public String getAverageFileSize() { + return averageFileSize; + } + + public VariantSetupParams setAverageFileSize(String averageFileSize) { + this.averageFileSize = averageFileSize; + return this; + } + + public Integer getVariantsPerSample() { + return variantsPerSample; + } + + public VariantSetupParams setVariantsPerSample(Integer variantsPerSample) { + this.variantsPerSample = variantsPerSample; + return this; + } + + public Float getAverageSamplesPerFile() { + return averageSamplesPerFile; + } + + public VariantSetupParams setAverageSamplesPerFile(Float averageSamplesPerFile) { + this.averageSamplesPerFile = averageSamplesPerFile; + return this; + } + + public DataDistribution getDataDistribution() { + return dataDistribution; + } + + public VariantSetupParams setDataDistribution(DataDistribution dataDistribution) { + this.dataDistribution = dataDistribution; + return this; + } + + public List getNormalizeExtensions() { + return normalizeExtensions; + } + + public VariantSetupParams setNormalizeExtensions(List normalizeExtensions) { + this.normalizeExtensions = normalizeExtensions; + return this; + } +} diff --git a/opencga-server/src/main/java/org/opencb/opencga/server/rest/operations/VariantOperationWebService.java b/opencga-server/src/main/java/org/opencb/opencga/server/rest/operations/VariantOperationWebService.java index a2143f45090..14fb051c85c 100644 --- a/opencga-server/src/main/java/org/opencb/opencga/server/rest/operations/VariantOperationWebService.java +++ b/opencga-server/src/main/java/org/opencb/opencga/server/rest/operations/VariantOperationWebService.java @@ -29,6 +29,7 @@ import org.opencb.opencga.core.exceptions.VersionException; import org.opencb.opencga.core.models.job.Job; import org.opencb.opencga.core.models.operations.variant.*; +import org.opencb.opencga.core.models.study.VariantSetupResult; import org.opencb.opencga.core.models.variant.*; import org.opencb.opencga.core.tools.ToolParams; import org.opencb.opencga.core.tools.annotations.Api; @@ -101,6 +102,23 @@ public Response variantConfigure( }); } + @POST + @Path("/variant/setup") + @ApiOperation(value = "Execute Variant Setup to allow using the variant engine. This setup is necessary before starting any variant operation.", + response = VariantSetupResult.class) + public Response variantConfigure( + @ApiParam(value = ParamConstants.STUDY_DESCRIPTION) @QueryParam(ParamConstants.STUDY_PARAM) String study, + @ApiParam(value = "Variant setup params") VariantSetupParams params) { + return run(() -> { + StopWatch stopWatch = StopWatch.createStarted(); + VariantSetupResult result = variantManager.variantSetup(study, params, token); + return new DataResult<>() + .setResults(Collections.singletonList(result)) + .setNumResults(1) + .setTime(((int) stopWatch.getTime(TimeUnit.MILLISECONDS))); + }); + } + @POST @Path("/variant/index") @ApiOperation(value = VariantIndexOperationTool.DESCRIPTION, response = Job.class) diff --git a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/VariantStorageEngine.java b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/VariantStorageEngine.java index b541b2f4ae4..77327d9d76a 100644 --- a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/VariantStorageEngine.java +++ b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/VariantStorageEngine.java @@ -24,13 +24,16 @@ import org.opencb.biodata.models.variant.avro.VariantAnnotation; import org.opencb.biodata.models.variant.metadata.SampleVariantStats; import org.opencb.biodata.models.variant.metadata.VariantMetadata; +import org.opencb.biodata.tools.variant.normalizer.extensions.VariantNormalizerExtensionFactory; import org.opencb.cellbase.client.config.ClientConfiguration; import org.opencb.cellbase.client.rest.CellBaseClient; import org.opencb.commons.datastore.core.*; +import org.opencb.opencga.core.api.ParamConstants; import org.opencb.opencga.core.common.TimeUtils; import org.opencb.opencga.core.config.storage.StorageConfiguration; import org.opencb.opencga.core.models.operations.variant.VariantAggregateFamilyParams; import org.opencb.opencga.core.models.operations.variant.VariantAggregateParams; +import org.opencb.opencga.core.models.variant.VariantSetupParams; import org.opencb.opencga.storage.core.variant.query.VariantQueryResult; import org.opencb.opencga.storage.core.StorageEngine; import org.opencb.opencga.storage.core.StoragePipelineResult; @@ -153,7 +156,12 @@ public static SplitData from(ObjectMap options) { String loadSplitDataStr = options.getString(LOAD_SPLIT_DATA.key()); boolean multiFile = options.getBoolean(LOAD_MULTI_FILE_DATA.key()); if (StringUtils.isNotEmpty(loadSplitDataStr) && multiFile) { - throw new IllegalArgumentException("Unable to mix loadSplitFile and loadMultiFile"); + if (loadSplitDataStr.equalsIgnoreCase("multi")) { + return MULTI; + } else { + throw new IllegalArgumentException("Unable to mix " + LOAD_MULTI_FILE_DATA.key() + "=true and " + + LOAD_SPLIT_DATA.key() + "='" + loadSplitDataStr + "'"); + } } if (StringUtils.isEmpty(loadSplitDataStr) && !multiFile) { return null; @@ -1498,6 +1506,45 @@ public VariantAggregationExecutor getVariantAggregationExecutor(Query query, Que throw new VariantQueryException("No VariantAggregationExecutor found to run the query. " + messages).setQuery(query); } + public ObjectMap inferConfigurationParams(VariantSetupParams params) { + ObjectMap options = new ObjectMap(); + + List normalizeExtensions = params.getNormalizeExtensions(); + if (normalizeExtensions != null && !normalizeExtensions.isEmpty()) { + if (!normalizeExtensions.equals(Collections.singletonList(ParamConstants.ALL))) { + List unsupportedExtensions = new ArrayList<>(); + for (String normalizeExtension : normalizeExtensions) { + if (!VariantNormalizerExtensionFactory.ALL_EXTENSIONS.contains(normalizeExtension)) { + unsupportedExtensions.add(normalizeExtension); + } + } + if (!unsupportedExtensions.isEmpty()) { + throw new IllegalArgumentException("Unsupported normalize extensions: " + unsupportedExtensions + ". Supported " + + "extensions are: " + VariantNormalizerExtensionFactory.ALL_EXTENSIONS); + } + } + options.put(NORMALIZATION_EXTENSIONS.key(), normalizeExtensions); + } + if (params.getDataDistribution() != null) { + switch (params.getDataDistribution()) { + case FILES_SPLIT_BY_CHROMOSOME: + options.put(LOAD_SPLIT_DATA.key(), SplitData.CHROMOSOME); + break; + case FILES_SPLIT_BY_REGION: + options.put(LOAD_SPLIT_DATA.key(), SplitData.REGION); + break; + case MULTIPLE_FILES_PER_SAMPLE: + options.put(LOAD_MULTI_FILE_DATA.key(), true); + options.put(LOAD_SPLIT_DATA.key(), SplitData.MULTI); + break; + default: + break; + } + } + + return options; + } + @Override public void close() throws IOException { cellBaseUtils = null; diff --git a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/VariantStorageOptions.java b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/VariantStorageOptions.java index bc1be055466..847ae860d70 100644 --- a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/VariantStorageOptions.java +++ b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/VariantStorageOptions.java @@ -1,10 +1,11 @@ package org.opencb.opencga.storage.core.variant; import org.opencb.biodata.models.variant.metadata.Aggregation; -import org.opencb.opencga.core.api.ParamConstants; import org.opencb.opencga.core.common.YesNoAuto; import org.opencb.opencga.core.config.ConfigurationOption; +import java.util.Arrays; + public enum VariantStorageOptions implements ConfigurationOption { STUDY("study"), @@ -26,7 +27,7 @@ public enum VariantStorageOptions implements ConfigurationOption { TRANSFORM_ISOLATE("transform.isolate", false), // Do not store file in metadata NORMALIZATION_SKIP("normalization.skip", false), // Do not run normalization NORMALIZATION_REFERENCE_GENOME("normalization.referenceGenome"), - NORMALIZATION_EXTENSIONS("normalization.extensions", ParamConstants.NONE), + NORMALIZATION_EXTENSIONS("normalization.extensions", Arrays.asList("VAF", "SV", "CUSTOM")), DEDUPLICATION_POLICY("deduplication.policy", "maxQual"), DEDUPLICATION_BUFFER_SIZE("deduplication.bufferSize", 100), diff --git a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/VariantStoragePipeline.java b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/VariantStoragePipeline.java index 722d79e59fd..34bbd5cfe5a 100644 --- a/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/VariantStoragePipeline.java +++ b/opencga-storage/opencga-storage-core/src/main/java/org/opencb/opencga/storage/core/variant/VariantStoragePipeline.java @@ -75,6 +75,7 @@ import java.io.UncheckedIOException; import java.net.URI; import java.nio.ByteBuffer; +import java.nio.file.Paths; import java.util.*; import java.util.concurrent.ExecutionException; import java.util.concurrent.atomic.AtomicInteger; @@ -421,15 +422,22 @@ protected Task initNormalizer(VariantFileMetadata metadata) th .then(new VariantSorterTask(100)) // Sort before generating reference blocks .then(new VariantReferenceBlockCreatorTask(metadata.getHeader())); } - if (CollectionUtils.isNotEmpty(enabledExtensions)) { + if (CollectionUtils.isEmpty(enabledExtensions)) { + enabledExtensions = NORMALIZATION_EXTENSIONS.defaultValue(); + } + if ((enabledExtensions.size() == 1 && enabledExtensions.contains(ParamConstants.NONE))) { + logger.info("Skip normalization extensions"); + } else { + logger.info("Enable normalization extensions: {}", enabledExtensions); VariantNormalizerExtensionFactory extensionFactory; if (enabledExtensions.size() == 1 && enabledExtensions.contains(ParamConstants.ALL)) { - extensionFactory = new VariantNormalizerExtensionFactory(); - } else { - extensionFactory = new VariantNormalizerExtensionFactory(new HashSet<>(enabledExtensions)); + enabledExtensions = NORMALIZATION_EXTENSIONS.defaultValue(); } + extensionFactory = new VariantNormalizerExtensionFactory(new HashSet<>(enabledExtensions)); Task extension = extensionFactory.buildExtensions(metadata); - if (extension != null) { + if (extension == null) { + logger.info("No normalization extensions can be used."); + } else { normalizer = normalizer.then(extension); } } @@ -558,17 +566,19 @@ public URI preLoad(URI input, URI output) throws StorageEngineException { return input; } - protected void preLoadRegisterAndValidateFile(int studyId, VariantFileMetadata fileMetadata) throws StorageEngineException { + protected void preLoadRegisterAndValidateFile(int studyId, VariantFileMetadata variantFileMetadata) throws StorageEngineException { final int fileId; String virtualFile = options.getString(LOAD_VIRTUAL_FILE.key()); + boolean loadSampleIndex = YesNoAuto.parse(options, LOAD_SAMPLE_INDEX.key()).orYes().booleanValue(); + VariantStorageEngine.SplitData splitData = VariantStorageEngine.SplitData.from(options); - if (VariantStorageEngine.SplitData.isPartialSplit(options)) { + if (VariantStorageEngine.SplitData.isPartialSplit(splitData)) { if (StringUtils.isEmpty(virtualFile)) { - fileId = getMetadataManager().registerFile(studyId, fileMetadata); + fileId = getMetadataManager().registerFile(studyId, variantFileMetadata); // throw new StorageEngineException("Unable to load file with 'split-data'. Missing virtual file belonging! " // + "Please, define " + LOAD_VIRTUAL_FILE.key()); } else { - fileId = getMetadataManager().registerPartialFile(studyId, virtualFile, fileMetadata); + fileId = getMetadataManager().registerPartialFile(studyId, virtualFile, variantFileMetadata); } } else { if (StringUtils.isNotEmpty(virtualFile)) { @@ -577,10 +587,85 @@ protected void preLoadRegisterAndValidateFile(int studyId, VariantFileMetadata f + " to " + VariantStorageEngine.SplitData.REGION + " or " + VariantStorageEngine.SplitData.CHROMOSOME); } else { - fileId = getMetadataManager().registerFile(studyId, fileMetadata); + fileId = getMetadataManager().registerFile(studyId, variantFileMetadata); } } setFileId(fileId); + FileMetadata fileMetadata = getMetadataManager().getFileMetadata(studyId, getFileId()); + + int version = getMetadataManager().getStudyMetadata(studyId).getSampleIndexConfigurationLatest().getVersion(); + Set alreadyIndexedSamples = new LinkedHashSet<>(); + Set processedSamples = new LinkedHashSet<>(); + Set samplesWithoutSplitData = new LinkedHashSet<>(); + for (String sample : variantFileMetadata.getSampleIds()) { + Integer sampleId = getMetadataManager().getSampleId(studyId, sample); + SampleMetadata sampleMetadata = getMetadataManager().getSampleMetadata(studyId, sampleId); + if (splitData != null && sampleMetadata.getSplitData() != null) { + if (splitData != sampleMetadata.getSplitData()) { + throw new StorageEngineException("Incompatible split data methods. " + + "Unable to mix requested " + splitData + + " with existing " + sampleMetadata.getSplitData()); + } + } + if (sampleMetadata.isIndexed()) { + if (sampleMetadata.getFiles().size() == 1 && sampleMetadata.getFiles().contains(fileMetadata.getId())) { + // It might happen that the sample is marked as INDEXED, but not the file. + // If the sample only belongs to this file (i.e. it's only file is this file), then ignore + // the overwrite the current sample metadata index status + sampleMetadata = getMetadataManager().updateSampleMetadata(studyId, sampleId, + sm -> sm.setIndexStatus(fileMetadata.getIndexStatus())); + } + } + if (sampleMetadata.isIndexed()) { + alreadyIndexedSamples.add(sample); + if (sampleMetadata.isAnnotated() + || !loadSampleIndex && sampleMetadata.getSampleIndexStatus(version) == TaskMetadata.Status.READY + || sampleMetadata.getSampleIndexAnnotationStatus(version) == TaskMetadata.Status.READY + || sampleMetadata.getFamilyIndexStatus(version) == TaskMetadata.Status.READY + || sampleMetadata.isFamilyIndexDefined()) { + processedSamples.add(sampleMetadata.getId()); + } + } + + if (splitData != null && splitData != sampleMetadata.getSplitData()) { + samplesWithoutSplitData.add(sampleId); + } + } + + if (!alreadyIndexedSamples.isEmpty()) { + if (splitData != null) { + logger.info("Loading split data"); + } else { + String fileName = Paths.get(variantFileMetadata.getPath()).getFileName().toString(); + throw StorageEngineException.alreadyLoadedSamples(fileName, new ArrayList<>(alreadyIndexedSamples)); + } + for (Integer sampleId : processedSamples) { + getMetadataManager().updateSampleMetadata(studyId, sampleId, sampleMetadata -> { + if (!loadSampleIndex) { + for (Integer v : sampleMetadata.getSampleIndexVersions()) { + sampleMetadata.setSampleIndexStatus(TaskMetadata.Status.NONE, v); + } + } + for (Integer v : sampleMetadata.getSampleIndexAnnotationVersions()) { + sampleMetadata.setSampleIndexAnnotationStatus(TaskMetadata.Status.NONE, v); + } + for (Integer v : sampleMetadata.getFamilyIndexVersions()) { + sampleMetadata.setFamilyIndexStatus(TaskMetadata.Status.NONE, v); + } + sampleMetadata.setAnnotationStatus(TaskMetadata.Status.NONE); + sampleMetadata.setMendelianErrorStatus(TaskMetadata.Status.NONE); + }); + } + } + + if (splitData != null) { + // Register loadSplitData + for (Integer sampleId : samplesWithoutSplitData) { + getMetadataManager().updateSampleMetadata(studyId, sampleId, sampleMetadata -> { + sampleMetadata.setSplitData(splitData); + }); + } + } } /** diff --git a/opencga-storage/opencga-storage-core/src/test/java/org/opencb/opencga/storage/core/variant/VariantStorageBaseTest.java b/opencga-storage/opencga-storage-core/src/test/java/org/opencb/opencga/storage/core/variant/VariantStorageBaseTest.java index fe5313bdc80..23ece394c79 100644 --- a/opencga-storage/opencga-storage-core/src/test/java/org/opencb/opencga/storage/core/variant/VariantStorageBaseTest.java +++ b/opencga-storage/opencga-storage-core/src/test/java/org/opencb/opencga/storage/core/variant/VariantStorageBaseTest.java @@ -374,6 +374,11 @@ public static StoragePipelineResult runDefaultETL(URI inputUri, VariantStorageEn return storagePipelineResult; } + public static StoragePipelineResult runETL(VariantStorageEngine variantStorageManager, URI inputUri, URI outputUri, + ObjectMap params) throws StorageEngineException, FileFormatException, IOException { + return runETL(variantStorageManager, inputUri, outputUri, params, true, true, true); + } + public static StoragePipelineResult runETL(VariantStorageEngine variantStorageManager, URI inputUri, URI outputUri, ObjectMap params, boolean doExtract, diff --git a/opencga-storage/opencga-storage-core/src/test/java/org/opencb/opencga/storage/core/variant/dummy/DummyVariantStorageEngine.java b/opencga-storage/opencga-storage-core/src/test/java/org/opencb/opencga/storage/core/variant/dummy/DummyVariantStorageEngine.java index 55866e24160..55903d221f2 100644 --- a/opencga-storage/opencga-storage-core/src/test/java/org/opencb/opencga/storage/core/variant/dummy/DummyVariantStorageEngine.java +++ b/opencga-storage/opencga-storage-core/src/test/java/org/opencb/opencga/storage/core/variant/dummy/DummyVariantStorageEngine.java @@ -84,6 +84,10 @@ public static void configure(StorageEngineFactory factory, boolean clear) { } } + public static VariantStorageMetadataManager getVariantMetadataManager() { + return new VariantStorageMetadataManager(new DummyVariantStorageMetadataDBAdaptorFactory()); + } + @Override public String getStorageEngineId() { return STORAGE_ENGINE_ID; diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-compat/opencga-storage-hadoop-compat-api/pom.xml b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-compat/opencga-storage-hadoop-compat-api/pom.xml index 740b4bb3101..ca935dae3fa 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-compat/opencga-storage-hadoop-compat-api/pom.xml +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-compat/opencga-storage-hadoop-compat-api/pom.xml @@ -43,6 +43,12 @@ ${hbase.version} provided + + org.apache.hbase + hbase-common + ${hbase.version} + provided + org.apache.hadoop hadoop-common diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-compat/opencga-storage-hadoop-compat-api/src/main/java/org/opencb/opencga/storage/hadoop/HBaseCompatApi.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-compat/opencga-storage-hadoop-compat-api/src/main/java/org/opencb/opencga/storage/hadoop/HBaseCompatApi.java index 4bfb6cd45fd..48ab8846f65 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-compat/opencga-storage-hadoop-compat-api/src/main/java/org/opencb/opencga/storage/hadoop/HBaseCompatApi.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-compat/opencga-storage-hadoop-compat-api/src/main/java/org/opencb/opencga/storage/hadoop/HBaseCompatApi.java @@ -1,9 +1,13 @@ package org.opencb.opencga.storage.hadoop; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.ServerName; +import org.apache.hadoop.hbase.client.Admin; +import org.apache.hadoop.hbase.client.Table; import org.opencb.opencga.storage.hadoop.variant.annotation.phoenix.PhoenixCompatApi; import java.io.IOException; +import java.util.List; public abstract class HBaseCompatApi { @@ -28,4 +32,8 @@ public static HBaseCompatApi getInstance() { public abstract void available(Configuration configuration) throws IOException; public abstract boolean isSolrTestingAvailable(); + + public abstract List getServerList(Admin admin) throws IOException; + + public abstract byte[][] getTableStartKeys(Admin admin, Table table) throws IOException; } diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-compat/opencga-storage-hadoop-compat-hbase2.0/pom.xml b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-compat/opencga-storage-hadoop-compat-hbase2.0/pom.xml index d8dd4f61336..241ecd7a87b 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-compat/opencga-storage-hadoop-compat-hbase2.0/pom.xml +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-compat/opencga-storage-hadoop-compat-hbase2.0/pom.xml @@ -49,6 +49,12 @@ ${hbase.version} provided + + org.apache.hbase + hbase-common + ${hbase.version} + provided + org.apache.hadoop hadoop-common diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-compat/opencga-storage-hadoop-compat-hbase2.0/src/main/java/org/opencb/opencga/storage/hadoop/HBaseCompat.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-compat/opencga-storage-hadoop-compat-hbase2.0/src/main/java/org/opencb/opencga/storage/hadoop/HBaseCompat.java index f3f9170f6f7..bb74dabf4c6 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-compat/opencga-storage-hadoop-compat-hbase2.0/src/main/java/org/opencb/opencga/storage/hadoop/HBaseCompat.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-compat/opencga-storage-hadoop-compat-hbase2.0/src/main/java/org/opencb/opencga/storage/hadoop/HBaseCompat.java @@ -1,11 +1,18 @@ package org.opencb.opencga.storage.hadoop; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.ServerName; +import org.apache.hadoop.hbase.client.Admin; import org.apache.hadoop.hbase.client.HBaseAdmin; +import org.apache.hadoop.hbase.client.RegionInfo; +import org.apache.hadoop.hbase.client.Table; +import org.apache.hadoop.hbase.util.Bytes; import org.opencb.opencga.storage.hadoop.variant.annotation.phoenix.PhoenixCompat; import org.opencb.opencga.storage.hadoop.variant.annotation.phoenix.PhoenixCompatApi; import java.io.IOException; +import java.util.ArrayList; +import java.util.List; public class HBaseCompat extends HBaseCompatApi { @@ -24,4 +31,18 @@ public PhoenixCompatApi getPhoenixCompat() { return new PhoenixCompat(); } + @Override + public List getServerList(Admin admin) throws IOException { + return new ArrayList<>(admin.getClusterStatus().getServers()); + } + + public byte[][] getTableStartKeys(Admin admin, Table table) throws IOException { + List regions = admin.getRegions(table.getName()); + regions.sort((o1, o2) -> Bytes.compareTo(o1.getStartKey(), o2.getStartKey())); + byte[][] startKeys = new byte[regions.size()][]; + for (int i = 0; i < regions.size(); i++) { + startKeys[i] = regions.get(i).getStartKey(); + } + return startKeys; + } } diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-compat/opencga-storage-hadoop-compat-hbase2.2/pom.xml b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-compat/opencga-storage-hadoop-compat-hbase2.2/pom.xml index c24d2e7072b..3fdd2a30431 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-compat/opencga-storage-hadoop-compat-hbase2.2/pom.xml +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-compat/opencga-storage-hadoop-compat-hbase2.2/pom.xml @@ -49,6 +49,12 @@ ${hbase.version} provided + + org.apache.hbase + hbase-common + ${hbase.version} + provided + org.apache.hadoop hadoop-common diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-compat/opencga-storage-hadoop-compat-hbase2.2/src/main/java/org/opencb/opencga/storage/hadoop/HBaseCompat.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-compat/opencga-storage-hadoop-compat-hbase2.2/src/main/java/org/opencb/opencga/storage/hadoop/HBaseCompat.java index 451238f6da5..f7cf534508a 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-compat/opencga-storage-hadoop-compat-hbase2.2/src/main/java/org/opencb/opencga/storage/hadoop/HBaseCompat.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-compat/opencga-storage-hadoop-compat-hbase2.2/src/main/java/org/opencb/opencga/storage/hadoop/HBaseCompat.java @@ -1,11 +1,18 @@ package org.opencb.opencga.storage.hadoop; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.ServerName; +import org.apache.hadoop.hbase.client.Admin; import org.apache.hadoop.hbase.client.HBaseAdmin; +import org.apache.hadoop.hbase.client.RegionInfo; +import org.apache.hadoop.hbase.client.Table; +import org.apache.hadoop.hbase.util.Bytes; import org.opencb.opencga.storage.hadoop.variant.annotation.phoenix.PhoenixCompat; import org.opencb.opencga.storage.hadoop.variant.annotation.phoenix.PhoenixCompatApi; import java.io.IOException; +import java.util.ArrayList; +import java.util.List; public class HBaseCompat extends HBaseCompatApi { @@ -22,4 +29,20 @@ public boolean isSolrTestingAvailable() { public PhoenixCompatApi getPhoenixCompat() { return new PhoenixCompat(); } + + @Override + public List getServerList(Admin admin) throws IOException { + return new ArrayList<>(admin.getClusterMetrics().getServersName()); + } + + @Override + public byte[][] getTableStartKeys(Admin admin, Table table) throws IOException { + List regions = admin.getRegions(table.getName()); + regions.sort((o1, o2) -> Bytes.compareTo(o1.getStartKey(), o2.getStartKey())); + byte[][] startKeys = new byte[regions.size()][]; + for (int i = 0; i < regions.size(); i++) { + startKeys[i] = regions.get(i).getStartKey(); + } + return startKeys; + } } diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-compat/opencga-storage-hadoop-compat-hbase2.4/pom.xml b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-compat/opencga-storage-hadoop-compat-hbase2.4/pom.xml index 2bf4db839f0..112516d4f72 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-compat/opencga-storage-hadoop-compat-hbase2.4/pom.xml +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-compat/opencga-storage-hadoop-compat-hbase2.4/pom.xml @@ -49,6 +49,12 @@ ${hbase.version} provided + + org.apache.hbase + hbase-common + ${hbase.version} + provided + org.apache.hadoop hadoop-common diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-compat/opencga-storage-hadoop-compat-hbase2.4/src/main/java/org/opencb/opencga/storage/hadoop/HBaseCompat.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-compat/opencga-storage-hadoop-compat-hbase2.4/src/main/java/org/opencb/opencga/storage/hadoop/HBaseCompat.java index 93f61922b9f..194b47b7794 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-compat/opencga-storage-hadoop-compat-hbase2.4/src/main/java/org/opencb/opencga/storage/hadoop/HBaseCompat.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-compat/opencga-storage-hadoop-compat-hbase2.4/src/main/java/org/opencb/opencga/storage/hadoop/HBaseCompat.java @@ -1,11 +1,16 @@ package org.opencb.opencga.storage.hadoop; import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hbase.ServerName; +import org.apache.hadoop.hbase.client.Admin; import org.apache.hadoop.hbase.client.HBaseAdmin; +import org.apache.hadoop.hbase.client.Table; import org.opencb.opencga.storage.hadoop.variant.annotation.phoenix.PhoenixCompat; import org.opencb.opencga.storage.hadoop.variant.annotation.phoenix.PhoenixCompatApi; import java.io.IOException; +import java.util.ArrayList; +import java.util.List; public class HBaseCompat extends HBaseCompatApi { @@ -24,4 +29,13 @@ public PhoenixCompatApi getPhoenixCompat() { return new PhoenixCompat(); } + @Override + public List getServerList(Admin admin) throws IOException { + return new ArrayList<>(admin.getClusterMetrics().getServersName()); + } + + @Override + public byte[][] getTableStartKeys(Admin admin, Table table) throws IOException { + return table.getRegionLocator().getStartKeys(); + } } diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-compat/pom.xml b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-compat/pom.xml index 0dd166e460a..1b1393dfceb 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-compat/pom.xml +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-compat/pom.xml @@ -45,6 +45,12 @@ ${hbase.version} provided + + org.apache.hbase + hbase-common + ${hbase.version} + provided + org.apache.phoenix phoenix-core diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/app/HBaseMain.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/app/HBaseMain.java index 509a758f268..7e80e787dea 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/app/HBaseMain.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/app/HBaseMain.java @@ -3,18 +3,19 @@ import org.apache.commons.collections4.CollectionUtils; import org.apache.commons.lang3.ArrayUtils; import org.apache.commons.lang3.StringUtils; +import org.apache.commons.lang3.time.StopWatch; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.*; -import org.apache.hadoop.hbase.client.Admin; -import org.apache.hadoop.hbase.client.Connection; -import org.apache.hadoop.hbase.client.SnapshotDescription; -import org.apache.hadoop.hbase.client.TableState; +import org.apache.hadoop.hbase.client.*; import org.apache.hadoop.hbase.exceptions.IllegalArgumentIOException; +import org.apache.hadoop.hbase.util.Bytes; +import org.apache.hadoop.hbase.util.Pair; import org.apache.tools.ant.types.Commandline; import org.opencb.commons.ProgressLogger; import org.opencb.commons.datastore.core.ObjectMap; import org.opencb.opencga.core.common.TimeUtils; import org.opencb.opencga.core.config.storage.StorageConfiguration; +import org.opencb.opencga.storage.hadoop.HBaseCompat; import org.opencb.opencga.storage.hadoop.utils.HBaseManager; import org.opencb.opencga.storage.hadoop.variant.HadoopVariantStorageEngine; import org.opencb.opencga.storage.hadoop.variant.executors.MRExecutor; @@ -78,9 +79,15 @@ public void run(String[] args) throws Exception { case MOVE_TABLE_REGIONS: moveTableRegions(args); break; - case BALANCE_TABLE_REGIONS: - balanceTableRegions(getArg(args, 1), getArgsMap(args, 2, "maxMoves")); + case BALANCE_TABLE_REGIONS: { + ObjectMap argsMap = getArgsMap(args, 2, "maxMoves", "dryRun", "ignoreExceptions", "maxRetries"); + balanceTableRegions(getArg(args, 1), + argsMap.getInt("maxMoves", 50000), + argsMap.getBoolean("dryRun", false), + argsMap.getBoolean("ignoreExceptions", false), + argsMap.getInt("maxRetries", 1)); break; + } case "tables": case LIST_TABLES: print(listTables(getArg(args, 1, "")).stream().map(TableName::getNameWithNamespaceInclAsString).iterator()); @@ -197,8 +204,10 @@ public void run(String[] args) throws Exception { System.out.println(" (see " + CHECK_TABLES_WITH_REGIONS_ON_DEAD_SERVERS + ") by creating a temporary snapshot"); System.out.println(" " + MOVE_TABLE_REGIONS + " "); System.out.println(" Move all regions from selected tables to new random nodes."); -// System.out.println(" " + BALANCE_TABLE_REGIONS + " [--maxMoves N]"); // FIXME -// System.out.println(" " + REGIONS_PER_TABLE + " "); // FIXME + System.out.println(" " + BALANCE_TABLE_REGIONS + " [--maxMoves N]" + + " [--maxRetries 1] [--ignoreExceptions]" + + " [--dryRun]"); + System.out.println(" " + REGIONS_PER_TABLE + " "); // FIXME System.out.println(" " + CLONE_TABLES + " " + "[--keepSnapshots] [--dryRun] [--snapshotSuffix ]"); System.out.println(" Clone all selected tables by creating an intermediate snapshot."); @@ -207,8 +216,8 @@ public void run(String[] args) throws Exception { + "[--skipTablesWithSnapshot]"); System.out.println(" " + SNAPSHOT_TABLE + " [--dryRun] [--snapshotSuffix ] " + "[--skipTablesWithSnapshot]"); - System.out.println(" " + DELETE_SNAPSHOTS + " [--dryRun] [--skipMissing]"); System.out.println(" Create a snapshot for all selected tables."); + System.out.println(" " + DELETE_SNAPSHOTS + " [--dryRun] [--skipMissing]"); System.out.println(" " + CLONE_SNAPSHOTS + " [--dryRun] " + "[--tablePrefixChange :] " + "[--onExistingTables [fail|skip|drop] ]"); @@ -326,47 +335,38 @@ private void exportSnapshot(String storageConfigurationPath, String snapshot, St } private void regionsPerTable(String tableNameStr) throws Exception { -// TableName tableName = getTable(tableNameStr); -// hBaseManager.act(tableName.getNameAsString(), (table, admin) -> { -// List servers = new ArrayList<>(admin.getClusterStatus().getServers()); -// Map regionsPerServer = new HashMap<>(); -// -// List> tableRegionsAndLocations = getTableRegionsAndLocations(tableName, admin); -// -// System.out.println("#REGION\tSERVER\tSTART_KEY\tEND_KEY"); -// for (Pair pair : tableRegionsAndLocations) { -// RegionInfo region = pair.getFirst(); -// ServerName server = pair.getSecond(); -// regionsPerServer.merge(server.getServerName(), 1, Integer::sum); -// -// System.out.println(region.getEncodedName() -// + "\t" + server.getServerName() -// + "\t" + Bytes.toStringBinary(region.getStartKey()) -// + "\t" + Bytes.toStringBinary(region.getEndKey())); -// } -// -// System.out.println(""); -// System.out.println("#SERVER\tREGIONS"); -// for (ServerName server : servers) { -// System.out.println(server.getServerName() + "\t" + regionsPerServer.getOrDefault(server.getServerName(), 0)); -// } -// -// -// -// return null; -// }); - } - -// private List> getTableRegionsAndLocations(TableName tableName, Admin admin) throws IOException { -// List> tableRegionsAndLocations; -//// try (ZooKeeperWatcher zkw = new ZooKeeperWatcher(admin.getConfiguration(), "hbase-main", null)) { -//// tableRegionsAndLocations = MetaTableAccessor -//// .getTableRegionsAndLocations(zkw, admin.getConnection(), tableName); -//// } -// tableRegionsAndLocations = MetaTableAccessor -// .getTableRegionsAndLocations(admin.getConnection(), tableName); -// return tableRegionsAndLocations; -// } + TableName tableName = getTable(tableNameStr); + hBaseManager.act(tableName.getNameAsString(), (table, admin) -> { + List servers = new ArrayList<>(admin.getClusterStatus().getServers()); + Map regionsPerServer = new HashMap<>(); + + List> tableRegionsAndLocations = getTableRegionsAndLocations(tableName, admin); + + System.out.println("#REGION\tSERVER\tSTART_KEY\tEND_KEY"); + for (Pair pair : tableRegionsAndLocations) { + RegionInfo region = pair.getFirst(); + ServerName server = pair.getSecond(); + regionsPerServer.merge(server.getServerName(), 1, Integer::sum); + + System.out.println(region.getEncodedName() + + "\t" + server.getServerName() + + "\t" + Bytes.toStringBinary(region.getStartKey()) + + "\t" + Bytes.toStringBinary(region.getEndKey())); + } + + System.out.println(""); + System.out.println("#SERVER\tREGIONS"); + for (ServerName server : servers) { + System.out.println(server.getServerName() + "\t" + regionsPerServer.getOrDefault(server.getServerName(), 0)); + } + + return null; + }); + } + + private List> getTableRegionsAndLocations(TableName tableName, Admin admin) throws IOException { + return MetaTableAccessor.getTableRegionsAndLocations(admin.getConnection(), tableName); + } private void reassignTablesWithRegionsOnDeadServers(String[] args) throws Exception { String tableNameFilter = getArg(args, 1); @@ -415,47 +415,81 @@ private void moveTableRegions(String[] args) throws Exception { }); } - private void balanceTableRegions(String tableNameStr, ObjectMap options) throws Exception { -// TableName tableName = getTable(tableNameStr); -// -// int regionCount = hBaseManager.act(tableName.getNameAsString(), (table, admin) -> { -// int maxMoves = options.getInt("maxMoves", 50000); -// List servers = new ArrayList<>(admin.getClusterStatus().getServers()); -// List> tableRegionsAndLocations = getTableRegionsAndLocations(tableName, admin); -// int expectedRegionsPerServer = (tableRegionsAndLocations.size() / servers.size()) + 1; -// Map regionsPerServer = new HashMap<>(); -// servers.forEach(s -> regionsPerServer.put(s.getServerName(), 0)); -// for (Pair pair : tableRegionsAndLocations) { -// regionsPerServer.merge(pair.getSecond().getServerName(), 1, Integer::sum); -// } -// -// for (Pair pair : tableRegionsAndLocations) { -// if (maxMoves < 0) { -// System.out.println("Reached max moves!"); -// break; -// } -// -// String sourceHost = pair.getSecond().getServerName(); -// if (regionsPerServer.get(sourceHost) > expectedRegionsPerServer) { -// Collections.shuffle(servers); -// Optional targetOptional = servers.stream() -// .filter(s -> regionsPerServer.get(s.getServerName()) < expectedRegionsPerServer).findAny(); -// if (!targetOptional.isPresent()) { -// break; -// } -// String testHost = targetOptional.get().getServerName(); -// regionsPerServer.merge(sourceHost, -1, Integer::sum); -// regionsPerServer.merge(testHost, 1, Integer::sum); -// System.out.println("Move region '" + pair.getFirst().getEncodedName() + "' from " + sourceHost + " to " + testHost); -// StopWatch stopWatch = StopWatch.createStarted(); -// admin.move(pair.getFirst().getEncodedNameAsBytes(), Bytes.toBytes(testHost)); -// System.out.println("Moved in "+TimeUtils.durationToString(stopWatch)); -// -// maxMoves--; -// } -// } -// return tableRegionsAndLocations.size(); -// }); + private void balanceTableRegions(String tableNameStr, int maxMoves, boolean dryRun, boolean ignoreExceptions, int maxRetries) + throws Exception { + TableName tableName = getTable(tableNameStr); + + LOGGER.info("Balancing table " + tableName.getNameAsString() + " with maxMoves=" + maxMoves + ", dryRun=" + dryRun + + ", ignoreExceptions=" + ignoreExceptions + ", maxRetries=" + maxRetries); + int regionCount = hBaseManager.act(tableName.getNameAsString(), (table, admin) -> { + List servers = HBaseCompat.getInstance().getServerList(admin); + List> tableRegionsAndLocations = getTableRegionsAndLocations(tableName, admin); + int expectedRegionsPerServer = (tableRegionsAndLocations.size() / servers.size()) + 1; + + Map regionsPerServer = new HashMap<>(); + servers.forEach(s -> regionsPerServer.put(s.getServerName(), 0)); + for (Pair pair : tableRegionsAndLocations) { + regionsPerServer.merge(pair.getSecond().getServerName(), 1, Integer::sum); + } + + // Shuffle the regions to avoid hotspots + Collections.shuffle(tableRegionsAndLocations); + + int moves = 0; + for (Pair pair : tableRegionsAndLocations) { + if (moves > maxMoves) { + LOGGER.info("Reached max moves!"); + break; + } + + ServerName sourceHost = pair.getSecond(); + // If the source host has more regions than expected, move one to another server + if (regionsPerServer.get(sourceHost.getServerName()) > expectedRegionsPerServer) { + // Iterate over the servers in a random order + Collections.shuffle(servers); + Optional targetOptional = servers.stream() + .filter(s -> !s.equals(sourceHost)) + .filter(s -> regionsPerServer.get(s.getServerName()) < expectedRegionsPerServer) + .findAny(); + if (!targetOptional.isPresent()) { + break; + } + ServerName targetHost = targetOptional.get(); + LOGGER.info("Move region '" + pair.getFirst().getEncodedName() + "' from " + sourceHost + " to " + targetHost); + StopWatch stopWatch = StopWatch.createStarted(); + int attempts = 0; + while (attempts < maxRetries) { + try { + if (dryRun) { + LOGGER.info("[DRY-RUN]: admin.move('" + pair.getFirst().getEncodedName() + "," + targetHost + ")"); + } else { + admin.move(pair.getFirst().getEncodedNameAsBytes(), Bytes.toBytes(targetHost.getServerName())); + } + break; + } catch (Exception e) { + LOGGER.info("Error moving region: " + e.getMessage()); + attempts++; + if (attempts < maxRetries) { + LOGGER.info("Retrying... " + attempts + "/" + maxRetries); + } else if (!ignoreExceptions) { + throw e; + } else { + LOGGER.info("Ignoring exception. Unable to move region after " + attempts + " attempts."); + break; + } + } + } + LOGGER.info("Moved in " + TimeUtils.durationToString(stopWatch)); + + regionsPerServer.merge(sourceHost.getServerName(), -1, Integer::sum); + regionsPerServer.merge(targetHost.getServerName(), 1, Integer::sum); + + moves++; + } + } + return tableRegionsAndLocations.size(); + }); + System.out.println("#Balanced regions for table '" + tableNameStr + "' . Total regions: " + regionCount); } diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/HBaseManager.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/HBaseManager.java index f9bfa4d0a3f..1f6cd77efd8 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/HBaseManager.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/utils/HBaseManager.java @@ -17,24 +17,28 @@ package org.opencb.opencga.storage.hadoop.utils; import org.apache.commons.lang3.StringUtils; +import org.apache.commons.lang3.time.StopWatch; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hbase.*; import org.apache.hadoop.hbase.client.*; import org.apache.hadoop.hbase.io.compress.Compression; import org.apache.hadoop.hbase.util.Bytes; import org.opencb.opencga.core.common.ExceptionUtils; +import org.opencb.opencga.core.common.TimeUtils; +import org.opencb.opencga.storage.hadoop.HBaseCompat; import org.opencb.opencga.storage.hadoop.auth.HBaseCredentials; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; import java.net.URISyntaxException; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; +import java.nio.ByteBuffer; +import java.util.*; import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicReference; +import java.util.function.Function; +import java.util.stream.Collectors; import static org.opencb.opencga.storage.hadoop.utils.PersistentResultScanner.isValid; @@ -388,6 +392,139 @@ public boolean createTableIfNeeded(String tableName, byte[] columnFamily, return createTableIfNeeded(getConnection(), tableName, columnFamily, preSplits, compressionType); } + public boolean splitAndMove(Admin admin, TableName name, byte[] expectedSplit) throws IOException { + return splitAndMove(admin, name, expectedSplit, 3, true); + } + + public boolean splitAndMove(Admin admin, TableName tableName, byte[] expectedSplit, int retries, boolean ignoreExceptions) + throws IOException { + StopWatch stopWatch = StopWatch.createStarted(); + int count = 0; + while (count < retries) { + count++; + try { + // Check if split point exists + RegionInfo regionInfo = getRegionInfo(admin, tableName, expectedSplit); + + if (regionInfo == null) { + LOGGER.info("Splitting table '{}' at '{}'", tableName, Bytes.toStringBinary(expectedSplit)); + admin.split(tableName, expectedSplit); + regionInfo = getRegionInfo(admin, tableName, expectedSplit); + int getRegionInfoAttempts = 10; + while (regionInfo == null) { + try { + Thread.sleep(200); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } + regionInfo = getRegionInfo(admin, tableName, expectedSplit); + getRegionInfoAttempts--; + if (getRegionInfoAttempts == 0) { + throw new DoNotRetryRegionException( + "Split point " + Bytes.toStringBinary(expectedSplit) + " not found after creation"); + } + } + } else if (count == 1) { + // First try and split point exists. Skip moving region + LOGGER.info("Split point {} already exists. Nothing to do.", Bytes.toStringBinary(expectedSplit)); + return false; + } + LOGGER.info("Moving region '{}' to another region server", regionInfo.getRegionNameAsString()); + admin.move(regionInfo.getEncodedNameAsBytes(), (byte[]) null); + LOGGER.info("New region created '{}' in {}", regionInfo.getRegionNameAsString(), TimeUtils.durationToString(stopWatch)); + return true; + } catch (IOException | RuntimeException e) { + if (ignoreExceptions) { + if (e instanceof DoNotRetryRegionException) { + LOGGER.warn("Error splitting table {} at {}. Retry {}/{} : {}", tableName, + Bytes.toStringBinary(expectedSplit), count, retries, e.getMessage()); + } else { + LOGGER.warn("Error splitting table {} at {}. Retry {}/{}", tableName, + Bytes.toStringBinary(expectedSplit), count, retries, e); + } + } else { + throw e; + } + } + try { + // Wait before retry + LOGGER.info("Waiting before retrying split table '{}' at '{}'", tableName, Bytes.toStringBinary(expectedSplit)); + Thread.sleep(1000); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } + } + LOGGER.warn("Unable to split table '{}' at '{}'", tableName, Bytes.toStringBinary(expectedSplit)); + return false; + } + + public static RegionInfo getRegionInfo(Admin admin, TableName name, byte[] expectedSplit) throws IOException { + return admin.getRegions(name) + .stream() + .filter(region -> Bytes.equals(region.getStartKey(), expectedSplit)) + .findFirst() + .orElse(null); + } + + public int expandTableIfNeeded(String tableName, int batch, + Function> batchSplitsGenerator, + int extraBatches, Function batchPlaceholderSplitGenerator) throws IOException { + return expandTableIfNeeded(tableName, Collections.singletonList(batch), batchSplitsGenerator, extraBatches, + batchPlaceholderSplitGenerator); + } + public int expandTableIfNeeded(String tableName, Collection batches, + Function> batchSplitsGenerator, + int extraBatches, Function batchPlaceholderSplitGenerator) throws IOException { + if (batches.isEmpty()) { + throw new IllegalArgumentException("No batches provided"); + } + // Get the expected splits for these batches + Collection expectedSplits = new LinkedHashSet<>(); + for (Integer batch : batches) { + expectedSplits.addAll(batchSplitsGenerator.apply(batch)); + } + int lastBatch = batches.stream().max(Integer::compareTo).get(); + + // Add some split placeholders for the extra batches + for (int i = lastBatch + 1; i < lastBatch + 1 + extraBatches; i++) { + expectedSplits.add(batchPlaceholderSplitGenerator.apply(i)); + } + // Shuffle the splits + List shuffledSplits = new ArrayList<>(expectedSplits); + Collections.shuffle(shuffledSplits); + + // Ensure that the table is split at least until the next expected split + return act(tableName, (table, admin) -> { + int newSplits = 0; + Set existingSplits = Arrays.stream(HBaseCompat.getInstance().getTableStartKeys(admin, table)) + .map(ByteBuffer::wrap) + .collect(Collectors.toSet()); + + int expectedNewSplits = 0; + for (byte[] expectedSplit : expectedSplits) { + if (!existingSplits.contains(ByteBuffer.wrap(expectedSplit))) { + expectedNewSplits++; + LOGGER.info("Missing split point '{}' at '{}'", tableName, Bytes.toStringBinary(expectedSplit)); + } + } + if (expectedNewSplits == 0) { + return 0; + } else { + LOGGER.info("Found {} missing split points. Splitting table '{}'", expectedNewSplits, tableName); + } + + for (byte[] expectedSplit : expectedSplits) { + if (!existingSplits.contains(ByteBuffer.wrap(expectedSplit))) { + if (splitAndMove(admin, table.getName(), expectedSplit)) { + newSplits++; + } + } + } + return newSplits; + }); + } + + /** * Create default HBase table layout with one column family. * diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/HadoopLocalLoadVariantStoragePipeline.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/HadoopLocalLoadVariantStoragePipeline.java index ada36f36d49..b1a21badba6 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/HadoopLocalLoadVariantStoragePipeline.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/HadoopLocalLoadVariantStoragePipeline.java @@ -37,7 +37,6 @@ import org.opencb.opencga.storage.core.exceptions.StorageEngineException; import org.opencb.opencga.storage.core.io.managers.IOConnectorProvider; import org.opencb.opencga.storage.core.metadata.VariantStorageMetadataManager; -import org.opencb.opencga.storage.core.metadata.models.FileMetadata; import org.opencb.opencga.storage.core.metadata.models.SampleMetadata; import org.opencb.opencga.storage.core.metadata.models.StudyMetadata; import org.opencb.opencga.storage.core.variant.VariantStorageEngine; @@ -96,83 +95,6 @@ public HadoopLocalLoadVariantStoragePipeline(StorageConfiguration configuration, @Override protected void preLoadRegisterAndValidateFile(int studyId, VariantFileMetadata variantFileMetadata) throws StorageEngineException { super.preLoadRegisterAndValidateFile(studyId, variantFileMetadata); - boolean loadSampleIndex = YesNoAuto.parse(getOptions(), LOAD_SAMPLE_INDEX.key()).orYes().booleanValue(); - FileMetadata fileMetadata = getMetadataManager().getFileMetadata(studyId, getFileId()); - - int version = getMetadataManager().getStudyMetadata(studyId).getSampleIndexConfigurationLatest().getVersion(); - Set alreadyIndexedSamples = new LinkedHashSet<>(); - Set processedSamples = new LinkedHashSet<>(); - Set samplesWithoutSplitData = new LinkedHashSet<>(); - VariantStorageEngine.SplitData splitData = VariantStorageEngine.SplitData.from(options); - for (String sample : variantFileMetadata.getSampleIds()) { - Integer sampleId = getMetadataManager().getSampleId(studyId, sample); - SampleMetadata sampleMetadata = getMetadataManager().getSampleMetadata(studyId, sampleId); - if (splitData != null && sampleMetadata.getSplitData() != null) { - if (!splitData.equals(sampleMetadata.getSplitData())) { - throw new StorageEngineException("Incompatible split data methods. " - + "Unable to mix requested " + splitData - + " with existing " + sampleMetadata.getSplitData()); - } - } - if (sampleMetadata.isIndexed()) { - if (sampleMetadata.getFiles().size() == 1 && sampleMetadata.getFiles().contains(fileMetadata.getId())) { - // It might happen that the sample is marked as INDEXED, but not the file. - // If the sample only belongs to this file (i.e. it's only file is this file), then ignore - // the overwrite the current sample metadata index status - sampleMetadata = getMetadataManager().updateSampleMetadata(studyId, sampleId, - sm -> sm.setIndexStatus(fileMetadata.getIndexStatus())); - } - } - if (sampleMetadata.isIndexed()) { - alreadyIndexedSamples.add(sample); - if (sampleMetadata.isAnnotated() - || !loadSampleIndex && sampleMetadata.getSampleIndexStatus(version) == Status.READY - || sampleMetadata.getSampleIndexAnnotationStatus(version) == Status.READY - || sampleMetadata.getFamilyIndexStatus(version) == Status.READY - || sampleMetadata.isFamilyIndexDefined()) { - processedSamples.add(sampleMetadata.getId()); - } - } - - if (splitData != null && splitData != sampleMetadata.getSplitData()) { - samplesWithoutSplitData.add(sampleId); - } - } - - if (!alreadyIndexedSamples.isEmpty()) { - if (splitData != null) { - logger.info("Loading split data"); - } else { - String fileName = Paths.get(variantFileMetadata.getPath()).getFileName().toString(); - throw StorageEngineException.alreadyLoadedSamples(fileName, new ArrayList<>(alreadyIndexedSamples)); - } - for (Integer sampleId : processedSamples) { - getMetadataManager().updateSampleMetadata(studyId, sampleId, sampleMetadata -> { - if (!loadSampleIndex) { - for (Integer v : sampleMetadata.getSampleIndexVersions()) { - sampleMetadata.setSampleIndexStatus(Status.NONE, v); - } - } - for (Integer v : sampleMetadata.getSampleIndexAnnotationVersions()) { - sampleMetadata.setSampleIndexAnnotationStatus(Status.NONE, v); - } - for (Integer v : sampleMetadata.getFamilyIndexVersions()) { - sampleMetadata.setFamilyIndexStatus(Status.NONE, v); - } - sampleMetadata.setAnnotationStatus(Status.NONE); - sampleMetadata.setMendelianErrorStatus(Status.NONE); - }); - } - } - - if (splitData != null) { - // Register loadSplitData - for (Integer sampleId : samplesWithoutSplitData) { - getMetadataManager().updateSampleMetadata(studyId, sampleId, sampleMetadata -> { - sampleMetadata.setSplitData(splitData); - }); - } - } } @Override diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStorageEngine.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStorageEngine.java index aa183adb007..84f00b042e9 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStorageEngine.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStorageEngine.java @@ -31,6 +31,7 @@ import org.opencb.commons.datastore.core.ObjectMap; import org.opencb.commons.datastore.core.Query; import org.opencb.commons.datastore.core.QueryOptions; +import org.opencb.opencga.core.common.IOUtils; import org.opencb.opencga.core.common.TimeUtils; import org.opencb.opencga.core.common.UriUtils; import org.opencb.opencga.core.config.DatabaseCredentials; @@ -38,6 +39,7 @@ import org.opencb.opencga.core.config.storage.StorageEngineConfiguration; import org.opencb.opencga.core.models.operations.variant.VariantAggregateFamilyParams; import org.opencb.opencga.core.models.operations.variant.VariantAggregateParams; +import org.opencb.opencga.core.models.variant.VariantSetupParams; import org.opencb.opencga.storage.core.StoragePipelineResult; import org.opencb.opencga.storage.core.exceptions.StorageEngineException; import org.opencb.opencga.storage.core.exceptions.StoragePipelineException; @@ -1086,6 +1088,49 @@ protected List initVariantAggregationExecutors() { return executors; } + @Override + public ObjectMap inferConfigurationParams(VariantSetupParams params) { + ObjectMap options = super.inferConfigurationParams(params); + ObjectMap configuredOptions = getOptions(); + + long expectedHBaseRegionSize = IOUtils.fromHumanReadableToByte("7.5GiB"); + + options.put(EXPECTED_SAMPLES_NUMBER.key(), params.getExpectedSamples()); + options.put(EXPECTED_FILES_NUMBER.key(), params.getExpectedFiles()); + + // Variant pre-split + int defaultVariantPreSplit = configuredOptions + .getInt(VARIANT_TABLE_PRESPLIT_SIZE.key(), VARIANT_TABLE_PRESPLIT_SIZE.defaultValue()); + float variantsFileToHBaseMultiplier = 1.3f; + Long averageFileSize = IOUtils.fromHumanReadableToByte(params.getAverageFileSize(), true); + float variantsTableSize = params.getExpectedFiles() * averageFileSize * variantsFileToHBaseMultiplier; + int variantPreSplit = (int) (variantsTableSize / expectedHBaseRegionSize); + options.put(VARIANT_TABLE_PRESPLIT_SIZE.key(), Math.max(defaultVariantPreSplit, variantPreSplit)); + + // Archive pre-split + int filesPerBatch = configuredOptions + .getInt(ARCHIVE_FILE_BATCH_SIZE.key(), ARCHIVE_FILE_BATCH_SIZE.defaultValue()); + float archiveFileToHBaseMultiplier = 1.2f; + float archiveTableSize = filesPerBatch * averageFileSize.floatValue() * archiveFileToHBaseMultiplier; + int archiveTablePreSplit = (int) (archiveTableSize / expectedHBaseRegionSize); + options.put(ARCHIVE_TABLE_PRESPLIT_SIZE.key(), Math.max(1, archiveTablePreSplit)); + + // SampleIndex pre-split + long averageSizePerVariant; + if (params.getVariantsPerSample() > 3500000) { + // With this many variants per sample, most of them won't have much data + averageSizePerVariant = IOUtils.fromHumanReadableToByte("13B"); + } else { + // With a small number of variants per sample, most of them will have a lot of data + averageSizePerVariant = IOUtils.fromHumanReadableToByte("25B"); + } + long sampleIndexSize = params.getVariantsPerSample() * averageSizePerVariant; + int samplesPerSplit = (int) (expectedHBaseRegionSize / sampleIndexSize); + options.put(SAMPLE_INDEX_TABLE_PRESPLIT_SIZE.key(), samplesPerSplit); + + return options; + } + @Override public void close() throws IOException { super.close(); diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStorageOptions.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStorageOptions.java index 9c1f22ed349..817605be87c 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStorageOptions.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStorageOptions.java @@ -8,7 +8,7 @@ public enum HadoopVariantStorageOptions implements ConfigurationOption { HADOOP_LOAD_FILES_IN_PARALLEL("storage.hadoop.load.filesInParallel", 1), HBASE_NAMESPACE("storage.hadoop.hbase.namespace"), - EXPECTED_FILES_NUMBER("expected_files_number", 5000), + EXPECTED_FILES_NUMBER("expected_files_number", 50), EXPECTED_SAMPLES_NUMBER("expected_samples_number"), DBADAPTOR_PHOENIX_FETCH_SIZE("storage.hadoop.phoenix.fetchSize", -1), DBADAPTOR_PHOENIX_QUERY_COMPLEXITY_THRESHOLD("storage.hadoop.phoenix.queryComplexityThreshold", 250), @@ -64,7 +64,7 @@ public enum HadoopVariantStorageOptions implements ConfigurationOption { // Variant table configuration ///////////////////////// VARIANT_TABLE_COMPRESSION("storage.hadoop.variant.table.compression", Compression.Algorithm.SNAPPY.getName()), - VARIANT_TABLE_PRESPLIT_SIZE("storage.hadoop.variant.table.preSplit.numSplits", 500), + VARIANT_TABLE_PRESPLIT_SIZE("storage.hadoop.variant.table.preSplit.numSplits", 50), // Do not create phoenix indexes. Testing purposes only VARIANT_TABLE_INDEXES_SKIP("storage.hadoop.variant.table.indexes.skip"), VARIANT_TABLE_LOAD_REFERENCE("storage.hadoop.variant.table.load.reference", false), @@ -78,7 +78,8 @@ public enum HadoopVariantStorageOptions implements ConfigurationOption { // Archive table configuration ///////////////////////// ARCHIVE_TABLE_COMPRESSION("storage.hadoop.archive.table.compression", Compression.Algorithm.GZ.getName()), - ARCHIVE_TABLE_PRESPLIT_SIZE("storage.hadoop.archive.table.preSplit.splitsPerBatch", 500), + ARCHIVE_TABLE_PRESPLIT_SIZE("storage.hadoop.archive.table.preSplit.splitsPerBatch", 10), + ARCHIVE_TABLE_PRESPLIT_EXTRA_SPLITS("storage.hadoop.archive.table.preSplit.extraSplits", 3), ARCHIVE_CHUNK_SIZE("storage.hadoop.archive.table.chunkSize", 1000), ARCHIVE_FILE_BATCH_SIZE("storage.hadoop.archive.table.fileBatchSize", 1000), @@ -92,7 +93,8 @@ public enum HadoopVariantStorageOptions implements ConfigurationOption { // Sample index table configuration ///////////////////////// SAMPLE_INDEX_TABLE_COMPRESSION("storage.hadoop.sampleIndex.table.compression", Compression.Algorithm.SNAPPY.getName()), - SAMPLE_INDEX_TABLE_PRESPLIT_SIZE("storage.hadoop.sampleIndex.table.preSplit.samplesPerSplit", 15), + SAMPLE_INDEX_TABLE_PRESPLIT_SIZE("storage.hadoop.sampleIndex.table.preSplit.samplesPerSplit", 200), + SAMPLE_INDEX_TABLE_PRESPLIT_EXTRA_SPLITS("storage.hadoop.sampleIndex.table.preSplit.extraSplits", 5), SAMPLE_INDEX_BUILD_MAX_SAMPLES_PER_MR("storage.hadoop.sampleIndex.build.maxSamplesPerMR", 2000), SAMPLE_INDEX_ANNOTATION_MAX_SAMPLES_PER_MR("storage.hadoop.sampleIndex.annotation.maxSamplesPerMR", 2000), SAMPLE_INDEX_FAMILY_MAX_TRIOS_PER_MR("storage.hadoop.sampleIndex.family.maxTriosPerMR", 1000), diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStoragePipeline.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStoragePipeline.java index 9b86cba3f6a..c3f5ebf550e 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStoragePipeline.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStoragePipeline.java @@ -170,6 +170,7 @@ public URI preLoad(URI input, URI output) throws StorageEngineException { try { if (YesNoAuto.parse(getOptions(), LOAD_ARCHIVE.key()).orYes().booleanValue()) { ArchiveTableHelper.createArchiveTableIfNeeded(getOptions(), getArchiveTable(), dbAdaptor.getConnection()); + ArchiveTableHelper.expandTableIfNeeded(getOptions(), getArchiveTable(), getFileId(), dbAdaptor.getHBaseManager()); } else { logger.info("Skip archive table"); } diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/archive/ArchiveRowKeyFactory.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/archive/ArchiveRowKeyFactory.java index 0f660bf4032..d20e5eee924 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/archive/ArchiveRowKeyFactory.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/archive/ArchiveRowKeyFactory.java @@ -143,10 +143,20 @@ public String generateBlockId(int fileId) { return sb.toString(); } - public String generateBlockIdFromSlice(int fileId, String chrom, long slice) { - return generateBlockIdFromSliceAndBatch(getFileBatch(fileId), chrom, slice); - } - + /** + * Generates a Row key based on Chromosome and position adjusted for the + * Chunk size.
+ *
    + *
  • Using {@link Region#normalizeChromosome(String)} to get standard chromosome + * name + *
  • Using {@link #getSliceId(long)} to return slice position + *
+ * e.g. using chunk size 100, separator _ with chr2 and 1234 would result in + * 2_12 + * + * @param fileBatch File batch + * @return {@link String} Row key string + */ public String generateBlockIdFromBatch(int fileBatch) { StringBuilder sb = new StringBuilder(FILE_BATCH_PAD + 1); sb.append(StringUtils.leftPad(String.valueOf(fileBatch), FILE_BATCH_PAD, '0')); @@ -154,6 +164,10 @@ public String generateBlockIdFromBatch(int fileBatch) { return sb.toString(); } + public String generateBlockIdFromSlice(int fileId, String chrom, long slice) { + return generateBlockIdFromSliceAndBatch(getFileBatch(fileId), chrom, slice); + } + public String generateBlockIdFromSliceAndBatch(int fileBatch, String chrom, long slice) { String chromosome = Region.normalizeChromosome(chrom); StringBuilder sb = new StringBuilder(FILE_BATCH_PAD + 1 + chromosome.length() + 1 + POSITION_PAD); diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/archive/ArchiveTableHelper.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/archive/ArchiveTableHelper.java index 3af44e97188..348d4d33e00 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/archive/ArchiveTableHelper.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/archive/ArchiveTableHelper.java @@ -56,7 +56,7 @@ public class ArchiveTableHelper extends GenomeHelper { public static final byte[] REF_COLUMN_SUFIX_BYTES = Bytes.toBytes(REF_COLUMN_SUFIX); public static final String CONFIG_ARCHIVE_TABLE_NAME = "opencga.archive.table.name"; - private final Logger logger = LoggerFactory.getLogger(ArchiveTableHelper.class); + private static Logger logger = LoggerFactory.getLogger(ArchiveTableHelper.class); private final AtomicReference meta = new AtomicReference<>(); private final ArchiveRowKeyFactory keyFactory; private final byte[] nonRefColumn; @@ -188,17 +188,13 @@ public static String getRefColumnName(int fileId) { public static boolean createArchiveTableIfNeeded(Configuration conf, String tableName) throws IOException { try (Connection con = ConnectionFactory.createConnection(conf)) { - return createArchiveTableIfNeeded(conf, tableName, con); + Compression.Algorithm compression = Compression.getCompressionAlgorithmByName( + conf.get(ARCHIVE_TABLE_COMPRESSION.key(), ARCHIVE_TABLE_COMPRESSION.defaultValue())); + final List preSplits = generateArchiveTableBootPreSplitHuman(conf); + return HBaseManager.createTableIfNeeded(con, tableName, COLUMN_FAMILY_BYTES, preSplits, compression); } } - public static boolean createArchiveTableIfNeeded(Configuration conf, String tableName, Connection con) throws IOException { - Compression.Algorithm compression = Compression.getCompressionAlgorithmByName( - conf.get(ARCHIVE_TABLE_COMPRESSION.key(), ARCHIVE_TABLE_COMPRESSION.defaultValue())); - final List preSplits = generateArchiveTableBootPreSplitHuman(conf); - return HBaseManager.createTableIfNeeded(con, tableName, COLUMN_FAMILY_BYTES, preSplits, compression); - } - public static boolean createArchiveTableIfNeeded(ObjectMap conf, String tableName, Connection con) throws IOException { Compression.Algorithm compression = Compression.getCompressionAlgorithmByName( conf.getString(ARCHIVE_TABLE_COMPRESSION.key(), ARCHIVE_TABLE_COMPRESSION.defaultValue())); @@ -206,6 +202,20 @@ public static boolean createArchiveTableIfNeeded(ObjectMap conf, String tableNam return HBaseManager.createTableIfNeeded(con, tableName, COLUMN_FAMILY_BYTES, preSplits, compression); } + public static void expandTableIfNeeded(ObjectMap options, String archiveTable, int fileId, HBaseManager hBaseManager) + throws IOException { + int splitsPerBatch = options.getInt(ARCHIVE_TABLE_PRESPLIT_SIZE.key(), ARCHIVE_TABLE_PRESPLIT_SIZE.defaultValue()); + int extraBatches = options.getInt(ARCHIVE_TABLE_PRESPLIT_EXTRA_SPLITS.key(), ARCHIVE_TABLE_PRESPLIT_EXTRA_SPLITS.defaultValue()); + ArchiveRowKeyFactory rowKeyFactory = new ArchiveRowKeyFactory(options); + int thisBatch = rowKeyFactory.getFileBatch(fileId); + int newRegions = hBaseManager.expandTableIfNeeded(archiveTable, thisBatch, + batch -> generateBatchSplitsHuman(rowKeyFactory, splitsPerBatch, batch), + extraBatches, batch -> Bytes.toBytes(rowKeyFactory.generateBlockIdFromBatch(batch))); + if (newRegions > 0) { + logger.info("Archive table '" + archiveTable + "' expanded with " + newRegions + " new regions for batch " + thisBatch); + } + } + public static List generateArchiveTableBootPreSplitHuman(Configuration conf) { final ArchiveRowKeyFactory rowKeyFactory = new ArchiveRowKeyFactory(conf); int nSplits = conf.getInt(ARCHIVE_TABLE_PRESPLIT_SIZE.key(), ARCHIVE_TABLE_PRESPLIT_SIZE.defaultValue()); @@ -226,15 +236,18 @@ private static List generateArchiveTableBootPreSplitHuman(ArchiveRowKeyF final List preSplits = new ArrayList<>(nSplits * expectedNumBatches); for (int batch = 0; batch <= expectedNumBatches; batch++) { - int finalBatch = batch; - preSplits.addAll(generateBootPreSplitsHuman(nSplits, (chr, position) -> { - long slice = rowKeyFactory.getSliceId(position); - return Bytes.toBytes(rowKeyFactory.generateBlockIdFromSliceAndBatch(finalBatch, chr, slice)); - })); + preSplits.addAll(generateBatchSplitsHuman(rowKeyFactory, nSplits, batch)); } return preSplits; } + public static List generateBatchSplitsHuman(ArchiveRowKeyFactory rowKeyFactory, int nSplits, int batch) { + return generateBootPreSplitsHuman(nSplits, (chr, position) -> { + long slice = rowKeyFactory.getSliceId(position); + return Bytes.toBytes(rowKeyFactory.generateBlockIdFromSliceAndBatch(batch, chr, slice)); + }); + } + public VariantFileMetadata getFileMetadata() { return meta.get(); } diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/index/sample/SampleIndexBuilder.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/index/sample/SampleIndexBuilder.java index 8b18a5e9a9f..8047fb49cf1 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/index/sample/SampleIndexBuilder.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/index/sample/SampleIndexBuilder.java @@ -85,6 +85,7 @@ public void buildSampleIndex(List samples, ObjectMap options, boolean ov } sampleIndexDBAdaptor.createTableIfNeeded(studyId, schema.getVersion(), options); + sampleIndexDBAdaptor.expandTableIfNeeded(studyId, schema.getVersion(), sampleIds, options); if (finalSamplesList.size() < 20) { logger.info("Run sample index build on samples " + finalSamplesList); diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/index/sample/SampleIndexDBAdaptor.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/index/sample/SampleIndexDBAdaptor.java index 06a3ca79b9a..0d785856776 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/index/sample/SampleIndexDBAdaptor.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/index/sample/SampleIndexDBAdaptor.java @@ -2,10 +2,7 @@ import com.google.common.collect.Iterators; import org.apache.commons.collections4.CollectionUtils; -import org.apache.hadoop.hbase.client.Get; -import org.apache.hadoop.hbase.client.Result; -import org.apache.hadoop.hbase.client.ResultScanner; -import org.apache.hadoop.hbase.client.Scan; +import org.apache.hadoop.hbase.client.*; import org.apache.hadoop.hbase.io.compress.Compression; import org.apache.hadoop.hbase.util.Bytes; import org.opencb.biodata.models.core.Region; @@ -796,8 +793,11 @@ public boolean createTableIfNeeded(int studyId, int version, ObjectMap options) int preSplitSize = options.getInt( HadoopVariantStorageOptions.SAMPLE_INDEX_TABLE_PRESPLIT_SIZE.key(), HadoopVariantStorageOptions.SAMPLE_INDEX_TABLE_PRESPLIT_SIZE.defaultValue()); + int sampleIndexExtraSplits = options.getInt( + HadoopVariantStorageOptions.SAMPLE_INDEX_TABLE_PRESPLIT_EXTRA_SPLITS.key(), + HadoopVariantStorageOptions.SAMPLE_INDEX_TABLE_PRESPLIT_EXTRA_SPLITS.defaultValue()); - int splits = samples / preSplitSize; + int splits = (samples / preSplitSize) + sampleIndexExtraSplits; ArrayList preSplits = new ArrayList<>(splits); for (int i = 0; i < splits; i++) { preSplits.add(SampleIndexSchema.toRowKey(i * preSplitSize)); @@ -815,6 +815,32 @@ public boolean createTableIfNeeded(int studyId, int version, ObjectMap options) } } + public void expandTableIfNeeded(int studyId, int version, List sampleIds, ObjectMap options) { + String sampleIndexTable = getSampleIndexTableName(studyId, version); + int preSplitSize = options.getInt( + HadoopVariantStorageOptions.SAMPLE_INDEX_TABLE_PRESPLIT_SIZE.key(), + HadoopVariantStorageOptions.SAMPLE_INDEX_TABLE_PRESPLIT_SIZE.defaultValue()); + int sampleIndexExtraBatches = options.getInt( + HadoopVariantStorageOptions.SAMPLE_INDEX_TABLE_PRESPLIT_EXTRA_SPLITS.key(), + HadoopVariantStorageOptions.SAMPLE_INDEX_TABLE_PRESPLIT_EXTRA_SPLITS.defaultValue()); + Set batches = new HashSet<>(); + for (Integer sampleId : sampleIds) { + batches.add((sampleId) / preSplitSize); + } + + try { + int newRegions = hBaseManager.expandTableIfNeeded(sampleIndexTable, batches, + batch -> Collections.singletonList(SampleIndexSchema.toRowKey(batch * preSplitSize)), + sampleIndexExtraBatches, batch -> SampleIndexSchema.toRowKey(batch * preSplitSize)); + if (newRegions != 0) { + // Log number of new regions + logger.info("Sample index table '" + sampleIndexTable + "' expanded with " + newRegions + " new regions"); + } + } catch (IOException e) { + throw new UncheckedIOException(e); + } + } + public void updateSampleIndexSchemaStatus(int studyId, int version) throws StorageEngineException { StudyMetadata studyMetadata = metadataManager.getStudyMetadata(studyId); if (studyMetadata.getSampleIndexConfiguration(version).getStatus() diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/index/sample/SampleIndexDBLoader.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/index/sample/SampleIndexDBLoader.java index f2700831e94..23c94cf0c28 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/index/sample/SampleIndexDBLoader.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/main/java/org/opencb/opencga/storage/hadoop/variant/index/sample/SampleIndexDBLoader.java @@ -188,6 +188,7 @@ public String toString() { public boolean open() { super.open(); dbAdaptor.createTableIfNeeded(studyId, schema.getVersion(), options); + dbAdaptor.expandTableIfNeeded(studyId, schema.getVersion(), sampleIds, options); return true; } diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/AutoScaleHBaseTableTest.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/AutoScaleHBaseTableTest.java new file mode 100644 index 00000000000..ae3ddabd3fb --- /dev/null +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/AutoScaleHBaseTableTest.java @@ -0,0 +1,119 @@ +package org.opencb.opencga.storage.hadoop.variant; + +import org.junit.Before; +import org.junit.ClassRule; +import org.junit.Test; +import org.junit.experimental.categories.Category; +import org.junit.rules.ExternalResource; +import org.opencb.commons.datastore.core.ObjectMap; +import org.opencb.opencga.core.testclassification.duration.LongTests; +import org.opencb.opencga.storage.core.variant.VariantStorageBaseTest; +import org.opencb.opencga.storage.core.variant.VariantStorageOptions; +import org.opencb.opencga.storage.hadoop.HBaseCompat; +import org.opencb.opencga.storage.hadoop.utils.HBaseManager; +import org.opencb.opencga.storage.hadoop.variant.adaptors.VariantHadoopDBAdaptor; +import org.opencb.opencga.storage.hadoop.variant.index.sample.SampleIndexDBAdaptor; + +import static org.junit.Assert.assertEquals; + +@Category(LongTests.class) +public class AutoScaleHBaseTableTest extends VariantStorageBaseTest implements HadoopVariantStorageTest { + + @ClassRule + public static ExternalResource externalResource = new HadoopExternalResource(); + + private VariantHadoopDBAdaptor dbAdaptor; + private SampleIndexDBAdaptor sampleIndexDBAdaptor; + private HadoopVariantStorageEngine engine; + + @Before + public void before() throws Exception { + clearDB(DB_NAME); + engine = getVariantStorageEngine(); + dbAdaptor = engine.getDBAdaptor(); + sampleIndexDBAdaptor = engine.getSampleIndexDBAdaptor(); + + } + + @Test + public void testAutoScaleTables() throws Exception { + + int archiveSplitsPerBatch = 10; + int samplesPerSplit = 2; + int extraSplits = 3; + + ObjectMap params = new ObjectMap() + .append(VariantStorageOptions.STUDY.key(), STUDY_NAME) + .append(VariantStorageOptions.ANNOTATE.key(), false) + .append(VariantStorageOptions.STATS_CALCULATE.key(), false) + .append(HadoopVariantStorageOptions.SAMPLE_INDEX_TABLE_PRESPLIT_SIZE.key(), samplesPerSplit) + .append(HadoopVariantStorageOptions.SAMPLE_INDEX_TABLE_PRESPLIT_EXTRA_SPLITS.key(), extraSplits) + .append(HadoopVariantStorageOptions.ARCHIVE_TABLE_PRESPLIT_SIZE.key(), archiveSplitsPerBatch) + .append(HadoopVariantStorageOptions.ARCHIVE_TABLE_PRESPLIT_EXTRA_SPLITS.key(), extraSplits) + .append(HadoopVariantStorageOptions.ARCHIVE_FILE_BATCH_SIZE.key(), 2) + .append(HadoopVariantStorageOptions.EXPECTED_SAMPLES_NUMBER.key(), 1) + .append(HadoopVariantStorageOptions.EXPECTED_FILES_NUMBER.key(), 1); + + + // -- Batch 1 + int batches = 1; + runETL(engine, getPlatinumFile(1), outputUri, params); + // Each batch starts with one extra split, expect the first batch. So, -1 + // Then, a fixed number of extra splits + checkArchiveTableSplits(((archiveSplitsPerBatch + 1) * batches) - 1 + extraSplits); + checkSampleIndexTableSplits(batches + extraSplits); + + // -- Batch 2 + // First batch has 1 fewer elements than the rest of the batches. + batches = 2; + runETL(engine, getPlatinumFile(2), outputUri, params); + checkArchiveTableSplits(((archiveSplitsPerBatch + 1) * batches) - 1 + extraSplits); + checkSampleIndexTableSplits(batches + extraSplits); + + runETL(engine, getPlatinumFile(3), outputUri, params); + checkArchiveTableSplits(((archiveSplitsPerBatch + 1) * batches) - 1 + extraSplits); + checkSampleIndexTableSplits(batches + extraSplits); + + // -- Batch 3 + batches = 3; + runETL(engine, getPlatinumFile(4), outputUri, params); + checkArchiveTableSplits(((archiveSplitsPerBatch + 1) * batches) - 1 + extraSplits); + checkSampleIndexTableSplits(batches + extraSplits); + + runETL(engine, getPlatinumFile(5), outputUri, params); + checkArchiveTableSplits(((archiveSplitsPerBatch + 1) * batches) - 1 + extraSplits); + checkSampleIndexTableSplits(batches + extraSplits); + + // -- Batch 4 + batches = 4; + runETL(engine, getPlatinumFile(6), outputUri, params); + checkArchiveTableSplits(((archiveSplitsPerBatch + 1) * batches) - 1 + extraSplits); + checkSampleIndexTableSplits(batches + extraSplits); + +// VariantHbaseTestUtils.printVariants(dbAdaptor, newOutputUri()); + } + + private void checkArchiveTableSplits(int expectedSplits) throws Exception { + int studyId = engine.getMetadataManager().getStudyId(STUDY_NAME); + + String archiveTableName = engine.getArchiveTableName(studyId); + HBaseManager hBaseManager = dbAdaptor.getHBaseManager(); + int archiveNumRegions = hBaseManager.act(archiveTableName, + (table, admin) -> HBaseCompat.getInstance().getTableStartKeys(admin, table).length); + // numRegions == numSplits + 1 + assertEquals(archiveTableName, expectedSplits + 1, archiveNumRegions); + } + + private void checkSampleIndexTableSplits(int expectedSplits) throws Exception { + int studyId = engine.getMetadataManager().getStudyId(STUDY_NAME); + + String sampleIndexTableName = sampleIndexDBAdaptor.getSampleIndexTableName(studyId, 1); + HBaseManager hBaseManager = dbAdaptor.getHBaseManager(); + int sampleIndexNumRegions = hBaseManager.act(sampleIndexTableName, + (table, admin) -> HBaseCompat.getInstance().getTableStartKeys(admin, table).length); + // numRegions == numSplits + 1 + assertEquals(sampleIndexTableName, expectedSplits + 1, sampleIndexNumRegions); + } + + +} diff --git a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStorageTest.java b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStorageTest.java index d11635abe6d..cf4d7984ada 100644 --- a/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStorageTest.java +++ b/opencga-storage/opencga-storage-hadoop/opencga-storage-hadoop-core/src/test/java/org/opencb/opencga/storage/hadoop/variant/HadoopVariantStorageTest.java @@ -232,8 +232,8 @@ public void before() throws Exception { Configurator.setLevel(MapTask.class.getName(), Level.WARN); Configurator.setLevel(TableInputFormatBase.class.getName(), Level.WARN); - utility.set(new HBaseTestingUtility()); - Configuration conf = utility.get().getConfiguration(); + HBaseTestingUtility testingUtility = new HBaseTestingUtility(); + Configuration conf = testingUtility.getConfiguration(); HadoopVariantStorageTest.configuration.set(conf); @@ -277,7 +277,8 @@ public void before() throws Exception { } //org.apache.commons.configuration2.Configuration - utility.get().startMiniCluster(1); + testingUtility.startMiniCluster(1); + utility.set(testingUtility); // MiniMRCluster miniMRCluster = utility.startMiniMapReduceCluster(); // MiniMRClientCluster miniMRClientCluster = MiniMRClientClusterFactory.create(HadoopVariantStorageManagerTestUtils.class, 1, configuration);