Skip to content

Commit

Permalink
Funcotator: added a maximum version number for data sources. (#6807)
Browse files Browse the repository at this point in the history
Added max version check for data sources in Funcotator.  This will automatically be
used when validating a data sources version for running.

Fixes #6712
  • Loading branch information
jonn-smith authored Oct 9, 2020
1 parent 1621724 commit 9d5727d
Show file tree
Hide file tree
Showing 10 changed files with 524 additions and 106 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ MAXARGS=0
################################################################################

# Change these to point to reference dictionaries:
D="/Users/jonn/Development/references"
D="."
#refAFile="${D}/GRCh37.p13.genome.dict"
#refAFile="${D}/human_g1k_v37.dict"
refAFile="${D}/ucsc.hg19.dict"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -745,11 +745,14 @@ public boolean requiresReference() {
@Override
public void onTraversalStart() {

logger.info("Validating Sequence Dictionaries...");
if (seqValidationArguments.performSequenceDictionaryValidation()) {
logger.info("Validating sequence dictionaries...");
// Ensure that the reference dictionary is a superset of the variant dictionary:
checkReferenceDictionaryIsSupersetOfVariantDictionary();
}
else {
logger.info("Skipping sequence dictionary validation.");
}

logger.info("Processing user transcripts/defaults/overrides...");
Utils.validateArg(funcotatorArgs.outputFormatType != FuncotatorArgumentDefinitions.OutputFormatType.SEG,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import org.broadinstitute.hellbender.cmdline.CommandLineProgram;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.tools.funcotator.dataSources.DataSourceUtils;
import org.broadinstitute.hellbender.utils.io.IOUtils;
import org.broadinstitute.hellbender.utils.nio.NioFileCopierWithProgressMeter;
import org.broadinstitute.hellbender.utils.nio.NioFileCopierWithProgressMeterResults;
Expand All @@ -25,7 +26,7 @@
* <h3>General Information</h3>
* <p>
* This tool can download pre-packaged data sources for both the <strong>somatic</strong> and <strong>germline</strong> use cases.
* The data sources downloaded by this tool correspond to the current minimum of the data sources supported as defined in <b><i>{@link org.broadinstitute.hellbender.tools.funcotator.dataSources.DataSourceUtils#CURRENT_MINIMUM_DATA_SOURCE_VERSION}</i></b>.
* The data sources downloaded by this tool correspond to the latest / current maximum of the data sources supported as defined in <b><i>{@link org.broadinstitute.hellbender.tools.funcotator.dataSources.DataSourceUtils#CURRENT_MAXIMUM_DATA_SOURCE_VERSION}</i></b>.
* </p>
*
* <p>
Expand Down Expand Up @@ -71,17 +72,19 @@ public class FuncotatorDataSourceDownloader extends CommandLineProgram {
//==================================================================================================================
// Private Static Members:

private static final String BASE_URL = "gs://broad-public-datasets/funcotator/funcotator_dataSources.v1.7.20200521";
// Set to always get the latest version of the data sources:
private static final String BASE_URL = DataSourceUtils.DATA_SOURCES_BUCKET_PATH +
DataSourceUtils.DATA_SOURCES_NAME_PREFIX + "." + DataSourceUtils.getDataSourceMaxVersionString();

private static final String GERMLINE_GCLOUD_DATASOURCES_BASEURL = BASE_URL + "g";
private static final String GERMLINE_GCLOUD_DATASOURCES_BASEURL = BASE_URL + DataSourceUtils.DS_GERMLINE_NAME_MODIFIER;
@VisibleForTesting
static final Path GERMLINE_GCLOUD_DATASOURCES_PATH = IOUtils.getPath(GERMLINE_GCLOUD_DATASOURCES_BASEURL + ".tar.gz");
private static final Path GERMLINE_GCLOUD_DATASOURCES_SHA256_PATH = IOUtils.getPath(GERMLINE_GCLOUD_DATASOURCES_BASEURL + ".sha256");
static final Path GERMLINE_GCLOUD_DATASOURCES_PATH = IOUtils.getPath(GERMLINE_GCLOUD_DATASOURCES_BASEURL + DataSourceUtils.DS_EXTENSION);
private static final Path GERMLINE_GCLOUD_DATASOURCES_SHA256_PATH = IOUtils.getPath(GERMLINE_GCLOUD_DATASOURCES_BASEURL + DataSourceUtils.DS_CHECKSUM_EXTENSION);

public static final String SOMATIC_GCLOUD_DATASOURCES_BASEURL = BASE_URL + "s";
public static final String SOMATIC_GCLOUD_DATASOURCES_BASEURL = BASE_URL + DataSourceUtils.DS_SOMATIC_NAME_MODIFIER;;

public static final Path SOMATIC_GCLOUD_DATASOURCES_PATH = IOUtils.getPath(SOMATIC_GCLOUD_DATASOURCES_BASEURL + ".tar.gz");
private static final Path SOMATIC_GCLOUD_DATASOURCES_SHA256_PATH = IOUtils.getPath(SOMATIC_GCLOUD_DATASOURCES_BASEURL + ".sha256");
public static final Path SOMATIC_GCLOUD_DATASOURCES_PATH = IOUtils.getPath(SOMATIC_GCLOUD_DATASOURCES_BASEURL + DataSourceUtils.DS_EXTENSION);
private static final Path SOMATIC_GCLOUD_DATASOURCES_SHA256_PATH = IOUtils.getPath(SOMATIC_GCLOUD_DATASOURCES_BASEURL + DataSourceUtils.DS_CHECKSUM_EXTENSION);

//==================================================================================================================
// Private Members:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -471,10 +471,21 @@ private boolean determineReferenceAndDatasourceCompatibility() {

boolean mustConvertInputContigsToHg19 = false;

if ( funcotatorArgs.forceB37ToHg19ContigNameConversion ||
( funcotatorArgs.referenceVersion.equals(BaseFuncotatorArgumentCollection.FuncotatorReferenceVersionHg19) &&
FuncotatorUtils.isSequenceDictionaryUsingB37Reference(sequenceDictionaryForDrivingVariants) )) {
// Do individual checks here so we can have a helpful log message for each case:
if ( funcotatorArgs.forceB37ToHg19ContigNameConversion ) {
logger.info("Forcing B37 -> HG19 Variant conversion.");
mustConvertInputContigsToHg19 = true;
}
else if ( funcotatorArgs.referenceVersion.equals(BaseFuncotatorArgumentCollection.FuncotatorReferenceVersionHg19) &&
FuncotatorUtils.isSequenceDictionaryUsingB37Reference(sequenceDictionaryForDrivingVariants) ) {
logger.info("VCF sequence dictionary detected as B37 in HG19 annotation mode. Performing conversion.");
mustConvertInputContigsToHg19 = true;
}
else {
logger.info("Using given VCF and Reference. No conversion required.");
}

if (mustConvertInputContigsToHg19) {
// NOTE AND WARNING:
// hg19 is from ucsc. b37 is from the genome reference consortium.
// ucsc decided the grc version had some bad data in it, so they blocked out some of the bases, aka "masked" them
Expand All @@ -487,8 +498,6 @@ private boolean determineReferenceAndDatasourceCompatibility() {
logger.warn("WARNING: You are using B37 as a reference. " +
"Funcotator will convert your variants to GRCh37, and this will be fine in the vast majority of cases. " +
"There MAY be some errors (e.g. in the Y chromosome, but possibly in other places as well) due to changes between the two references.");

mustConvertInputContigsToHg19 = true;
}

return mustConvertInputContigsToHg19;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.*;
import java.time.LocalDate;
import java.time.Month;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
Expand Down Expand Up @@ -57,24 +59,37 @@ private DataSourceUtils() {}

// Track our minimum version number here:
@VisibleForTesting
static final int MIN_MAJOR_VERSION_NUMBER = 1;
static final int MIN_MAJOR_VERSION_NUMBER = 1;
@VisibleForTesting
static final int MIN_MINOR_VERSION_NUMBER = 6;
static final int MIN_MINOR_VERSION_NUMBER = 6;
@VisibleForTesting
static final int MIN_YEAR_RELEASED = 2019;
static final LocalDate MIN_DATE = LocalDate.of(2019, Month.JANUARY, 24);

// Track out maximum version number here:
@VisibleForTesting
static final int MAX_MAJOR_VERSION_NUMBER = 1;
@VisibleForTesting
static final int MIN_MONTH_RELEASED = 1;
static final int MAX_MINOR_VERSION_NUMBER = 7;
@VisibleForTesting
static final int MIN_DAY_RELEASED = 24;
static final LocalDate MAX_DATE = LocalDate.of(2020, Month.MAY, 21);

//==================================================================================================================
// Public Static Members:

/** The minimum version of the data sources required for funcotator to run. */
public static final String CURRENT_MINIMUM_DATA_SOURCE_VERSION = String.format("v%d.%d.%d%02d%02d", MIN_MAJOR_VERSION_NUMBER, MIN_MINOR_VERSION_NUMBER, MIN_YEAR_RELEASED, MIN_MONTH_RELEASED, MIN_DAY_RELEASED);
public static final String CURRENT_MINIMUM_DATA_SOURCE_VERSION = getDataSourceMinVersionString();

/** The maximum supported version of the data sources for funcotator to run. */
public static final String CURRENT_MAXIMUM_DATA_SOURCE_VERSION = getDataSourceMaxVersionString();

public static final String MANIFEST_FILE_NAME = "MANIFEST.txt";
public static final String DATA_SOURCES_FTP_PATH = "ftp://[email protected]/bundle/funcotator/";
public static final String DATA_SOURCES_BUCKET_PATH = "gs://broad-public-datasets/funcotator/";
public static final String DATA_SOURCES_NAME_PREFIX = "funcotator_dataSources";
public static final String DS_SOMATIC_NAME_MODIFIER = "s";
public static final String DS_GERMLINE_NAME_MODIFIER = "g";
public static final String DS_EXTENSION = ".tar.gz";
public static final String DS_CHECKSUM_EXTENSION = ".sha256";

// TODO: Turn these into an enum (Issue #5465 - https://github.com/broadinstitute/gatk/issues/5465):
public static final String CONFIG_FILE_FIELD_NAME_NAME = "name";
Expand All @@ -100,6 +115,51 @@ private DataSourceUtils() {}
//==================================================================================================================
// Public Static Methods:

/**
* Get the string representing the Min version information for funcotator as it would be written in the data sources
* release files.
* Max version info is specified in the following variables:
* {@link #MIN_MAJOR_VERSION_NUMBER}
* {@link #MIN_MINOR_VERSION_NUMBER}
* {@link #MIN_DATE}
* @return A {@link String} representing the Min version information as it would appear in the data sources file name.
*/
public static String getDataSourceMinVersionString() {
return getDataSourceVersionString(MIN_MAJOR_VERSION_NUMBER, MIN_MINOR_VERSION_NUMBER, MIN_DATE);
}

/**
* Get the string representing the Max version information for funcotator as it would be written in the data sources
* release files.
* Max version info is specified in the following variables:
* {@link #MAX_MAJOR_VERSION_NUMBER}
* {@link #MAX_MINOR_VERSION_NUMBER}
* {@link #MAX_DATE}
* @return A {@link String} representing the Max version information as it would appear in the data sources file name.
*/
public static String getDataSourceMaxVersionString() {
return getDataSourceVersionString(MAX_MAJOR_VERSION_NUMBER, MAX_MINOR_VERSION_NUMBER, MAX_DATE);
}


/**
* Get the string representing the given version information for funcotator as it would be written in the data sources
* release files.
* @param major {@code int} representing the major version of the data sources to use.
* @param minor {@code int} representing the minor version of the data sources to use.
* @param date {@link LocalDate} representing the date of the data sources to use.
* @return A {@link String} representing the given version information as it would appear in the data sources file name.
*/
public static String getDataSourceVersionString(final int major, final int minor, final LocalDate date) {
return String.format("v%d.%d.%d%02d%02d",
major,
minor,
date.getYear(),
date.getMonthValue(),
date.getDayOfMonth()
);
}

/**
* Initializes the data sources for {@link Funcotator}.
* @param refVersion The version of the reference we're using to create annotations. Must not be {@code null}.
Expand Down Expand Up @@ -196,7 +256,7 @@ else if ( !hasGencodeDataSource ) {
* @param directory The {@link Path} of the directory in which to search for a config file. Must not be {@code null}.
* @return The {@link Path} to the config file found in the given {@code directory}.
*/
public static Path getConfigfile(final Path directory) {
private static Path getConfigfile(final Path directory) {

Utils.nonNull(directory);

Expand Down Expand Up @@ -225,7 +285,7 @@ else if ( configFileSet.size() == 0 ) {
}

/** @return {@code true} if the given {@link Path} exists, is readable, and is a directory; {@code false} otherwise. */
public static boolean isValidDirectory(final Path p) {
private static boolean isValidDirectory(final Path p) {
Utils.nonNull(p);
return Files.exists(p) && Files.isReadable(p) && Files.isDirectory(p);
}
Expand Down Expand Up @@ -628,7 +688,7 @@ private static boolean logDataSourcesInfo(final Path dataSourcesPath) {
Integer versionYear = null;
Integer versionMonth = null;
Integer versionDay = null;
String versionDecorator = null;
String versionDecorator;
String source = null;
String alternateSource = null;

Expand Down Expand Up @@ -711,46 +771,51 @@ private static boolean logDataSourcesInfo(final Path dataSourcesPath) {

// Warn the user if they need newer stuff.
if ( !dataSourcesPathIsAcceptable ) {

String message = "";
message = message + "ERROR: Given data source path is too old! Minimum required version is: " + CURRENT_MINIMUM_DATA_SOURCE_VERSION + " (yours: " + version + ")\n";
message = message + " You must download a newer version of the data sources from the Broad Institute FTP site: " + DATA_SOURCES_FTP_PATH + "\n";
message = message + "ERROR: Given data source path is too old or too new! \n";
message = message + " Minimum required version is: " + CURRENT_MINIMUM_DATA_SOURCE_VERSION + "\n";
message = message + " Maximum allowed version is: " + CURRENT_MAXIMUM_DATA_SOURCE_VERSION + "\n";
message = message + " Yours: " + version + "\n";
message = message + " You must download a compatible version of the data sources from the Broad Institute FTP site: " + DATA_SOURCES_FTP_PATH + "\n";
message = message + " or the Broad Institute Google Bucket: " + DATA_SOURCES_BUCKET_PATH + "\n";
throw new UserException( message );
}

return dataSourcesPathIsAcceptable;
}

/**
* Checks that the version information given is within the valid range for data source versions.
*
* @param major int containing the major version number to be checked.
* @param minor int containing the minor version number to be checked.
* @param year int containing the year version number to be checked.RecQ DNA helicase WRN
* @param month int containing the month version number to be checked.
* @param day int containing the day version number to be checked.
*
* @return {@code true} iff the given version information is valid for the current data source ranges. {@code false} otherwise.
*/
@VisibleForTesting
static boolean validateVersionInformation(final int major, final int minor, final int year, final int month, final int day) {

// Compare from largest to smallest differences:

if ( major < MIN_MAJOR_VERSION_NUMBER ) {
// Compare Major Version:
if ((major < MIN_MAJOR_VERSION_NUMBER) || (major > MAX_MAJOR_VERSION_NUMBER)) {
return false;
}

if ( minor < MIN_MINOR_VERSION_NUMBER ) {
// Compare minor version if we're on the edge of versions:
if ( major == MIN_MAJOR_VERSION_NUMBER && minor < MIN_MINOR_VERSION_NUMBER ) {
return false;
}

if ( year < MIN_YEAR_RELEASED ) {
if ( major == MAX_MAJOR_VERSION_NUMBER && minor > MAX_MINOR_VERSION_NUMBER ) {
return false;
}

else if ( year == MIN_YEAR_RELEASED ) {
if ( month < MIN_MONTH_RELEASED ) {
return false;
}
else if ( month == MIN_MONTH_RELEASED ) {
if ( day < MIN_DAY_RELEASED ) {
return false;
}
}
}
// Now make sure the date is between or equal to the min and max date:
final LocalDate versionDate = LocalDate.of(year, month, day);

return true;
// A valid date is between min and max date inclusive.
return (!versionDate.isBefore(MIN_DATE)) && (!versionDate.isAfter(MAX_DATE));
}

// ========================================================================================================
Expand Down
Loading

0 comments on commit 9d5727d

Please sign in to comment.