diff --git a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl index 1ce992a350e..35d4cb56a66 100644 --- a/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl +++ b/scripts/variantstore/variant-annotations-table/GvsCreateVATfromVDS.wdl @@ -85,7 +85,7 @@ workflow GvsCreateVATfromVDS { # If the vat version is undefined or v1 then the vat tables would be named like filter_vat, otherwise filter_vat_v2. String effective_vat_version = if (defined(vat_version) && select_first([vat_version]) != "v1") then "_" + select_first([vat_version]) else "" - String vat_table_name = filter_set_name + "_vat" + effective_vat_version + String effective_vat_table_name = filter_set_name + "_vat" + effective_vat_version String output_path_without_a_trailing_slash = sub(output_path, "/$", "") String effective_output_path = if (output_path == output_path_without_a_trailing_slash) then output_path + "/" else output_path @@ -262,7 +262,7 @@ workflow GvsCreateVATfromVDS { project_id = project_id, dataset_name = dataset_name, output_path = effective_output_path, - base_vat_table_name = vat_table_name, + base_vat_table_name = effective_vat_table_name, prep_vt_json_done = PrepVtAnnotationJson.done, prep_genes_json_done = PrepGenesAnnotationJson.done, cloud_sdk_docker = effective_cloud_sdk_docker, @@ -271,7 +271,7 @@ workflow GvsCreateVATfromVDS { call DeduplicateVatInBigQuery { input: input_vat_table_name = BigQueryLoadJson.vat_table, - output_vat_table_name = vat_table_name, + output_vat_table_name = effective_vat_table_name, nirvana_schema = MakeSubpopulationFilesAndReadSchemaFiles.vat_schema_json_file, project_id = project_id, dataset_name = dataset_name, @@ -294,6 +294,7 @@ workflow GvsCreateVATfromVDS { } output { + String vat_table_name = effective_vat_table_name String? cluster_name = GenerateSitesOnlyVcf.cluster_name File? dropped_sites_file = MergeTsvs.output_file File? final_tsv_file = GvsCreateVATFilesFromBigQuery.final_tsv_file diff --git a/scripts/variantstore/variant-annotations-table/GvsValidateVAT.wdl b/scripts/variantstore/variant-annotations-table/GvsValidateVAT.wdl index 3b9e30958a1..a49bc65a655 100644 --- a/scripts/variantstore/variant-annotations-table/GvsValidateVAT.wdl +++ b/scripts/variantstore/variant-annotations-table/GvsValidateVAT.wdl @@ -8,6 +8,7 @@ workflow GvsValidateVat { String project_id String dataset_name String vat_table_name + Boolean? is_small_callset String? cloud_sdk_docker String? variants_docker } @@ -25,20 +26,23 @@ workflow GvsValidateVat { String effective_cloud_sdk_docker = select_first([cloud_sdk_docker, GetToolVersions.cloud_sdk_docker]) String effective_variants_docker = select_first([variants_docker, GetToolVersions.variants_docker]) - call Utils.GetBQTableLastModifiedDatetime as SampleDateTime { - input: - project_id = project_id, - fq_table = fq_vat_table, - cloud_sdk_docker = effective_cloud_sdk_docker, - } + # Defining is_small_callset allows us to run this WDL on a dataset that has not had samples loaded (for testing) + if (!defined(is_small_callset)) { + call Utils.GetBQTableLastModifiedDatetime as SampleDateTime { + input: + project_id = project_id, + fq_table = fq_sample_table, + cloud_sdk_docker = effective_cloud_sdk_docker, + } - call Utils.GetNumSamplesLoaded { - input: - fq_sample_table = fq_sample_table, - project_id = project_id, - sample_table_timestamp = SampleDateTime.last_modified_timestamp, - control_samples = false, - cloud_sdk_docker = effective_cloud_sdk_docker, + call Utils.GetNumSamplesLoaded { + input: + fq_sample_table = fq_sample_table, + project_id = project_id, + sample_table_timestamp = SampleDateTime.last_modified_timestamp, + control_samples = false, + cloud_sdk_docker = effective_cloud_sdk_docker, + } } call Utils.GetBQTableLastModifiedDatetime as VatDateTime { @@ -152,8 +156,9 @@ workflow GvsValidateVat { cloud_sdk_docker = effective_cloud_sdk_docker, } - # only check certain things if the callset is larger than 10,000 samples (a guess) - Boolean callset_is_small = GetNumSamplesLoaded.num_samples < 10000 + # Check if the input boolean `is_small_callset` is defined, + # if not use the `GetNumSamples` task to find the number of samples in the callset and set the flag if it's < 10000 + Boolean callset_is_small = select_first([is_small_callset, select_first([GetNumSamplesLoaded.num_samples, 1]) < 10000]) if (!callset_is_small) { call ClinvarSignificance { input: diff --git a/scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl b/scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl index 903fa64fbf0..caee90d29f6 100644 --- a/scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl +++ b/scripts/variantstore/wdl/test/GvsQuickstartIntegration.wdl @@ -2,6 +2,7 @@ version 1.0 import "GvsQuickstartVcfIntegration.wdl" as QuickstartVcfIntegration import "GvsQuickstartHailIntegration.wdl" as QuickstartHailIntegration +import "GvsQuickstartVATIntegration.wdl" as QuickstartVATIntegration import "../GvsJointVariantCalling.wdl" as JointVariantCalling import "../GvsUtils.wdl" as Utils @@ -14,6 +15,8 @@ workflow GvsQuickstartIntegration { Boolean run_exome_integration = true Boolean run_beta_integration = true Boolean run_bge_integration = true + Boolean run_vat_integration = true + Boolean run_vat_integration_test_from_vds = true # If false, will use sites-only VCF String sample_id_column_name = "sample_id" String vcf_files_column_name = "hg38_reblocked_gvcf" String vcf_index_files_column_name = "hg38_reblocked_gvcf_index" @@ -25,6 +28,7 @@ workflow GvsQuickstartIntegration { String? cloud_sdk_docker String? cloud_sdk_slim_docker String? variants_docker + String? variants_nirvana_docker String? gatk_docker String? hail_version Boolean chr20_X_Y_only = true @@ -36,6 +40,7 @@ workflow GvsQuickstartIntegration { File full_exome_interval_list = "gs://gcp-public-data--broad-references/hg38/v0/bge_exome_calling_regions.v1.1.interval_list" String expected_subdir = if (!chr20_X_Y_only) then "all_chrs/" else "" File expected_output_prefix = "gs://gvs-internal-quickstart/integration/2024-10-29/" + expected_subdir + File truth_data_prefix = "gs://gvs-internal-quickstart/integration/test_data/2025-01-17/" # WDL 1.0 trick to set a variable ('none') to be undefined. if (false) { @@ -53,6 +58,7 @@ workflow GvsQuickstartIntegration { String effective_cloud_sdk_docker = select_first([cloud_sdk_docker, GetToolVersions.cloud_sdk_docker]) String effective_cloud_sdk_slim_docker = select_first([cloud_sdk_slim_docker, GetToolVersions.cloud_sdk_slim_docker]) String effective_variants_docker = select_first([variants_docker, GetToolVersions.variants_docker]) + String effective_variants_nirvana_docker = select_first([variants_nirvana_docker, GetToolVersions.variants_nirvana_docker]) String effective_gatk_docker = select_first([gatk_docker, GetToolVersions.gatk_docker]) String effective_hail_version = select_first([hail_version, GetToolVersions.hail_version]) @@ -73,6 +79,10 @@ workflow GvsQuickstartIntegration { } } + String workspace_bucket = GetToolVersions.workspace_bucket + String workspace_id = GetToolVersions.workspace_id + String submission_id = GetToolVersions.submission_id + # Note for `GvsQuickstartIntegration` we use the git_branch_or_tag *input* and its corresponding git hash. This is not # necessarily the same as the branch name selected in Terra for the integration `GvsQuickstartIntegration` workflow, # though in practice likely they are the same. @@ -99,9 +109,9 @@ workflow GvsQuickstartIntegration { cloud_sdk_slim_docker = effective_cloud_sdk_slim_docker, variants_docker = effective_variants_docker, gatk_docker = effective_gatk_docker, - workspace_bucket = GetToolVersions.workspace_bucket, - workspace_id = GetToolVersions.workspace_id, - submission_id = GetToolVersions.submission_id, + workspace_bucket = workspace_bucket, + workspace_id = workspace_id, + submission_id = submission_id, hail_version = effective_hail_version, maximum_alternate_alleles = maximum_alternate_alleles, ploidy_table_name = ploidy_table_name, @@ -139,9 +149,9 @@ workflow GvsQuickstartIntegration { cloud_sdk_slim_docker = effective_cloud_sdk_slim_docker, variants_docker = effective_variants_docker, gatk_docker = effective_gatk_docker, - workspace_bucket = GetToolVersions.workspace_bucket, - workspace_id = GetToolVersions.workspace_id, - submission_id = GetToolVersions.submission_id, + workspace_bucket = workspace_bucket, + workspace_id = workspace_id, + submission_id = submission_id, maximum_alternate_alleles = maximum_alternate_alleles, } call QuickstartVcfIntegration.GvsQuickstartVcfIntegration as QuickstartVcfVQSRIntegration { @@ -166,9 +176,9 @@ workflow GvsQuickstartIntegration { cloud_sdk_slim_docker = effective_cloud_sdk_slim_docker, variants_docker = effective_variants_docker, gatk_docker = effective_gatk_docker, - workspace_bucket = GetToolVersions.workspace_bucket, - workspace_id = GetToolVersions.workspace_id, - submission_id = GetToolVersions.submission_id, + workspace_bucket = workspace_bucket, + workspace_id = workspace_id, + submission_id = submission_id, maximum_alternate_alleles = maximum_alternate_alleles, } @@ -212,9 +222,9 @@ workflow GvsQuickstartIntegration { cloud_sdk_slim_docker = effective_cloud_sdk_slim_docker, variants_docker = effective_variants_docker, gatk_docker = effective_gatk_docker, - workspace_bucket = GetToolVersions.workspace_bucket, - workspace_id = GetToolVersions.workspace_id, - submission_id = GetToolVersions.submission_id, + workspace_bucket = workspace_bucket, + workspace_id = workspace_id, + submission_id = submission_id, maximum_alternate_alleles = maximum_alternate_alleles, target_interval_list = target_interval_list, } @@ -251,9 +261,9 @@ workflow GvsQuickstartIntegration { cloud_sdk_slim_docker = effective_cloud_sdk_slim_docker, variants_docker = effective_variants_docker, gatk_docker = effective_gatk_docker, - workspace_bucket = GetToolVersions.workspace_bucket, - workspace_id = GetToolVersions.workspace_id, - submission_id = GetToolVersions.submission_id, + workspace_bucket = workspace_bucket, + workspace_id = workspace_id, + submission_id = submission_id, maximum_alternate_alleles = maximum_alternate_alleles, target_interval_list = target_interval_list, } @@ -270,8 +280,6 @@ workflow GvsQuickstartIntegration { if (run_beta_integration) { String project_id = "gvs-internal" - String workspace_bucket = GetToolVersions.workspace_bucket - String submission_id = GetToolVersions.submission_id String extract_output_gcs_dir = "~{workspace_bucket}/output_vcfs/by_submission_id/~{submission_id}/beta" Boolean collect_variant_calling_metrics = true @@ -298,9 +306,9 @@ workflow GvsQuickstartIntegration { cloud_sdk_docker = effective_cloud_sdk_docker, variants_docker = effective_variants_docker, gatk_docker = effective_gatk_docker, - workspace_bucket = GetToolVersions.workspace_bucket, - workspace_id = GetToolVersions.workspace_id, - submission_id = GetToolVersions.submission_id, + workspace_bucket = workspace_bucket, + workspace_id = workspace_id, + submission_id = submission_id, maximum_alternate_alleles = maximum_alternate_alleles, git_branch_or_tag = git_branch_or_tag, sample_id_column_name = sample_id_column_name, @@ -319,6 +327,28 @@ workflow GvsQuickstartIntegration { } } + if (run_vat_integration) { + String extract_vat_output_gcs_dir = "~{workspace_bucket}/output_vat/by_submission_id/~{submission_id}/vat" + + call QuickstartVATIntegration.GvsQuickstartVATIntegration { + input: + git_branch_or_tag = git_branch_or_tag, + git_hash = GetToolVersions.git_hash, + use_default_dockers = use_default_dockers, + truth_data_prefix = truth_data_prefix, + expected_output_prefix = expected_output_prefix, + dataset_suffix = "vat", + output_path = extract_vat_output_gcs_dir, + use_vds_as_input = run_vat_integration_test_from_vds, + basic_docker = effective_basic_docker, + cloud_sdk_docker = effective_cloud_sdk_docker, + cloud_sdk_slim_docker = effective_cloud_sdk_slim_docker, + variants_docker = effective_variants_docker, + variants_nirvana_docker = effective_variants_nirvana_docker, + gatk_docker = effective_gatk_docker, + } + } + output { String recorded_git_hash = GetToolVersions.git_hash } diff --git a/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl b/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl new file mode 100644 index 00000000000..d49b8815b0d --- /dev/null +++ b/scripts/variantstore/wdl/test/GvsQuickstartVATIntegration.wdl @@ -0,0 +1,201 @@ +version 1.0 + +import "../GvsUtils.wdl" as Utils +import "../../variant-annotations-table/GvsCreateVATfromVDS.wdl" as CreateVATFromVDS +import "../../variant-annotations-table/GvsValidateVAT.wdl" as ValidateVAT + +workflow GvsQuickstartVATIntegration { + input { + String git_branch_or_tag + String? git_hash + Boolean use_default_dockers = false + String truth_data_prefix + String expected_output_prefix + String dataset_suffix + Boolean use_vds_as_input = true # If true, use a VDS, otherwise use a sites only VCF. + String output_path + String split_intervals_scatter_count = 10 + String? basic_docker + String? cloud_sdk_docker + String? cloud_sdk_slim_docker + String? variants_docker + String? variants_nirvana_docker + String? gatk_docker + } + String project_id = "gvs-internal" + + File ancestry_path = truth_data_prefix + "quickstart_ancestry.tsv" + File? vds_path = if (use_vds_as_input) then truth_data_prefix + "gvs_export.vds" else none + File? sites_only_vcf = if (!use_vds_as_input) then truth_data_prefix + "quickstart_sites_only.vcf.bgz" else none + + # WDL 1.0 trick to set a variable ('none') to be undefined. + if (false) { + File? none = "" + } + + call Utils.GetToolVersions { + input: + git_branch_or_tag = git_branch_or_tag, + } + + String effective_basic_docker = select_first([basic_docker, GetToolVersions.basic_docker]) + String effective_cloud_sdk_docker = select_first([cloud_sdk_docker, GetToolVersions.cloud_sdk_docker]) + String effective_cloud_sdk_slim_docker = select_first([cloud_sdk_slim_docker, GetToolVersions.cloud_sdk_slim_docker]) + String effective_variants_docker = select_first([variants_docker, GetToolVersions.variants_docker]) + String effective_variants_nirvana_docker = select_first([variants_nirvana_docker, GetToolVersions.variants_nirvana_docker]) + String effective_gatk_docker = select_first([gatk_docker, GetToolVersions.gatk_docker]) + String effective_git_hash = select_first([git_hash, GetToolVersions.git_hash]) + + if (!use_default_dockers) { + call Utils.BuildGATKJar { + input: + git_branch_or_tag = git_branch_or_tag, + cloud_sdk_slim_docker = effective_cloud_sdk_slim_docker, + } + } + + call Utils.CreateDatasetForTest { + input: + git_branch_or_tag = git_branch_or_tag, + dataset_prefix = "quickit", + dataset_suffix = dataset_suffix, + cloud_sdk_docker = effective_cloud_sdk_docker, + } + + call CreateVATFromVDS.GvsCreateVATfromVDS as CreateVATFromVDS { + input: + project_id = project_id, + dataset_name = CreateDatasetForTest.dataset_name, + ancestry_file = ancestry_path, + filter_set_name = "quickit", + vds_path = vds_path, + sites_only_vcf = sites_only_vcf, + output_path = output_path, + split_intervals_scatter_count = split_intervals_scatter_count, + git_branch_or_tag = git_branch_or_tag, + basic_docker = effective_basic_docker, + cloud_sdk_docker = effective_cloud_sdk_docker, + gatk_docker = effective_gatk_docker, + variants_docker = effective_variants_docker, + variants_nirvana_docker = effective_variants_nirvana_docker, + } + + call ValidateVAT.GvsValidateVat { + input: + project_id = project_id, + dataset_name = CreateDatasetForTest.dataset_name, + vat_table_name = CreateVATFromVDS.vat_table_name, + is_small_callset = true, + cloud_sdk_docker = effective_cloud_sdk_docker, + variants_docker = effective_variants_docker, + } + + String expected_prefix = expected_output_prefix + dataset_suffix + "/" + call AssertIdenticalOutputs { + input: + actual_file = select_first([CreateVATFromVDS.final_tsv_file]), + expected_file = expected_prefix + "vat_complete.bgz.tsv.gz", + gatk_docker = effective_gatk_docker + } + + + call AssertTableSizeIsAsExpected { + input: + dataset_name = CreateDatasetForTest.dataset_name, + project_id = project_id, + vat_table_name = CreateVATFromVDS.vat_table_name, + expected_output_csv = expected_prefix + "table_sizes.csv", + cloud_sdk_docker = effective_cloud_sdk_docker, + } + + output { + String dataset_name = CreateDatasetForTest.dataset_name + String recorded_git_hash = effective_git_hash + } +} + +task AssertIdenticalOutputs { + input { + File actual_file + File expected_file + String gatk_docker + } + + Int disk_size_gb = ceil(10 * size(actual_file, "GiB") + 10 * size(expected_file, "GiB")) + 200 + + command <<< + # Prepend date, time and pwd to xtrace log entries. + PS4='\D{+%F %T} \w $ ' + set -o errexit -o nounset -o pipefail -o xtrace + + cat ~{actual_file} | gunzip | sort > actual_file.txt + cat ~{expected_file} | gunzip | sort > expected_file.txt + set +o errexit + cmp actual_file.txt expected_file.txt + rc=$? + set -o errexit + if [[ $rc -ne 0 ]]; then + echo "The observed file ~{actual_file} differs from the expected ~{expected_file}!" + exit 1; + fi + >>> + + runtime { + docker: gatk_docker + disks: "local-disk ${disk_size_gb} HDD" + } + + output { + Boolean done = true + } +} + +task AssertTableSizeIsAsExpected { + meta { + # we want to check the database each time this runs + volatile: true + } + + input { + String dataset_name + String project_id + String vat_table_name + File expected_output_csv + String cloud_sdk_docker + } + + command <<< + # Prepend date, time and pwd to xtrace log entries. + PS4='\D{+%F %T} \w $ ' + set -o errexit -o nounset -o pipefail -o xtrace + + mkdir output + + echo "project_id = ~{project_id}" > ~/.bigqueryrc + bq --apilog=false query --project_id=~{project_id} --format=csv --use_legacy_sql=false \ + "SELECT 'vat_total' AS total_name, sum(total_billable_bytes) AS total_bytes \ + FROM \`~{dataset_name}.INFORMATION_SCHEMA.PARTITIONS\` \ + WHERE table_name = '~{vat_table_name}'" > output/table_sizes.csv + + set +o errexit + diff -w output/table_sizes.csv ~{expected_output_csv} > differences.txt + set -o errexit + + if [[ -s differences.txt ]]; then + echo "Differences found:" + cat differences.txt + exit 1 + fi + >>> + + runtime { + docker: cloud_sdk_docker + disks: "local-disk 10 HDD" + } + + output { + File table_sizes_output_csv = "output/table_sizes.csv" + File differences = "differences.txt" + } +} +