From 43c4e3f2e2b3e85e065269b9932bc144c67cded2 Mon Sep 17 00:00:00 2001 From: Vahid Date: Mon, 9 Jan 2023 11:07:54 -0800 Subject: [PATCH 01/19] Update the github action and configuration of the website (#481) * Update github actions for building the website. * Update baseUrl following github pages recommendation. --- .github/workflows/docs.yml | 40 +++++++++++++----------------------- website/docusaurus.config.js | 2 +- 2 files changed, 15 insertions(+), 27 deletions(-) diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 22b5647a4..97ec63724 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -7,41 +7,32 @@ on: branches: [master] jobs: - checks: + Test: if: github.event_name != 'push' runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 with: - lfs: 'true' - - name: checkoutLFS - uses: actions/checkout@v2 - run: git lfs pull - - uses: actions/setup-node@v1 + lfs: 'true' + - uses: actions/setup-node@v3 with: node-version: '18.x' - name: Test Build run: | cd website - if [ -e yarn.lock ]; then - yarn install --frozen-lockfile - elif [ -e package-lock.json ]; then - npm ci - else - npm i + if [ -e yarn.lock ]; then yarn install --frozen-lockfile; + elif [ -e package-lock.json ]; then npm ci; + else npm i; fi npm run build - gh-release: + Deploy: if: github.event_name != 'pull_request' runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 with: - lfs: 'true' - - name: checkoutLFS - uses: actions/checkout@v2 - run: git lfs pull - - uses: actions/setup-node@v1 + lfs: 'true' + - uses: actions/setup-node@v3 with: node-version: '18.x' - name: Release to GitHub Pages @@ -52,12 +43,9 @@ jobs: git config --global user.email "actions@gihub.com" git config --global user.name "gh-actions" cd website - if [ -e yarn.lock ]; then - yarn install --frozen-lockfile - elif [ -e package-lock.json ]; then - npm ci - else - npm i + if [ -e yarn.lock ]; then yarn install --frozen-lockfile; + elif [ -e package-lock.json ]; then npm ci; + else npm i; fi npx docusaurus build - name: Deploy diff --git a/website/docusaurus.config.js b/website/docusaurus.config.js index c37246116..3cd083d6c 100644 --- a/website/docusaurus.config.js +++ b/website/docusaurus.config.js @@ -9,7 +9,7 @@ const config = { title: 'GATK-SV', tagline: 'A cloud-native pipeline for calling structural variations on short-read sequencing data', url: 'https://broadinstitute.github.io', - baseUrl: '/', + baseUrl: '/gatk-sv/', onBrokenLinks: 'throw', onBrokenMarkdownLinks: 'throw', //favicon: 'img/favicon.ico', From c5040350d2bbde8be77e6d9d1e420b72d6c0be72 Mon Sep 17 00:00:00 2001 From: Vahid Date: Wed, 18 Jan 2023 15:33:55 -0800 Subject: [PATCH 02/19] Refactor references to the gatk-sv master branch to main. (#482) --- .github/workflows/README.md | 4 ++-- .github/workflows/docs.yml | 4 ++-- .github/workflows/pytest.yaml | 4 ++-- .github/workflows/sv_pipeline_docker.yml | 10 +++++----- .github/workflows/testwdls.yaml | 4 ++-- carrot/README.md | 4 ++-- carrot/carrot_helper.py | 2 +- 7 files changed, 16 insertions(+), 16 deletions(-) diff --git a/.github/workflows/README.md b/.github/workflows/README.md index 2bf6de818..3b60c42a6 100644 --- a/.github/workflows/README.md +++ b/.github/workflows/README.md @@ -56,7 +56,7 @@ specifically: 3. `Publish`. This job is triggered when a PR is merged to a commit - is pushed to the `master` branch. Similar to the `Test Images Build` job, + is pushed to the `main` branch. Similar to the `Test Images Build` job, this job builds Docker images and fails if the build process was unsuccessful. However, in addition, this job pushes the built images to GCR. To authorize access to GCR, this job assumes a GCP service @@ -133,7 +133,7 @@ In order to set up the `Deploy` environment, you may take the following steps: Once the `Deploy` environment is set up, and the `Required reviewers` option under the section `Environment protection rules` is checked, -with every push to the `master` branch (e.g., merging a PR), the +with every push to the `main` branch (e.g., merging a PR), the DIW execution will pause at the `Publish` job with the following message: diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 97ec63724..3d861fc53 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -2,9 +2,9 @@ name: documentation on: pull_request: - branches: [master] + branches: [main] push: - branches: [master] + branches: [main] jobs: Test: diff --git a/.github/workflows/pytest.yaml b/.github/workflows/pytest.yaml index d43a8f822..a296b500d 100644 --- a/.github/workflows/pytest.yaml +++ b/.github/workflows/pytest.yaml @@ -2,9 +2,9 @@ name: Test Python Scripts on: push: - branches: [ master ] + branches: [ main ] pull_request: - branches: [ master ] + branches: [ main ] jobs: linting: diff --git a/.github/workflows/sv_pipeline_docker.yml b/.github/workflows/sv_pipeline_docker.yml index 083ef73b3..6bb636dd4 100644 --- a/.github/workflows/sv_pipeline_docker.yml +++ b/.github/workflows/sv_pipeline_docker.yml @@ -3,7 +3,7 @@ name: Docker Images on: push: branches: - - master + - main paths: - 'src/**' - 'dockerfiles/**' @@ -11,7 +11,7 @@ on: - '.github/workflows/sv_pipeline_docker.yml' pull_request: branches: - - master + - main paths: - 'src/**' - 'dockerfiles/**' @@ -43,7 +43,7 @@ jobs: id: commit_sha # This action determines the SHA of two commits: # - BASE (BASE_SHA): The commit SHA of the base branch (e.g., - # broadinstitute/gatk-sv:master) which the feature branch targets. + # broadinstitute/gatk-sv:main) which the feature branch targets. # - HEAD (HEAD_SHA): The commit SHA of the latest commit on the # feature branch. # @@ -51,7 +51,7 @@ jobs: # # X---Y---Z feature # / \ - # A---B---C---D---E master + # A---B---C---D---E main # # 'E' is the merge commit (e.g., 'Merge pull request #0'). # @@ -222,5 +222,5 @@ jobs: git config --global user.name 'gatk-sv-bot' git config --global user.email '101641599+gatk-sv-bot@users.noreply.github.com' git commit ./inputs/values/dockers.json -m "Update docker images list, triggered by "${COMMIT_SHA::8} - git pull --rebase origin master + git pull --rebase origin main git push diff --git a/.github/workflows/testwdls.yaml b/.github/workflows/testwdls.yaml index 420ca54f5..60055b1be 100644 --- a/.github/workflows/testwdls.yaml +++ b/.github/workflows/testwdls.yaml @@ -3,12 +3,12 @@ name: Test WDLs on: push: branches: - - master + - main paths: - 'wdl/**' pull_request: branches: - - master + - main paths: - 'wdl/**' diff --git a/carrot/README.md b/carrot/README.md index c1fb0a323..35f458452 100644 --- a/carrot/README.md +++ b/carrot/README.md @@ -133,7 +133,7 @@ a publicly accessible GitHub repository. Therefore, in order to define/update tests, `carrot_helper` requires to know the GitHub repository and the git branch where the test and evaluation WDLs are available. If you want to run existing tests, you may use `https://github.com/broadinstitute/gatk-sv` and -`master` for repository and branch respectively. If you are developing +`main` for repository and branch respectively. If you are developing a carrot test for a WDL, then you may set the repository to your fork of `github.com/broadinstitute/gatk-sv` and set the branch to your feature branch. @@ -181,7 +181,7 @@ and any necessary mapping between them) in the `.carrot_pipelines.json`. The `.carrot_pipelines.json` file tracked on git contains metadata belonging to the `carrot` resources defined for tests and WDLs available from the -`master` branch of the +`main` branch of the [`github.com/broadinstitute/gatk-sv`](https://github.com/broadinstitute/gatk-sv) repository on a `carrot` server maintained for internal use at the Broad institute. You may use this file to run and updated (read the following) diff --git a/carrot/carrot_helper.py b/carrot/carrot_helper.py index 87602d584..e9474b292 100644 --- a/carrot/carrot_helper.py +++ b/carrot/carrot_helper.py @@ -792,7 +792,7 @@ def init_config(): print("\nPlease enter the branch that contains the test and evaluation" "WDLs and their input JSON files. If you want to run the " "existing tests on broadinstitute/gatk-sv, this can be " - "`master`, or the name of your feature branch.") + "`main`, or the name of your feature branch.") branch = None while not branch or not is_url_accessible(f"{repo}/tree/{branch}"): branch = input("\tPlease enter branch name: ") From 5d1d5e0ca321b6235ae77952accfb8186f74a547 Mon Sep 17 00:00:00 2001 From: Vahid Date: Thu, 19 Jan 2023 05:26:32 -0800 Subject: [PATCH 03/19] Update setup.py (#485) --- src/svtest/setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/svtest/setup.py b/src/svtest/setup.py index c47a37de1..41049eb1c 100644 --- a/src/svtest/setup.py +++ b/src/svtest/setup.py @@ -7,7 +7,7 @@ description='Test package for the GATK SV pipeline', classifiers=[ 'Development Status :: 3 - Alpha', - 'Programming Language :: Python :: 2.7', + 'Programming Language :: Python :: 3.8', ], url='https://github.com/talkowski-lab/gatk-sv-v1', author='Mark Walker', @@ -16,4 +16,4 @@ include_package_data=True, zip_safe=False, scripts=['scripts/svtest'], - install_requires=['numpy', 'matplotlib']) + install_requires=['numpy', 'matplotlib', 'pandas', 'intervaltree', 'pysam']) From dde4314db2d827ae112f19c2b09694bb4caac094 Mon Sep 17 00:00:00 2001 From: gatk-sv-bot <101641599+gatk-sv-bot@users.noreply.github.com> Date: Thu, 19 Jan 2023 13:38:07 +0000 Subject: [PATCH 04/19] Update docker images list, triggered by 5d1d5e0c --- inputs/values/dockers.json | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/inputs/values/dockers.json b/inputs/values/dockers.json index a1f3ae281..3cb491d24 100644 --- a/inputs/values/dockers.json +++ b/inputs/values/dockers.json @@ -13,12 +13,12 @@ "samtools_cloud_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/samtools-cloud:2022-06-10-v0.23-beta-9c6fbf56", "sv_base_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-base:2022-06-10-v0.23-beta-9c6fbf56", "sv_base_mini_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-base-mini:2022-06-10-v0.23-beta-9c6fbf56", - "sv_pipeline_base_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-01-04-v0.26.7-beta-da4c0aba", - "sv_pipeline_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-01-04-v0.26.7-beta-da4c0aba", - "sv_pipeline_hail_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-01-04-v0.26.7-beta-da4c0aba", - "sv_pipeline_updates_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-01-04-v0.26.7-beta-da4c0aba", - "sv_pipeline_qc_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-01-04-v0.26.7-beta-da4c0aba", - "sv_pipeline_rdtest_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-01-04-v0.26.7-beta-da4c0aba", + "sv_pipeline_base_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-01-19-v0.26.8-beta-5d1d5e0c", + "sv_pipeline_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-01-19-v0.26.8-beta-5d1d5e0c", + "sv_pipeline_hail_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-01-19-v0.26.8-beta-5d1d5e0c", + "sv_pipeline_updates_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-01-19-v0.26.8-beta-5d1d5e0c", + "sv_pipeline_qc_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-01-19-v0.26.8-beta-5d1d5e0c", + "sv_pipeline_rdtest_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-01-19-v0.26.8-beta-5d1d5e0c", "wham_docker": "us.gcr.io/broad-dsde-methods/wham:8645aa", "igv_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/igv:mw-xz-fixes-2-b1be6a9", "duphold_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/duphold:mw-xz-fixes-2-b1be6a9", From 60b804a168dec6a63765742ed36c9486899c4bf7 Mon Sep 17 00:00:00 2001 From: Vahid Date: Thu, 19 Jan 2023 05:39:59 -0800 Subject: [PATCH 05/19] Add algolia setup. (#486) --- website/docusaurus.config.js | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/website/docusaurus.config.js b/website/docusaurus.config.js index 3cd083d6c..7120ef32e 100644 --- a/website/docusaurus.config.js +++ b/website/docusaurus.config.js @@ -116,6 +116,12 @@ const config = { ], copyright: `Copyright © ${new Date().getFullYear()} BroadInstitute, Built with Docusaurus.`, }, + algolia: { + appId: 'LI6UMHUDIS', + apiKey: '97d929d265c25db1ed1816391a2a719a', + indexName: 'gatk-sv', + contextualSearch: true + }, prism: { theme: lightCodeTheme, darkTheme: darkCodeTheme, From 902a31973205ad947e748136630deb7795173f5f Mon Sep 17 00:00:00 2001 From: Vahid Date: Fri, 20 Jan 2023 16:39:19 -0500 Subject: [PATCH 06/19] Extend the `testwdls` workflow to test WDLs using WOMtool (#456) * Refactor test names * Add scripts for the validation using womtool. --- .github/workflows/testwdls.yaml | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/.github/workflows/testwdls.yaml b/.github/workflows/testwdls.yaml index 60055b1be..a6a5cd10a 100644 --- a/.github/workflows/testwdls.yaml +++ b/.github/workflows/testwdls.yaml @@ -13,28 +13,48 @@ on: - 'wdl/**' jobs: - miniwdl_job: + syntax_test_job: runs-on: ubuntu-22.04 - name: miniwdl validation + name: Check WDL Syntax strategy: matrix: python-version: ['3.8'] steps: - name: Checkout code - uses: actions/checkout@v2 + uses: actions/checkout@v3 - name: Setup Python uses: actions/setup-python@v2 with: python-version: ${{ matrix.python-version }} - - name: Install dependencies + - name: Setup Java + uses: actions/setup-java@v3 + with: + # See https://github.com/actions/setup-java for the available options. + distribution: 'temurin' + java-version: '17' + + - name: Setup run: | python -m pip install --upgrade pip + # Setup for running miniwdl pip install miniwdl + + # Setup for running womtool + pip install jinja2==3.1.2 + wget -O womtool.jar https://github.com/broadinstitute/cromwell/releases/download/84/womtool-84.jar + echo \ + '{ "google_project_id": "my-google-project-id", "terra_billing_project_id": "my-terra-billing-project" }' \ + > inputs/values/google_cloud.my_project.json + scripts/inputs/build_default_inputs.sh -d . -c google_cloud.my_project - - name: Run Tests + - name: Test with Miniwdl run: | python scripts/test/miniwdl_validation.py \ --imports-dir wdl \ wdl/*.wdl + + - name: Test with WOMtool + run: | + scripts/test/validate.sh -d . -j womtool.jar From a08504831d4084d70700e6e29cafca1d48dd4a3f Mon Sep 17 00:00:00 2001 From: Vahid Date: Tue, 31 Jan 2023 16:28:26 -0500 Subject: [PATCH 07/19] Delete update_json_docker.sh (#491) --- scripts/docker/update_json_docker.sh | 74 ---------------------------- 1 file changed, 74 deletions(-) delete mode 100755 scripts/docker/update_json_docker.sh diff --git a/scripts/docker/update_json_docker.sh b/scripts/docker/update_json_docker.sh deleted file mode 100755 index 97ef6e296..000000000 --- a/scripts/docker/update_json_docker.sh +++ /dev/null @@ -1,74 +0,0 @@ -#!/bin/bash - - -#################################### -#################################### -# Updates GATKSV json file dockers -#################################### -#################################### - -set -euo pipefail - -################################################# -# Utility functions -################################################# - -function usage() { - printf "Usage: \n \ - %s -d -r -n -t \n \ - \t path to gatk-sv-v1 base directory \n \ - \t docker root/user (e.g. \"gatksv\") \n \ - \t docker image name (e.g. \"sv-pipeline\", \"ALL\") \n \ - \t docker tag \n" "$1" -} - -if [[ "$#" == 0 ]]; then - usage "$0"; exit 0; -fi - -################################################# -# Parsing arguments -################################################# -while getopts "d:r:n:t:" option; do - case "$option" in - d) BASE_DIR="$OPTARG" ;; - r) DOCKER_ROOT="$OPTARG" ;; - n) DOCKER_NAME="$OPTARG" ;; - t) DOCKER_TAG="$OPTARG" ;; - *) usage "$0" && exit 1 ;; - esac -done - -if [[ -z "$BASE_DIR" ]] || [[ -z "$DOCKER_ROOT" ]] || [[ -z "$DOCKER_NAME" ]] || [[ -z "$DOCKER_TAG" ]]; then - usage "$0" - exit 1 -fi - -if [[ ! -d "$BASE_DIR" ]]; then - echo "Invalid directory: $BASE_DIR" - exit 1 -fi - -if [[ ${DOCKER_NAME} == "ALL" ]]; then - DOCKER_NAME_ARR=("cnmops" "manta" "samtools-cloud" "sv-base" "sv-base-mini" "sv-pipeline" "sv-pipeline-base" \ - "sv-pipeline-qc" "sv-pipeline-rdtest" "wham") - echo "Warning: MELT dockers will not be updated." -else - DOCKER_NAME_ARR=("$DOCKER_NAME") -fi - -################################################# -# Update jsons -################################################# - -shopt -s nullglob -JSON_ARR=(${BASE_DIR}/*.json ${BASE_DIR}/test/*/*.json ${BASE_DIR}/inputs/*.json) -JSONS=$(printf "%s " "${JSON_ARR[@]}") - -for name in "${DOCKER_NAME_ARR[@]}"; do - docker_var=$(tr '-' '_' <<< "${name}")"_docker" - docker_regex='\.'"${docker_var}"'"\s*:\s*".+\/.+:.+"' - cmd="perl -pi -e 's|${docker_regex}|.${docker_var}\": \"${DOCKER_ROOT}\/${name}:${DOCKER_TAG}\"|g' ${JSONS}" - echo "${cmd}" - eval "${cmd}" -done From 16c8991e8074d04927b53ec9d964fbe79cf59fae Mon Sep 17 00:00:00 2001 From: cwhelan Date: Wed, 1 Feb 2023 11:58:58 -0500 Subject: [PATCH 08/19] Fix single sample pipeline REF_PANEL_GENOTYPES filter bug (#490) * work around bedtools truncation issue in ref panel genotypes filter; improve file handling --- inputs/values/dockers.json | 12 +- .../apply_ref_panel_genotypes_filter.py | 118 ++++++++++++------ 2 files changed, 83 insertions(+), 47 deletions(-) diff --git a/inputs/values/dockers.json b/inputs/values/dockers.json index 3cb491d24..040ca1fa3 100644 --- a/inputs/values/dockers.json +++ b/inputs/values/dockers.json @@ -13,12 +13,12 @@ "samtools_cloud_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/samtools-cloud:2022-06-10-v0.23-beta-9c6fbf56", "sv_base_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-base:2022-06-10-v0.23-beta-9c6fbf56", "sv_base_mini_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-base-mini:2022-06-10-v0.23-beta-9c6fbf56", - "sv_pipeline_base_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-01-19-v0.26.8-beta-5d1d5e0c", - "sv_pipeline_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-01-19-v0.26.8-beta-5d1d5e0c", - "sv_pipeline_hail_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-01-19-v0.26.8-beta-5d1d5e0c", - "sv_pipeline_updates_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-01-19-v0.26.8-beta-5d1d5e0c", - "sv_pipeline_qc_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-01-19-v0.26.8-beta-5d1d5e0c", - "sv_pipeline_rdtest_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-01-19-v0.26.8-beta-5d1d5e0c", + "sv_pipeline_base_docker": "sv-pipeline:cw-fix-refpanelgt-filter-c2bca8", + "sv_pipeline_docker": "sv-pipeline:cw-fix-refpanelgt-filter-c2bca8", + "sv_pipeline_hail_docker": "sv-pipeline:cw-fix-refpanelgt-filter-c2bca8", + "sv_pipeline_updates_docker": "sv-pipeline:cw-fix-refpanelgt-filter-c2bca8", + "sv_pipeline_qc_docker": "sv-pipeline:cw-fix-refpanelgt-filter-c2bca8", + "sv_pipeline_rdtest_docker": "sv-pipeline:cw-fix-refpanelgt-filter-c2bca8", "wham_docker": "us.gcr.io/broad-dsde-methods/wham:8645aa", "igv_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/igv:mw-xz-fixes-2-b1be6a9", "duphold_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/duphold:mw-xz-fixes-2-b1be6a9", diff --git a/src/sv-pipeline/scripts/single_sample/apply_ref_panel_genotypes_filter.py b/src/sv-pipeline/scripts/single_sample/apply_ref_panel_genotypes_filter.py index 03e42a65a..a94295822 100755 --- a/src/sv-pipeline/scripts/single_sample/apply_ref_panel_genotypes_filter.py +++ b/src/sv-pipeline/scripts/single_sample/apply_ref_panel_genotypes_filter.py @@ -1,7 +1,8 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -# Applies REF_PANEL_GENOTYPES filter to a single sample VCF file given the cleaned final VCF for the reference panel cohort +# Applies REF_PANEL_GENOTYPES filter to a single sample VCF file given the cleaned final VCF for the reference panel +# cohort # # The general objective is to remove calls in the single sample VCF which do not match a call or calls in the reference # panel VCF AND have variant genotypes for samples from the reference panel. @@ -21,6 +22,12 @@ import svtk.utils as svu +VCF_FILTERS = [ + '##FILTER=' +] + + def ac_filter(feature, variant_gts_allowed, sample_to_exclude): variant_samples = set(feature.fields[6].split(',')) if sample_to_exclude in variant_samples: @@ -54,7 +61,12 @@ def filter_on_reciprocal_overlap(single_sample_vcf_file, return filtered_variant_ids -def filter_cnv_on_coverage(single_sample_vcf_file, ref_vcf_file, svtype, case_sample, overlap_frac, variant_gts_allowed): +def filter_cnv_on_coverage(single_sample_vcf_file, + ref_vcf_file, + svtype, + case_sample, + overlap_frac, + variant_gts_allowed): single_sample_vcf = single_sample_vcf_file ref_vcf = ref_vcf_file @@ -64,48 +76,63 @@ def filter_cnv_on_coverage(single_sample_vcf_file, ref_vcf_file, svtype, case_sa ref_vcf, annotate_ins=False, include_samples=True, svtypes=[svtype]) # in bash bedtools this gets the results we want: - # bedtools coverage -a single_sample_calls.bed -b ref_panel_calls.bed -d \ # compute per-base coverage of query by intervals in ref + # bedtools coverage -a single_sample_calls.bed -b ref_panel_calls.bed -d \ # compute per-base coverage + # # of query by intervals in ref # | awk '{OFS="\t"; print $1,$2,$3,$8,$9}' \ # slim down the of data by removing sample list, extra fields # | bedtools groupby -g 1,2,3,5 -c 4 -o min,max \ # group together regions with the same coverage value # | awk '$5 > 0 {OFS="\t"; print $1,$2+$5-1,$2+$6}' \ # make these regions into new bed intervals - # | bedtools intersect -a stdin -b ref_panel_calls.bed -wb \ # print out the ref intervals that overlapped these regions + # | bedtools intersect -a stdin -b ref_panel_calls.bed -wb \ # print out the ref intervals + # # that overlapped these regions # | bedtools groupby -g 1,2,3 -c 10 -o distinct\ # condense the sample lists - # | bedtools intersect -a single_sample_calls.bed -b stdin -wao # intersect with the query, printing the amt of overlap + # | bedtools intersect -a single_sample_calls.bed -b stdin -wao # intersect with the query, + # # printing the amount of overlap # # pybedtools unable to handle this pipeline without blowing up disk space due to lack of working streaming support # # subprocess streaming equivalent - single_sample_bed.saveas('single_sample_calls.bed') - ref_bed.saveas('ref_panel_calls.bed') - - cov_hist = subprocess.Popen(['bedtools', 'coverage', '-a', 'single_sample_calls.bed', '-b', 'ref_panel_calls.bed', '-d'], - stdout=subprocess.PIPE) - cov_hist_slim = subprocess.Popen(['awk', '{OFS="\t"; print $1,$2,$3,$8,$9}'], - stdin=cov_hist.stdout, - stdout=subprocess.PIPE) - cov_reg_grouped = subprocess.Popen(['bedtools', 'groupby', '-g', '1,2,3,5', '-c', '4', '-o', 'min,max'], - stdin=cov_hist_slim.stdout, - stdout=subprocess.PIPE) - cov_reg_grp_fix = subprocess.Popen(['awk', '$5 > 0 {OFS="\t"; print $1,$2+$5-1,$2+$6}'], - stdin=cov_reg_grouped.stdout, - stdout=subprocess.PIPE) - cov_reg_ref_ovl = subprocess.Popen(['bedtools', 'intersect', '-a', 'stdin', '-b', 'ref_panel_calls.bed', '-wb'], - stdin=cov_reg_grp_fix.stdout, - stdout=subprocess.PIPE) - cov_reg_ref_cds = subprocess.Popen(['bedtools', 'groupby', '-g', '1,2,3', '-c', '10', '-o', 'distinct'], - stdin=cov_reg_ref_ovl.stdout, - stdout=subprocess.PIPE) - final_intersect_process = subprocess.Popen(['bedtools', 'intersect', '-a', 'single_sample_calls.bed', '-b', 'stdin', '-wao'], - stdin=cov_reg_ref_cds.stdout, - stdout=open('final_merged_intersection.bed', 'w')) - - final_intersect_process.communicate()[0] # expect this to be empty - return_code = final_intersect_process.returncode - if return_code != 0: - raise Exception( - 'intersection pipeline process exited with return code ' + return_code) - - intersection = pybedtools.BedTool('final_merged_intersection.bed') + single_sample_bed.saveas('single_sample_calls_{}.bed'.format(svtype)) + ref_bed.saveas('ref_panel_calls_{}.bed'.format(svtype)) + + with open("ref_panel_variant_samples_by_itx_region_{}.bed".format(svtype), 'w') as \ + ref_panel_variant_samples_by_itx_region, \ + open("final_merged_intersection_{}.bed".format(svtype), 'w') as final_merged_intersection: + cov_hist = subprocess.Popen(['bedtools', 'coverage', '-a', 'single_sample_calls_{}.bed'.format(svtype), '-b', + 'ref_panel_calls_{}.bed'.format(svtype), '-d'], + stdout=subprocess.PIPE) + cov_hist_slim = subprocess.Popen(['awk', '{OFS="\t"; print $1,$2,$3,$8,$9}'], + stdin=cov_hist.stdout, + stdout=subprocess.PIPE) + cov_reg_grouped = subprocess.Popen(['bedtools', 'groupby', '-g', '1,2,3,5', '-c', '4', '-o', 'min,max'], + stdin=cov_hist_slim.stdout, + stdout=subprocess.PIPE) + cov_reg_grp_fix = subprocess.Popen(['awk', '$5 > 0 {OFS="\t"; print $1,$2+$5-1,$2+$6}'], + stdin=cov_reg_grouped.stdout, + stdout=subprocess.PIPE) + cov_reg_ref_ovl = subprocess.Popen(['bedtools', 'intersect', '-a', 'stdin', + '-b', 'ref_panel_calls_{}.bed'.format(svtype), '-wb'], + stdin=cov_reg_grp_fix.stdout, + stdout=subprocess.PIPE) + cov_reg_ref_cds = subprocess.Popen(['bedtools', 'groupby', '-g', '1,2,3', '-c', '10', '-o', 'distinct'], + stdin=cov_reg_ref_ovl.stdout, + stdout=ref_panel_variant_samples_by_itx_region) + proc_out, proc_err = cov_reg_ref_cds.communicate() + return_code = cov_reg_ref_cds.returncode + if return_code != 0: + sys.stderr.write(proc_err) + raise Exception( + 'intersection pipeline process exited with return code ' + return_code) + final_intersect_process = subprocess.Popen( + ['bedtools', 'intersect', '-a', 'single_sample_calls_{}.bed'.format(svtype), + '-b', "ref_panel_variant_samples_by_itx_region_{}.bed".format(svtype), '-wao'], + stdout=final_merged_intersection) + proc_out, proc_err = final_intersect_process.communicate() + return_code = final_intersect_process.returncode + if return_code != 0: + sys.stderr.write(proc_err) + raise Exception( + 'intersection pipeline process exited with return code {}'.format(return_code)) + + intersection = pybedtools.BedTool("final_merged_intersection_{}.bed".format(svtype)) filtered_variant_ids = [] current_case_id = '' @@ -113,6 +140,17 @@ def filter_cnv_on_coverage(single_sample_vcf_file, ref_vcf_file, svtype, case_sa bases_covered_by_matching_calls = 0 current_case_length = -1 + # a bug in bedtools seems to be truncating the ref panel variant sample list in the final merged intersection + # Could be from https://github.com/arq5x/bedtools2/issues/919 + # so we read in the ref_panel_variant_samples_by_itx_region file to grab the sample lists pre-truncation + sample_lists_by_ref_region = {} + with open("ref_panel_variant_samples_by_itx_region_{}.bed".format(svtype), 'r') as ref_sample_file: + for line in ref_sample_file: + fields = line.rstrip().split("\t") + key = "_".join(fields[0:3]) + sample_set = set(fields[3].split(",")) + sample_lists_by_ref_region[key] = sample_set + for intx in intersection: new_case_id = intx.name @@ -134,7 +172,7 @@ def filter_cnv_on_coverage(single_sample_vcf_file, ref_vcf_file, svtype, case_sa has_ref_panel_gts = True if intx.fields[7] != ".": - ref_panel_gts = set(intx.fields[10].split(',')) + ref_panel_gts = sample_lists_by_ref_region["_".join(intx.fields[7:10])] if len(variant_samples - ref_panel_gts) <= variant_gts_allowed: bases_covered_by_matching_calls += int(intx.fields[11]) return filtered_variant_ids @@ -156,10 +194,6 @@ def main(): variant_gts_allowed = args.variant_gts_allowed overlap_frac = args.overlap_frac - VCF_FILTERS = [ - '##FILTER=' - ] - filtered_variant_ids = [] filtered_variant_ids += filter_cnv_on_coverage(pysam.VariantFile(args.single_sample_vcf), @@ -214,6 +248,8 @@ def main(): record.filter.add('REF_PANEL_GENOTYPES') fout.write(record) + fout.close() + if __name__ == '__main__': main() From dd9a42d078f718191faf7ea1d98f07991fd744cb Mon Sep 17 00:00:00 2001 From: gatk-sv-bot <101641599+gatk-sv-bot@users.noreply.github.com> Date: Wed, 1 Feb 2023 18:43:10 +0000 Subject: [PATCH 09/19] Update docker images list, triggered by 16c8991e --- inputs/values/dockers.json | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/inputs/values/dockers.json b/inputs/values/dockers.json index 040ca1fa3..6d1b0e50d 100644 --- a/inputs/values/dockers.json +++ b/inputs/values/dockers.json @@ -13,12 +13,12 @@ "samtools_cloud_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/samtools-cloud:2022-06-10-v0.23-beta-9c6fbf56", "sv_base_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-base:2022-06-10-v0.23-beta-9c6fbf56", "sv_base_mini_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-base-mini:2022-06-10-v0.23-beta-9c6fbf56", - "sv_pipeline_base_docker": "sv-pipeline:cw-fix-refpanelgt-filter-c2bca8", - "sv_pipeline_docker": "sv-pipeline:cw-fix-refpanelgt-filter-c2bca8", - "sv_pipeline_hail_docker": "sv-pipeline:cw-fix-refpanelgt-filter-c2bca8", - "sv_pipeline_updates_docker": "sv-pipeline:cw-fix-refpanelgt-filter-c2bca8", - "sv_pipeline_qc_docker": "sv-pipeline:cw-fix-refpanelgt-filter-c2bca8", - "sv_pipeline_rdtest_docker": "sv-pipeline:cw-fix-refpanelgt-filter-c2bca8", + "sv_pipeline_base_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-02-01-v0.26.8-beta-16c8991e", + "sv_pipeline_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-02-01-v0.26.8-beta-16c8991e", + "sv_pipeline_hail_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-02-01-v0.26.8-beta-16c8991e", + "sv_pipeline_updates_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-02-01-v0.26.8-beta-16c8991e", + "sv_pipeline_qc_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-02-01-v0.26.8-beta-16c8991e", + "sv_pipeline_rdtest_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-02-01-v0.26.8-beta-16c8991e", "wham_docker": "us.gcr.io/broad-dsde-methods/wham:8645aa", "igv_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/igv:mw-xz-fixes-2-b1be6a9", "duphold_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/duphold:mw-xz-fixes-2-b1be6a9", From 9b25c72d067117543dbbcea3ca01f61f7933ca31 Mon Sep 17 00:00:00 2001 From: Vahid Date: Wed, 1 Feb 2023 18:17:48 -0500 Subject: [PATCH 10/19] Fix a few build errors of docker images (#489) * Fix build errors of docker images. * Remove a testing left-over. * Rebuilt all the images supported by build_docker.py. * Merge upstream. * Update the list of docker images. * Rebuild all the images. --- dockerfiles/samtools-cloud/Dockerfile | 49 ++++++++++--------- dockerfiles/sv-base/Dockerfile | 3 +- .../sv-pipeline-virtual-env/Dockerfile | 5 +- dockerfiles/sv-utils-env/Dockerfile | 2 +- inputs/values/dockers.json | 38 +++++++------- 5 files changed, 51 insertions(+), 46 deletions(-) diff --git a/dockerfiles/samtools-cloud/Dockerfile b/dockerfiles/samtools-cloud/Dockerfile index 51bc344f7..5110c6ec3 100644 --- a/dockerfiles/samtools-cloud/Dockerfile +++ b/dockerfiles/samtools-cloud/Dockerfile @@ -2,47 +2,43 @@ # so we base off the sv-base-mini image, which has samtools installed, # and only install GCloud SDK here for authentication purpose -# Start with the barebones image that has samtools (a version that must support NIO) installed +# Start with the barebones image that has samtools (a version that must support NIO) installed ARG MINIBASE_IMAGE=sv-base-mini:latest ARG VIRTUAL_ENV_IMAGE=samtoolscloud-virtual-env:latest # available gsutil versions here: https://cloud.google.com/sdk/docs/release-notes -ARG CLOUD_SDK_VERSION=389.0.0 +ARG CLOUD_SDK_VERSION=414.0.0-0 ARG CONDA_ENV_NAME="gatk-sv" ARG CONDA_INSTALL_DIR=/opt/conda -ARG GOOGLE_CLOUD_INSTALL_DIR=/opt/google-cloud-sdk - ################## install google-cloud-cli into a copy of the virtual image FROM $VIRTUAL_ENV_IMAGE as prep_virtual_env - # get prerequisites for installing google-cloud-cli ARG DEBIAN_FRONTEND=noninteractive RUN apt-get -qqy update --fix-missing && \ apt-get -qqy install --no-upgrade --no-install-recommends \ - apt-transport-https ca-certificates wget libssl3 + apt-transport-https ca-certificates wget curl gnupg2 libssl3 \ + libfontconfig1-dev -# install google-cloud-cli -# NOTE --override-components to avoid installing bq (BigQuery) -# NOTE: despite this being a "versioned" package, the installer will insist on upgrading at install time, so this -# is difficult to fully pin +# Google cloud SDK is installed using the documentation in the following link. +# https://cloud.google.com/sdk/docs/install#deb ARG CLOUD_SDK_VERSION -ARG GOOGLE_CLOUD_INSTALL_DIR -ENV CLOUDSDK_CONFIG=$GOOGLE_CLOUD_INSTALL_DIR -ENV PATH=$GOOGLE_CLOUD_INSTALL_DIR/bin:$PATH -RUN wget -q https://dl.google.com/dl/cloudsdk/channels/rapid/downloads/google-cloud-sdk-${CLOUD_SDK_VERSION}-linux-x86_64.tar.gz && \ - tar xzf google-cloud-sdk-${CLOUD_SDK_VERSION}-linux-x86_64.tar.gz && \ - mkdir -p $(basename $GOOGLE_CLOUD_INSTALL_DIR) && \ - mv google-cloud-sdk $GOOGLE_CLOUD_INSTALL_DIR && \ - cd $GOOGLE_CLOUD_INSTALL_DIR && \ - ./install.sh --usage-reporting false --override-components core gsutil && \ + +RUN echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] http://packages.cloud.google.com/apt cloud-sdk main" \ + | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list && \ + curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | \ + tee /usr/share/keyrings/cloud.google.gpg && \ + apt-get update -y && \ + apt-get install google-cloud-sdk=$CLOUD_SDK_VERSION -y && \ gcloud config set core/disable_usage_reporting true && \ gcloud config set component_manager/disable_update_check true && \ gcloud config set metrics/environment github_docker_image +ENV GOOGLE_CLOUD_INSTALL_DIR=/usr/lib/google-cloud-sdk +ENV CLOUDSDK_CONFIG=$GOOGLE_CLOUD_INSTALL_DIR + # clean up some stuff we don't need # -anthoscli (Anthos) RUN rm $GOOGLE_CLOUD_INSTALL_DIR/bin/anthoscli - -########## Copy results to final image +# Copy results to final image FROM $MINIBASE_IMAGE ARG CONDA_INSTALL_DIR ARG CONDA_ENV_NAME @@ -53,7 +49,16 @@ COPY --from=prep_virtual_env $CONDA_ENV_PATH $CONDA_ENV_PATH ARG GOOGLE_CLOUD_INSTALL_DIR COPY --from=prep_virtual_env $GOOGLE_CLOUD_INSTALL_DIR $GOOGLE_CLOUD_INSTALL_DIR ENV PATH=/opt/google-cloud-sdk/bin:$CONDA_ENV_PATH/bin:$PATH - +ARG DEBIAN_FRONTEND=noninteractive +RUN apt-get -qqy update --fix-missing && \ + apt-get -qqy install --no-upgrade --no-install-recommends libfontconfig1-dev && \ + apt-get -qqy clean && \ + rm -rf /tmp/* \ + /var/tmp/* \ + /var/cache/apt/* \ + /var/lib/apt/lists/* \ + /usr/share/man/?? \ + /usr/share/man/??_* # show the google packages run RUN gcloud --version RUN gsutil --version diff --git a/dockerfiles/sv-base/Dockerfile b/dockerfiles/sv-base/Dockerfile index 8a8a4e498..ac0608caf 100644 --- a/dockerfiles/sv-base/Dockerfile +++ b/dockerfiles/sv-base/Dockerfile @@ -24,9 +24,8 @@ RUN apt-get -qqy update --fix-missing && \ apt-get -qqy install --no-upgrade --no-install-recommends \ $GATK_BUILD_DEP $GATK_RUN_DEP # clone requested GATK_COMMIT -RUN git clone https://github.com/broadinstitute/gatk.git && \ +RUN GIT_LFS_SKIP_SMUDGE=1 git clone https://github.com/broadinstitute/gatk.git && \ cd gatk && \ - git lfs install && \ git checkout ${GATK_COMMIT} # build GATK_JAR RUN cd gatk && \ diff --git a/dockerfiles/sv-pipeline-virtual-env/Dockerfile b/dockerfiles/sv-pipeline-virtual-env/Dockerfile index a709ede13..6f92d7bf4 100644 --- a/dockerfiles/sv-pipeline-virtual-env/Dockerfile +++ b/dockerfiles/sv-pipeline-virtual-env/Dockerfile @@ -71,7 +71,8 @@ ENV CONDA_ENV_NAME=$CONDA_ENV_NAME ARG BUILD_DEPS="make cmake g++ gcc gfortran \ libbz2-dev libblas-dev libicu-dev liblapack-dev liblzma-dev libpcre2-dev zlib1g-dev \ pkg-config libcurl4-openssl-dev libssl-dev libatlas-base-dev libssh2-1-dev libssh2-1 \ - libgit2-dev libfftw3-dev libjpeg8-dev libxml2-dev" + libgit2-dev libfftw3-dev libjpeg8-dev libxml2-dev libfontconfig1-dev libharfbuzz-dev \ + libfribidi-dev libfreetype6-dev libpng-dev libtiff5-dev libjpeg-dev" ARG RUN_DEPS="file bzip2 libssl3 libatlas3-base libssh2-1 libgit2-1.1 libfftw3-3 libjpeg8 libxml2" RUN export NEW_PACKAGES=$(diff_of_lists.sh "$RUN_DEPS" $APT_REQUIRED_PACKAGES) && \ printf " $NEW_PACKAGES" | fix_spaces.sh >> $APT_REQUIRED_PACKAGES @@ -112,4 +113,4 @@ RUN cd /opt/bin && \ unzip plink2_linux_x86_64_20190107.zip && \ rm -f plink2_linux_x86_64_20190107.zip -ENV PATH=/opt/bin:$PATH \ No newline at end of file +ENV PATH=/opt/bin:$PATH diff --git a/dockerfiles/sv-utils-env/Dockerfile b/dockerfiles/sv-utils-env/Dockerfile index 2ac482bc9..ad94b53df 100644 --- a/dockerfiles/sv-utils-env/Dockerfile +++ b/dockerfiles/sv-utils-env/Dockerfile @@ -7,7 +7,7 @@ ARG CONDA_ENV_NAME="gatk-sv" FROM $PYTHON_VIRTUAL_ENV_IMAGE as python_virtual_env # install any build dependencies, plus ghostscript (needed for merging PDFs) -ARG BUILD_DEPS="g++ make apt-transport-https ca-certificates wget" +ARG BUILD_DEPS="g++ make apt-transport-https ca-certificates wget zlib1g-dev" ARG DEBIAN_FRONTEND=noninteractive RUN apt-get -qqy update --fix-missing && \ apt-get -qqy install --no-install-recommends $BUILD_DEPS diff --git a/inputs/values/dockers.json b/inputs/values/dockers.json index 6d1b0e50d..a691120f4 100644 --- a/inputs/values/dockers.json +++ b/inputs/values/dockers.json @@ -1,36 +1,36 @@ { "name": "dockers", - "cnmops_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/cnmops:2022-06-10-v0.23-beta-9c6fbf56", + "cnmops_docker": "us.gcr.io/broad-dsde-methods/vjalili/cnmops:5994670", "condense_counts_docker": "us.gcr.io/broad-dsde-methods/tsharpe/gatk:4.2.6.1-57-g9e03432", "gatk_docker": "us.gcr.io/broad-dsde-methods/tsharpe/gatk:4.2.6.1-57-g9e03432", "gatk_docker_pesr_override": "us.gcr.io/broad-dsde-methods/tsharpe/gatk:4.2.6.1-57-g9e03432", "gatk_docker_concordance": "us.gcr.io/broad-dsde-methods/markw/gatk:mw-sv-concordance-937c81", "genomes_in_the_cloud_docker": "us.gcr.io/broad-gotc-prod/genomes-in-the-cloud:2.3.2-1510681135", "linux_docker": "marketplace.gcr.io/google/ubuntu1804", - "manta_docker": "us.gcr.io/broad-dsde-methods/manta:8645aa", + "manta_docker": "us.gcr.io/broad-dsde-methods/vjalili/manta:5994670", "melt_docker": "us.gcr.io/talkowski-sv-gnomad/melt:vj-4ff9de9f", "scramble_docker": "us.gcr.io/broad-dsde-methods/tsharpe/scramble:1.0.2", - "samtools_cloud_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/samtools-cloud:2022-06-10-v0.23-beta-9c6fbf56", - "sv_base_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-base:2022-06-10-v0.23-beta-9c6fbf56", - "sv_base_mini_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-base-mini:2022-06-10-v0.23-beta-9c6fbf56", - "sv_pipeline_base_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-02-01-v0.26.8-beta-16c8991e", - "sv_pipeline_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-02-01-v0.26.8-beta-16c8991e", - "sv_pipeline_hail_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-02-01-v0.26.8-beta-16c8991e", - "sv_pipeline_updates_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-02-01-v0.26.8-beta-16c8991e", - "sv_pipeline_qc_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-02-01-v0.26.8-beta-16c8991e", - "sv_pipeline_rdtest_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-02-01-v0.26.8-beta-16c8991e", - "wham_docker": "us.gcr.io/broad-dsde-methods/wham:8645aa", + "samtools_cloud_docker": "us.gcr.io/broad-dsde-methods/vjalili/samtools-cloud:5994670", + "sv_base_docker": "us.gcr.io/broad-dsde-methods/vjalili/sv-base:5994670", + "sv_base_mini_docker": "us.gcr.io/broad-dsde-methods/vjalili/sv-base-mini:5994670", + "sv_pipeline_base_docker": "us.gcr.io/broad-dsde-methods/vjalili/sv-pipeline:5994670", + "sv_pipeline_docker": "us.gcr.io/broad-dsde-methods/vjalili/sv-pipeline:5994670", + "sv_pipeline_hail_docker": "us.gcr.io/broad-dsde-methods/vjalili/sv-pipeline:5994670", + "sv_pipeline_updates_docker": "us.gcr.io/broad-dsde-methods/vjalili/sv-pipeline:5994670", + "sv_pipeline_qc_docker": "us.gcr.io/broad-dsde-methods/vjalili/sv-pipeline:5994670", + "sv_pipeline_rdtest_docker": "us.gcr.io/broad-dsde-methods/vjalili/sv-pipeline:5994670", + "wham_docker": "us.gcr.io/broad-dsde-methods/vjalili/wham:5994670", "igv_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/igv:mw-xz-fixes-2-b1be6a9", "duphold_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/duphold:mw-xz-fixes-2-b1be6a9", "vapor_docker": "us.gcr.io/broad-dsde-methods/eph/vapor:header-hash-2fc8f12", "cloud_sdk_docker": "google/cloud-sdk", "pangenie_docker": "us.gcr.io/broad-dsde-methods/vjalili/pangenie:vj-127571f", - "sv-base-virtual-env": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-base-virtual-env:2022-06-10-v0.23-beta-9c6fbf56", - "cnmops-virtual-env": "us.gcr.io/broad-dsde-methods/gatk-sv/cnmops-virtual-env:2022-06-10-v0.23-beta-9c6fbf56", - "sv-pipeline-virtual-env": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline-virtual-env:2022-06-10-v0.23-beta-9c6fbf56", - "samtools-cloud-virtual-env": "us.gcr.io/broad-dsde-methods/gatk-sv/samtools-cloud-virtual-env:2022-06-10-v0.23-beta-9c6fbf56", - "sv-utils-env": "us.gcr.io/broad-dsde-methods/markw/sv-utils-env:mw-concordance-7251507", - "sv_utils_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-utils:2022-09-27-v0.26.2-beta-1f1a3b3a", + "sv-base-virtual-env": "us.gcr.io/broad-dsde-methods/vjalili/sv-base-virtual-env:5994670", + "cnmops-virtual-env": "us.gcr.io/broad-dsde-methods/vjalili/cnmops-virtual-env:5994670", + "sv-pipeline-virtual-env": "us.gcr.io/broad-dsde-methods/vjalili/sv-pipeline-virtual-env:5994670", + "samtools-cloud-virtual-env": "us.gcr.io/broad-dsde-methods/vjalili/samtools-cloud-virtual-env:5994670", + "sv-utils-env": "us.gcr.io/broad-dsde-methods/vjalili/sv-utils-env:5994670", + "sv_utils_docker": "us.gcr.io/broad-dsde-methods/vjalili/sv-utils:5994670", "gq_recalibrator_docker": "us.gcr.io/broad-dsde-methods/tbrookin/gatk:0a7e1d86f", - "str": "us.gcr.io/broad-dsde-methods/gatk-sv/str:2022-10-17-v0.26.4-beta-770425d6" + "str": "us.gcr.io/broad-dsde-methods/vjalili/str:5994670" } \ No newline at end of file From c80adcba6e9769ee8f3b2cb9d3cc612eb8b54c63 Mon Sep 17 00:00:00 2001 From: gatk-sv-bot <101641599+gatk-sv-bot@users.noreply.github.com> Date: Thu, 2 Feb 2023 00:35:43 +0000 Subject: [PATCH 11/19] Update docker images list, triggered by 9b25c72d --- inputs/values/dockers.json | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/inputs/values/dockers.json b/inputs/values/dockers.json index a691120f4..8448074b6 100644 --- a/inputs/values/dockers.json +++ b/inputs/values/dockers.json @@ -1,6 +1,6 @@ { "name": "dockers", - "cnmops_docker": "us.gcr.io/broad-dsde-methods/vjalili/cnmops:5994670", + "cnmops_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/cnmops:2023-02-01-v0.26.8-beta-9b25c72d", "condense_counts_docker": "us.gcr.io/broad-dsde-methods/tsharpe/gatk:4.2.6.1-57-g9e03432", "gatk_docker": "us.gcr.io/broad-dsde-methods/tsharpe/gatk:4.2.6.1-57-g9e03432", "gatk_docker_pesr_override": "us.gcr.io/broad-dsde-methods/tsharpe/gatk:4.2.6.1-57-g9e03432", @@ -10,15 +10,15 @@ "manta_docker": "us.gcr.io/broad-dsde-methods/vjalili/manta:5994670", "melt_docker": "us.gcr.io/talkowski-sv-gnomad/melt:vj-4ff9de9f", "scramble_docker": "us.gcr.io/broad-dsde-methods/tsharpe/scramble:1.0.2", - "samtools_cloud_docker": "us.gcr.io/broad-dsde-methods/vjalili/samtools-cloud:5994670", - "sv_base_docker": "us.gcr.io/broad-dsde-methods/vjalili/sv-base:5994670", + "samtools_cloud_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/samtools-cloud:2023-02-01-v0.26.8-beta-9b25c72d", + "sv_base_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-base:2023-02-01-v0.26.8-beta-9b25c72d", "sv_base_mini_docker": "us.gcr.io/broad-dsde-methods/vjalili/sv-base-mini:5994670", - "sv_pipeline_base_docker": "us.gcr.io/broad-dsde-methods/vjalili/sv-pipeline:5994670", - "sv_pipeline_docker": "us.gcr.io/broad-dsde-methods/vjalili/sv-pipeline:5994670", - "sv_pipeline_hail_docker": "us.gcr.io/broad-dsde-methods/vjalili/sv-pipeline:5994670", - "sv_pipeline_updates_docker": "us.gcr.io/broad-dsde-methods/vjalili/sv-pipeline:5994670", - "sv_pipeline_qc_docker": "us.gcr.io/broad-dsde-methods/vjalili/sv-pipeline:5994670", - "sv_pipeline_rdtest_docker": "us.gcr.io/broad-dsde-methods/vjalili/sv-pipeline:5994670", + "sv_pipeline_base_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-02-01-v0.26.8-beta-9b25c72d", + "sv_pipeline_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-02-01-v0.26.8-beta-9b25c72d", + "sv_pipeline_hail_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-02-01-v0.26.8-beta-9b25c72d", + "sv_pipeline_updates_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-02-01-v0.26.8-beta-9b25c72d", + "sv_pipeline_qc_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-02-01-v0.26.8-beta-9b25c72d", + "sv_pipeline_rdtest_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2023-02-01-v0.26.8-beta-9b25c72d", "wham_docker": "us.gcr.io/broad-dsde-methods/vjalili/wham:5994670", "igv_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/igv:mw-xz-fixes-2-b1be6a9", "duphold_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/duphold:mw-xz-fixes-2-b1be6a9", @@ -27,10 +27,10 @@ "pangenie_docker": "us.gcr.io/broad-dsde-methods/vjalili/pangenie:vj-127571f", "sv-base-virtual-env": "us.gcr.io/broad-dsde-methods/vjalili/sv-base-virtual-env:5994670", "cnmops-virtual-env": "us.gcr.io/broad-dsde-methods/vjalili/cnmops-virtual-env:5994670", - "sv-pipeline-virtual-env": "us.gcr.io/broad-dsde-methods/vjalili/sv-pipeline-virtual-env:5994670", + "sv-pipeline-virtual-env": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline-virtual-env:2023-02-01-v0.26.8-beta-9b25c72d", "samtools-cloud-virtual-env": "us.gcr.io/broad-dsde-methods/vjalili/samtools-cloud-virtual-env:5994670", - "sv-utils-env": "us.gcr.io/broad-dsde-methods/vjalili/sv-utils-env:5994670", - "sv_utils_docker": "us.gcr.io/broad-dsde-methods/vjalili/sv-utils:5994670", + "sv-utils-env": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-utils-env:2023-02-01-v0.26.8-beta-9b25c72d", + "sv_utils_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-utils:2023-02-01-v0.26.8-beta-9b25c72d", "gq_recalibrator_docker": "us.gcr.io/broad-dsde-methods/tbrookin/gatk:0a7e1d86f", "str": "us.gcr.io/broad-dsde-methods/vjalili/str:5994670" } \ No newline at end of file From 8b58c0c2f3ee5a95de2f219deb5080ab6fb8caa3 Mon Sep 17 00:00:00 2001 From: epiercehoffman Date: Thu, 2 Feb 2023 11:36:14 -0500 Subject: [PATCH 12/19] Include mCNVs in per-sample QC plots in MainVcfQc (#484) * include mCNVs in per-sample QC & expose downsample param * incorporate update to SubsetVcfBySamplesList to keep mCNVs --- wdl/CollectQcPerSample.wdl | 6 ++++-- wdl/MainVcfQc.wdl | 6 +++++- wdl/Utils.wdl | 2 +- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/wdl/CollectQcPerSample.wdl b/wdl/CollectQcPerSample.wdl index 300986f4b..2ebd232e7 100644 --- a/wdl/CollectQcPerSample.wdl +++ b/wdl/CollectQcPerSample.wdl @@ -98,13 +98,15 @@ task CollectVidsPerSample { # sample, and write one .tsv file per sample to output directory if [ ~{vcf_format_has_cn} == "true" ]; then bcftools view -S ~{samples_list} ~{vcf} \ - | bcftools view --min-ac 1 \ + | bcftools +fill-tags -- -t AC \ + | bcftools view -i 'SVTYPE=="CNV" || AC>0' \ | bcftools query -f '[%SAMPLE\t%ID\t%ALT\t%GT\t%GQ\t%CN\t%CNQ\n]' \ | awk '{OFS="\t"; gt = $4; gq = $5; if ($3 == "") { gq = $7; if ($6 == 2) { gt = "0/0" } else if ($6 == 1 || $6 == 3) { gt = "0/1" } else { gt = "1/1"} }; print $1, $2, gt, gq}' \ | awk -v outprefix="~{outdirprefix}" '$3 != "0/0" && $3 != "./." {OFS="\t"; print $2, $3, $4 >> outprefix"/"$1".VIDs_genotypes.txt" }' else bcftools view -S ~{samples_list} ~{vcf} \ - | bcftools view --min-ac 1 \ + | bcftools +fill-tags -- -t AC \ + | bcftools view -i 'SVTYPE=="CNV" || AC>0' \ | bcftools query -f '[%SAMPLE\t%ID\t%ALT\t%GT\t%GQ\n]' \ | awk '{OFS="\t"; gt = $4; gq = $5; if ($3 ~ /CN0/) { if ($4 == "0/2") { gt = "0/0" } else if ($4 == "0/1" || $4 == "0/3") { gt = "0/1" } else { gt = "1/1"} }; print $1, $2, gt, gq}' \ | awk -v outprefix="~{outdirprefix}" '$3 != "0/0" && $3 != "./." {OFS="\t"; print $2, $3, $4 >> outprefix"/"$1".VIDs_genotypes.txt" }' diff --git a/wdl/MainVcfQc.wdl b/wdl/MainVcfQc.wdl index ef737e0f8..2dc62a79d 100644 --- a/wdl/MainVcfQc.wdl +++ b/wdl/MainVcfQc.wdl @@ -26,6 +26,7 @@ workflow MainVcfQc { File primary_contigs_fai Int? random_seed Int? max_gq # Max GQ for plotting. Default = 99, ie. GQ is on a scale of [0,99]. Prior to CleanVcf, use 999 + Int? downsample_qc_per_sample # Number of samples to use for per-sample QC. Default: 1000 String sv_base_mini_docker String sv_pipeline_docker @@ -212,6 +213,7 @@ workflow MainVcfQc { per_sample_tarball=TarShardVidLists.vid_lists, prefix=prefix, max_gq=max_gq_, + downsample_qc_per_sample=downsample_qc_per_sample, sv_pipeline_qc_docker=sv_pipeline_qc_docker, runtime_attr_override=runtime_override_plot_qc_per_sample } @@ -452,6 +454,7 @@ task PlotQcPerSample { File per_sample_tarball String prefix Int max_gq + Int? downsample_qc_per_sample String sv_pipeline_qc_docker RuntimeAttr? runtime_attr_override } @@ -496,7 +499,8 @@ task PlotQcPerSample { ~{samples_list} \ ~{prefix}_perSample/ \ ~{prefix}_perSample_plots/ \ - --maxgq ~{max_gq} + --maxgq ~{max_gq} \ + ~{"--downsample " + downsample_qc_per_sample} # Prepare output tar -czvf ~{prefix}.plotQC_perSample.tar.gz \ diff --git a/wdl/Utils.wdl b/wdl/Utils.wdl index 3105b68ea..72e890b2c 100644 --- a/wdl/Utils.wdl +++ b/wdl/Utils.wdl @@ -661,7 +661,7 @@ task SubsetVcfBySamplesList { String vcf_subset_filename = select_first([outfile_name, basename(vcf, ".vcf.gz") + ".subset.vcf.gz"]) String vcf_subset_idx_filename = vcf_subset_filename + ".tbi" - String remove_private_sites_flag = if remove_private_sites then " | bcftools view --min-ac 1 " else "" + String remove_private_sites_flag = if remove_private_sites then " | bcftools +fill-tags -- -t AC | bcftools view -i 'SVTYPE==\"CNV\" || AC>0' " else "" String complement_flag = if remove_samples then "^" else "" # Disk must be scaled proportionally to the size of the VCF From ef097f3d128e245ddae19d4bbb4eb096100b48e6 Mon Sep 17 00:00:00 2001 From: Vahid Date: Thu, 16 Feb 2023 14:16:24 -0500 Subject: [PATCH 13/19] Update WDLs to adhere with req. opt. add/concatenation requirement. (#495) # Resolve miniwdl validation errors such as the following. # Validation Error: 'Cannot add/concatenate String and Int?' --- wdl/ExpansionHunterDenovo.wdl | 27 +++++++++------------------ wdl/GangSTR.wdl | 8 ++++---- wdl/MakeBincovMatrix.wdl | 6 +++--- wdl/ShardedCluster.wdl | 2 +- 4 files changed, 17 insertions(+), 26 deletions(-) diff --git a/wdl/ExpansionHunterDenovo.wdl b/wdl/ExpansionHunterDenovo.wdl index ae199df93..ab3361a66 100644 --- a/wdl/ExpansionHunterDenovo.wdl +++ b/wdl/ExpansionHunterDenovo.wdl @@ -8,16 +8,7 @@ version 1.0 -#import "Structs.wdl" -# Carrot currently does not support imports. -struct RuntimeAttr { - Float? mem_gb - Int? cpu_cores - Int? disk_gb - Int? boot_disk_gb - Int? preemptible_tries - Int? max_retries -} +import "Structs.wdl" struct FilenamePostfixes { String locus @@ -184,7 +175,7 @@ task ComputeSTRProfile { >>> - RuntimeAttr runtime_attr_str_profile_default = object { + RuntimeAttr default_attr = object { cpu_cores: 1, mem_gb: 4, boot_disk_gb: 10, @@ -198,13 +189,13 @@ task ComputeSTRProfile { } RuntimeAttr runtime_attr = select_first([ runtime_attr_override, - runtime_attr_str_profile_default]) + default_attr]) runtime { docker: ehdn_docker cpu: runtime_attr.cpu_cores - memory: runtime_attr.mem_gb + " GiB" - disks: "local-disk " + runtime_attr.disk_gb + " HDD" + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" bootDiskSizeGb: runtime_attr.boot_disk_gb preemptible: runtime_attr.preemptible_tries maxRetries: runtime_attr.max_retries @@ -324,7 +315,7 @@ task STRAnalyze { done >>> - RuntimeAttr runtime_attr_analysis_default = object { + RuntimeAttr default_attr = object { cpu_cores: 1, mem_gb: 4, boot_disk_gb: 10, @@ -336,13 +327,13 @@ task STRAnalyze { } RuntimeAttr runtime_attr = select_first([ runtime_attr_override, - runtime_attr_analysis_default]) + default_attr]) runtime { docker: ehdn_docker cpu: runtime_attr.cpu_cores - memory: runtime_attr.mem_gb + " GiB" - disks: "local-disk " + runtime_attr.disk_gb + " HDD" + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" bootDiskSizeGb: runtime_attr.boot_disk_gb preemptible: runtime_attr.preemptible_tries maxRetries: runtime_attr.max_retries diff --git a/wdl/GangSTR.wdl b/wdl/GangSTR.wdl index 97d22973f..2f75d6020 100644 --- a/wdl/GangSTR.wdl +++ b/wdl/GangSTR.wdl @@ -98,7 +98,7 @@ task CallGangSTR { --out ~{output_prefix} >>> - RuntimeAttr runtime_attr_str_profile_default = object { + RuntimeAttr default_attr = object { cpu_cores: 1, mem_gb: 4, boot_disk_gb: 10, @@ -111,13 +111,13 @@ task CallGangSTR { } RuntimeAttr runtime_attr = select_first([ runtime_attr_override, - runtime_attr_str_profile_default]) + default_attr]) runtime { docker: str_docker cpu: runtime_attr.cpu_cores - memory: runtime_attr.mem_gb + " GiB" - disks: "local-disk " + runtime_attr.disk_gb + " HDD" + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" bootDiskSizeGb: runtime_attr.boot_disk_gb preemptible: runtime_attr.preemptible_tries maxRetries: runtime_attr.max_retries diff --git a/wdl/MakeBincovMatrix.wdl b/wdl/MakeBincovMatrix.wdl index 4d8af9a6d..d17aa0427 100644 --- a/wdl/MakeBincovMatrix.wdl +++ b/wdl/MakeBincovMatrix.wdl @@ -144,7 +144,7 @@ task SetBins { runtime { cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" - disks: "local-disk " + runtime_attr.disk_gb + " HDD" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) docker: sv_base_mini_docker preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) @@ -210,7 +210,7 @@ task MakeBincovMatrixColumns { runtime { cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" - disks: "local-disk " + runtime_attr.disk_gb + " HDD" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) docker: sv_base_mini_docker preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) @@ -271,7 +271,7 @@ task ZPaste { runtime { cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" - disks: "local-disk " + runtime_attr.disk_gb + " HDD" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) docker: sv_base_docker preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) diff --git a/wdl/ShardedCluster.wdl b/wdl/ShardedCluster.wdl index 8c3990f43..8589cdaae 100644 --- a/wdl/ShardedCluster.wdl +++ b/wdl/ShardedCluster.wdl @@ -59,7 +59,7 @@ workflow ShardedCluster { File vcf_idx = vcf + ".tbi" if (defined(exclude_list)) { - File exclude_list_idx = exclude_list + ".tbi" + File exclude_list_idx = select_first([exclude_list]) + ".tbi" } call MiniTasks.MakeSitesOnlyVcf { From 480173b27d19072a7b055a0d5626aec130177cd2 Mon Sep 17 00:00:00 2001 From: Vahid Date: Thu, 16 Feb 2023 14:25:26 -0500 Subject: [PATCH 14/19] Update the style of `build_docker.py` (#492) * Update build_docker for consistent style and formatting. # - Consistent use of line breaks for each argument; # - Consistent use of indentation before arguments; # - Consistent use of double and single quotation for strings; # - Consistent use of line break before closing parenthesis; # - Use back tick for quoting a substring. * Fix few remaining formatting inconsistencies. * A correct value for indent is the number of spaces. * Replace termcolor with built-in features. * Fix a style inconsistency. * Fix linting issue. --- scripts/docker/build_docker.py | 357 +++++++++++++++++++++------------ 1 file changed, 227 insertions(+), 130 deletions(-) diff --git a/scripts/docker/build_docker.py b/scripts/docker/build_docker.py index fb09aa2a5..12f4bee04 100755 --- a/scripts/docker/build_docker.py +++ b/scripts/docker/build_docker.py @@ -9,8 +9,6 @@ import tempfile import shutil import pprint -# noinspection PyPackageRequirements -from termcolor import colored import subprocess from typing import Union, Iterable, Mapping, Optional, List, Tuple, Set, Dict from types import MappingProxyType @@ -98,48 +96,84 @@ class ProjectBuilder: """ class to track dependencies, control build and push of entire job """ - github_org = 'broadinstitute' - github_repo = 'gatk-sv' + github_org = "broadinstitute" + github_repo = "gatk-sv" # mapping from target to its dependencies # each dependency is either None, or a mapping from each dependency name to the docker ARG it is passed via - # currently each image has zero or one dependencies, but multiple dependencies are allowed + # currently each image has zero or one dependency, but multiple dependencies are allowed dependencies = { - "manta": ImageDependencies("dockerfiles/manta/*"), - "melt": ImageDependencies("dockerfiles/melt/*"), - "wham": ImageDependencies("dockerfiles/wham/*"), - "str": ImageDependencies(("dockerfiles/str/*", "src/str/*")), - "sv-base-mini": ImageDependencies("dockerfiles/sv-base-mini/*"), - "samtools-cloud-virtual-env": ImageDependencies("dockerfiles/samtools-cloud-virtual-env/*"), + "manta": ImageDependencies( + git_dependencies="dockerfiles/manta/*" + ), + "melt": ImageDependencies( + git_dependencies="dockerfiles/melt/*" + ), + "wham": ImageDependencies( + git_dependencies="dockerfiles/wham/*" + ), + "str": ImageDependencies( + git_dependencies=("dockerfiles/str/*", "src/str/*") + ), + "sv-base-mini": ImageDependencies( + git_dependencies="dockerfiles/sv-base-mini/*" + ), + "samtools-cloud-virtual-env": ImageDependencies( + git_dependencies="dockerfiles/samtools-cloud-virtual-env/*" + ), "samtools-cloud": ImageDependencies( - "dockerfiles/samtools-cloud/*", - {"sv-base-mini": "MINIBASE_IMAGE", "samtools-cloud-virtual-env": "VIRTUAL_ENV_IMAGE"}), - "sv-base-virtual-env": ImageDependencies("dockerfiles/sv-base-virtual-env/*"), + git_dependencies="dockerfiles/samtools-cloud/*", + docker_dependencies={ + "sv-base-mini": "MINIBASE_IMAGE", + "samtools-cloud-virtual-env": "VIRTUAL_ENV_IMAGE"} + ), + "sv-base-virtual-env": ImageDependencies( + git_dependencies="dockerfiles/sv-base-virtual-env/*" + ), "sv-base": ImageDependencies( - "dockerfiles/sv-base/*", - {"samtools-cloud": "SAMTOOLS_CLOUD_IMAGE", "sv-base-virtual-env": "VIRTUAL_ENV_IMAGE"}), + git_dependencies="dockerfiles/sv-base/*", + docker_dependencies={ + "samtools-cloud": "SAMTOOLS_CLOUD_IMAGE", + "sv-base-virtual-env": "VIRTUAL_ENV_IMAGE"} + ), "cnmops-virtual-env": ImageDependencies( - "dockerfiles/cnmops-virtual-env/*", - {"sv-base-virtual-env": "VIRTUAL_ENV_IMAGE"}), + git_dependencies="dockerfiles/cnmops-virtual-env/*", + docker_dependencies={ + "sv-base-virtual-env": "VIRTUAL_ENV_IMAGE"} + ), "cnmops": ImageDependencies( - ("dockerfiles/cnmops/*", "src/WGD/*"), - {"sv-base": "SVBASE_IMAGE", "cnmops-virtual-env": "VIRTUAL_ENV_IMAGE"}), + git_dependencies=("dockerfiles/cnmops/*", "src/WGD/*"), + docker_dependencies={ + "sv-base": "SVBASE_IMAGE", + "cnmops-virtual-env": "VIRTUAL_ENV_IMAGE"} + ), "sv-pipeline-virtual-env": ImageDependencies( - "dockerfiles/sv-pipeline-virtual-env/*", - {"sv-base-mini": "SV_BASE_MINI_IMAGE", - "sv-base-virtual-env": "R_VIRTUAL_ENV_IMAGE", - "samtools-cloud-virtual-env": "PYTHON_VIRTUAL_ENV_IMAGE"}), + git_dependencies="dockerfiles/sv-pipeline-virtual-env/*", + docker_dependencies={ + "sv-base-mini": "SV_BASE_MINI_IMAGE", + "sv-base-virtual-env": "R_VIRTUAL_ENV_IMAGE", + "samtools-cloud-virtual-env": "PYTHON_VIRTUAL_ENV_IMAGE"} + ), "sv-pipeline": ImageDependencies( - ("dockerfiles/sv-pipeline/*", "src/RdTest/*", "src/sv-pipeline/*", "src/svqc/*", "src/svtest/*", - "src/svtk/*", "src/WGD/*"), - {"sv-base": "SVBASE_IMAGE", "sv-pipeline-virtual-env": "VIRTUAL_ENV_IMAGE"}), + git_dependencies=( + "dockerfiles/sv-pipeline/*", "src/RdTest/*", "src/sv-pipeline/*", + "src/svqc/*", "src/svtest/*", "src/svtk/*", "src/WGD/*"), + docker_dependencies={ + "sv-base": "SVBASE_IMAGE", + "sv-pipeline-virtual-env": "VIRTUAL_ENV_IMAGE"} + ), "sv-utils-env": ImageDependencies( - "dockerfiles/sv-utils-env/*", - {"samtools-cloud-virtual-env": "PYTHON_VIRTUAL_ENV_IMAGE"} + git_dependencies="dockerfiles/sv-utils-env/*", + docker_dependencies={ + "samtools-cloud-virtual-env": "PYTHON_VIRTUAL_ENV_IMAGE"} ), - "sv-utils": ImageDependencies(("dockerfiles/sv-utils/*", "src/sv_utils/src/*", "src/sv_utils/setup.py"), - {"samtools-cloud": "SAMTOOLS_CLOUD_IMAGE", "sv-utils-env": "VIRTUAL_ENV_IMAGE"}) + "sv-utils": ImageDependencies( + git_dependencies=("dockerfiles/sv-utils/*", "src/sv_utils/src/*", "src/sv_utils/setup.py"), + docker_dependencies={ + "samtools-cloud": "SAMTOOLS_CLOUD_IMAGE", + "sv-utils-env": "VIRTUAL_ENV_IMAGE"} + ) } - non_public_images = frozenset({'melt'}) + non_public_images = frozenset({"melt"}) images_built_by_all = frozenset(dependencies.keys()).difference({"melt"}) accepted_target_values = frozenset(dependencies.keys()).union({"all"}) latest_tag = "latest" @@ -163,18 +197,18 @@ def __init__( def load_json(json_file: str) -> Dict[str, str]: if not os.path.isfile(json_file): return {} - with open(json_file, 'r') as f_in: + with open(json_file, "r") as f_in: return json.load(f_in) @staticmethod def get_target_from_image(docker_image: str) -> str: # note: will process any string, even if it does not have '/' or ':' in it, even if it is not a docker image # (in which case it will return the whole docker_image string) - return docker_image.rsplit('/', 1)[-1].split(':', 1)[0] + return docker_image.rsplit("/", 1)[-1].split(":", 1)[0] @staticmethod def get_image_repo(docker_image: str) -> Optional[str]: - repo_ind = docker_image.rfind('/') + repo_ind = docker_image.rfind("/") return None if repo_ind < 0 else docker_image[:repo_ind] @staticmethod @@ -225,10 +259,10 @@ def update_dockers_json(self): # update dockers.json with the new data if self.project_arguments.dry_run: print(f"Write output dockers json at {output_json}") - print(json.dumps(new_dockers_json, indent=" ")) + print(json.dumps(new_dockers_json, indent=2)) else: - with open(output_json, 'w') as f_out: - json.dump(new_dockers_json, f_out, indent=" ") + with open(output_json, "w") as f_out: + json.dump(new_dockers_json, f_out, indent=2) def get_build_priority(self, target_name: str) -> (int, str): """ @@ -299,7 +333,7 @@ def go_to_workdir(self) -> str: self.working_dir = os.path.dirname(os.path.dirname(self.launch_script_path)) else: # if staging is required, mkdir, cd, and pull - tmp_dir_path = tempfile.mkdtemp(prefix=self.project_arguments.staging_dir).rstrip('/') + '/' + tmp_dir_path = tempfile.mkdtemp(prefix=self.project_arguments.staging_dir).rstrip("/") + "/" connect_mode = "git@github.com:" if self.project_arguments.use_ssh else "https://github.com" clone_target = connect_mode + "/" + ProjectBuilder.github_org + "/" + ProjectBuilder.github_repo + ".git" if os.system(f"git clone {clone_target} {tmp_dir_path}") != 0: @@ -316,7 +350,7 @@ def go_to_workdir(self) -> str: git_checkout_cmd = "git checkout " + \ self.project_arguments.remote_git_hash if os.system(git_checkout_cmd) != 0: - raise ValueError(f"The provided git hash [{self.project_arguments.remote_git_hash}] does not exist") + raise ValueError(f"The provided git hash `{self.project_arguments.remote_git_hash}` does not exist") print("Working directory: " + os.getcwd()) return tmp_dir_path @@ -342,7 +376,7 @@ def build_and_push(self): build_completed = False if os.system("docker system info > /dev/null 2>&1") != 0: - raise RuntimeError("docker daemon is not running or returned error") + raise RuntimeError("Docker daemon is not running or returned error.") try: # set the appropriate targets @@ -367,12 +401,12 @@ def build_and_push(self): for target_name in expanded_build_targets: ImageBuilder(target_name, self).update_current_image() else: - print(colored('#' * 50, 'magenta')) + print_colored("#" * 50, Colors.MAGENTA) for target_name in expanded_build_targets: - a = colored("Building image ", "grey") - b = colored(target_name + ":" + self.project_arguments.image_tag, "yellow", attrs=['bold']) - c = colored(" ...", "grey") - print(a, b, c) + print_colored( + f"Building image: {target_name}:{self.project_arguments.image_tag} ...", + Colors.YELLOW + ) build_time_args = { arg: self.get_current_image(image_name) @@ -385,9 +419,9 @@ def build_and_push(self): if self.project_arguments.prune_after_each_image and not self.project_arguments.dry_run: # clean dangling images (i.e. those "" images), stopped containers, etc os.system("docker system prune -f") - print(colored('#' * 50, 'magenta')) + print_colored("#" * 50, Colors.MAGENTA) - print(colored('BUILD PROCESS SUCCESS!', 'green')) + print_colored("BUILD PROCESS SUCCESS!", Colors.GREEN) if self.remote_docker_repos: # push any images that are purely local (this can happen if e.g. images are built without specifying @@ -433,13 +467,13 @@ def changed_project_files(self) -> List[str]: return get_command_output( f"git diff --name-only {base_git_commit}" if current_git_commit is None else f"git diff --name-only {base_git_commit} {current_git_commit}" - ).strip().split('\n') + ).strip().split("\n") class ImageBuilder: # class for building and pushing a single image def __init__(self, name: str, project_builder: ProjectBuilder): - if ':' in name: - self.name, self.tag = name.split(':', 1) + if ":" in name: + self.name, self.tag = name.split(":", 1) else: self.name = name self.tag = project_builder.project_arguments.image_tag @@ -489,8 +523,8 @@ def remote_image_exists(remote_image: str) -> bool: def image_is_built(local_image: str) -> bool: images = { image for image in get_command_output( - 'docker images --format "{{.Repository}}:{{.Tag}}"' - ).split('\n') + "docker images --format '{{.Repository}}:{{.Tag}}'" + ).split("\n") } return local_image in images @@ -500,7 +534,7 @@ def do_not_rebuild(self) -> bool: def build(self, build_time_args: Mapping[str, str]): if self.do_not_rebuild: - print(f"skipping build of {self.local_image} because it is already built and --no-force-rebuild is set") + print(f"Skipping build of {self.local_image} because it is already built and --no-force-rebuild is set.") return # standard build command docker_build_command = "docker build --platform linux/amd64 --progress plain --network=host \\\n " @@ -529,9 +563,11 @@ def push(self): """ for remote_image in self.remote_images: # do not push images with very restrictive licenses - if self.name in ProjectBuilder.non_public_images and not remote_image.startswith('us.gcr.io'): - print(colored(f"Refusing to push non-public image {self.name} to non us.grc.io repo at {remote_image}", - "red")) + if self.name in ProjectBuilder.non_public_images and not remote_image.startswith("us.gcr.io"): + print_colored( + f"Refusing to push non-public image {self.name} to non us.grc.io repo at {remote_image}", + Colors.RED + ) continue if ImageBuilder.docker_tag(self.local_image, remote_image) != 0: raise RuntimeError(f"Failed to tag image ({remote_image}) for pushing to remote") @@ -572,7 +608,7 @@ def get_command_output( stderr = pipe_err.read().decode(encoding) return_code = sub_p.poll() if raise_on_error and return_code != 0: - raise RuntimeError('Error executing %s:\n%s' % (command, stderr[:-1])) + raise RuntimeError("Error executing %s:\n%s" % (command, stderr[:-1])) return (output, stderr, return_code) if return_error_info else output @@ -580,6 +616,21 @@ class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescri pass +class Colors: + GRAY = "\033[90m" + RED = "\033[91m" + GREEN = "\033[92m" + YELLOW = "\033[93m" + BLUE = "\033[94m" + MAGENTA = "\033[95m" + Cyan = "\033[96m" + ENDC = "\033[0m" + + +def print_colored(text: str, color: str): + print(f"{color}{text}{Colors.ENDC}") + + def __parse_arguments(args_list: List[str]) -> argparse.Namespace: parser = argparse.ArgumentParser( description="Tool for building docker images for GATK-SV pipeline" + "\n" + program_operation_description, @@ -587,91 +638,138 @@ def __parse_arguments(args_list: List[str]) -> argparse.Namespace: ) # required arguments # to build from local or remote git tag/hash values - git_args_group = parser.add_argument_group('Mutex args', 'remote git tag/hash values (mutually exclusive)') + git_args_group = parser.add_argument_group("Mutex args", "remote git tag/hash values (mutually exclusive)") git_mutex_args_group = git_args_group.add_mutually_exclusive_group() git_mutex_args_group.add_argument( - '--remote-git-tag', type=str, - help='release tag on Github; this indicates pulling from Github to a staging dir' + "--remote-git-tag", type=str, + help="Release tag on Github; this indicates pulling from Github to a staging dir." ) git_mutex_args_group.add_argument( - '--remote-git-hash', type=str, - help='a hash value on Github; this indicates pulling from Github to a staging dir' + "--remote-git-hash", type=str, + help="A hash value on Github; this indicates pulling from Github to a staging dir." ) + # to build from remote/Github (staging required) - remote_git_args_group = parser.add_argument_group('Remote git', - 'args involved when building from remote git tags/hashes') + remote_git_args_group = parser.add_argument_group( + "Remote git", + "Args involved when building from remote git tags/hashes." + ) + + remote_git_args_group.add_argument( + "--staging-dir", type=str, + help="A temporary staging directory to store builds; required only when pulling " + "from Github, ignored otherwise." + ) remote_git_args_group.add_argument( - '--staging-dir', type=str, help='a temporary staging directory to store builds; required only when pulling ' - 'from Github, ignored otherwise' + "--use-ssh", action="store_true", + help="Use SSH to pull from GitHub." ) - remote_git_args_group.add_argument('--use-ssh', action='store_true', help='use SSH to pull from github') + # flag to turn on push to Dockerhub and/or GCR docker_remote_args_group = parser.add_argument_group( - 'Docker push', 'controlling behavior related pushing dockers to remote repos' + "Docker push", + "Controlling behavior related pushing dockers to remote repos." + ) + + docker_remote_args_group.add_argument( + "--docker-repo", type=str, + help="Docker repo to push images to. This will push images that are built " + "this run of build_docker.py, or that currently have only a local image " + "in --input-json." + ) + + docker_remote_args_group.add_argument( + "--gcr-project", type=str, + help="Deprecated. Used to determine which docker repo to push images to. Use " + "--docker-repo instead." + ) + + docker_remote_args_group.add_argument( + "--update-latest", action="store_true", + help=f"Also update `{ProjectBuilder.latest_tag}` tag in remote docker repo(s)." ) - docker_remote_args_group.add_argument('--docker-repo', type=str, - help='Docker repo to push images to. This will push images that are built ' - 'this run of build_docker.py, or that currently have only a local image ' - 'in --input-json') - docker_remote_args_group.add_argument('--gcr-project', type=str, - help='Deprecated. Used to determine which docker repo to push images to. Use ' - '--docker-repo instead.') - docker_remote_args_group.add_argument('--update-latest', action='store_true', - help=f'also update \"{ProjectBuilder.latest_tag}\" tag in remote docker' - f'repo(s)') - docker_remote_args_group.add_argument('--input-json', type=str, default=Paths.dockers_json_path, - help="Path to dockers.json to use as input. This file serves as a store for " - "both the default docker image to use for various gatk-sv WDLs, and for " - "the most up-to-date docker tag for each docker image.") - docker_remote_args_group.add_argument('--output-json', type=str, default=Paths.dockers_json_path, - help=f"Path to output updated dockers.json. Set to {Paths.dev_null} to turn " - "off updates") - parser.add_argument("--dry-run", action="store_true", - help="Compute docker images that will be build, but don't actually build or push") - parser.add_argument("--skip-dependent-images", action="store_true", - help="Don't build images that depend on targets. Can leave images in an unreproducible state: " - "only use this if you are sure you know what you are doing!") + + docker_remote_args_group.add_argument( + "--input-json", type=str, default=Paths.dockers_json_path, + help="Path to dockers.json to use as input. This file serves as a store for " + "both the default docker image to use for various gatk-sv WDLs, and for " + "the most up-to-date docker tag for each docker image." + ) + + docker_remote_args_group.add_argument( + "--output-json", type=str, default=Paths.dockers_json_path, + help=f"Path to output updated dockers.json. Set to {Paths.dev_null} to turn off updates." + ) + + parser.add_argument( + "--dry-run", action="store_true", + help="Compute docker images that will be build, but don't actually build or push." + ) + parser.add_argument( - '--targets', nargs='*', type=str, - help='Manually-specified list project docker image(s) you want to build (note "all" does not include melt). ' - 'Alternatively can specify --base-git-commit/--current-git-commit to automatically determine targets.' + "--skip-dependent-images", action="store_true", + help="Don't build images that depend on targets. Can leave images in an unreproducible state: " + "only use this if you are sure you know what you are doing!" ) + + parser.add_argument( + "--targets", nargs="*", type=str, + help="Manually-specified list project docker image(s) you want to build (note `all` does not include melt). " + "Alternatively can specify --base-git-commit/--current-git-commit to automatically determine targets." + ) + short_git_hash_head = get_command_output("git rev-parse --short HEAD").strip() - parser.add_argument('--image-tag', type=str, default=short_git_hash_head, - help='tag to be applied to all images being built') + parser.add_argument( + "--image-tag", type=str, default=short_git_hash_head, + help="Tag to be applied to all images being built." + ) + parser.add_argument( # flag to turn off git protection (default mode is refusing to build when there are untracked files and/or # uncommitted changes) - '--disable-git-protect', action='store_true', - help='disable git check/protect when building from local files (will use uncommited changes to build)' + "--disable-git-protect", action="store_true", + help="Disable git check/protect when building from local files (will use uncommitted changes to build)." + ) + + parser.add_argument( + "--no-force-rebuild", action="store_true", + help="Do not rebuild docker images if the exact image and tag already exist." + ) + + parser.add_argument( + "--skip-cleanup", action="store_true", + help="Skip cleanup after successful and unsuccessful build attempts." + ) + + parser.add_argument( + "--base-git-commit", type=str, + help="This script can have targets specified manually (via --targets) or it can automatically " + "determine which docker image `targets` to build by examining which files have changed " + "in the git repo. When auto-determining build targets, this options specifies the baseline " + "git commit to check for changes, i.e. only files altered since this commit should be " + "considered changed. Can be a SHA or other specifier (e.g. HEAD)." + ) + + parser.add_argument( + "--current-git-commit", type=str, + help="This script can automatically determine which docker image `targets` to build by " + "examining which files have changed in the git repo. When auto-determining build targets, " + "this options specifies the current git commit to check for changes. If omitted, " + "use current status of git repo with uncommitted changes. Can be a SHA or other specifier (e.g. HEAD)." + ) + + parser.add_argument( + "--prune-after-each-image", action="store_true", + help="Do `docker system prune` after each image is successfully built, to save disk space. " + "Only necessary on cramped VMs." ) - parser.add_argument('--no-force-rebuild', action='store_true', - help='Do not rebuild docker images if the exact image and tag already exist.') - parser.add_argument('--skip-cleanup', action='store_true', - help='skip cleanup after successful and unsuccessful build attempts.') - parser.add_argument('--base-git-commit', type=str, - help="This script can have targets specified manually (via --targets) or it can automatically " - "determine which docker image \"targets\" to build by examining which files have changed " - "in the git repo. When auto-determining build targets, this options specifies the baseline" - " git commit to check for changes, i.e. only files altered since this commit should be " - "considered changed. Can be a SHA or other specifier " - "(e.g. HEAD^)") - parser.add_argument('--current-git-commit', type=str, - help="This script can automatically determine which docker image \"targets\" to build by " - "examining which files have changed in the git repo. When auto-determining build targets, " - "this options specifies the current git commit to check for changes. If omitted," - " use current status of git repo with uncommitted changes. Can be a SHA or other specifier" - " (e.g. HEAD)") - parser.add_argument('--prune-after-each-image', action='store_true', - help='Do "docker system prune" after each image is successfully built, to save disk space. Only' - ' necessary on cramped VMs') # parse and consistency check if len(args_list) <= 1: # no arguments, print help and exit with success - parser.parse_args(['-h']) + parser.parse_args(["-h"]) sys.exit(0) parsed_args = parser.parse_args(args_list[1:]) @@ -680,34 +778,34 @@ def __parse_arguments(args_list: List[str]) -> argparse.Namespace: if parsed_args.base_git_commit is None: if parsed_args.targets is None: - raise ValueError('Must specify exactly one of "--base-git-commit" or "--targets", but neither were passed') + raise ValueError("Must specify exactly one of `--base-git-commit` or `--targets`, but neither were passed.") # if passed targets are in the accepted values for tar in parsed_args.targets: if tar not in ProjectBuilder.accepted_target_values: - raise ValueError("\"" + tar + "\" not in allowed target values") + raise ValueError("\"" + tar + "\" not in allowed target values.") if "all" in parsed_args.targets: if 1 != len(parsed_args.targets): - raise ValueError("when \"all\" is provided, no other target values allowed") + raise ValueError("When `all` is provided, no other target values allowed.") else: if parsed_args.targets is not None: - raise ValueError('Must specify exactly one of "--base-git-commit" or "--targets", but both were passed') + raise ValueError("Must specify exactly one of `--base-git-commit` or `--targets`, but both were passed.") # if "use_ssh" flag is turned on, remote git tag/hash should be provided if parsed_args.use_ssh is True: if (parsed_args.remote_git_tag is None) and (parsed_args.remote_git_hash is None): - raise ValueError("\"use_ssh\" is specified but remote git tag/hash is not") + raise ValueError("`use_ssh` is specified but remote git tag/hash is not.") # if remote git tag/hash and/or is specified, staging dir should be specified if (parsed_args.remote_git_tag is not None) or (parsed_args.remote_git_hash is not None): if parsed_args.staging_dir is None: - raise ValueError("remote git tag/hash is specified but staging_dir is not") + raise ValueError("Remote git tag/hash is specified but staging_dir is not.") # if requesting to update "latest" tag in remote docker repo(s), remote git release tag must be specified if parsed_args.update_latest is True: if parsed_args.remote_git_tag is None: - raise ValueError(f"publishing \"{ProjectBuilder.latest_tag}\" docker images requires a remote Github " - "release tag") + raise ValueError(f"Publishing `{ProjectBuilder.latest_tag}` docker images requires a remote Github " + "release tag.") # if there are uncommitted changes when building from local files, raise exception if parsed_args.staging_dir is None and not parsed_args.disable_git_protect: @@ -715,13 +813,12 @@ def __parse_arguments(args_list: List[str]) -> argparse.Namespace: ret = int(s) if 0 != ret: raise ValueError( - "Current directory has uncommitted changes or untracked files. Cautiously refusing to proceed." - ) + "Current directory has uncommitted changes or untracked files. Cautiously refusing to proceed.") if parsed_args.gcr_project is not None: if parsed_args.docker_repo is not None: raise ValueError("Both --gcr-project and --docker-repo were specified, but only one is allowed.") - print(colored("--gcr-project is deprecated, use --docker-repo instead.", "red")) + print_colored("--gcr-project is deprecated, use --docker-repo instead.", Colors.RED) parsed_args.docker_repo = f"us.gcr.io/{parsed_args.gcr_project.strip('/')}" return parsed_args From 40303ec0a8c70afb56ae8d06715724ee0e2af9bf Mon Sep 17 00:00:00 2001 From: Mark Walker Date: Mon, 20 Feb 2023 12:12:02 -0500 Subject: [PATCH 15/19] Remove depth vcf from pesr merging in FilterBatchSamples (#493) --- wdl/FilterBatchSamples.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/wdl/FilterBatchSamples.wdl b/wdl/FilterBatchSamples.wdl index 0c116a762..96c106065 100644 --- a/wdl/FilterBatchSamples.wdl +++ b/wdl/FilterBatchSamples.wdl @@ -98,8 +98,8 @@ workflow FilterBatchSamples { runtime_attr_override = runtime_attr_filter_samples } - Array[File] pesr_vcfs_no_outliers = select_all([SubsetVcfBySamplesList.vcf_subset[0], SubsetVcfBySamplesList.vcf_subset[1], SubsetVcfBySamplesList.vcf_subset[2], SubsetVcfBySamplesList.vcf_subset[3], SubsetVcfBySamplesList.vcf_subset[4]]) - Array[File] pesr_vcfs_no_outliers_index = select_all([SubsetVcfBySamplesList.vcf_subset_index[0], SubsetVcfBySamplesList.vcf_subset_index[1], SubsetVcfBySamplesList.vcf_subset_index[2], SubsetVcfBySamplesList.vcf_subset_index[3], SubsetVcfBySamplesList.vcf_subset_index[4]]) + Array[File] pesr_vcfs_no_outliers = select_all([SubsetVcfBySamplesList.vcf_subset[0], SubsetVcfBySamplesList.vcf_subset[1], SubsetVcfBySamplesList.vcf_subset[2], SubsetVcfBySamplesList.vcf_subset[3]]) + Array[File] pesr_vcfs_no_outliers_index = select_all([SubsetVcfBySamplesList.vcf_subset_index[0], SubsetVcfBySamplesList.vcf_subset_index[1], SubsetVcfBySamplesList.vcf_subset_index[2], SubsetVcfBySamplesList.vcf_subset_index[3]]) call tasks_mcv.ConcatVcfs as MergePesrVcfs { input: vcfs=pesr_vcfs_no_outliers, From f1c4a470a6ebd19d97932af505058b302a0d41b2 Mon Sep 17 00:00:00 2001 From: Vahid Date: Tue, 21 Feb 2023 10:25:31 -0500 Subject: [PATCH 16/19] Maintain a mirror of images on GCR and ACR. (#477) * Maintain a mirror of images on GCR and ACR. * Revert renaming dockers.json to dockers_gcp.json to keep this branch/PR simpler. * Update Azure images. * Push the remaining docker images to acr. --- .github/workflows/sv_pipeline_docker.yml | 35 +++++++++++++++++++---- inputs/values/dockers_azure.json | 36 ++++++++++++++++++++++++ 2 files changed, 65 insertions(+), 6 deletions(-) create mode 100644 inputs/values/dockers_azure.json diff --git a/.github/workflows/sv_pipeline_docker.yml b/.github/workflows/sv_pipeline_docker.yml index 6bb636dd4..40c6f42b3 100644 --- a/.github/workflows/sv_pipeline_docker.yml +++ b/.github/workflows/sv_pipeline_docker.yml @@ -155,6 +155,9 @@ jobs: strategy: matrix: python-version: ['3.8'] + env: + DOCKERS_AZURE: "./inputs/values/dockers_azure.json" + DOCKERS_GCP: "./inputs/values/dockers.json" steps: - name: Checkout code uses: actions/checkout@v2 @@ -203,24 +206,44 @@ jobs: sudo mv "$tmp" /etc/docker/daemon.json sudo systemctl restart docker.service - - name: Build and Publish Docker Images + - name: Build and Publish Docker Images to GCR id: build_and_publish run: | python ./scripts/docker/build_docker.py \ --base-git-commit ${{ needs.build_args_job.outputs.base_sha }} \ --current-git-commit ${{ needs.build_args_job.outputs.head_sha }} \ - --gcr-project ${{ secrets.GCP_PROJECT_ID }}/gatk-sv \ + --docker-repo us.gcr.io/${{ secrets.GCP_PROJECT_ID }}/gatk-sv \ --image-tag ${{ needs.build_args_job.outputs.image_tag }} \ - --prune-after-each-image - CHANGED=$(git diff --quiet ./inputs/values/dockers.json || echo True) + --input-json $DOCKERS_GCP \ + --output-json $DOCKERS_GCP + + CHANGED=$(git diff --quiet $DOCKERS_GCP || echo True) echo "::set-output name=CHANGED::$CHANGED" - - name: Commit Changes to dockers.json + - name: Azure login + uses: azure/docker-login@v1 + with: + login-server: ${{ secrets.AZ_CR }} + username: ${{ secrets.AZ_USERNAME }} + password: ${{ secrets.AZ_PASSWORD }} + + - name: Build and Publish Docker Images to ACR + run: | + python ./scripts/docker/build_docker.py \ + --targets manta \ + --docker-repo ${{ secrets.AZ_CR }} \ + --image-tag ${{ needs.build_args_job.outputs.image_tag }} \ + --input-json $DOCKERS_AZURE \ + --output-json $DOCKERS_AZURE \ + --prune-after-each-image \ + --disable-git-protect + + - name: Commit Changes to dockers_*.json if: steps.build_and_publish.outputs.CHANGED run: | COMMIT_SHA=${{ needs.build_args_job.outputs.head_sha }} git config --global user.name 'gatk-sv-bot' git config --global user.email '101641599+gatk-sv-bot@users.noreply.github.com' - git commit ./inputs/values/dockers.json -m "Update docker images list, triggered by "${COMMIT_SHA::8} + git commit $DOCKERS_AZURE $DOCKERS_GCP -m "Update docker images list, triggered by "${COMMIT_SHA::8} git pull --rebase origin main git push diff --git a/inputs/values/dockers_azure.json b/inputs/values/dockers_azure.json new file mode 100644 index 000000000..cbadb5319 --- /dev/null +++ b/inputs/values/dockers_azure.json @@ -0,0 +1,36 @@ +{ + "name": "dockers", + "cnmops_docker": "vahid.azurecr.io/gatk-sv/cnmops:2023-02-01-v0.26.8-beta-9b25c72d", + "condense_counts_docker": "vahid.azurecr.io/tsharpe/gatk:4.2.6.1-57-g9e03432", + "gatk_docker": "vahid.azurecr.io/tsharpe/gatk:4.2.6.1-57-g9e03432", + "gatk_docker_pesr_override": "vahid.azurecr.io/tsharpe/gatk:4.2.6.1-57-g9e03432", + "gatk_docker_concordance": "vahid.azurecr.io/markw/gatk:mw-sv-concordance-937c81", + "genomes_in_the_cloud_docker": "vahid.azurecr.io/genomes-in-the-cloud:2.3.2-1510681135", + "linux_docker": "vahid.azurecr.io/google/ubuntu1804", + "manta_docker": "vahid.azurecr.io/vjalili/manta:5994670", + "melt_docker": "vahid.azurecr.io/melt:vj-4ff9de9f", + "scramble_docker": "vahid.azurecr.io/tsharpe/scramble:1.0.2", + "samtools_cloud_docker": "vahid.azurecr.io/gatk-sv/samtools-cloud:2023-02-01-v0.26.8-beta-9b25c72d", + "sv_base_docker": "vahid.azurecr.io/gatk-sv/sv-base:2023-02-01-v0.26.8-beta-9b25c72d", + "sv_base_mini_docker": "vahid.azurecr.io/vjalili/sv-base-mini:5994670", + "sv_pipeline_base_docker": "vahid.azurecr.io/gatk-sv/sv-pipeline:2023-02-01-v0.26.8-beta-9b25c72d", + "sv_pipeline_docker": "vahid.azurecr.io/gatk-sv/sv-pipeline:2023-02-01-v0.26.8-beta-9b25c72d", + "sv_pipeline_hail_docker": "vahid.azurecr.io/gatk-sv/sv-pipeline:2023-02-01-v0.26.8-beta-9b25c72d", + "sv_pipeline_updates_docker": "vahid.azurecr.io/gatk-sv/sv-pipeline:2023-02-01-v0.26.8-beta-9b25c72d", + "sv_pipeline_qc_docker": "vahid.azurecr.io/gatk-sv/sv-pipeline:2023-02-01-v0.26.8-beta-9b25c72d", + "sv_pipeline_rdtest_docker": "vahid.azurecr.io/gatk-sv/sv-pipeline:2023-02-01-v0.26.8-beta-9b25c72d", + "wham_docker": "vahid.azurecr.io/vjalili/wham:5994670", + "igv_docker": "vahid.azurecr.io/gatk-sv/igv:mw-xz-fixes-2-b1be6a9", + "duphold_docker": "vahid.azurecr.io/gatk-sv/duphold:mw-xz-fixes-2-b1be6a9", + "vapor_docker": "vahid.azurecr.io/eph/vapor:header-hash-2fc8f12", + "cloud_sdk_docker": "vahid.azurecr.io/google/cloud-sdk", + "pangenie_docker": "vahid.azurecr.io/vjalili/pangenie:vj-127571f", + "sv-base-virtual-env": "vahid.azurecr.io/vjalili/sv-base-virtual-env:5994670", + "cnmops-virtual-env": "vahid.azurecr.io/vjalili/cnmops-virtual-env:5994670", + "sv-pipeline-virtual-env": "vahid.azurecr.io/gatk-sv/sv-pipeline-virtual-env:2023-02-01-v0.26.8-beta-9b25c72d", + "samtools-cloud-virtual-env": "vahid.azurecr.io/vjalili/samtools-cloud-virtual-env:5994670", + "sv-utils-env": "vahid.azurecr.io/gatk-sv/sv-utils-env:2023-02-01-v0.26.8-beta-9b25c72d", + "sv_utils_docker": "vahid.azurecr.io/gatk-sv/sv-utils:2023-02-01-v0.26.8-beta-9b25c72d", + "gq_recalibrator_docker": "vahid.azurecr.io/tbrookin/gatk:0a7e1d86f", + "str": "vahid.azurecr.io/vjalili/str:5994670" +} \ No newline at end of file From 031ca6abf1b31a6bddf4e75ee290f348307141ce Mon Sep 17 00:00:00 2001 From: Vahid Date: Wed, 22 Feb 2023 10:21:35 -0500 Subject: [PATCH 17/19] Set target images for Azure based on commit SHA. (#498) --- .github/workflows/sv_pipeline_docker.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/sv_pipeline_docker.yml b/.github/workflows/sv_pipeline_docker.yml index 40c6f42b3..af67d576d 100644 --- a/.github/workflows/sv_pipeline_docker.yml +++ b/.github/workflows/sv_pipeline_docker.yml @@ -230,7 +230,8 @@ jobs: - name: Build and Publish Docker Images to ACR run: | python ./scripts/docker/build_docker.py \ - --targets manta \ + --base-git-commit ${{ needs.build_args_job.outputs.base_sha }} \ + --current-git-commit ${{ needs.build_args_job.outputs.head_sha }} \ --docker-repo ${{ secrets.AZ_CR }} \ --image-tag ${{ needs.build_args_job.outputs.image_tag }} \ --input-json $DOCKERS_AZURE \ From e2823cb8e0f708f2d300ecf6f25682258b2c9ebb Mon Sep 17 00:00:00 2001 From: Vahid Date: Thu, 2 Mar 2023 08:22:54 -0500 Subject: [PATCH 18/19] Update docs and its build action (#494) * Update website README. * Add docs on install git LFS for the website. * Build the website only if the contents of the website dir is updated. * Add a hint for how to build on M1. * Update website. * Clean Terra getting started docs. * Revert sections titles to workflow/module names. * Remove the batching section from Cohort.md as its own mdx exists. * Remove the troubleshooting section. * Replace the paragraph with a link to an article on gatk website. * Remove the link to the Terra workspace. * Remove links. * Reference cromwell and cromshell docs. * Remove numbers from headings. * Simplify note display. * Add a link to the exec modes in the overview. * Remove the run section. * Change the order of modules in the sidebar to match their call order. * remove docker files description. * remove the repeated content. * Remove link to batching. * remove other backends. * revert batching. * change sentence wording. * add a section on melt. * update description. * removed a sentence. * removed terra file. * revert naming for consistency. * extend intro. * add training. * add sample id req. * typo. * add sample exclusion and move id req. to input_files.md. * add gen ref panel. * fix broken links. * add faq section. * move content from wdls.md to modules/index.md. * Highlight supported backends in an admonitions. * add links for the references. * update header. * move content from cromwell to runtime_env. * Move modules to the end of the list. * Some formatting updates. * Update text. * Update text. * Match the heading level. * Remove the ref to hosting your own cromwell server. * Grammar corrections. * Text update. --- .github/workflows/docs.yml | 4 + website/README.md | 50 ++++++---- website/docs/execmodes/batch.md | 5 - website/docs/execmodes/cohort.md | 29 ++++++ website/docs/execmodes/singlesample.md | 5 + website/docs/gs/_category_.json | 7 ++ website/docs/gs/input_files.md | 92 +++++++++++++++++++ website/docs/gs/overview.md | 14 +++ website/docs/gs/quick_start.md | 77 ++++++++++++++++ website/docs/gs/runtime_env.md | 86 +++++++++++++++++ website/docs/intro.md | 40 +++++--- website/docs/modules/_category_.json | 7 ++ website/docs/modules/annotate_vcf.md | 23 +++++ website/docs/modules/cluster_batch.md | 23 +++++ website/docs/modules/downstream_filter.md | 36 ++++++++ website/docs/modules/evidence_qc.md | 67 ++++++++++++++ website/docs/modules/filter_batch.md | 40 ++++++++ website/docs/modules/gather_batch_evidence.md | 32 +++++++ .../docs/modules/gather_sample_evidence.md | 25 +++++ .../docs/modules/generate_batch_metrics.md | 23 +++++ website/docs/modules/genotype_batch.md | 27 ++++++ website/docs/modules/index.md | 38 ++++++++ website/docs/modules/make_cohort_vcf.md | 30 ++++++ website/docs/modules/merge_batch_sites.md | 20 ++++ website/docs/modules/regenotype_cnvs.md | 24 +++++ website/docs/modules/train_gcnv.md | 41 +++++++++ website/docs/modules/visualization.md | 14 +++ website/docs/run/_category_.json | 7 ++ website/docs/run/batching.md | 22 +++++ website/docs/troubleshooting/_category_.json | 7 ++ website/docs/troubleshooting/faq.md | 37 ++++++++ .../src/components/HomepageFeatures/index.js | 9 +- website/src/pages/index.js | 2 +- 33 files changed, 922 insertions(+), 41 deletions(-) delete mode 100644 website/docs/execmodes/batch.md create mode 100644 website/docs/execmodes/cohort.md create mode 100644 website/docs/gs/_category_.json create mode 100644 website/docs/gs/input_files.md create mode 100644 website/docs/gs/overview.md create mode 100644 website/docs/gs/quick_start.md create mode 100644 website/docs/gs/runtime_env.md create mode 100644 website/docs/modules/_category_.json create mode 100644 website/docs/modules/annotate_vcf.md create mode 100644 website/docs/modules/cluster_batch.md create mode 100644 website/docs/modules/downstream_filter.md create mode 100644 website/docs/modules/evidence_qc.md create mode 100644 website/docs/modules/filter_batch.md create mode 100644 website/docs/modules/gather_batch_evidence.md create mode 100644 website/docs/modules/gather_sample_evidence.md create mode 100644 website/docs/modules/generate_batch_metrics.md create mode 100644 website/docs/modules/genotype_batch.md create mode 100644 website/docs/modules/index.md create mode 100644 website/docs/modules/make_cohort_vcf.md create mode 100644 website/docs/modules/merge_batch_sites.md create mode 100644 website/docs/modules/regenotype_cnvs.md create mode 100644 website/docs/modules/train_gcnv.md create mode 100644 website/docs/modules/visualization.md create mode 100644 website/docs/run/_category_.json create mode 100644 website/docs/run/batching.md create mode 100644 website/docs/troubleshooting/_category_.json create mode 100644 website/docs/troubleshooting/faq.md diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 3d861fc53..3f160a164 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -3,8 +3,12 @@ name: documentation on: pull_request: branches: [main] + paths: + - 'website/**' push: branches: [main] + paths: + - 'website/**' jobs: Test: diff --git a/website/README.md b/website/README.md index aaba2fa1e..f44da0d10 100644 --- a/website/README.md +++ b/website/README.md @@ -1,41 +1,51 @@ # Website -This website is built using [Docusaurus 2](https://docusaurus.io/), a modern static website generator. +The GATK-SV [technical documentation website](https://broadinstitute.github.io/gatk-sv/) +is built using [Docusaurus](https://docusaurus.io/). The website is built +from the Markdown (`.md`) files available in the `website/docs/` directory. +The website is built into the [GATK-SV CI/CD](https://github.com/broadinstitute/gatk-sv/blob/main/.github/workflows/docs.yml), +hence, any changes to the `website/` is automatically tested, built, and deployed. -### Installation -``` -$ yarn -``` +## Installation +You may build and run it for local development using the following commands. + +### Prerequisite -### Local Development +- Install [Node.js](https://nodejs.org/en/download/); +- Install [Git LFS](https://docs.github.com/en/repositories/working-with-files/managing-large-files/installing-git-large-file-storage) +since GATK-SV tracks images using `git lfs`. After you `git clone` the GATK-SV repository, run the +following command to fetch the `LFS`-tracked files. + ``` + $ git lfs install + $ git lfs fetch --all + $ git lfs pull + ``` +- Mac M1 users: You may need to [set IPv6 configuration](https://support.apple.com/guide/mac-help/use-ipv6-on-mac-mchlp2499/mac) +to `Link-Local Only` for `npm` to successfully install packages. + +### Install ``` -$ yarn start +$ cd website/ +$ npm install ``` -This command starts a local development server and opens up a browser window. Most changes are reflected live without having to restart the server. +This command installs necessary packages and creates `package-lock.json` +for the reproducibility of deployment. ### Build ``` -$ yarn build +$ npm run build ``` This command generates static content into the `build` directory and can be served using any static contents hosting service. -### Deployment - -Using SSH: +### Start ``` -$ USE_SSH=true yarn deploy +$ npm run start ``` -Not using SSH: - -``` -$ GIT_USER= yarn deploy -``` - -If you are using GitHub pages for hosting, this command is a convenient way to build the website and push to the `gh-pages` branch. +This command starts a local development server and opens up a browser window. Most changes are reflected live without having to restart the server. diff --git a/website/docs/execmodes/batch.md b/website/docs/execmodes/batch.md deleted file mode 100644 index 6980383eb..000000000 --- a/website/docs/execmodes/batch.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -sidebar_position: 2 ---- - -# Batch execution mode diff --git a/website/docs/execmodes/cohort.md b/website/docs/execmodes/cohort.md new file mode 100644 index 000000000..969f9ebeb --- /dev/null +++ b/website/docs/execmodes/cohort.md @@ -0,0 +1,29 @@ +--- +sidebar_position: 2 +--- + +# Cohort mode + +A minimum cohort size of 100 is required, and a roughly equal +number of males and females is recommended. For modest cohorts +(~100-500 samples), the pipeline can be run as a single batch +using `GATKSVPipelineBatch.wdl`. + +For larger cohorts, samples should be split up into batches of +about 100-500 samples. + +The pipeline should be executed as follows: + +- Modules [GatherSampleEvidence](/docs/modules/gse) and + [EvidenceQC](/docs/modules/eqc) can be run on arbitrary cohort partitions + +- Modules [GatherBatchEvidence](/docs/modules/gbe), + [ClusterBatch](/docs/modules/cb), [GenerateBatchMetrics](/docs/modules/gbm), + and [FilterBatch](/docs/modules/fb) are run separately per batch + +- [GenotypeBatch](/docs/modules/gb) is run separately per batch, + using filtered variants ([FilterBatch](/docs/modules/fb) output) combined across all batches + +- [MakeCohortVcf](/docs/modules/cvcf) and beyond are run on all batches together + +Note: [GatherBatchEvidence](/docs/modules/gbe) requires a [trained gCNV model](/docs/modules/gcnv). diff --git a/website/docs/execmodes/singlesample.md b/website/docs/execmodes/singlesample.md index 6ef9609ee..df35bf5a0 100644 --- a/website/docs/execmodes/singlesample.md +++ b/website/docs/execmodes/singlesample.md @@ -4,3 +4,8 @@ sidebar_position: 1 # Single-sample execution mode +`GATKSVPipelineSingleSample.wdl` runs the pipeline on a single sample using +a fixed reference panel. An example run with reference panel containing 156 +samples from the +[NYGC 1000G Terra workspace](https://app.terra.bio/#workspaces/anvil-datastorage/1000G-high-coverage-2019) +can be found in `inputs/build/NA12878/test` after building inputs). diff --git a/website/docs/gs/_category_.json b/website/docs/gs/_category_.json new file mode 100644 index 000000000..6b9e1272a --- /dev/null +++ b/website/docs/gs/_category_.json @@ -0,0 +1,7 @@ +{ + "label": "Getting Started", + "position": 2, + "link": { + "type": "generated-index" + } +} \ No newline at end of file diff --git a/website/docs/gs/input_files.md b/website/docs/gs/input_files.md new file mode 100644 index 000000000..724a007f8 --- /dev/null +++ b/website/docs/gs/input_files.md @@ -0,0 +1,92 @@ +--- +title: Input Data +description: Supported input and reference data. +sidebar_position: 3 +slug: ./inputs +--- + +GATK-SV requires the following input data: + +- Illumina short-read whole-genome CRAMs or BAMs, aligned to hg38 with [bwa-mem](https://github.com/lh3/bwa). + BAMs must also be indexed. + +- Family structure definitions file in + [PED format](https://gatk.broadinstitute.org/hc/en-us/articles/360035531972-PED-Pedigree-format). + **Note**: Sex aneuploidies (detected in EvidenceQC) should be entered as sex = 0. + +### Sample Exclusion +We recommend filtering out samples with a high percentage +of improperly paired reads (>10% or an outlier for your data) +as technical outliers prior to running [GatherSampleEvidence](/docs/modules/gse). +A high percentage of improperly paired reads may indicate issues +with library prep, degradation, or contamination. Artifactual +improperly paired reads could cause incorrect SV calls, and +these samples have been observed to have longer runtimes and +higher compute costs for [GatherSampleEvidence](/docs/modules/gse). + + +### Sample ID requirements +#### Sample IDs must + +- Be unique within the cohort +- Contain only alphanumeric characters and underscores (no dashes, whitespace, or special characters) + +#### Sample IDs should not + +- Contain only numeric characters +- Be a substring of another sample ID in the same cohort +- Contain any of the following substrings: `chr`, `name`, `DEL`, `DUP`, `CPX`, `CHROM` + +The same requirements apply to family IDs in the PED file, +as well as batch IDs and the cohort ID provided as workflow inputs. + +Sample IDs are provided to GatherSampleEvidence directly and +need not match sample names from the BAM/CRAM headers. +`GetSampleID.wdl` can be used to fetch BAM sample IDs and +also generates a set of alternate IDs that are considered +safe for this pipeline; alternatively, [this script](https://github.com/talkowski-lab/gnomad_sv_v3/blob/master/sample_id/convert_sample_ids.py) +transforms a list of sample IDs to fit these requirements. +Currently, sample IDs can be replaced again in [GatherBatchEvidence](/docs/modules/gbe). + +The following inputs will need to be updated with the transformed sample IDs: + +- Sample ID list for [GatherSampleEvidence](/docs/modules/gse) or [GatherBatchEvidence](/docs/modules/gbe) +- PED file + + +### Generating a reference panel (single-sample mode only) +New reference panels can be generated easily from a single run of the `GATKSVPipelineBatch` workflow. If using a Cromwell server, we recommend copying the outputs to a permanent location by adding the following option to the workflow configuration file: +``` + "final_workflow_outputs_dir" : "gs://my-outputs-bucket", + "use_relative_output_paths": false, +``` +Here is an example of how to generate workflow input jsons from `GATKSVPipelineBatch` workflow metadata: +``` +> cromshell -t60 metadata 38c65ca4-2a07-4805-86b6-214696075fef > metadata.json +> python scripts/inputs/create_test_batch.py \ + --execution-bucket gs://my-exec-bucket \ + --final-workflow-outputs-dir gs://my-outputs-bucket \ + metadata.json \ + > inputs/values/my_ref_panel.json +> # Define your google project id (for Cromwell inputs) and Terra billing project (for workspace inputs) +> echo '{ "google_project_id": "my-google-project-id", "terra_billing_project_id": "my-terra-billing-project" }' > inputs/values/google_cloud.my_project.json +> # Build test files for batched workflows (google cloud project id required) +> python scripts/inputs/build_inputs.py \ + inputs/values \ + inputs/templates/test \ + inputs/build/my_ref_panel/test \ + -a '{ "test_batch" : "ref_panel_1kg", "cloud_env": "google_cloud.my_project" }' +> # Build test files for the single-sample workflow +> python scripts/inputs/build_inputs.py \ + inputs/values \ + inputs/templates/test/GATKSVPipelineSingleSample \ + inputs/build/NA19240/test_my_ref_panel \ + -a '{ "single_sample" : "test_single_sample_NA19240", "ref_panel" : "my_ref_panel" }' +> # Build files for a Terra workspace +> python scripts/inputs/build_inputs.py \ + inputs/values \ + inputs/templates/terra_workspaces/single_sample \ + inputs/build/NA12878/terra_my_ref_panel \ + -a '{ "single_sample" : "test_single_sample_NA12878", "ref_panel" : "my_ref_panel" }' +``` +Note that the inputs to `GATKSVPipelineBatch` may be used as resources for the reference panel and therefore should also be in a permanent location. diff --git a/website/docs/gs/overview.md b/website/docs/gs/overview.md new file mode 100644 index 000000000..eb839c99e --- /dev/null +++ b/website/docs/gs/overview.md @@ -0,0 +1,14 @@ +--- +title: Overview +description: Setup and options for running GATK-SV. +sidebar_position: 0 +--- + +# Overview + +GATK-SV is a highly-scalable cloud-native pipeline for structural variation discovery +on Illumina short-read whole-genome sequencing (WGS) data. +The pipeline genotypes structural variations using Docker-based tools, modularized in +different components, and orchestrated using Cromwell. + +The pipeline runs in two modes: [Cohort](/docs/execmodes/cohort) and [single-sample](/docs/execmodes/singlesample). diff --git a/website/docs/gs/quick_start.md b/website/docs/gs/quick_start.md new file mode 100644 index 000000000..3a23c546c --- /dev/null +++ b/website/docs/gs/quick_start.md @@ -0,0 +1,77 @@ +--- +title: Quick Start +description: Run the pipeline on demo data. +sidebar_position: 1 +slug: ./qs +--- + +This page provides steps for running the pipeline using demo data. + +# Quick Start on Cromwell + +This section walks you through the steps of running pipeline using +demo data on a managed Cromwell server. + +### Setup Environment + +- A running instance of a Cromwell server. + +- Install Cromshell and configure it to connect with the Cromwell server you are using. + You may refer to the documentation on [Cromshell README](https://github.com/broadinstitute/cromshell). + +### Build Inputs + +- Example workflow inputs can be found in `/inputs`. + Build using `scripts/inputs/build_default_inputs.sh`, + which generates input jsons in `/inputs/build`. + +- Some workflows require a Google Cloud Project ID to be defined in + a cloud environment parameter group. Workspace builds require a + Terra billing project ID as well. An example is provided at + `/inputs/values/google_cloud.json` but should not be used, + as modifying this file will cause tracked changes in the repository. + Instead, create a copy in the same directory with the format + `google_cloud.my_project.json` and modify as necessary. + + Note that these inputs are required only when certain data are + located in requester pays buckets. If this does not apply, + users may use placeholder values for the cloud configuration + and simply delete the inputs manually. + +### MELT +Important: The example input files contain MELT inputs that are NOT public +(see [Requirements](https://github.com/broadinstitute/gatk-sv#requirements)). These include: + +- `GATKSVPipelineSingleSample.melt_docker` and `GATKSVPipelineBatch.melt_docker` - MELT docker URI +(see [Docker readme](https://github.com/talkowski-lab/gatk-sv-v1/blob/master/dockerfiles/README.md)) +- `GATKSVPipelineSingleSample.ref_std_melt_vcfs` - Standardized MELT VCFs ([GatherBatchEvidence](/docs/modules/gbe)) +The input values are provided only as an example and are not publicly accessible. +- In order to include MELT, these values must be provided by the user. MELT can be + disabled by deleting these inputs and setting `GATKSVPipelineBatch.use_melt` to false. + +### Requester Pays Buckets + +The following parameters must be set when certain input data is in requester pays (RP) buckets: + +`GATKSVPipelineSingleSample.requester_pays_cram` and +`GATKSVPipelineBatch.GatherSampleEvidenceBatch.requester_pays_crams` - +set to `True` if inputs are CRAM format and in an RP bucket, otherwise `False`. + + +### Execution + +```shell +> mkdir gatksv_run && cd gatksv_run +> mkdir wdl && cd wdl +> cp $GATK_SV_ROOT/wdl/*.wdl . +> zip dep.zip *.wdl +> cd .. +> echo '{ "google_project_id": "my-google-project-id", "terra_billing_project_id": "my-terra-billing-project" }' > inputs/values/google_cloud.my_project.json +> bash scripts/inputs/build_default_inputs.sh -d $GATK_SV_ROOT -c google_cloud.my_project +> cp $GATK_SV_ROOT/inputs/build/ref_panel_1kg/test/GATKSVPipelineBatch/GATKSVPipelineBatch.json GATKSVPipelineBatch.my_run.json +> cromshell submit wdl/GATKSVPipelineBatch.wdl GATKSVPipelineBatch.my_run.json cromwell_config.json wdl/dep.zip +``` + +where `cromwell_config.json` is a Cromwell +[workflow options file](https://cromwell.readthedocs.io/en/stable/wf_options/Overview/). +Note users will need to re-populate batch/sample-specific parameters (e.g. BAMs and sample IDs). diff --git a/website/docs/gs/runtime_env.md b/website/docs/gs/runtime_env.md new file mode 100644 index 000000000..8dc54cdb1 --- /dev/null +++ b/website/docs/gs/runtime_env.md @@ -0,0 +1,86 @@ +--- +title: Runtime Environments +description: Describes the supported runtime environments. +sidebar_position: 7 +slug: ./runtime-env +--- + +The GATK-SV pipeline consists of _workflows_ and _reference data_ that +orchestrates the analysis flow of input data. Hence, a successful +execution requires running the _workflows_ on _reference_ and input data. + +:::info Currently supported backends: GCP +GATK-SV has been tested only on the Google Cloud Platform (GCP); +therefore, we are unable to provide specific guidance or support +for other execution platforms including HPC clusters and AWS. +::: + +## Alternative backends + +Contributions from the community to improve portability between backends +will be considered on a case-by-case-basis. We ask contributors to +please adhere to the following guidelines when submitting issues and pull requests: + +1. Code changes must be functionally equivalent on GCP backends, i.e. not result in changed output +2. Increases to cost and runtime on GCP backends should be minimal +3. Avoid adding new inputs and tasks to workflows. Simpler changes + are more likely to be approved, e.g. small in-line changes to scripts or WDL task command sections +4. Avoid introducing new code paths, e.g. conditional statements +5. Additional backend-specific scripts, workflows, tests, and Dockerfiles will not be approved +6. Changes to Dockerfiles may require extensive testing before approval + +We still encourage members of the community to adapt GATK-SV for non-GCP backends +and share code on forked repositories. Here are a some considerations: + +- Refer to Cromwell's [documentation](https://cromwell.readthedocs.io/en/stable/backends/Backends/) + for configuration instructions. + +- The handling and ordering of `glob` commands may differ between platforms. + +- Shell commands that are potentially destructive to input files + (e.g. `rm`, `mv`, `tabix`) can cause unexpected behavior on shared filesystems. + Enabling [copy localization](https://cromwell.readthedocs.io/en/stable/Configuring/#local-filesystem-options) + may help to more closely replicate the behavior on GCP. + +- For clusters that do not support Docker, Singularity is an alternative. + See [Cromwell documentation on Singularity](https://cromwell.readthedocs.io/en/stable/tutorials/Containers/#singularity). + +- The GATK-SV pipeline takes advantage of the massive parallelization possible in the cloud. + Local backends may not have the resources to execute all of the workflows. + Workflows that use fewer resources or that are less parallelized may be more successful. + For instance, some users have been able to run [GatherSampleEvidence](#gather-sample-evidence) on a SLURM cluster. + +# Cromwell + +[Cromwell](https://github.com/broadinstitute/cromwell) is a workflow management system +that takes a workflow (e.g., a workflow written in [Workflow Description Language (WDL)](https://openwdl.org)), +its dependencies and input data, and runs it on a given platform +(e.g., [GCP](https://cromwell.readthedocs.io/en/stable/backends/Google/)). +In order to run a workflow on Cromwell, you need a running instance of +Cromwell that is available in two forms: [Server and stand-alone mode](https://cromwell.readthedocs.io/en/stable/Modes/). + +In general, you may use a managed Cromwell server maintained by your +institute or host a self-managed server, or run Cromwell as a standalone Java application. +The former is ideal for large scale execution in a managed environment, +and the latter is useful for small scale and isolated WDL development. + +:::info +Due to its dependency on cloud-hosted resources and large-scale execution needs, +we currently do not support running the entire GATK-SV pipeline using +Cromwell as a [stand-alone Java application](https://cromwell.readthedocs.io/en/stable/Modes/#run) +::: + +# Cromwell Server + +There are two option to communicate with a running Cromwell server: +[REST API](https://cromwell.readthedocs.io/en/stable/tutorials/ServerMode/), and +[Cromshell](https://github.com/broadinstitute/cromshell) which is a command line tool +to interface with a Cromwell server. We recommend using Cromshell due to its simplicity +of use. This documentation is explained using Cromshell, but the same steps can be +taken using the REST API. + +- **Setup Cromwell**: You may follow [this](https://cromwell.readthedocs.io/en/stable/Modes/) documentation + on setting up a Cromwell server. + +- **Setup Cromshell**: You may follow [this](https://github.com/broadinstitute/cromshell) documentation + on installing and configuring Cromshell to communicate with the Cromwell server. diff --git a/website/docs/intro.md b/website/docs/intro.md index 6793271f0..46d0da8a7 100644 --- a/website/docs/intro.md +++ b/website/docs/intro.md @@ -1,16 +1,32 @@ --- +title: About +description: GATK-SV sidebar_position: 1 --- -# About - -Structural variants are key players in human evolution and -disease, but they are difficult to discover and analyze. -There are many tools out there to discover structural variants; -some algorithms are specialized to detect certain structural -variants or only work with certain SV categories. GATK-SV is -an ensemble method that specializes in bringing together -the best evidence from each tool in a high quality format. -In this article, we will go over the evidence categories -for detecting structural variants and the structural variant -types that we report in GATK-SV. \ No newline at end of file +GATK-SV is a comprehensive, cloud-based ensemble pipeline to capture and annotate all +classes of structural variants (SV) from whole genome sequencing (WGS). It can detect +deletions, duplications, multi-allelic copy number variants, balanced inversions, +insertions, translocations, and a diverse spectrum of complex SV. Briefly, GATK-SV +maximizes the sensitivity of SV discovery by harmonizing output from five tools +(Manta, Wham, cnMOPS, GATK-gCNV, MELT). In order to reduce false positives, raw SV +are adjudicated and re-genotyped from read evidence considering all potential +sequencing evidence including anomalous paired-end (PE) reads, split reads (SR) crossing +a breakpoint, normalized read-depth (RD), and B-allele frequencies (BAF). It also +fully resolves 11 classes of complex SV based on their breakpoint signature. GATK-SV +has been designed to be deployed in the Google Cloud Platform via the cromwell +execution engine, which allows massively parallel scaling. Further details about +GATK--SV can be found in [Collins et al. 2020, Nature](https://www.nature.com/articles/s41586-020-2287-8). + + +A high-level description of GATK-SV is available [here](https://gatk.broadinstitute.org/hc/en-us/articles/9022487952155-Structural-variant-SV-discovery). + +### Citation + +Please cite the following publication: + +- [Collins, Brand, et al. 2020. "A structural variation reference for medical and population genetics." Nature 581, 444-451.](https://doi.org/10.1038/s41586-020-2287-8) + +Additional references: + +- [Werling et al. 2018. "An analytical framework for whole-genome sequence association studies and its implications for autism spectrum disorder." Nature genetics 50.5, 727-736.](https://doi.org/10.1038/s41588-018-0107-y) diff --git a/website/docs/modules/_category_.json b/website/docs/modules/_category_.json new file mode 100644 index 000000000..bb1a47f82 --- /dev/null +++ b/website/docs/modules/_category_.json @@ -0,0 +1,7 @@ +{ + "label": "Modules", + "position": 6, + "link": { + "type": "generated-index" + } +} diff --git a/website/docs/modules/annotate_vcf.md b/website/docs/modules/annotate_vcf.md new file mode 100644 index 000000000..f4d965b22 --- /dev/null +++ b/website/docs/modules/annotate_vcf.md @@ -0,0 +1,23 @@ +--- +title: AnnotateVCF +description: Annotate VCF (work in progress) +sidebar_position: 13 +slug: av +--- + +Add annotations, such as the inferred function and +allele frequencies of variants, to final VCF. + +Annotations methods include: + +- Functional annotation - annotate SVs with inferred functional + consequence on protein-coding regions, regulatory regions + such as UTR and promoters, and other non-coding elements. + +- Allele Frequency annotation - annotate SVs with their allele + frequencies across all samples, and samples of specific sex, + as well as specific sub-populations. + +- Allele Frequency annotation with external callset - annotate + SVs with the allele frequencies of their overlapping SVs in + another callset, eg. gnomad SV callset. diff --git a/website/docs/modules/cluster_batch.md b/website/docs/modules/cluster_batch.md new file mode 100644 index 000000000..eb3435e24 --- /dev/null +++ b/website/docs/modules/cluster_batch.md @@ -0,0 +1,23 @@ +--- +title: ClusterBatch +description: Cluster Batch +sidebar_position: 5 +slug: cb +--- + +Clusters SV calls across a batch. + +### Prerequisites + +- GatherBatchEvidence + + +### Inputs + +- Standardized call VCFs (GatherBatchEvidence) +- Depth-only (DEL/DUP) calls (GatherBatchEvidence) + +### Outputs + +- Clustered SV VCFs +- Clustered depth-only call VCF diff --git a/website/docs/modules/downstream_filter.md b/website/docs/modules/downstream_filter.md new file mode 100644 index 000000000..f06df455b --- /dev/null +++ b/website/docs/modules/downstream_filter.md @@ -0,0 +1,36 @@ +--- +title: Downstream Filtering +description: Downstream filtering (work in progress) +sidebar_position: 12 +slug: df +--- + +Apply downstream filtering steps to the cleaned VCF to further +control the false discovery rate; all steps are optional +and users should decide based on the specific purpose of +their projects. + +Filtering methods include: + +- minGQ - remove variants based on the genotype quality across + populations. Note: Trio families are required to build the minGQ + filtering model in this step. We provide tables pre-trained with + the 1000 genomes samples at different FDR thresholds for projects + that lack family structures, and they can be found at the paths below. + These tables assume that GQ has a scale of [0,999], so they will + not work with newer VCFs where GQ has a scale of [0,99]. + + ```shell + gs://gatk-sv-resources-public/hg38/v0/sv-resources/ref-panel/1KG/v2/mingq/1KGP_2504_and_698_with_GIAB.10perc_fdr.PCRMINUS.minGQ.filter_lookup_table.txt + gs://gatk-sv-resources-public/hg38/v0/sv-resources/ref-panel/1KG/v2/mingq/1KGP_2504_and_698_with_GIAB.1perc_fdr.PCRMINUS.minGQ.filter_lookup_table.txt + gs://gatk-sv-resources-public/hg38/v0/sv-resources/ref-panel/1KG/v2/mingq/1KGP_2504_and_698_with_GIAB.5perc_fdr.PCRMINUS.minGQ.filter_lookup_table.txt + ``` + +- BatchEffect - remove variants that show significant discrepancies + in allele frequencies across batches + +- FilterOutlierSamplesPostMinGQ - remove outlier samples with unusually + high or low number of SVs + +- FilterCleanupQualRecalibration - sanitize filter columns and + recalibrate variant QUAL scores for easier interpretation diff --git a/website/docs/modules/evidence_qc.md b/website/docs/modules/evidence_qc.md new file mode 100644 index 000000000..d6127d2ad --- /dev/null +++ b/website/docs/modules/evidence_qc.md @@ -0,0 +1,67 @@ +--- +title: EvidenceQC +description: Evidence QC +sidebar_position: 2 +slug: eqc +--- + +Runs ploidy estimation, dosage scoring, and optionally VCF QC. +The results from this module can be used for QC and batching. + +For large cohorts, this workflow can be run on arbitrary cohort +partitions of up to about 500 samples. Afterwards, we recommend +using the results to divide samples into smaller batches (~100-500 samples) +with ~1:1 male:female ratio. Refer to the [Batching](/docs/run/batching) section +for further guidance on creating batches. + +We also recommend using sex assignments generated from the ploidy +estimates and incorporating them into the PED file, with sex = 0 for sex aneuploidies. + +### Prerequisites + +- [Gather Sample Evidence](./gse) + +### Inputs + +- Read count files (GatherSampleEvidence) +- (Optional) SV call VCFs (GatherSampleEvidence) + +### Outputs + +- Per-sample dosage scores with plots +- Median coverage per sample +- Ploidy estimates, sex assignments, with plots +- (Optional) Outlier samples detected by call counts + +## Preliminary Sample QC + +The purpose of sample filtering at this stage after EvidenceQC is to +prevent very poor quality samples from interfering with the results for +the rest of the callset. In general, samples that are borderline are +okay to leave in, but you should choose filtering thresholds to suit +the needs of your cohort and study. There will be future opportunities +(as part of FilterBatch) for filtering before the joint genotyping +stage if necessary. Here are a few of the basic QC checks that we recommend: + +- Look at the X and Y ploidy plots, and check that sex assignments + match your expectations. If there are discrepancies, check for + sample swaps and update your PED file before proceeding. + +- Look at the dosage score (WGD) distribution and check that + it is centered around 0 (the distribution of WGD for PCR- + samples is expected to be slightly lower than 0, and the distribution + of WGD for PCR+ samples is expected to be slightly greater than 0. + Refer to the gnomAD-SV paper for more information on WGD score). + Optionally filter outliers. + +- Look at the low outliers for each SV caller (samples with + much lower than typical numbers of SV calls per contig for + each caller). An empty low outlier file means there were + no outliers below the median and no filtering is necessary. + Check that no samples had zero calls. + +- Look at the high outliers for each SV caller and optionally + filter outliers; samples with many more SV calls than average may be poor quality. + +- Remove samples with autosomal aneuploidies based on + the per-batch binned coverage plots of each chromosome. diff --git a/website/docs/modules/filter_batch.md b/website/docs/modules/filter_batch.md new file mode 100644 index 000000000..307c1acdd --- /dev/null +++ b/website/docs/modules/filter_batch.md @@ -0,0 +1,40 @@ +--- +title: FilterBatch +description: Filter Batch +sidebar_position: 7 +slug: fb +--- + +Filters poor quality variants and filters outlier samples. +This workflow can be run all at once with the WDL at wdl/FilterBatch.wdl, +or it can be run in three steps to enable tuning of outlier +filtration cutoffs. The three subworkflows are: + +1. FilterBatchSites: Per-batch variant filtration + +2. PlotSVCountsPerSample: Visualize SV counts per + sample per type to help choose an IQR cutoff for + outlier filtering, and preview outlier samples for a given cutoff + +3. FilterBatchSamples: Per-batch outlier sample filtration; + provide an appropriate outlier_cutoff_nIQR based on the + SV count plots and outlier previews from step 2. Note + that not removing high outliers can result in increased + compute cost and a higher false positive rate in later steps. + +### Prerequisites + +- Generate Batch Metrics + +### Inputs + +- Batch PED file +- Metrics file (GenerateBatchMetrics) +- Clustered SV and depth-only call VCFs (ClusterBatch) + +### Outputs + +- Filtered SV (non-depth-only a.k.a. "PESR") VCF with outlier samples excluded +- Filtered depth-only call VCF with outlier samples excluded +- Random forest cutoffs file +- PED file with outlier samples excluded \ No newline at end of file diff --git a/website/docs/modules/gather_batch_evidence.md b/website/docs/modules/gather_batch_evidence.md new file mode 100644 index 000000000..d6de3948f --- /dev/null +++ b/website/docs/modules/gather_batch_evidence.md @@ -0,0 +1,32 @@ +--- +title: GatherBatchEvidence +description: Gather Batch Evidence +sidebar_position: 4 +slug: gbe +--- + +Runs CNV callers (cnMOPs, GATK gCNV) and combines single-sample +raw evidence into a batch. See above for more information on batching. + +### Prerequisites + +- GatherSampleEvidence +- (Recommended) EvidenceQC +- gCNV training. + +### Inputs +- PED file (updated with EvidenceQC sex assignments, including sex = 0 + for sex aneuploidies. Calls will not be made on sex chromosomes + when sex = 0 in order to avoid generating many confusing calls + or upsetting normalized copy numbers for the batch.) +- Read count, BAF, PE, SD, and SR files (GatherSampleEvidence) +- Caller VCFs (GatherSampleEvidence) +- Contig ploidy model and gCNV model files (gCNV training) + +### Outputs + +- Combined read count matrix, SR, PE, and BAF files +- Standardized call VCFs +- Depth-only (DEL/DUP) calls +- Per-sample median coverage estimates +- (Optional) Evidence QC plots diff --git a/website/docs/modules/gather_sample_evidence.md b/website/docs/modules/gather_sample_evidence.md new file mode 100644 index 000000000..918a1b27d --- /dev/null +++ b/website/docs/modules/gather_sample_evidence.md @@ -0,0 +1,25 @@ +--- +title: GatherSampleEvidence +description: Gather Sample Evidence +sidebar_position: 1 +slug: gse +--- + +Runs raw evidence collection on each sample with the following SV callers: +Manta, Wham, and/or MELT. For guidance on pre-filtering prior to GatherSampleEvidence, +refer to the Sample Exclusion section. + +Note: a list of sample IDs must be provided. Refer to the sample ID +requirements for specifications of allowable sample IDs. +IDs that do not meet these requirements may cause errors. + +### Inputs + +- Per-sample BAM or CRAM files aligned to hg38. Index files (.bai) must be provided if using BAMs. + +### Outputs + +- Caller VCFs (Manta, MELT, and/or Wham) +- Binned read counts file +- Split reads (SR) file +- Discordant read pairs (PE) file diff --git a/website/docs/modules/generate_batch_metrics.md b/website/docs/modules/generate_batch_metrics.md new file mode 100644 index 000000000..3f202e39b --- /dev/null +++ b/website/docs/modules/generate_batch_metrics.md @@ -0,0 +1,23 @@ +--- +title: GenerateBatchMetrics +description: Generate Batch Metrics +sidebar_position: 6 +slug: gbm +--- + +Generates variant metrics for filtering. + +### Prerequisites + +- Cluster batch + +### Inputs + +- Combined read count matrix, SR, PE, and BAF files (GatherBatchEvidence) +- Per-sample median coverage estimates (GatherBatchEvidence) +- Clustered SV VCFs (ClusterBatch) +- Clustered depth-only call VCF (ClusterBatch) + +### Outputs + +- Metrics file diff --git a/website/docs/modules/genotype_batch.md b/website/docs/modules/genotype_batch.md new file mode 100644 index 000000000..062aa9098 --- /dev/null +++ b/website/docs/modules/genotype_batch.md @@ -0,0 +1,27 @@ +--- +title: GenotypeBatch +description: Genotype Batch +sidebar_position: 9 +slug: gb +--- + +Genotypes a batch of samples across unfiltered variants combined across all batches. + +### Prerequisites +- Filter batch +- Merge Batch Sites + +### Inputs + +- Batch PESR and depth VCFs (FilterBatch) +- Cohort PESR and depth VCFs (MergeBatchSites) +- Batch read count, PE, and SR files (GatherBatchEvidence) + +### Outputs + +- Filtered SV (non-depth-only a.k.a. "PESR") VCF with outlier samples excluded +- Filtered depth-only call VCF with outlier samples excluded +- PED file with outlier samples excluded +- List of SR pass variants +- List of SR fail variants +- (Optional) Depth re-genotyping intervals list diff --git a/website/docs/modules/index.md b/website/docs/modules/index.md new file mode 100644 index 000000000..2ffea615f --- /dev/null +++ b/website/docs/modules/index.md @@ -0,0 +1,38 @@ +--- +title: Overview +description: Overview of the constituting components +sidebar_position: 0 +--- + +The pipeline is written in [Workflow Description Language (WDL)](https://openwdl.org), +consisting of multiple modules to be executed in the following order. + +- **GatherSampleEvidence** SV evidence collection, including calls from a configurable set of + algorithms (Manta, MELT, and Wham), read depth (RD), split read positions (SR), + and discordant pair positions (PE). + +- **EvidenceQC** Dosage bias scoring and ploidy estimation. + +- **GatherBatchEvidence** Copy number variant calling using + `cn.MOPS` and `GATK gCNV`; B-allele frequency (BAF) generation; + call and evidence aggregation. + +- **ClusterBatch** Variant clustering + +- **GenerateBatchMetrics** Variant filtering metric generation + +- **FilterBatch** Variant filtering; outlier exclusion + +- **GenotypeBatch** Genotyping + +- **MakeCohortVcf** Cross-batch integration; complex variant resolution and re-genotyping; vcf cleanup + +- **Module 07 (in development)** Downstream filtering, including minGQ, batch effect check, + outlier samples removal and final recalibration; + +- **AnnotateVCF** Annotations, including functional annotation, + allele frequency (AF) annotation and AF annotation with external population callsets; + +- **Module 09 (in development)** Visualization, including scripts that generates IGV screenshots and rd plots. + +- Additional modules to be added: de novo and mosaic scripts diff --git a/website/docs/modules/make_cohort_vcf.md b/website/docs/modules/make_cohort_vcf.md new file mode 100644 index 000000000..d16e22acf --- /dev/null +++ b/website/docs/modules/make_cohort_vcf.md @@ -0,0 +1,30 @@ +--- +title: MakeCohortVcf +description: Make Cohort VCF +sidebar_position: 11 +slug: cvcf +--- + +Combines variants across multiple batches, resolves complex variants, +re-genotypes, and performs final VCF clean-up. + +### Prerequisites + +- GenotypeBatch +- (Optional) RegenotypeCNVs + +### Inputs + +- RD, PE and SR file URIs (GatherBatchEvidence) +- Batch filtered PED file URIs (FilterBatch) +- Genotyped PESR VCF URIs (GenotypeBatch) +- Genotyped depth VCF URIs (GenotypeBatch or RegenotypeCNVs) +- SR pass variant file URIs (GenotypeBatch) +- SR fail variant file URIs (GenotypeBatch) +- Genotyping cutoff file URIs (GenotypeBatch) +- Batch IDs +- Sample ID list URIs + +### Outputs + +- Finalized "cleaned" VCF and QC plots \ No newline at end of file diff --git a/website/docs/modules/merge_batch_sites.md b/website/docs/modules/merge_batch_sites.md new file mode 100644 index 000000000..6bfe43789 --- /dev/null +++ b/website/docs/modules/merge_batch_sites.md @@ -0,0 +1,20 @@ +--- +title: MergeBatchSites +description: Merge Batch Sites +sidebar_position: 8 +slug: msites +--- + +Combines filtered variants across batches. The WDL can be found at: /wdl/MergeBatchSites.wdl. + +### Prerequisites +- Filter Batch + +### Inputs + +- List of filtered PESR VCFs (FilterBatch) +- List of filtered depth VCFs (FilterBatch) + +### Outputs + +- Combined cohort PESR and depth VCFs diff --git a/website/docs/modules/regenotype_cnvs.md b/website/docs/modules/regenotype_cnvs.md new file mode 100644 index 000000000..cb5356b7e --- /dev/null +++ b/website/docs/modules/regenotype_cnvs.md @@ -0,0 +1,24 @@ +--- +title: ReGenotypeCNVs +description: Regenotype CNVs +sidebar_position: 10 +slug: rgcnvs +--- + +Re-genotypes probable mosaic variants across multiple batches. + +### Prerequisites +- Genotype batch + +### Inputs + +- Per-sample median coverage estimates (GatherBatchEvidence) +- Pre-genotyping depth VCFs (FilterBatch) +- Batch PED files (FilterBatch) +- Cohort depth VCF (MergeBatchSites) +- Genotyped depth VCFs (GenotypeBatch) +- Genotyped depth RD cutoffs file (GenotypeBatch) + +### Outputs + +- Re-genotyped depth VCFs. diff --git a/website/docs/modules/train_gcnv.md b/website/docs/modules/train_gcnv.md new file mode 100644 index 000000000..7bbd8e934 --- /dev/null +++ b/website/docs/modules/train_gcnv.md @@ -0,0 +1,41 @@ +--- +title: TrainGCNV +description: Train gCNV +sidebar_position: 3 +slug: gcnv +--- + +Trains a gCNV model for use in GatherBatchEvidence. +The WDL can be found at /wdl/TrainGCNV.wdl. + +Both the cohort and single-sample modes use the +GATK gCNV depth calling pipeline, which requires a +trained model as input. The samples used for training +should be technically homogeneous and similar to the +samples to be processed (i.e. same sample type, +library prep protocol, sequencer, sequencing center, etc.). +The samples to be processed may comprise all or a subset +of the training set. For small, relatively homogenous cohorts, +a single gCNV model is usually sufficient. If a cohort +contains multiple data sources, we recommend training a separate +model for each batch or group of batches with similar dosage +score (WGD). The model may be trained on all or a subset of +the samples to which it will be applied; a reasonable default +is 100 randomly-selected samples from the batch (the random +selection can be done as part of the workflow by specifying +a number of samples to the n_samples_subsample input +parameter in /wdl/TrainGCNV.wdl). + +### Prerequisites + +- GatherSampleEvidence +- (Recommended) EvidenceQC + +### Inputs + +- Read count files (GatherSampleEvidence) + +### Outputs + +- Contig ploidy model tarball +- gCNV model tarballs \ No newline at end of file diff --git a/website/docs/modules/visualization.md b/website/docs/modules/visualization.md new file mode 100644 index 000000000..c6c71bf65 --- /dev/null +++ b/website/docs/modules/visualization.md @@ -0,0 +1,14 @@ +--- +title: Visualization +description: Visualization (work in progress) +sidebar_position: 14 +slug: viz +--- + +Visualize SVs with IGV screenshots and read depth plots. + +Visualization methods include: + +- RD Visualization - generate RD plots across all samples, ideal for visualizing large CNVs. +- IGV Visualization - generate IGV plots of each SV for individual sample, ideal for visualizing de novo small SVs. +- Module09.visualize.wdl - generate RD plots and IGV plots, and combine them for easy review. diff --git a/website/docs/run/_category_.json b/website/docs/run/_category_.json new file mode 100644 index 000000000..eb46f4d4c --- /dev/null +++ b/website/docs/run/_category_.json @@ -0,0 +1,7 @@ +{ + "label": "Run", + "position": 4, + "link": { + "type": "generated-index" + } +} diff --git a/website/docs/run/batching.md b/website/docs/run/batching.md new file mode 100644 index 000000000..271ba8a72 --- /dev/null +++ b/website/docs/run/batching.md @@ -0,0 +1,22 @@ +--- +title: Batching +description: Batching +slug: batching +--- + +For larger cohorts, samples should be split up into batches of about 100-500 +samples with similar characteristics. We recommend batching based on overall +coverage and dosage score (WGD), which can be generated in [EvidenceQC](/docs/modules/eqc). +An example batching process is outlined below: + +1. Divide the cohort into PCR+ and PCR- samples +2. Partition the samples by median coverage from [EvidenceQC](/docs/modules/eqc), + grouping samples with similar median coverage together. The end goal is to + divide the cohort into roughly equal-sized batches of about 100-500 samples; + if your partitions based on coverage are larger or uneven, you can partition + the cohort further in the next step to obtain the final batches. +3. Optionally, divide the samples further by dosage score (WGD) from + [EvidenceQC](/docs/modules/eqc), grouping samples with similar WGD score + together, to obtain roughly equal-sized batches of about 100-500 samples +4. Maintain a roughly equal sex balance within each batch, based on sex + assignments from [EvidenceQC](/docs/modules/eqc) diff --git a/website/docs/troubleshooting/_category_.json b/website/docs/troubleshooting/_category_.json new file mode 100644 index 000000000..9bcfbef40 --- /dev/null +++ b/website/docs/troubleshooting/_category_.json @@ -0,0 +1,7 @@ +{ + "label": "Troubleshooting", + "position": 5, + "link": { + "type": "generated-index" + } +} diff --git a/website/docs/troubleshooting/faq.md b/website/docs/troubleshooting/faq.md new file mode 100644 index 000000000..1cd431028 --- /dev/null +++ b/website/docs/troubleshooting/faq.md @@ -0,0 +1,37 @@ +--- +title: FAQ +slug: faq +--- + +### VM runs out of memory or disk + +- Default pipeline settings are tuned for batches of 100 samples. + Larger batches or cohorts may require additional VM resources. + Most runtime attributes can be modified through + the RuntimeAttr inputs. These are formatted like this in the json: + + ```json + "MyWorkflow.runtime_attr_override": { + "disk_gb": 100, + "mem_gb": 16 + }, + ``` + + Note that a subset of the struct attributes can be specified. + See `wdl/Structs.wdl` for available attributes. + +### Calculated read length causes error in MELT workflow + +Example error message from `GatherSampleEvidence.MELT.GetWgsMetrics`: + +> Exception in thread "main" java.lang.ArrayIndexOutOfBoundsException: +> The requested index 701766 is out of counter bounds. +> Possible cause of exception can be wrong READ_LENGTH +> parameter (much smaller than actual read length) + + +This error message was observed for a sample with an average +read length of 117, but for which half the reads were of length +90 and half were of length 151. As a workaround, override the +calculated read length by providing a `read_length` input of 151 +(or the expected read length for the sample in question) to `GatherSampleEvidence`. \ No newline at end of file diff --git a/website/src/components/HomepageFeatures/index.js b/website/src/components/HomepageFeatures/index.js index 9a4e82c3a..9c56bb32b 100644 --- a/website/src/components/HomepageFeatures/index.js +++ b/website/src/components/HomepageFeatures/index.js @@ -8,7 +8,7 @@ const FeatureList = [ Svg: require('@site/static/img/undraw_docusaurus_mountain.svg').default, description: ( <> - ... + Built and optimized for the Google Cloud Platform ), }, @@ -17,7 +17,9 @@ const FeatureList = [ Svg: require('@site/static/img/undraw_docusaurus_tree.svg').default, description: ( <> - ... + Used to produce high-quality SV call sets for large + scale sequencing initiatives such as the Genome + Aggregation Project (gnomAD) ), }, @@ -26,7 +28,8 @@ const FeatureList = [ Svg: require('@site/static/img/undraw_docusaurus_react.svg').default, description: ( <> - ... + Analyzes SV calls from multiple algorithms and evidence + signatures to achieve high sensitivity and precision ), }, diff --git a/website/src/pages/index.js b/website/src/pages/index.js index 7c711a71f..f0bc11a5d 100644 --- a/website/src/pages/index.js +++ b/website/src/pages/index.js @@ -18,7 +18,7 @@ function HomepageHeader() { - Quick Start - 5min ⏱️ + Quick Start From 0d6712848850d8075fb8087d614686f064953674 Mon Sep 17 00:00:00 2001 From: cwhelan Date: Thu, 2 Mar 2023 10:38:07 -0500 Subject: [PATCH 19/19] New RdTest mode for very large variants (#499) New genotyping mode for very large variants Sites over a given size are genotyped by taking the medians of an even subsampling of windows across the event. Also refactored error handling and extracted methods for readability. --- src/RdTest/RdTest.R | 348 ++++++++++++++++++++++++++++---------------- 1 file changed, 222 insertions(+), 126 deletions(-) diff --git a/src/RdTest/RdTest.R b/src/RdTest/RdTest.R index 1838cfa17..5557e03de 100755 --- a/src/RdTest/RdTest.R +++ b/src/RdTest/RdTest.R @@ -12,7 +12,15 @@ #--------------------------- #Print traceback on error -options(error = traceback) +options(warn = 2) +options(error = function() { + sink(stderr()) + on.exit(sink(NULL)) + traceback(3, max.lines = 1L) + if (!interactive()) { + q(status = 1) + } +}) #Loads required packages; installs if necessary RPackages <- c("optparse", "plyr", "MASS", "zoo","methods","metap", "e1071", "fpc", "BSDA", "DAAG", "pwr", "reshape", "perm", "hash") @@ -47,7 +55,7 @@ list <- structure(NA, class = "result") #Command line options -option_list = list( +option_list <- list( make_option(c("-b", "--bed"), type="character", default=NULL, help="Bed file of CNVs to check. No header. Locus ID as fourth column. SampleIDs of interest comma delimited as fifth column. CNVtype (DEL,DUP) as the sixth column", metavar="character"), make_option(c("-c", "--coveragefile"), type="character", default=NULL, @@ -88,18 +96,24 @@ option_list = list( help="Optional:Plot JPG visualization of copy state (requires -j TRUE if want to plot kmeans) . Default:FALSE", metavar="logical"), make_option(c("-s", "--sizefilter"), type="numeric", default=1000000, help="Optional:Restrict to large CNV to inner specified size Default:1000000", metavar="numeric"), + make_option(c("-x", "--verylargevariantsize"), type="numeric", default=2500000, + help="Optional:For CNVs over this size load entire coverage range with median window strategy Default:2500000", metavar="numeric"), + make_option("--verylargevariantpoints", type="numeric", default=500, + help="Optional:Number of windows to use for very large variant genotyping Default:500", metavar="numeric"), + make_option("--verylargevariantwindows", type="numeric", default=2000, + help="Optional:Size of windows to use for very large variant genotyping Default:2000", metavar="numeric"), make_option(c("-u", "--quartetDenovo"), type="logical", default=FALSE, help="Proband,Father,Mother, & Sib de novo analysis", metavar="logical"), make_option(c("-z", "--mosaicsep"), type="logical", default=FALSE, help="Optional:Change sep calculation to a maxium rather than medium for determing mosaic variants", metavar="logical"), - make_option(c("-l", "--Blacklist"), type="character", default=NULL, - help="Optional:Single column file with blacklist of samples to remove", metavar="character"), - make_option(c("-w", "--Whitelist"), type="character", default=NULL, - help="Optional:Single column file with whitelist of samples to include", metavar="character") -); + make_option(c("-l", "--SampleExcludeList"), type="character", default=NULL, + help="Optional:Single column file with list of samples to exclude", metavar="character"), + make_option(c("-w", "--SampleIncludeList"), type="character", default=NULL, + help="Optional:Single column file with list of samples to include", metavar="character") +) -opt_parser = OptionParser(option_list = option_list) -opt = parse_args(opt_parser) +opt_parser <- OptionParser(option_list = option_list) +opt <- parse_args(opt_parser) ##QC check, see file inputs exist and are formated correctly and edit if neccessary## @@ -196,53 +210,176 @@ rebin <- function(df, compression) { return(as.data.frame(cbind(Chr, Start, End, newvals))) } +fillGapsInCoverageMatrixWithZeroCountBins <- function (cov1, BinSize, chromosome) +{ + ##Fill in any gaps between coverage bins with zero-count rows## + if (nrow(cov1) > 1) { + gapLengths <- sapply(2:nrow(cov1), function(i) { cov1$start[i] - cov1$end[i-1]}) + if (any(gapLengths) > 0) { + gapStarts <- cov1$end[which(gapLengths > 0)] + gapEnds <- cov1$start[which(gapLengths > 0)+1] + zeroBinStarts <- unlist(lapply(1:length(gapStarts), function(i) { seq(gapStarts[i], gapEnds[i]-1, by=BinSize) })) + zeroBinEnds <- unlist(lapply(1:length(gapEnds), + function(i) { + c( if (gapStarts[i] + BinSize < gapEnds[i]) { seq(gapStarts[i] + BinSize, gapEnds[i]-1, by=BinSize) }, gapEnds[i]) + })) + column_start = matrix(zeroBinStarts, ncol = 1) + column_end = matrix(zeroBinEnds, ncol = 1) + ncov_col = ncol(cov1) + null_model <- + cbind(chromosome, column_start, column_end, matrix(rep(0, times = nrow(column_start) * + (ncov_col - 3)), ncol = ncov_col - 3)) + colnames(null_model) <- colnames(cov1) + + cov1 <- rbind(cov1, null_model) + + ##Use sapply to convert files to numeric only more than one column in cov1 matrix. If not matrix will already be numeric## + if (nrow(cov1) > 1) { + cov1[2:length(names(cov1))] <- lapply(cov1[2:length(names(cov1))], as.numeric) + } else {cov1<-data.frame(t(sapply(cov1,as.numeric)),check.names=FALSE)} + cov1 <- cov1[order(cov1[, 2]), ] + } + } + return(cov1) +} + + +applySampleIncludeAndExcludeLists <- function(cov1, allnorm, SampleExcludeListFile, SampleIncludelistFile) +{ + #Samples Filter + if ( !is.null(SampleExcludeListFile) ) { + samplesExcludeList <- readLines(SampleExcludeListFile) + IDsSamplesExcludelistButNotCoverage <- + samplesExcludeList[!(samplesExcludeList %in% names(cov1))] + if (length(IDsSamplesExcludelistButNotCoverage) > 0) + { + cat ( + " WARNING: IDs in samplesExcludeList but not coverage:", IDsSamplesExcludelistButNotCoverage + ) + } + ##Filter samples based of specified exclude list here## + cov1 <- cov1[,!(names(cov1) %in% samplesExcludeList)] + allnorm <- + allnorm[,!(names(allnorm) %in% samplesExcludeList)] + } + + ##Allow include list## + if (!is.null(SampleIncludelistFile)) { + samplesIncludeList <- readLines(SampleIncludelistFile) + IDsSamplesIncludelistButNotCoverage <- + samplesIncludeList[!(samplesIncludeList %in% names(cov1))] + if (length(IDsSamplesIncludelistButNotCoverage) > 0) + { + cat ( + " WARNING: IDs in samplesIncludeList but not coverage:", + IDsSamplesIncludelistButNotCoverage + ) + } + ##make sure to still include first three lines### + cov1 <- cov1[,names(cov1) %in% c("chr","start","end",samplesIncludeList)] + allnorm <- + allnorm[, (names(allnorm) %in% samplesIncludeList)] + } + return(list(cov1, allnorm)) +} #Reads coverage of specific queried region and compresses to a reasonable number of bins to create a region coverage matrix #sampleIDs is comma specficed list of samples## -loadData <- function(chr, start, end, cnvID, sampleIDs,coveragefile,medianfile,bins,poorbincov=NULL) +removeExcludedBinCovBins <- function(chr, cov1, end, poorbincov, start) { + if (!is.null(poorbincov)) { + intervalfile = poorbincov + ##take off 10% on each of cnv for more accurate check for depth## + start10 <- round(start + ((end - start) * 0.10)) + end10 <- round(end - ((end - start) * 0.10)) + cov_exclude <- cov1[which(cov1[, 3] <= start10 | cov1[, 2] >= end10),] + + ##pull outcoord that fail## + file.length <- tryCatch(read.table(pipe(paste("tabix -h ", intervalfile, " ", chr, ":", start10, "-", end10, sep = "")), sep = "\t"), error = function(e) NULL) + + passing_int <- c(paste(cov_exclude[, 1], "_", cov_exclude[, 2], "_", cov_exclude[, 3], sep = ""), as.character(file.length[, 4])) + + #don't include poor region filter but still shave 10%### + passing_int_noregion <- c(paste(cov_exclude[, 1], "_", cov_exclude[, 2], "_", cov_exclude[, 3], sep = "")) + + ##remove failing bins from coverage file## + cov2 <- cov1[-which(paste(cov1[, 1], "_", cov1[, 2], "_", cov1[, 3], sep = "") %in% passing_int),] + cov3 <- cov1[-which(paste(cov1[, 1], "_", cov1[, 2], "_", cov1[, 3], sep = "") %in% passing_int_noregion),] + + ##must have at least 10 bins after filtering or exclude## + if (nrow(cov2) > 9) { + cov1 <- cov2 + } else if (nrow(cov3) > 9) { + cov1 <- cov3 + } + } + return(cov1) +} + +loadData <- function(chr, start, end, coveragefile, medianfile, bins, verylargevariantsize, + vlRegionPoints, vlWindow, SampleExcludeList, SampleIncludeList, poorbincov=NULL, raw_cov=NULL) { - #Take the coverage matrix header and tabix query the region in the .gz coverage matrix - cov1 <-tryCatch(read.table(pipe(paste("tabix -h ",coveragefile," ", chr, ":", start, "-", end, " | sed 's/^#//'|sed 's/Start/start/g'|sed 's/Chr/chr/g'|sed 's/End/end/g'", sep = "")),sep = "\t", header = TRUE, check.names = FALSE), error=function(e) NULL) - #Load plotting values if median coverage file generated by bincov## - allnorm <- read.table(medianfile, header = TRUE, check.names = FALSE) - ##remove when start or end pull in extra tabix line## - cov1<-cov1[cov1$start!=end,] - cov1<-cov1[cov1$end!=start,] + if (is.null(raw_cov)) { + if (end - start > verylargevariantsize) { + print(paste0("using very large event subsampling approach for ", chr,":",start,"-",end)) + regionPoints <- vlRegionPoints + window <- vlWindow / 2 + pointSpacing <- trunc((end - start) / regionPoints) + points <- seq(start + window, end - window, by=pointSpacing) + pointBed <- data.frame(chr=chr, start=points - window, end=points+window+1) + #pointFile <- tempfile(pattern=cnvID) + pointFile <- "regions.bed" + write.table(pointBed, file=pointFile, quote=FALSE, sep="\t", row.names=FALSE, col.names=FALSE) + cov1 <- tryCatch( + read.table(pipe( + paste("bedtools merge -i ", pointFile, + " | tabix -h ",coveragefile," -R - | sed 's/^#//'|sed 's/Start/start/g'|sed 's/Chr/chr/g'|sed 's/End/end/g'", sep = "")),sep = "\t", header = TRUE, check.names = FALSE), error=function(e) NULL) + + + cov1 <- removeExcludedBinCovBins(chr, cov1, end, poorbincov, start) + + cov1 <- ldply(seq(1,regionPoints), function(i) { + windowCov <- cov1[cov1$end > pointBed[i,"start"] & cov1$start < pointBed[i,"end"],] + if (nrow(windowCov) > 0) { + covMeds <- data.frame(lapply(windowCov[,4:ncol(windowCov)], median), check.names=FALSE) + medianWindowCov <- cbind(data.frame(chr=chr, start=pointBed[i, "start"], end=pointBed[i, "end"]), covMeds) + return(medianWindowCov) + } else { + return(NULL) + } + }) + + } else { + #Take the coverage matrix header and tabix query the region in the .gz coverage matrix + cov1 <-read.table(pipe(paste("tabix -h ",coveragefile," ", chr, ":", start, "-", end, " | sed 's/^#//'|sed 's/Start/start/g'|sed 's/Chr/chr/g'|sed 's/End/end/g'", sep = "")),sep = "\t", header = TRUE, check.names = FALSE) + ##remove when start or end pull in extra tabix line## + cov1<-cov1[cov1$start!=end,] + cov1<-cov1[cov1$end!=start,] + + #Find window bin size + BinSize <- median(cov1$end - cov1$start) + + cov1 <- fillGapsInCoverageMatrixWithZeroCountBins(cov1, BinSize, chr) + + } + + raw_coverage <- cov1 + } else { + # reuse previously loaded raw_cov as cov1 matrix + cov1 <- raw_cov + raw_coverage <- raw_cov + } + + #Check if no data if (nrow(cov1) < 1) { - return("Failure") - } - #Find window bin size - BinSize <- median(cov1$end - cov1$start) - ##Fill in any gaps between coverage bins with zero-count rows## - if (nrow(cov1) > 1) { - gapLengths <- sapply(2:nrow(cov1), function(i) { cov1$start[i] - cov1$end[i-1]}) - if (any(gapLengths) > 0) { - gapStarts <- cov1$end[which(gapLengths > 0)] - gapEnds <- cov1$start[which(gapLengths > 0)+1] - zeroBinStarts <- unlist(lapply(1:length(gapStarts), function(i) { seq(gapStarts[i], gapEnds[i]-1, by=BinSize) })) - zeroBinEnds <- unlist(lapply(1:length(gapEnds), - function(i) { - c( if (gapStarts[i] + BinSize < gapEnds[i]) { seq(gapStarts[i] + BinSize, gapEnds[i]-1, by=BinSize) }, gapEnds[i]) - })) - column_start = matrix(zeroBinStarts, ncol = 1) - column_end = matrix(zeroBinEnds, ncol = 1) - ncov_col = ncol(cov1) - null_model <- - cbind(chr, column_start, column_end, matrix(rep(0, times = nrow(column_start) * - (ncov_col - 3)), ncol = ncov_col - 3)) - colnames(null_model) <- colnames(cov1) - - cov1 <- rbind(cov1, null_model) - - ##Use sapply to convert files to numeric only more than one column in cov1 matrix. If not matrix will already be numeric## - if (nrow(cov1) > 1) { - cov1 <- data.frame(sapply(cov1, as.numeric), check.names = FALSE) - } else {cov1<-data.frame(t(sapply(cov1,as.numeric)),check.names=FALSE)} - cov1 <- cov1[order(cov1[, 2]), ] - } + return(list(cnv_matrix="Failure", raw_cov=raw_coverage)) } + + #Load plotting values if median coverage file generated by bincov## + allnorm <- read.table(medianfile, header = TRUE, check.names = FALSE) + + #Round down the number of used bins events for smaller events (e.g at 100 bp bins can't have 10 bins if event is less than 1kb) startAdjToInnerBinStart <- if (any(cov1$start >= start)) { cov1$start[which(cov1$start >= start)[1]] } else { min(cov1$start) } endAdjToInnerBinEnd <- if (any(cov1$end <= end)) { cov1$end[max(which(cov1$end <= end))] } else { max(cov1$end) } @@ -270,9 +407,9 @@ loadData <- function(chr, start, end, cnvID, sampleIDs,coveragefile,medianfile,b # Number of bins that need to be removed RemainderForRemoval <- ##Need to account for round error by trunc so add the decimal#### - trunc((( - UnadjustedBins - trunc(UnadjustedBins) - ) * bins / 2) + 0.000000001) + trunc((( + UnadjustedBins - trunc(UnadjustedBins) + ) * bins / 2) + 0.000000001) BinsToRemoveFromFront <- round_any(RemainderForRemoval, 1, floor) BinsToRemoveFromBack <- @@ -293,45 +430,15 @@ loadData <- function(chr, start, end, cnvID, sampleIDs,coveragefile,medianfile,b #Cut bins down to those required for compressed clean size based on Rstart and Rend## cov1<-cov1[which(cov1[,3]>Rstart & cov1[,2] 0) - { - cat ( - " WARNING: IDs in samplesBlacklist but not coverage:", - IDsSamplesBlacklistButNotCoverage - ) - } - ##Filter samples based of specified blacklist here## - cov1 <- cov1[,!(names(cov1) %in% samplesBlacklist)] - allnorm <- - allnorm[,!(names(allnorm) %in% samplesBlacklist)] - } - - ##Allow whitelist## - if (!is.null(opt$Whitelist)) { - samplesWhitelist <- readLines(opt$Whitelist) - IDsSamplesWhitelistButNotCoverage <- - samplesWhitelist[!(samplesWhitelist %in% names(cov1))] - if (length(IDsSamplesWhitelistButNotCoverage) > 0) - { - cat ( - " WARNING: IDs in samplesWhitelist but not coverage:", - IDsSamplesWhitelistButNotCoverage - ) - } - ##make sure to still include first three lines### - cov1 <- cov1[,names(cov1) %in% c("chr","start","end",samplesWhitelist)] - allnorm <- - allnorm[, (names(allnorm) %in% samplesWhitelist)] - } + sampleFilterResult <- applySampleIncludeAndExcludeLists(cov1, allnorm, SampleExcludeList, SampleIncludeList) + cov1 <- sampleFilterResult[[1]] + allnorm <- sampleFilterResult[[2]] + if (ncol(cov1) < 4) { stop (" WARNING: All samples excluded by filtering") - } + } + #Approximates rebinned per-sample medians (approximated for speed & memory) allnorm[which(allnorm == 0)] <- 1 allnorm <- compression * allnorm @@ -340,31 +447,8 @@ loadData <- function(chr, start, end, cnvID, sampleIDs,coveragefile,medianfile,b cov1[cov1 == 0] <- 1 ##restrict bins to those with unique mapping## - if (!is.null(poorbincov)) { - intervalfile=poorbincov - ##take off 10% on each of cnv for more accurate check for depth## - start10<-round(start+((end-start)*0.10)) - end10<-round(end-((end-start)*0.10)) - cov_exclude<-cov1[which(cov1[,3]<=start10 | cov1[,2]>=end10),] - - ##pull outcoord that fail## - file.length<-tryCatch(read.table(pipe(paste("tabix -h ",intervalfile ," ", chr, ":", start10, "-", end10, sep = "")),sep = "\t"), error=function(e) NULL) + cov1 <- removeExcludedBinCovBins(chr, cov1, end, poorbincov, start) - passing_int<-c(paste(cov_exclude[,1],"_",cov_exclude[,2],"_",cov_exclude[,3],sep=""),as.character(file.length[,4])) - - #don't include poor region filter but still shave 10%### - passing_int_noregion<-c(paste(cov_exclude[,1],"_",cov_exclude[,2],"_",cov_exclude[,3],sep="")) - - ##remove failing bins from coverage file## - cov2<-cov1[-which(paste(cov1[,1],"_",cov1[,2],"_",cov1[,3],sep="") %in% passing_int),] - cov3<-cov1[-which(paste(cov1[,1],"_",cov1[,2],"_",cov1[,3],sep="") %in% passing_int_noregion),] - ##must have at least 10 bins after filtering or exclude## - if (nrow(cov2) >9) { - cov1<-cov2 - } else if (nrow(cov3) >9) { - cov1<-cov3 - } - } #Rebins values if (compression > 1) { @@ -393,7 +477,7 @@ loadData <- function(chr, start, end, cnvID, sampleIDs,coveragefile,medianfile,b } else { cnv_matrix <- as.matrix(res1) } - return(cnv_matrix) + return(list(cnv_matrix=cnv_matrix, raw_cov=raw_coverage)) } #Loads specified sample set in genotyping matrix based on the specified cnv type (del=1,dup=3) and unspecified samples as cn=2 @@ -406,7 +490,7 @@ specified_cnv <- function(cnv_matrix, sampleIDs, cnvID, chr, start, end, cnvtype samplenames <- colnames(as.matrix(genotype_matrix)) columnswithsamp <- which(colnames(genotype_matrix) %in% unlist(strsplit(as.character(sampleIDs),split=","))) if (length(columnswithsamp)==0) { - ##"WARNING: No samples in coverage matrix for comparision check black/whitelist"## + ##"WARNING: No samples in coverage matrix for comparision check exclude/include lists"## return ("No_Samples") } @@ -1014,8 +1098,9 @@ runRdTest<-function(bed) assign(names,unname(unlist(opt[names]))) } #Speed up large cnvs by taking inner range of laregest desired size - if (end - start > sizefilter) - { + if (end - start > verylargevariantsize) { + cat(paste(chr,":",start,"-",end,":Using new large event subsampling\n",sep="")) + } else if (end - start > sizefilter) { cat(paste(chr,":",start,"-",end,":Large size so subsampling in middle\n",sep="")) center=(start + end) / 2 start = round(center - (sizefilter/2)) @@ -1031,22 +1116,31 @@ runRdTest<-function(bed) ##Get Intesity Data## if (exists("poorbincov")) { - cnv_matrix<-loadData(chr, start, end, cnvID, sampleIDs,coveragefile,medianfile,bins,poorbincov) + loadResult <-loadData(chr, start, end, coveragefile,medianfile,bins, + verylargevariantsize, verylargevariantpoints, verylargevariantwindows, opt$SampleExcludeList, + opt$SampleIncludeList,poorbincov) + cnv_matrix <- loadResult[["cnv_matrix"]] + raw_cov <- loadResult[["raw_cov"]] } else { - cnv_matrix<-loadData(chr, start, end, cnvID, sampleIDs,coveragefile,medianfile,bins) + loadResult<-loadData(chr, start, end, coveragefile,medianfile,bins, + verylargevariantsize, verylargevariantpoints, verylargevariantwindows, opt$SampleExcludeList, + opt$SampleIncludeList) + cnv_matrix <- loadResult[["cnv_matrix"]] + raw_cov <- loadResult[["raw_cov"]] + } if (cnv_matrix[1]=="Failure") { ##assign genotype if no coverage## - if (opt$rungenotype == TRUE && !is.null(opt$Whitelist)) { - samplesWhitelist <- readLines(opt$Whitelist) + if (opt$rungenotype == TRUE && !is.null(opt$SampleIncludeList)) { + samplesIncludeList <- readLines(opt$SampleIncludeList) ##make dots to indicate missing genotype or GQ## - dotlist<- samplesWhitelist + dotlist<- samplesIncludeList dotlist[1:length(dotlist)]<-"." - ##write out cnv medians for each sample (requires whitelist)## + ##write out cnv medians for each sample (requires include list)## if(!file.exists(paste(outFolder,outputname,".median_geno",sep=""))) { ##write header## - write.table(matrix(c("chr","start","end","cnvID",samplesWhitelist),nrow=1),paste(outFolder, outputname, ".median_geno", sep = ""),quote=FALSE,row.names=FALSE,col.names=FALSE,sep= "\t") + write.table(matrix(c("chr","start","end","cnvID",samplesIncludeList),nrow=1),paste(outFolder, outputname, ".median_geno", sep = ""),quote=FALSE,row.names=FALSE,col.names=FALSE,sep= "\t") } write.table(matrix(c(chr, start, end, cnvID,dotlist),nrow=1),paste(outFolder, outputname, ".median_geno", sep = ""), quote = FALSE,col.names = FALSE, row.names = FALSE,append=TRUE,sep= "\t") @@ -1054,7 +1148,7 @@ runRdTest<-function(bed) ##write GQ## if(!file.exists(paste(outFolder,outputname,".gq",sep=""))) { ##write header## - write.table(matrix(c("chr","start","end","cnvID",samplesWhitelist),nrow=1),paste(outFolder, outputname, ".gq", sep = ""),quote=FALSE,row.names=FALSE,col.names=FALSE,sep= "\t") + write.table(matrix(c("chr","start","end","cnvID",samplesIncludeList),nrow=1),paste(outFolder, outputname, ".gq", sep = ""),quote=FALSE,row.names=FALSE,col.names=FALSE,sep= "\t") } write.table(matrix(c(chr, start, end, cnvID,dotlist),nrow=1),paste(outFolder, outputname, ".gq", sep = ""), quote = FALSE,col.names = FALSE, row.names = FALSE,append=TRUE,sep= "\t") @@ -1071,7 +1165,7 @@ runRdTest<-function(bed) if(!file.exists(paste(outFolder,outputname,".geno",sep=""))) { ##write header## - write.table(matrix(c("chr","start","end","cnvID",samplesWhitelist),nrow=1),paste(outFolder, outputname, ".geno", sep = ""),quote=FALSE,row.names=FALSE,col.names=FALSE,sep= "\t") + write.table(matrix(c("chr","start","end","cnvID",samplesIncludeList),nrow=1),paste(outFolder, outputname, ".geno", sep = ""),quote=FALSE,row.names=FALSE,col.names=FALSE,sep= "\t") } write.table(matrix(c(chr, start, end, cnvID,dotlist),nrow=1),paste(outFolder, outputname, ".geno", sep = ""), quote = FALSE,col.names = FALSE, row.names = FALSE,append=TRUE,sep= "\t") @@ -1079,7 +1173,7 @@ runRdTest<-function(bed) return(c(chr,start,end,cnvID,sampleOrigIDs,cnvtypeOrigIDs,"coverage_failure","coverage_failure","coverage_failure","coverage_failure","coverage_failure","coverage_failure")) } - ##remove black or white list samples from sampleIDs### + ##remove excluded or included samples from sampleIDs lists### idsforsearch<-rownames(cnv_matrix) samplestokeep<-match(unlist(strsplit(sampleIDs,",")),idsforsearch) sampleIDs<-idsforsearch[na.omit(samplestokeep)] @@ -1101,7 +1195,9 @@ runRdTest<-function(bed) ##genotype and write to file## if (opt$rungenotype == TRUE) { ##Compress x-axis to 10 bins so it is easier to view### - plot_cnvmatrix<-loadData(chr, start, end, cnvID, sampleIDs,coveragefile,medianfile,bins=10) + plot_cnvmatrix<-loadData(chr, start, end, coveragefile, medianfile, + verylargevariantsize, verylargevariantpoints, verylargevariantwindows, opt$SampleExcludeList, + opt$SampleIncludeList,bins=10,raw_cov=raw_cov)[["cnv_matrix"]] genotype(cnv_matrix,genotype_matrix,refgeno,chr,start,end,cnvID,sampleIDs,cnvtype,outFolder,outputname,plot_cnvmatrix) }