Merge remote-tracking branch 'upstream/main' into workflow_bug2

Genometric · Jun 27, 2023 · f175e61 · f175e61
2 parents ddd4ce3 + 0be9cf8
commit f175e61
Show file tree

Hide file tree

Showing 126 changed files with 6,049 additions and 5,242 deletions.
diff --git a/.github/workflows/pytest.yaml b/.github/workflows/pytest.yaml
@@ -11,9 +11,9 @@ jobs:
     name: Linting
     runs-on: ubuntu-latest
     steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v3
     - name: Set up Python 3.8
-      uses: actions/setup-python@v2
+      uses: actions/setup-python@v3
       with:
         python-version: 3.8
     - name: Install dependencies

diff --git a/.github/workflows/sv_pipeline_docker.yml b/.github/workflows/sv_pipeline_docker.yml
@@ -30,7 +30,7 @@ jobs:
       image_tag: ${{ steps.image_tag.outputs.IMAGE_TAG }}
     steps:
       - name: Checkout code
-        uses: actions/checkout@v2
+        uses: actions/checkout@v3
         with:
           # By default, this checks out only the current commit;
           # however, since a diff between the current commit and
@@ -79,8 +79,8 @@ jobs:
           echo "::debug::HEAD_SHA: $HEAD_SHA"
 
           # Avail the determined commit SHAs to other steps.
-          echo "::set-output name=BASE_SHA::$BASE_SHA"
-          echo "::set-output name=HEAD_SHA::$HEAD_SHA"
+          echo "BASE_SHA=$BASE_SHA" >> $GITHUB_OUTPUT
+          echo "HEAD_SHA=$HEAD_SHA" >> $GITHUB_OUTPUT
 
       - name: Compose Image Tag
         id: image_tag
@@ -106,7 +106,7 @@ jobs:
           
           IMAGE_TAG=$DATE-$RELEASE_TAG-${COMMIT_SHA::8}
           echo "::debug::Image tag: $IMAGE_TAG"
-          echo "::set-output name=IMAGE_TAG::$IMAGE_TAG"
+          echo "IMAGE_TAG=$IMAGE_TAG" >> $GITHUB_OUTPUT
 
   build_job:
     runs-on: ubuntu-20.04
@@ -118,7 +118,7 @@ jobs:
         python-version: ['3.8']
     steps:
       - name: Checkout code
-        uses: actions/checkout@v2
+        uses: actions/checkout@v3
         with:
           # See the comment on build_args_job.
           fetch-depth: 0
@@ -128,11 +128,6 @@ jobs:
         with:
           python-version: ${{ matrix.python-version }}
 
-      - name: Install dependencies
-        run: |
-          python -m pip install --upgrade pip
-          pip install termcolor
-
       - name: Run build_docker.py
         run: |
           cd ./scripts/docker/
@@ -160,22 +155,24 @@ jobs:
       DOCKERS_GCP: "./inputs/values/dockers.json"
     steps:
       - name: Checkout code
-        uses: actions/checkout@v2
+        uses: actions/checkout@v3
         with:
           # See the comment on build_args_job.
           fetch-depth: 0
           # Authenticates git using the bot's access token.
           token: ${{ secrets.BOT_PAT }}
 
       - name: Setup Python
-        uses: actions/setup-python@v2
+        uses: actions/setup-python@v3
         with:
           python-version: ${{ matrix.python-version }}
 
-      - name: Install dependencies
-        run: |
-          python -m pip install --upgrade pip
-          pip install termcolor
+      - name: Azure login
+        uses: azure/docker-login@v1
+        with:
+          login-server: ${{ secrets.AZ_CR }}
+          username: ${{ secrets.AZ_USERNAME }}
+          password: ${{ secrets.AZ_PASSWORD }}
 
       - name: Setup gcloud CLI
         uses: google-github-actions/[email protected]
@@ -206,39 +203,21 @@ jobs:
           sudo mv "$tmp" /etc/docker/daemon.json
           sudo systemctl restart docker.service
 
-      - name: Build and Publish Docker Images to GCR
+      - name: Build and Publish Docker Images to ACR & GCR
         id: build_and_publish
         run: |
           python ./scripts/docker/build_docker.py \
             --base-git-commit ${{ needs.build_args_job.outputs.base_sha }} \
             --current-git-commit ${{ needs.build_args_job.outputs.head_sha }} \
             --docker-repo us.gcr.io/${{ secrets.GCP_PROJECT_ID }}/gatk-sv \
             --image-tag ${{ needs.build_args_job.outputs.image_tag }} \
-            --input-json $DOCKERS_GCP \
-            --output-json $DOCKERS_GCP
+            --input-json $DOCKERS_AZURE $DOCKERS_GCP \
+            --output-json $DOCKERS_AZURE $DOCKERS_GCP \
+            --disable-git-protect
           
           CHANGED=$(git diff --quiet $DOCKERS_GCP || echo True)
           echo "::set-output name=CHANGED::$CHANGED"
 
-      - name: Azure login
-        uses: azure/docker-login@v1
-        with:
-          login-server: ${{ secrets.AZ_CR }}
-          username: ${{ secrets.AZ_USERNAME }}
-          password: ${{ secrets.AZ_PASSWORD }}
-
-      - name: Build and Publish Docker Images to ACR
-        run: |
-          python ./scripts/docker/build_docker.py \
-            --base-git-commit ${{ needs.build_args_job.outputs.base_sha }} \
-            --current-git-commit ${{ needs.build_args_job.outputs.head_sha }} \
-            --docker-repo ${{ secrets.AZ_CR }} \
-            --image-tag ${{ needs.build_args_job.outputs.image_tag }} \
-            --input-json $DOCKERS_AZURE \
-            --output-json $DOCKERS_AZURE \
-            --prune-after-each-image \
-            --disable-git-protect
-
       - name: Commit Changes to dockers_*.json
         if: steps.build_and_publish.outputs.CHANGED
         run: |
@@ -247,4 +226,9 @@ jobs:
           git config --global user.email '[email protected]'
           git commit $DOCKERS_AZURE $DOCKERS_GCP -m "Update docker images list, triggered by "${COMMIT_SHA::8}
           git pull --rebase origin main
-          git push
+          
+          # In the following, force-push is required when the above rebase updates the branch;
+          # otherwise, the push will be rejected with the following error: 
+          # > Updates were rejected because the tip of your current branch is behind ts remote counterpart.
+          # See this thread for details: https://stackoverflow.com/q/39399804
+          git push -f
diff --git a/.github/workflows/testwdls.yaml b/.github/workflows/testwdls.yaml
@@ -24,7 +24,7 @@ jobs:
         uses: actions/checkout@v3
 
       - name: Setup Python
-        uses: actions/setup-python@v2
+        uses: actions/setup-python@v3
         with:
           python-version: ${{ matrix.python-version }}
 

diff --git a/README.md b/README.md
@@ -40,6 +40,10 @@ A structural variation discovery pipeline for Illumina short-read whole-genome s
   * [Cromwell](https://github.com/broadinstitute/cromwell) (v36 or higher). A dedicated server is highly recommended.
   * or [Terra](https://terra.bio/) (note preconfigured GATK-SV workflows are not yet available for this platform)
 * Recommended: [MELT](https://melt.igs.umaryland.edu/). Due to licensing restrictions, we cannot provide a public docker image or reference panel VCFs for this algorithm.
+* Recommended: [Manta](https://github.com/Illumina/manta). Calls SVs and indels from mapped PE reads based on split read and discordant read pair evidence.
+* Recommended: [Wham](https://github.com/zeeev/wham). Used to predict SV breakpoints after integrating all evidence.
+* Recommended: [cn.MOPS](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3351174/). Used to detect CNVs from variations in read depth using a mixture of Poisson models.
+* Recommended: [gatk-gCNV](https://gatk.broadinstitute.org/hc/en-us/articles/360035531152). Detects germline CNVs from variations in read depth.
 * Recommended: [cromshell](https://github.com/broadinstitute/cromshell) for interacting with a dedicated Cromwell server.
 * Recommended: [WOMtool](https://cromwell.readthedocs.io/en/stable/WOMtool/) for validating WDL/json files.
 
@@ -123,11 +127,6 @@ apply, users may use placeholder values for the cloud configuration and simply d
 
 The input values are provided only as an example and are not publicly accessible. In order to include MELT, these values must be provided by the user. MELT can be disabled by deleting these inputs and setting `GATKSVPipelineBatch.use_melt` to `false`.
 
-#### Requester pays buckets
-**Important**: The following parameters must be set when certain input data is in requester pays (RP) buckets:
-
-* `GATKSVPipelineSingleSample.requester_pays_cram` and `GATKSVPipelineBatch.GatherSampleEvidenceBatch.requester_pays_crams` - set to `True` if inputs are CRAM format and in an RP bucket, otherwise `False`.
-
 #### Execution
 We recommend running the pipeline on a dedicated [Cromwell](https://github.com/broadinstitute/cromwell) server with a [cromshell](https://github.com/broadinstitute/cromshell) client. A batch run can be started with the following commands:
 
@@ -202,7 +201,7 @@ For larger cohorts, samples should be split up into batches of about 100-500 sam
 `GATKSVPipelineSingleSample.wdl` runs the pipeline on a single sample using a fixed reference panel. An example run with reference panel containing 156 samples from the [NYGC 1000G Terra workspace](https://app.terra.bio/#workspaces/anvil-datastorage/1000G-high-coverage-2019) can be found in `inputs/build/NA12878/test` after [building inputs](#building-inputs)).
 
 ## <a name="gcnv-training-overview">gCNV Training</a>
-Both the cohort and single-sample modes use the GATK gCNV depth calling pipeline, which requires a [trained model](#gcnv-training) as input. The samples used for training should be technically homogeneous and similar to the samples to be processed (i.e. same sample type, library prep protocol, sequencer, sequencing center, etc.). The samples to be processed may comprise all or a subset of the training set. For small, relatively homogenous cohorts, a single gCNV model is usually sufficient. If a cohort contains multiple data sources, we recommend training a separate model for each [batch](#batching) or group of batches with similar dosage score (WGD). The model may be trained on all or a subset of the samples to which it will be applied; a reasonable default is 100 randomly-selected samples from the batch (the random selection can be done as part of the workflow by specifying a number of samples to the `n_samples_subsample` input parameter in `/wdl/TrainGCNV.wdl`).
+Both the cohort and single-sample modes use the [GATK-gCNV](https://gatk.broadinstitute.org/hc/en-us/articles/360035531152) depth calling pipeline, which requires a [trained model](#gcnv-training) as input. The samples used for training should be technically homogeneous and similar to the samples to be processed (i.e. same sample type, library prep protocol, sequencer, sequencing center, etc.). The samples to be processed may comprise all or a subset of the training set. For small, relatively homogenous cohorts, a single gCNV model is usually sufficient. If a cohort contains multiple data sources, we recommend training a separate model for each [batch](#batching) or group of batches with similar dosage score (WGD). The model may be trained on all or a subset of the samples to which it will be applied; a reasonable default is 100 randomly-selected samples from the batch (the random selection can be done as part of the workflow by specifying a number of samples to the `n_samples_subsample` input parameter in `/wdl/TrainGCNV.wdl`).
 
 ## <a name="reference-panel-generation">Generating a reference panel</a>
 New reference panels can be generated easily from a single run of the `GATKSVPipelineBatch` workflow. If using a Cromwell server, we recommend copying the outputs to a permanent location by adding the following option to the workflow configuration file:
@@ -247,7 +246,7 @@ The following sections briefly describe each module and highlights inter-depende
 ## <a name="gather-sample-evidence">GatherSampleEvidence</a>
 *Formerly Module00a*
 
-Runs raw evidence collection on each sample with the following SV callers: Manta, Wham, and/or MELT. For guidance on pre-filtering prior to `GatherSampleEvidence`, refer to the [Sample Exclusion](#sample-exclusion) section.
+Runs raw evidence collection on each sample with the following SV callers: [Manta](https://github.com/Illumina/manta), [Wham](https://github.com/zeeev/wham), and/or [MELT](https://melt.igs.umaryland.edu/). For guidance on pre-filtering prior to `GatherSampleEvidence`, refer to the [Sample Exclusion](#sample-exclusion) section.
 
 Note: a list of sample IDs must be provided. Refer to the [sample ID requirements](#sampleids) for specifications of allowable sample IDs. IDs that do not meet these requirements may cause errors.
 
@@ -292,7 +291,7 @@ The purpose of sample filtering at this stage after EvidenceQC is to prevent ver
 
 
 ## <a name="gcnv-training">TrainGCNV</a>
-Trains a gCNV model for use in [GatherBatchEvidence](#gather-batch-evidence). The WDL can be found at `/wdl/TrainGCNV.wdl`. See the [gCNV training overview](#gcnv-training-overview) for more information.
+Trains a [gCNV](https://gatk.broadinstitute.org/hc/en-us/articles/360035531152) model for use in [GatherBatchEvidence](#gather-batch-evidence). The WDL can be found at `/wdl/TrainGCNV.wdl`. See the [gCNV training overview](#gcnv-training-overview) for more information.
 
 #### Prerequisites:
 * [GatherSampleEvidence](#gather-sample-evidence)
@@ -309,7 +308,7 @@ Trains a gCNV model for use in [GatherBatchEvidence](#gather-batch-evidence). Th
 ## <a name="gather-batch-evidence">GatherBatchEvidence</a>
 *Formerly Module00c*
 
-Runs CNV callers (cnMOPs, GATK gCNV) and combines single-sample raw evidence into a batch. See [above](#cohort-mode) for more information on batching.
+Runs CNV callers ([cn.MOPS](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3351174/), [GATK-gCNV](https://gatk.broadinstitute.org/hc/en-us/articles/360035531152)) and combines single-sample raw evidence into a batch. See [above](#cohort-mode) for more information on batching.
 
 #### Prerequisites:
 * [GatherSampleEvidence](#gather-sample-evidence)
@@ -492,7 +491,7 @@ gs://gatk-sv-resources-public/hg38/v0/sv-resources/ref-panel/1KG/v2/mingq/1KGP_2
 Add annotations, such as the inferred function and allele frequencies of variants, to final VCF.
 
 Annotations methods include:
-* Functional annotation - annotate SVs with inferred functional consequence on protein-coding regions, regulatory regions such as UTR and promoters, and other non-coding elements.
+* Functional annotation - The GATK tool [SVAnnotate](https://gatk.broadinstitute.org/hc/en-us/articles/13832752531355-SVAnnotate) is used to annotate SVs with inferred functional consequence on protein-coding regions, regulatory regions such as UTR and promoters, and other non-coding elements.
 * Allele Frequency annotation - annotate SVs with their allele frequencies across all samples, and samples of specific sex, as well as specific sub-populations.
 * Allele Frequency annotation with external callset - annotate SVs with the allele frequencies of their overlapping SVs in another callset, eg. gnomad SV callset.
 

diff --git a/dockerfiles/melt/Dockerfile b/dockerfiles/melt/Dockerfile
@@ -1,4 +1,5 @@
-FROM ubuntu:18.04
+ARG SVBASE_IMAGE=sv-base:latest
+FROM $SVBASE_IMAGE
 
 # Dependencies, including BOWTIE_2 and OpenJRE
 ARG BOWTIE2_RELEASE=2.3.4.3
@@ -43,7 +44,7 @@ ARG MELT_RELEASE=2.0.5
 ARG MELT_DIR="/MELT/"
 RUN mkdir -p ${MELT_DIR}
 WORKDIR ${MELT_DIR}
-ADD MELTv${MELT_RELEASE}_patch.tar.gz .
+ADD dockerfiles/melt/MELTv${MELT_RELEASE}_patch.tar.gz .
 RUN echo "export PATH=${MELT_DIR}/MELTv${MELT_RELEASE}_patch/:$PATH" > /etc/profile.d/MELT.sh
 ENV PATH="${MELT_DIR}/MELTv${MELT_RELEASE}_patch/:$PATH"
 

diff --git a/dockerfiles/str/Dockerfile b/dockerfiles/str/Dockerfile
@@ -11,6 +11,7 @@ ARG HTSLIB_VER=1.15.1
 ARG BCFTOOLS_VER=1.15.1
 ARG SAMTOOLS_VER=1.15.1
 ARG EH_VER=5.0.0
+ARG REVIEWER_VER=0.2.7
 
 RUN apt-get update && apt-get install --no-install-recommends -qqy \
     curl \
@@ -46,13 +47,21 @@ RUN pip install -q numpy==1.23.4
 RUN pip install -q Cython==0.29.32
 RUN pip install -q pandas==1.5.0
 
+# Install ExpansionHunter
 RUN cd /opt/ && mkdir eh && cd eh/ && \
     wget -O eh.tar.gz https://github.com/Illumina/ExpansionHunter/releases/download/v$EH_VER/ExpansionHunter-v$EH_VER-linux_x86_64.tar.gz && \
     mkdir eh && tar -xf eh.tar.gz -C eh --strip-components=1 && \
     mv eh/bin/ExpansionHunter . && \
     rm eh.tar.gz && rm -r eh && cd ..
 ENV PATH="/opt/eh/:$PATH"
 
+# Install REViewer
+RUN cd /opt/ && mkdir reviewer && cd reviewer && \
+    wget -O reviewer.gz https://github.com/Illumina/REViewer/releases/download/v$REVIEWER_VER/REViewer-v$REVIEWER_VER-linux_x86_64.gz && \
+    gunzip reviewer.gz && \
+    chmod +x reviewer
+ENV PATH="/opt/reviewer/:$PATH"
+
 # Install htslib, bcftools, & samtools.
 RUN wget -O htslib.tar.bz2 https://github.com/samtools/htslib/releases/download/$HTSLIB_VER/htslib-$HTSLIB_VER.tar.bz2 && \
     mkdir htslib && tar -xvjf htslib.tar.bz2 -C htslib --strip-components=1 && rm htslib.tar.bz2 && \

diff --git a/dockerfiles/sv-pipeline-virtual-env/Dockerfile b/dockerfiles/sv-pipeline-virtual-env/Dockerfile
@@ -79,7 +79,7 @@ RUN export NEW_PACKAGES=$(diff_of_lists.sh "$RUN_DEPS" $APT_REQUIRED_PACKAGES) &
 
 # install R packages
 ARG R_PACKAGES="assertthat beeswarm BH BSDA caret cli crayon DAAG data.table devtools digest dplyr e1071 fansi fpc \
-                generics gert glue HardyWeinberg hash latticeExtra magrittr MASS Matrix metap mnormt nlme nloptr nnet \
+                generics gert glue HardyWeinberg hash latticeExtra magrittr Matrix metap mnormt nlme nloptr nnet \
                 numDeriv perm pillar pkgconfig plogr plyr purrr pwr R6 RColorBrewer Rcpp reshape reshape2 rlang ROCR \
                 rpart stringi stringr survival tibble tidyr tidyselect utf8 vioplot withr zoo"
 ARG BIOCONDUCTOR_PKGS="SNPRelate multtest"
@@ -88,6 +88,7 @@ RUN export APT_TRANSIENT_PACKAGES=$(diff_of_lists.sh "$BUILD_DEPS" $APT_REQUIRED
     apt-get -qqy update --fix-missing && \
     apt-get -qqy install --no-install-recommends $BUILD_DEPS $(fix_spaces.sh $APT_REQUIRED_PACKAGES) && \
     install_bioconductor_packages.R $BIOCONDUCTOR_PKGS && \
+    install_deprecated_R_package.sh "https://cran.r-project.org/src/contrib/Archive/MASS/MASS_7.3-58.tar.gz" && \
     install_R_packages.R $R_PACKAGES && \
     apt-get -qqy remove --purge $APT_TRANSIENT_PACKAGES && \
     apt-get -qqy autoremove --purge && \

diff --git a/dockerfiles/wham/Dockerfile b/dockerfiles/wham/Dockerfile
@@ -1,4 +1,5 @@
-FROM ubuntu:18.04
+ARG SAMTOOLS_CLOUD_IMAGE=samtools-cloud:latest
+FROM $SAMTOOLS_CLOUD_IMAGE
 
 ARG DEBIAN_FRONTEND=noninteractive
 RUN apt-get -qqy update --fix-missing && \
@@ -18,4 +19,4 @@ RUN apt-get -qqy update --fix-missing && \
            /usr/share/man/?? \
            /usr/share/man/??_*
 
-ADD dockerfiles/wham/whamg /bin/
+COPY dockerfiles/wham/whamg /bin/