Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/main' into workflow_bug2
Browse files Browse the repository at this point in the history
  • Loading branch information
VJalili committed Jun 27, 2023
2 parents ddd4ce3 + 0be9cf8 commit f175e61
Show file tree
Hide file tree
Showing 126 changed files with 6,049 additions and 5,242 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/pytest.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@ jobs:
name: Linting
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v3
- name: Set up Python 3.8
uses: actions/setup-python@v2
uses: actions/setup-python@v3
with:
python-version: 3.8
- name: Install dependencies
Expand Down
62 changes: 23 additions & 39 deletions .github/workflows/sv_pipeline_docker.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ jobs:
image_tag: ${{ steps.image_tag.outputs.IMAGE_TAG }}
steps:
- name: Checkout code
uses: actions/checkout@v2
uses: actions/checkout@v3
with:
# By default, this checks out only the current commit;
# however, since a diff between the current commit and
Expand Down Expand Up @@ -79,8 +79,8 @@ jobs:
echo "::debug::HEAD_SHA: $HEAD_SHA"
# Avail the determined commit SHAs to other steps.
echo "::set-output name=BASE_SHA::$BASE_SHA"
echo "::set-output name=HEAD_SHA::$HEAD_SHA"
echo "BASE_SHA=$BASE_SHA" >> $GITHUB_OUTPUT
echo "HEAD_SHA=$HEAD_SHA" >> $GITHUB_OUTPUT
- name: Compose Image Tag
id: image_tag
Expand All @@ -106,7 +106,7 @@ jobs:
IMAGE_TAG=$DATE-$RELEASE_TAG-${COMMIT_SHA::8}
echo "::debug::Image tag: $IMAGE_TAG"
echo "::set-output name=IMAGE_TAG::$IMAGE_TAG"
echo "IMAGE_TAG=$IMAGE_TAG" >> $GITHUB_OUTPUT
build_job:
runs-on: ubuntu-20.04
Expand All @@ -118,7 +118,7 @@ jobs:
python-version: ['3.8']
steps:
- name: Checkout code
uses: actions/checkout@v2
uses: actions/checkout@v3
with:
# See the comment on build_args_job.
fetch-depth: 0
Expand All @@ -128,11 +128,6 @@ jobs:
with:
python-version: ${{ matrix.python-version }}

- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install termcolor
- name: Run build_docker.py
run: |
cd ./scripts/docker/
Expand Down Expand Up @@ -160,22 +155,24 @@ jobs:
DOCKERS_GCP: "./inputs/values/dockers.json"
steps:
- name: Checkout code
uses: actions/checkout@v2
uses: actions/checkout@v3
with:
# See the comment on build_args_job.
fetch-depth: 0
# Authenticates git using the bot's access token.
token: ${{ secrets.BOT_PAT }}

- name: Setup Python
uses: actions/setup-python@v2
uses: actions/setup-python@v3
with:
python-version: ${{ matrix.python-version }}

- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install termcolor
- name: Azure login
uses: azure/docker-login@v1
with:
login-server: ${{ secrets.AZ_CR }}
username: ${{ secrets.AZ_USERNAME }}
password: ${{ secrets.AZ_PASSWORD }}

- name: Setup gcloud CLI
uses: google-github-actions/[email protected]
Expand Down Expand Up @@ -206,39 +203,21 @@ jobs:
sudo mv "$tmp" /etc/docker/daemon.json
sudo systemctl restart docker.service
- name: Build and Publish Docker Images to GCR
- name: Build and Publish Docker Images to ACR & GCR
id: build_and_publish
run: |
python ./scripts/docker/build_docker.py \
--base-git-commit ${{ needs.build_args_job.outputs.base_sha }} \
--current-git-commit ${{ needs.build_args_job.outputs.head_sha }} \
--docker-repo us.gcr.io/${{ secrets.GCP_PROJECT_ID }}/gatk-sv \
--image-tag ${{ needs.build_args_job.outputs.image_tag }} \
--input-json $DOCKERS_GCP \
--output-json $DOCKERS_GCP
--input-json $DOCKERS_AZURE $DOCKERS_GCP \
--output-json $DOCKERS_AZURE $DOCKERS_GCP \
--disable-git-protect
CHANGED=$(git diff --quiet $DOCKERS_GCP || echo True)
echo "::set-output name=CHANGED::$CHANGED"
- name: Azure login
uses: azure/docker-login@v1
with:
login-server: ${{ secrets.AZ_CR }}
username: ${{ secrets.AZ_USERNAME }}
password: ${{ secrets.AZ_PASSWORD }}

- name: Build and Publish Docker Images to ACR
run: |
python ./scripts/docker/build_docker.py \
--base-git-commit ${{ needs.build_args_job.outputs.base_sha }} \
--current-git-commit ${{ needs.build_args_job.outputs.head_sha }} \
--docker-repo ${{ secrets.AZ_CR }} \
--image-tag ${{ needs.build_args_job.outputs.image_tag }} \
--input-json $DOCKERS_AZURE \
--output-json $DOCKERS_AZURE \
--prune-after-each-image \
--disable-git-protect
- name: Commit Changes to dockers_*.json
if: steps.build_and_publish.outputs.CHANGED
run: |
Expand All @@ -247,4 +226,9 @@ jobs:
git config --global user.email '[email protected]'
git commit $DOCKERS_AZURE $DOCKERS_GCP -m "Update docker images list, triggered by "${COMMIT_SHA::8}
git pull --rebase origin main
git push
# In the following, force-push is required when the above rebase updates the branch;
# otherwise, the push will be rejected with the following error:
# > Updates were rejected because the tip of your current branch is behind ts remote counterpart.
# See this thread for details: https://stackoverflow.com/q/39399804
git push -f
2 changes: 1 addition & 1 deletion .github/workflows/testwdls.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ jobs:
uses: actions/checkout@v3

- name: Setup Python
uses: actions/setup-python@v2
uses: actions/setup-python@v3
with:
python-version: ${{ matrix.python-version }}

Expand Down
19 changes: 9 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,10 @@ A structural variation discovery pipeline for Illumina short-read whole-genome s
* [Cromwell](https://github.com/broadinstitute/cromwell) (v36 or higher). A dedicated server is highly recommended.
* or [Terra](https://terra.bio/) (note preconfigured GATK-SV workflows are not yet available for this platform)
* Recommended: [MELT](https://melt.igs.umaryland.edu/). Due to licensing restrictions, we cannot provide a public docker image or reference panel VCFs for this algorithm.
* Recommended: [Manta](https://github.com/Illumina/manta). Calls SVs and indels from mapped PE reads based on split read and discordant read pair evidence.
* Recommended: [Wham](https://github.com/zeeev/wham). Used to predict SV breakpoints after integrating all evidence.
* Recommended: [cn.MOPS](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3351174/). Used to detect CNVs from variations in read depth using a mixture of Poisson models.
* Recommended: [gatk-gCNV](https://gatk.broadinstitute.org/hc/en-us/articles/360035531152). Detects germline CNVs from variations in read depth.
* Recommended: [cromshell](https://github.com/broadinstitute/cromshell) for interacting with a dedicated Cromwell server.
* Recommended: [WOMtool](https://cromwell.readthedocs.io/en/stable/WOMtool/) for validating WDL/json files.

Expand Down Expand Up @@ -123,11 +127,6 @@ apply, users may use placeholder values for the cloud configuration and simply d

The input values are provided only as an example and are not publicly accessible. In order to include MELT, these values must be provided by the user. MELT can be disabled by deleting these inputs and setting `GATKSVPipelineBatch.use_melt` to `false`.

#### Requester pays buckets
**Important**: The following parameters must be set when certain input data is in requester pays (RP) buckets:

* `GATKSVPipelineSingleSample.requester_pays_cram` and `GATKSVPipelineBatch.GatherSampleEvidenceBatch.requester_pays_crams` - set to `True` if inputs are CRAM format and in an RP bucket, otherwise `False`.

#### Execution
We recommend running the pipeline on a dedicated [Cromwell](https://github.com/broadinstitute/cromwell) server with a [cromshell](https://github.com/broadinstitute/cromshell) client. A batch run can be started with the following commands:

Expand Down Expand Up @@ -202,7 +201,7 @@ For larger cohorts, samples should be split up into batches of about 100-500 sam
`GATKSVPipelineSingleSample.wdl` runs the pipeline on a single sample using a fixed reference panel. An example run with reference panel containing 156 samples from the [NYGC 1000G Terra workspace](https://app.terra.bio/#workspaces/anvil-datastorage/1000G-high-coverage-2019) can be found in `inputs/build/NA12878/test` after [building inputs](#building-inputs)).

## <a name="gcnv-training-overview">gCNV Training</a>
Both the cohort and single-sample modes use the GATK gCNV depth calling pipeline, which requires a [trained model](#gcnv-training) as input. The samples used for training should be technically homogeneous and similar to the samples to be processed (i.e. same sample type, library prep protocol, sequencer, sequencing center, etc.). The samples to be processed may comprise all or a subset of the training set. For small, relatively homogenous cohorts, a single gCNV model is usually sufficient. If a cohort contains multiple data sources, we recommend training a separate model for each [batch](#batching) or group of batches with similar dosage score (WGD). The model may be trained on all or a subset of the samples to which it will be applied; a reasonable default is 100 randomly-selected samples from the batch (the random selection can be done as part of the workflow by specifying a number of samples to the `n_samples_subsample` input parameter in `/wdl/TrainGCNV.wdl`).
Both the cohort and single-sample modes use the [GATK-gCNV](https://gatk.broadinstitute.org/hc/en-us/articles/360035531152) depth calling pipeline, which requires a [trained model](#gcnv-training) as input. The samples used for training should be technically homogeneous and similar to the samples to be processed (i.e. same sample type, library prep protocol, sequencer, sequencing center, etc.). The samples to be processed may comprise all or a subset of the training set. For small, relatively homogenous cohorts, a single gCNV model is usually sufficient. If a cohort contains multiple data sources, we recommend training a separate model for each [batch](#batching) or group of batches with similar dosage score (WGD). The model may be trained on all or a subset of the samples to which it will be applied; a reasonable default is 100 randomly-selected samples from the batch (the random selection can be done as part of the workflow by specifying a number of samples to the `n_samples_subsample` input parameter in `/wdl/TrainGCNV.wdl`).

## <a name="reference-panel-generation">Generating a reference panel</a>
New reference panels can be generated easily from a single run of the `GATKSVPipelineBatch` workflow. If using a Cromwell server, we recommend copying the outputs to a permanent location by adding the following option to the workflow configuration file:
Expand Down Expand Up @@ -247,7 +246,7 @@ The following sections briefly describe each module and highlights inter-depende
## <a name="gather-sample-evidence">GatherSampleEvidence</a>
*Formerly Module00a*

Runs raw evidence collection on each sample with the following SV callers: Manta, Wham, and/or MELT. For guidance on pre-filtering prior to `GatherSampleEvidence`, refer to the [Sample Exclusion](#sample-exclusion) section.
Runs raw evidence collection on each sample with the following SV callers: [Manta](https://github.com/Illumina/manta), [Wham](https://github.com/zeeev/wham), and/or [MELT](https://melt.igs.umaryland.edu/). For guidance on pre-filtering prior to `GatherSampleEvidence`, refer to the [Sample Exclusion](#sample-exclusion) section.

Note: a list of sample IDs must be provided. Refer to the [sample ID requirements](#sampleids) for specifications of allowable sample IDs. IDs that do not meet these requirements may cause errors.

Expand Down Expand Up @@ -292,7 +291,7 @@ The purpose of sample filtering at this stage after EvidenceQC is to prevent ver


## <a name="gcnv-training">TrainGCNV</a>
Trains a gCNV model for use in [GatherBatchEvidence](#gather-batch-evidence). The WDL can be found at `/wdl/TrainGCNV.wdl`. See the [gCNV training overview](#gcnv-training-overview) for more information.
Trains a [gCNV](https://gatk.broadinstitute.org/hc/en-us/articles/360035531152) model for use in [GatherBatchEvidence](#gather-batch-evidence). The WDL can be found at `/wdl/TrainGCNV.wdl`. See the [gCNV training overview](#gcnv-training-overview) for more information.

#### Prerequisites:
* [GatherSampleEvidence](#gather-sample-evidence)
Expand All @@ -309,7 +308,7 @@ Trains a gCNV model for use in [GatherBatchEvidence](#gather-batch-evidence). Th
## <a name="gather-batch-evidence">GatherBatchEvidence</a>
*Formerly Module00c*

Runs CNV callers (cnMOPs, GATK gCNV) and combines single-sample raw evidence into a batch. See [above](#cohort-mode) for more information on batching.
Runs CNV callers ([cn.MOPS](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3351174/), [GATK-gCNV](https://gatk.broadinstitute.org/hc/en-us/articles/360035531152)) and combines single-sample raw evidence into a batch. See [above](#cohort-mode) for more information on batching.

#### Prerequisites:
* [GatherSampleEvidence](#gather-sample-evidence)
Expand Down Expand Up @@ -492,7 +491,7 @@ gs://gatk-sv-resources-public/hg38/v0/sv-resources/ref-panel/1KG/v2/mingq/1KGP_2
Add annotations, such as the inferred function and allele frequencies of variants, to final VCF.

Annotations methods include:
* Functional annotation - annotate SVs with inferred functional consequence on protein-coding regions, regulatory regions such as UTR and promoters, and other non-coding elements.
* Functional annotation - The GATK tool [SVAnnotate](https://gatk.broadinstitute.org/hc/en-us/articles/13832752531355-SVAnnotate) is used to annotate SVs with inferred functional consequence on protein-coding regions, regulatory regions such as UTR and promoters, and other non-coding elements.
* Allele Frequency annotation - annotate SVs with their allele frequencies across all samples, and samples of specific sex, as well as specific sub-populations.
* Allele Frequency annotation with external callset - annotate SVs with the allele frequencies of their overlapping SVs in another callset, eg. gnomad SV callset.

Expand Down
5 changes: 3 additions & 2 deletions dockerfiles/melt/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
FROM ubuntu:18.04
ARG SVBASE_IMAGE=sv-base:latest
FROM $SVBASE_IMAGE

# Dependencies, including BOWTIE_2 and OpenJRE
ARG BOWTIE2_RELEASE=2.3.4.3
Expand Down Expand Up @@ -43,7 +44,7 @@ ARG MELT_RELEASE=2.0.5
ARG MELT_DIR="/MELT/"
RUN mkdir -p ${MELT_DIR}
WORKDIR ${MELT_DIR}
ADD MELTv${MELT_RELEASE}_patch.tar.gz .
ADD dockerfiles/melt/MELTv${MELT_RELEASE}_patch.tar.gz .
RUN echo "export PATH=${MELT_DIR}/MELTv${MELT_RELEASE}_patch/:$PATH" > /etc/profile.d/MELT.sh
ENV PATH="${MELT_DIR}/MELTv${MELT_RELEASE}_patch/:$PATH"

Expand Down
9 changes: 9 additions & 0 deletions dockerfiles/str/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ ARG HTSLIB_VER=1.15.1
ARG BCFTOOLS_VER=1.15.1
ARG SAMTOOLS_VER=1.15.1
ARG EH_VER=5.0.0
ARG REVIEWER_VER=0.2.7

RUN apt-get update && apt-get install --no-install-recommends -qqy \
curl \
Expand Down Expand Up @@ -46,13 +47,21 @@ RUN pip install -q numpy==1.23.4
RUN pip install -q Cython==0.29.32
RUN pip install -q pandas==1.5.0

# Install ExpansionHunter
RUN cd /opt/ && mkdir eh && cd eh/ && \
wget -O eh.tar.gz https://github.com/Illumina/ExpansionHunter/releases/download/v$EH_VER/ExpansionHunter-v$EH_VER-linux_x86_64.tar.gz && \
mkdir eh && tar -xf eh.tar.gz -C eh --strip-components=1 && \
mv eh/bin/ExpansionHunter . && \
rm eh.tar.gz && rm -r eh && cd ..
ENV PATH="/opt/eh/:$PATH"

# Install REViewer
RUN cd /opt/ && mkdir reviewer && cd reviewer && \
wget -O reviewer.gz https://github.com/Illumina/REViewer/releases/download/v$REVIEWER_VER/REViewer-v$REVIEWER_VER-linux_x86_64.gz && \
gunzip reviewer.gz && \
chmod +x reviewer
ENV PATH="/opt/reviewer/:$PATH"

# Install htslib, bcftools, & samtools.
RUN wget -O htslib.tar.bz2 https://github.com/samtools/htslib/releases/download/$HTSLIB_VER/htslib-$HTSLIB_VER.tar.bz2 && \
mkdir htslib && tar -xvjf htslib.tar.bz2 -C htslib --strip-components=1 && rm htslib.tar.bz2 && \
Expand Down
3 changes: 2 additions & 1 deletion dockerfiles/sv-pipeline-virtual-env/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ RUN export NEW_PACKAGES=$(diff_of_lists.sh "$RUN_DEPS" $APT_REQUIRED_PACKAGES) &

# install R packages
ARG R_PACKAGES="assertthat beeswarm BH BSDA caret cli crayon DAAG data.table devtools digest dplyr e1071 fansi fpc \
generics gert glue HardyWeinberg hash latticeExtra magrittr MASS Matrix metap mnormt nlme nloptr nnet \
generics gert glue HardyWeinberg hash latticeExtra magrittr Matrix metap mnormt nlme nloptr nnet \
numDeriv perm pillar pkgconfig plogr plyr purrr pwr R6 RColorBrewer Rcpp reshape reshape2 rlang ROCR \
rpart stringi stringr survival tibble tidyr tidyselect utf8 vioplot withr zoo"
ARG BIOCONDUCTOR_PKGS="SNPRelate multtest"
Expand All @@ -88,6 +88,7 @@ RUN export APT_TRANSIENT_PACKAGES=$(diff_of_lists.sh "$BUILD_DEPS" $APT_REQUIRED
apt-get -qqy update --fix-missing && \
apt-get -qqy install --no-install-recommends $BUILD_DEPS $(fix_spaces.sh $APT_REQUIRED_PACKAGES) && \
install_bioconductor_packages.R $BIOCONDUCTOR_PKGS && \
install_deprecated_R_package.sh "https://cran.r-project.org/src/contrib/Archive/MASS/MASS_7.3-58.tar.gz" && \
install_R_packages.R $R_PACKAGES && \
apt-get -qqy remove --purge $APT_TRANSIENT_PACKAGES && \
apt-get -qqy autoremove --purge && \
Expand Down
5 changes: 3 additions & 2 deletions dockerfiles/wham/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
FROM ubuntu:18.04
ARG SAMTOOLS_CLOUD_IMAGE=samtools-cloud:latest
FROM $SAMTOOLS_CLOUD_IMAGE

ARG DEBIAN_FRONTEND=noninteractive
RUN apt-get -qqy update --fix-missing && \
Expand All @@ -18,4 +19,4 @@ RUN apt-get -qqy update --fix-missing && \
/usr/share/man/?? \
/usr/share/man/??_*

ADD dockerfiles/wham/whamg /bin/
COPY dockerfiles/wham/whamg /bin/
Loading

0 comments on commit f175e61

Please sign in to comment.