diff --git a/CHANGELOG.md b/CHANGELOG.md
index cb5160a..bc93c82 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,5 @@
# Changelog
-All notable changes to the pipeline-name pipeline.
+All notable changes to the StableLift pipeline.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
@@ -8,58 +8,13 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm
---
## [Unreleased]
-### Added
-- Add `sample_id` extraction from BAM
-- Add template input YAMLs
-- Add pipeline-Nexflow-config as submodule and redirect set_resources_allocation
-- Add pipeline-Nextflow-module as submodule
-- Additional out of memory exit code
-- Pipeline release action
-- Template for NFTest testing results in PR template
-- Enable dependabot
-- Add example PlantUML image to README
-- Add workflow to build documentation
-- Add workflows to run Nextflow configuration tests
-
-### Changed
-- Switch resource limit checks to external scripts
-- Update links in on-prem Confluence to point to cloud-based Confluence
-- Fix `CODEOWNERS` file
-- Use `schema.check_path` for `workDir` validation
-- Add `Discussions` and `Contributors` to the Table of Contents in `README.md`
-- Update from DSL1 to DSL2
-- Standardize config structure
-- Restructure repo so main script is main.nf
-- Reorganize contributors and metadata
-- Reorganize PR template so description is at top
-- Update automatic node detection to allow for F2 detection
-- Update Issue Template
-- Standardize input/output/parameter structure in README
-- Avoid modification of input parameter `output_dir`
-- Create default docker container registry parameter for tools
-- Use `methods.setup_process_afterscript()` to capture log files
-
----
-## [1.0.0] - YYYY-MM-DD
### Added
-- For new features.
-- Added item 1.
-### Changed
-- For changes in existing functionality.
-- Changed item 1.
-
-### Deprecated
-- For soon-to-be removed features.
+- Add workflow for SNV callers (Mutect2, HaplotypeCaller, Strelka2, Muse2, SomaticSniper)
+- Add workflow for SV caller (Delly2)
+- Add pipeline diagram
-### Removed
-- For now removed features.
-- Removed item 1.
-
-### Fixed
-- For any bug fixes.
-- Fixed item 1.
+### Changed
-### Security
-- In case of vulnerabilities.
+- Sort VCF after liftover in SV branch
diff --git a/Dockerfile b/Dockerfile
index 3b95730..959ac76 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,19 +1,60 @@
-ARG R_VERSION=4.3.1
+ARG R_VERSION="4.3.1"
+
+ARG LIBBZ2_VERSION="1.0.8-*"
+ARG LIBCURL_VERSION="7.81.0-*"
+ARG LIBLZMA_VERSION="5.2.5-*"
+ARG LIBXML2_VERSION="2.9.13+dfsg-*"
+ARG PYTHON_VERSION="3.10.6-*"
+ARG ZLIB_VERSION="1:1.2.11.dfsg-*"
+ARG RLIBDIR="/usr/local/stablelift-R"
FROM rocker/r-ver:${R_VERSION} AS build
+ARG LIBBZ2_VERSION
+ARG LIBCURL_VERSION
+ARG LIBLZMA_VERSION
+ARG LIBXML2_VERSION
+ARG ZLIB_VERSION
+
+# Install build-time dependencies
+RUN apt-get update \
+ && apt-get install -y --no-install-recommends \
+ libbz2-dev=${LIBBZ2_VERSION} \
+ libcurl4-openssl-dev=${LIBCURL_VERSION} \
+ liblzma-dev=${LIBLZMA_VERSION} \
+ libxml2-dev=${LIBXML2_VERSION} \
+ zlib1g-dev=${ZLIB_VERSION} \
+ && apt-get clean \
+ && rm -rf /var/lib/apt/lists/*
+
+ARG BIOC_VERSION="3.18"
+ENV BIOC_VERSION=${BIOC_VERSION}
+
+ARG RLIBDIR
+ENV RENV_PATHS_CACHE ${RLIBDIR}/.cache
+
+RUN mkdir -p ${RENV_PATHS_CACHE}
+
+WORKDIR ${RLIBDIR}
+
COPY docker/install-stablelift.R /tmp
RUN Rscript /tmp/install-stablelift.R
+# renv prints to stdout, so we need to change directories
+WORKDIR /
+RUN echo ".libPaths( c( .libPaths(), \"/usr/local/stablelift-R/renv/library/R-4.3/$(Rscript -e "cat(unname(unlist(R.version['platform'])))")\" ) )" >> /usr/local/lib/R/etc/Rprofile.site
+
FROM rocker/r-ver:${R_VERSION}
-# Overwrite the site library with just the desired packages. By default rocker
-# only bundles docopt and littler in that directory.
-COPY --from=build /tmp/userlib /usr/local/lib/R/site-library
+ARG RLIBDIR
+COPY --from=build ${RLIBDIR} ${RLIBDIR}
+COPY --from=build \
+ /usr/local/lib/R/etc/Rprofile.site \
+ /usr/local/lib/R/etc/Rprofile.site
# Install python (required for argparse). The version is not important, but
# let's pin it for stability.
-ARG PYTHON_VERSION=3.10.6-1~22.04
+ARG PYTHON_VERSION
RUN apt-get update \
&& apt-get install -y --no-install-recommends \
diff --git a/README.md b/README.md
index 6d1ed5c..263196b 100644
--- a/README.md
+++ b/README.md
@@ -39,9 +39,7 @@ If you are using the UCLA Azure cluster, please use the [submission script](http
## Flow Diagram
-A directed acyclic graph of your pipeline. The [PlantUML](https://plantuml.com/) code defining this diagram is version-controlled in the [docs/](./docs/) folder, and a [GitHub Action](https://github.com/uclahs-cds/tool-PlantUML-action) automatically regenerates the SVG image when that file is changed.
-
-![Pipeline Graph](./docs/pipeline-flow.svg)
+![Pipeline Graph](./docs/pipeline.mmd.svg)
---
diff --git a/config/default.config b/config/default.config
index 0ee67bd..05eb6f7 100644
--- a/config/default.config
+++ b/config/default.config
@@ -22,7 +22,7 @@ params {
gatk_version = '4.2.4.1'
pipeval_version = '5.0.0-rc.3'
samtools_version = '1.20'
- stablelift_version = 'branch-nwiltsie-bootstrap' // FIXME
+ stablelift_version = 'branch-nwiltsie-regroup-modules' // FIXME
docker_image_bcftools = "${-> params.docker_container_registry}/bcftools-score:${params.bcftools_version}"
docker_image_bedtools = "${-> params.docker_container_registry}/bedtools:${params.bedtools_version}"
diff --git a/config/methods.config b/config/methods.config
index 7cbf874..05cf337 100644
--- a/config/methods.config
+++ b/config/methods.config
@@ -9,7 +9,7 @@ methods {
set_output_dir = {
def date = new Date().format("yyyyMMdd'T'HHmmss'Z'", TimeZone.getTimeZone('UTC'))
- params.output_dir_base = "${params.output_dir}/${manifest.name}-${manifest.version}/${params.sample_id.replace(' ', '_')}"
+ params.output_dir_base = "${params.output_dir}/${manifest.name}-${manifest.version}/${params.sample_id.replace(' ', '_')}/StableLift-${manifest.version}"
params.log_output_dir = "${params.output_dir_base}/log-${manifest.name}-${manifest.version}-${date}"
}
diff --git a/config/schema.yaml b/config/schema.yaml
index 7d2645b..76b8a2f 100644
--- a/config/schema.yaml
+++ b/config/schema.yaml
@@ -3,6 +3,17 @@ sample_id:
type: 'String'
required: true
help: 'sample id supplied from input yaml'
+variant_caller:
+ type: 'String'
+ required: true
+ help: 'Tool used to call structural or somatic variants'
+ choices:
+ - Mutect2
+ - HaplotypeCaller
+ - Strelka2
+ - Muse2
+ - SomaticSniper
+ - Delly2
save_intermediate_files:
type: 'Bool'
required: true
diff --git a/config/template.config b/config/template.config
index 06c8741..e30e086 100644
--- a/config/template.config
+++ b/config/template.config
@@ -36,7 +36,14 @@ params {
chain_file = "/hot/ref/tool-specific-input/liftOver/hg19ToHg38.over.chain"
// FIXME How to describe this file?
- repeat_bed = "/hot/ref/database/RepeatMasker-v3.0.1/processed/GRCh38/GRCh38_RepeatMasker_intervals.bed"
+ repeat_bed = "/hot/ref/database/RepeatMasker-3.0.1/processed/GRCh38/GRCh38_RepeatMasker_intervals.bed"
+
+ // SV files
+ // FIXME Should this be bundled?
+ header_contigs = "/hot/code/nkwang/GitHub/uclahs-cds/project-method-AlgorithmEvaluation-BNCH-000142-GRCh37v38/report/manuscript/publish/GRCh38-vcf-header-contigs.txt"
+
+ // FIXME Should this be bundled?
+ gnomad_rds = "/hot/code/nkwang/GitHub/uclahs-cds/project-method-AlgorithmEvaluation-BNCH-000142-GRCh37v38/report/manuscript/publish/data/gnomad.v4.0.sv.Rds"
}
// Setup the pipeline config. DO NOT REMOVE THIS LINE!
diff --git a/docker/install-stablelift.R b/docker/install-stablelift.R
index b20a08d..ef50338 100644
--- a/docker/install-stablelift.R
+++ b/docker/install-stablelift.R
@@ -1,26 +1,22 @@
-# Install the remotes package to the library
-install.packages('remotes', lib = .Library)
+install.packages('renv', lib = .Library)
-# Make a temporary directory to hold all of the installed packages
-localdir <- '/tmp/userlib'
-dir.create(localdir)
+options(
+ renv.settings.bioconductor.version = Sys.getenv('BIOC_VERSION')
+)
-dependencies <- c(
- 'ROCR' = '1.0-11',
- 'argparse' = '2.2.2',
- 'caret' = '6.0-94',
- 'data.table' = '1.14.8',
- 'doParallel' = '1.0.17',
- 'foreach' = '1.5.2',
- 'ranger' = '0.15.1',
- 'vcfR' = '1.14.0'
+renv::init(
+ bare = TRUE,
+ bioconductor = Sys.getenv('BIOC_VERSION')
)
-# Unfortunately, this will install the dependencies multiple times
-for (name in names(dependencies)) {
- remotes::install_version(
- name,
- unname(dependencies[name]),
- lib = localdir
- )
-}
+renv::install(c(
+ 'ROCR@1.0-11',
+ 'argparse@2.2.2',
+ 'caret@6.0-94',
+ 'data.table@1.14.8',
+ 'doParallel@1.0.17',
+ 'foreach@1.5.2',
+ 'ranger@0.15.1',
+ 'vcfR@1.14.0',
+ 'bioc::rtracklayer@1.62.0'
+))
diff --git a/docs/pipeline.mmd b/docs/pipeline.mmd
new file mode 100644
index 0000000..f0bba92
--- /dev/null
+++ b/docs/pipeline.mmd
@@ -0,0 +1,79 @@
+%%{init: {"flowchart": {"htmlLabels": false}} }%%
+
+flowchart TD
+
+ classDef input fill:#ffffb3
+ classDef output fill:#b3de69
+ classDef gatk fill:#bebada
+ classDef bcftools fill:#fdb462
+ classDef R fill:#8dd3c7
+ classDef linux fill:#fb8072
+
+ subgraph legend ["`**Legend**`"]
+ direction RL
+ subgraph nodes ["`**Nodes**`"]
+ input[["Input File"]]:::input
+ input_node(["Parameterized Input"]):::input
+ output[["Output file"]]:::output
+ end
+
+ subgraph processes ["`**Processes**`"]
+ gatk_docker[GATK]:::gatk
+ bcftools_docker[bcftools]:::bcftools
+ r_docker[Rscript]:::R
+ linux_docker[Generic Linux]:::linux
+ end
+ end
+
+ legend
+ ~~~ input_vcf[["Input VCF"]]:::input
+ --> pipeval:::linux
+ --> sv_vs_snv{{Variant Type?}}
+
+ sv_vs_snv ------> r_liftover
+ header_contigs .-> r_liftover
+ chain_file2 ..-> r_liftover
+ gnomad_rds .-> r_extract_sv
+
+ subgraph SV ["`**SV**`"]
+ %% Other input files
+ header_contigs([header_contigs]):::input
+ chain_file2([chain_file]):::input
+ gnomad_rds([gnomad_rds]):::input
+
+ r_liftover[liftover-Delly2-vcf.R]:::R
+ ---> r_extract_sv[extract-VCF-features-SV.R]:::R
+
+ end
+
+ chain_file .-> bcftools_liftover
+ sv_vs_snv --> bcftools_liftover
+
+ subgraph SNV ["`**SNV**`"]
+ funcotator_sources([funcotator_sources]):::input
+ chain_file([chain_file]):::input
+ repeat_bed([repeat_bed]):::input
+
+ bcftools_liftover[bcftools +liftover]:::bcftools
+ ---> gatk_func[gatk Funcotator]:::gatk
+ --> bcftools_annotate["`bcftools annotate*RepeatMasker*`"]:::bcftools
+ --> bcftools_annotate2["`bcftools annotate*Trinucleotide*`"]:::bcftools
+ --> r_extract_snv[extract-VCF-features.R]:::R
+ end
+
+ funcotator_sources .-> gatk_func
+ repeat_bed .-> bcftools_annotate
+
+ joinpaths{ }
+ r_extract_snv --> joinpaths
+ r_extract_sv --> joinpaths
+ joinpaths ---> r_predict_stability
+
+ subgraph Predict Stability ["` **Predict Stability**`"]
+ r_predict_stability[predict-liftover-stability.R]:::R
+ --> bcftools_annotate3["`bcftools annotate*Stability*`"]:::bcftools
+
+ rf_model([rf_model]):::input .-> r_predict_stability
+ end
+
+ bcftools_annotate3 --> output_vcfs[["Output VCFs"]]:::output
diff --git a/docs/pipeline.mmd.svg b/docs/pipeline.mmd.svg
new file mode 100644
index 0000000..714aef7
--- /dev/null
+++ b/docs/pipeline.mmd.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/main.nf b/main.nf
index 1e45a85..abac9ca 100644
--- a/main.nf
+++ b/main.nf
@@ -10,10 +10,9 @@ include { run_validate_PipeVal_with_metadata } from './external/pipeline-Nextflo
]
)
-include { run_liftover_BCFtools } from './module/liftover.nf'
-include { run_Funcotator_GATK } from './module/funcotator.nf'
-include { workflow_apply_annotations } from './module/annotations.nf'
-include { workflow_extract_features} from './module/extract_features.nf'
+include { workflow_extract_sv_annotations } from './module/sv_workflow.nf'
+include { workflow_extract_snv_annotations } from './module/snv_workflow.nf'
+include { workflow_predict_stability } from './module/predict_stability.nf'
// Log info here
log.info """\
@@ -139,28 +138,45 @@ workflow {
.map { filename, metadata -> [metadata[0].sample_id, metadata[0] + [(metadata[1]): filename]] }
.groupTuple()
.map { it[1].inject([:]) { result, i -> result + i } }
- .set { validated_vcf_with_index }
+ .tap { validated_vcf_with_index }
+ .map { [it.sample_id, it.vcf, it.index] }
+ .set { validated_vcf_tuple }
// The values of validated_vcf_with_index are maps with keys vcf, index, and sample_id.
- run_liftover_BCFtools(
- validated_vcf_with_index.map { [it.sample_id, it.vcf, it.index] },
- input_ch_src_sequence,
- input_ch_dest_sequence,
- Channel.value(params.chain_file)
- )
+ // The values of validated_vcf_tuple are tuples of (sample_id, vcf, index).
+
+ if (params.variant_caller == "Delly2") {
+ // Take the SV branch
+ workflow_extract_sv_annotations(
+ validated_vcf_tuple,
+ Channel.value(params.header_contigs),
+ Channel.value(params.gnomad_rds),
+ Channel.value(params.chain_file),
+ Channel.value(params.variant_caller)
+ )
- run_Funcotator_GATK(
- run_liftover_BCFtools.out.liftover_vcf_with_index,
- input_ch_dest_sequence,
- Channel.value(params.funcotator_data.data_source)
- )
+ workflow_extract_sv_annotations.out.liftover_vcf.set { liftover_vcf }
+ workflow_extract_sv_annotations.out.r_annotations.set { r_annotations }
+
+ } else {
+ // Take the SNV branch
+ workflow_extract_snv_annotations(
+ validated_vcf_tuple,
+ input_ch_src_sequence,
+ input_ch_dest_sequence,
+ Channel.value(params.chain_file),
+ Channel.value(params.variant_caller)
+ )
- workflow_apply_annotations(
- run_Funcotator_GATK.out.funcotator_vcf,
- input_ch_dest_sequence
- )
+ workflow_extract_snv_annotations.out.liftover_vcf.set { liftover_vcf }
+ workflow_extract_snv_annotations.out.r_annotations.set { r_annotations }
+ }
- workflow_extract_features(
- workflow_apply_annotations.out.annotated_vcf
+ // Predict stability and apply annotate lifted VCF
+ workflow_predict_stability(
+ liftover_vcf,
+ r_annotations,
+ Channel.value(params.rf_model),
+ Channel.value(params.variant_caller)
)
}
diff --git a/module/funcotator.nf b/module/funcotator.nf
deleted file mode 100644
index 1917ae0..0000000
--- a/module/funcotator.nf
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
-* Module/process description here
-*
-* @input
-* @params
-* @output
-*/
-
-include { generate_standard_filename } from '../external/pipeline-Nextflow-module/modules/common/generate_standardized_filename/main.nf'
-
-process run_Funcotator_GATK {
- container params.docker_image_gatk
-
- publishDir path: "${intermediate_filepath}",
- pattern: "output.vcf.gz",
- mode: "copy",
- enabled: params.save_intermediate_files,
- saveAs: { "${slug}.vcf.gz" }
-
- input:
- tuple val(sample_id),
- path(vcf, stageAs: 'inputs/*'),
- path(index, stageAs: 'inputs/*')
- tuple val(dest_fasta_id), path(dest_fasta_ref), path(dest_fasta_fai), path(dest_fasta_dict)
- path (funcotator_sources)
-
- output:
- tuple val(sample_id), path('output.vcf.gz'), emit: funcotator_vcf
-
- script:
- intermediate_filepath = "${params.output_dir_base}/GATK-${params.gatk_version}/intermediate/${task.process}"
-
- slug = "Funcotator-${sample_id}-${dest_fasta_id}"
-
- """
- gatk Funcotator \
- --variant "${vcf}" \
- --reference "${dest_fasta_ref}" \
- --ref-version "${dest_fasta_id}" \
- --data-sources-path "${funcotator_sources}" \
- --output-file-format VCF \
- --output "output.vcf.gz"
- """
-}
diff --git a/module/liftover.nf b/module/liftover.nf
deleted file mode 100644
index 5e8f09c..0000000
--- a/module/liftover.nf
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
-* Module/process description here
-*
-* @input
-* @params
-* @output
-*/
-
-// include { generate_standard_filename } from '../external/pipeline-Nextflow-module/modules/common/generate_standardized_filename/main.nf'
-
-process run_liftover_BCFtools {
- container params.docker_image_bcftools
-
- publishDir path: "${intermediate_path}",
- pattern: "reject.vcf.gz",
- mode: "copy",
- enabled: params.save_intermediate_files,
- saveAs: { "${slug}-reject.vcf.gz" }
-
- publishDir path: "${intermediate_path}",
- pattern: "liftover.vcf.gz",
- mode: "copy",
- enabled: params.save_intermediate_files,
- saveAs: { "${slug}.vcf.gz" }
-
- publishDir path: "${intermediate_path}",
- pattern: "liftover.vcf.gz.tbi",
- mode: "copy",
- enabled: params.save_intermediate_files,
- saveAs: { "${slug}.vcf.gz.tbi" }
-
- input:
- tuple val(sample_id), path(vcf), path(index)
- tuple val(src_fasta_id), path(src_fasta_ref), path(src_fasta_fai), path(src_fasta_dict)
- tuple val(dest_fasta_id), path(dest_fasta_ref), path(dest_fasta_fai), path(dest_fasta_dict)
- path (chain_file)
-
- output:
- tuple val(sample_id), path('liftover.vcf.gz'), path('liftover.vcf.gz.tbi'), emit: liftover_vcf_with_index
-
- script:
- // FIXME Use a more standard path
- intermediate_path = "${params.output_dir_base}/BCFtools-${params.bcftools_version}/intermediate/${task.process}"
-
- slug = "LiftOver-${sample_id}-${src_fasta_id}-to-${dest_fasta_id}"
-
- """
- bcftools +liftover \
- --output-type u \
- "${vcf}" \
- -- \
- --src-fasta-ref "${src_fasta_ref}" \
- --fasta-ref "${dest_fasta_ref}" \
- --chain "${chain_file}" \
- --reject-type z \
- --reject "reject.vcf.gz" | \
- bcftools view \
- --output-type u \
- -e 'REF="." | ALT="."' | \
- bcftools sort \
- --output-type z \
- --write-index=tbi \
- --output "liftover.vcf.gz"
- """
-}
diff --git a/module/extract_features.nf b/module/predict_stability.nf
similarity index 56%
rename from module/extract_features.nf
rename to module/predict_stability.nf
index d5b385f..1ae6987 100644
--- a/module/extract_features.nf
+++ b/module/predict_stability.nf
@@ -1,58 +1,34 @@
-include { compress_and_index_HTSlib } from './annotations.nf'
-
-process extract_VCF_features_StableLift {
- container params.docker_image_stablelift
- containerOptions "-v ${moduleDir}:${moduleDir}"
-
- publishDir path: "${intermediate_filepath}",
- pattern: "features.Rds",
- mode: "copy",
- enabled: params.save_intermediate_files,
- saveAs: { "${slug}.${file(it).getExtension()}" }
-
- input:
- tuple val(sample_id), path(vcf)
-
- output:
- tuple val(sample_id), path('features.Rds'), emit: r_annotations
-
- script:
- intermediate_filepath = "${params.output_dir_base}/stablelift-${params.stablelift_version}/intermediate/${task.process}"
-
- slug = "stablelift-${sample_id}"
-
- """
- Rscript "${moduleDir}/scripts/extract-vcf-features.R" \
- --input-vcf "${vcf}" \
- --variant-caller ${params.variant_caller} \
- --output-rds "features.Rds"
- """
-}
+include { compress_and_index_HTSlib } from './utils.nf'
process predict_stability_StableLift {
container params.docker_image_stablelift
containerOptions "-v ${moduleDir}:${moduleDir}"
publishDir path: "${params.output_dir_base}/output",
- pattern: "${output_file_name}",
- mode: "copy"
+ pattern: "stability.tsv",
+ mode: "copy",
+ saveAs: { "StableLift-${sample_id}-${variant_caller}.tsv" }
input:
tuple val(sample_id), path(features_rds)
path(rf_model)
+ val(variant_caller)
output:
- tuple val(sample_id), path(output_file_name), emit: stability_tsv
+ tuple val(sample_id), path("stability.tsv"), emit: stability_tsv
script:
- output_file_name = "stablelift-${sample_id}.tsv"
-
"""
Rscript "${moduleDir}/scripts/predict-liftover-stability.R" \
--features-dt "${features_rds}" \
--rf-model "${rf_model}" \
- --variant-caller "${params.variant_caller}" \
- --output-tsv "${output_file_name}"
+ --variant-caller "${variant_caller}" \
+ --output-tsv "stability.tsv"
+ """
+
+ stub:
+ """
+ touch "stability.tsv"
"""
}
@@ -60,8 +36,9 @@ process run_apply_stability_annotations {
container params.docker_image_bcftools
publishDir path: "${params.output_dir_base}/output",
- pattern: "*.vcf.gz{,.tbi}",
- mode: "copy"
+ pattern: "{stability,filtered}.vcf.gz{,.tbi}",
+ mode: "copy",
+ saveAs: { "${sample_id}-${it}" }
input:
tuple val(sample_id),
@@ -72,21 +49,19 @@ process run_apply_stability_annotations {
output:
tuple val(sample_id),
- path(stability_vcf),
- path(stability_vcf_tbi),
+ path("stability.vcf.gz"),
+ path("stability.vcf.gz.tbi"),
emit: stability_vcf_with_index
tuple val(sample_id),
- path(filtered_vcf),
- path(filtered_vcf_tbi),
+ path("filtered.vcf.gz"),
+ path("filtered.vcf.gz.tbi"),
emit: filtered_vcf_with_index
script:
- slug = "${sample_id}_LiftOver"
-
- stability_vcf = "${slug}_stability.vcf.gz"
+ stability_vcf = "stability.vcf.gz"
stability_vcf_tbi = "${stability_vcf}.tbi"
- filtered_vcf = "${slug}_filtered.vcf.gz"
+ filtered_vcf = "filtered.vcf.gz"
filtered_vcf_tbi = "${filtered_vcf}.tbi"
"""
@@ -107,23 +82,29 @@ process run_apply_stability_annotations {
bcftools index -t "${filtered_vcf}"
"""
+
+ stub:
+ """
+ touch "stability.vcf.gz"
+ touch "stability.vcf.gz.tbi"
+ touch "filtered.vcf.gz"
+ touch "filtered.vcf.gz.tbi"
+ """
}
-workflow workflow_extract_features {
+workflow workflow_predict_stability {
take:
vcf_with_sample_id
+ r_annotations
+ rf_model
+ variant_caller
main:
- if (params.variant_caller == "HaplotypeCaller") {
- error "HaplotypeCaller is not supported yet"
- } else {
- extract_VCF_features_StableLift(vcf_with_sample_id)
- extract_VCF_features_StableLift.out.r_annotations.set { ch_annotations }
- }
predict_stability_StableLift(
- ch_annotations,
- Channel.value(params.rf_model)
+ r_annotations,
+ rf_model,
+ variant_caller
)
compress_and_index_HTSlib(
diff --git a/module/scripts/extract-vcf-features-SV.R b/module/scripts/extract-vcf-features-SV.R
new file mode 100644
index 0000000..9927a7e
--- /dev/null
+++ b/module/scripts/extract-vcf-features-SV.R
@@ -0,0 +1,181 @@
+#!/usr/bin/env Rscript
+# extract-vcf-features-SV.R
+####################################################################################################
+#
+# Extract features from vcf
+# Intersect and annotate with gnomAD-SV vcf
+#
+####################################################################################################
+
+suppressPackageStartupMessages({
+ library(vcfR);
+ library(data.table);
+ library(argparse);
+ library(GenomicRanges);
+ });
+
+###################################################################################################
+# Input
+###################################################################################################
+# Define command line arguments
+parser <- ArgumentParser();
+parser$add_argument('--variant-caller', type = 'character', help = '');
+parser$add_argument('--input-vcf', type = 'character', help = 'Delly2 vcf');
+parser$add_argument('--output-rds', type = 'character', help = 'Rds output for use in RF model');
+parser$add_argument('--gnomad-rds', type = 'character', help = 'gnomAD Rds file');
+args <- parser$parse_args();
+
+# Save command line arguments
+for (arg in names(args)) {
+ assign(gsub('_', '.', arg), args[[arg]]);
+ }
+
+# Set parameters for interactive runs
+if (interactive()) {
+ variant.caller <- 'Delly2';
+ input.vcf <- '/hot/project/method/AlgorithmEvaluation/BNCH-000142-GRCh37v38/sSV/stableLift/train_CPCG-40QC_Delly2/CPCG-40QC_Delly2_LiftOver-GRCh38.vcf.gz';
+ gnomad.rds <- '/hot/code/nkwang/GitHub/uclahs-cds/project-method-AlgorithmEvaluation-BNCH-000142-GRCh37v38/report/manuscript/publish/data/gnomad.v4.0.sv.Rds';
+ }
+
+###################################################################################################
+# Functions
+###################################################################################################
+vcf.info.to.dt <- function(vcf.info) {
+ # Split each string by semicolon and convert to a list of key-value pairs
+ vcf.info <- strsplit(vcf.info, ';');
+ vcf.info <- lapply(vcf.info, function(x) {
+ x <- strsplit(x, '=');
+ as.list(stats::setNames(sapply(x, `[`, 2), sapply(x, `[`, 1)));
+ })
+
+ # Combine the list of key-value pairs into a data table
+ rbindlist(vcf.info, fill = TRUE);
+ }
+
+calculate.VAF <- function(GT.row) {
+ total <- sum(GT.row %in% c('0/0', '0/1', '1/1'), na.rm = TRUE) * 2;
+ alt <- sum(GT.row == '0/1', na.rm = TRUE) + sum(GT.row == '1/1', na.rm = TRUE) * 2;
+ return(alt / total);
+ }
+
+get.overlap <- function(start1, end1, start2, end2) {
+ max.length <- pmax((end1 - start1), (end2 - start2));
+ overlap.length <- pmin(end1, end2) - pmax(start1, start2);
+ return(overlap.length / max.length);
+ }
+
+find.SV.match <- function(this.ID, input, reference, overlap, offset) {
+ # Match SV type and CHR
+ this.variant <- input[ID == this.ID];
+ reference <- reference[SVTYPE == this.variant$SVTYPE & CHROM == this.variant$CHROM];
+
+ if (this.variant$SVTYPE == 'BND') {
+ # reference[, OFFSET := abs(POS - this.variant$POS) + abs(POS2 - this.variant$POS2)];
+ reference[, OFFSET := abs(POS - this.variant$POS)];
+ reference <- reference[OFFSET < offset & CHR2 == this.variant$CHR2][order(OFFSET)];
+ } else {
+ reference[, OVERLAP := get.overlap(POS, END, this.variant$POS, this.variant$END)];
+ reference <- reference[OVERLAP > overlap][order(OVERLAP, decreasing = TRUE)];
+ }
+
+ return(list(gnomad.match.ID = reference[1, ID], gnomad.matches = nrow(reference)));
+ }
+
+###################################################################################################
+# Load files
+###################################################################################################
+input.vcf <- read.vcfR(input.vcf);
+features.dt.gnomad <- readRDS(gnomad.rds);
+
+###################################################################################################
+# Data preprocessing
+###################################################################################################
+# Convert variant information into dt
+input.info <- vcf.info.to.dt(input.vcf@fix[, 'INFO']);
+input.fix <- as.data.table(input.vcf@fix);
+features.dt <- cbind(input.fix[, -c('INFO')], input.info);
+
+# Format columns
+features.dt[, CONSENSUS := NULL];
+numeric.columns <- c('POS', 'QUAL', 'END', 'PE', 'MAPQ', 'SRMAPQ', 'INSLEN', 'HOMLEN', 'SR', 'SRQ', 'CE', 'RDRATIO', 'SVLEN', 'POS2');
+features.dt[, (numeric.columns) := lapply(.SD, as.numeric), .SDcols = numeric.columns];
+
+# Extract and aggregate per sample GT fields
+gt.fields <- c('GQ', 'RC', 'RDCN', 'DR', 'DV', 'RR', 'RV');
+for (field in gt.fields) {
+ features.dt[, (field) := apply(extract.gt(input.vcf, element = ..field, as.numeric = TRUE), 1, mean, na.rm = TRUE)];
+ }
+features.dt[, COHORT_AF := apply(extract.gt(input.vcf, element = 'GT'), 1, calculate.VAF)];
+features.dt[!SVTYPE %in% c('BND', 'INS'), SVLEN := END - POS + 1];
+features.dt[, CIPOS := as.numeric(sapply(CIPOS, function(x) unlist(strsplit(x, ','))[2]))];
+
+###################################################################################################
+# Intersect variants with gnomAD SVs
+###################################################################################################
+start.time <- Sys.time();
+
+# features.dt <- features.dt[1:100];
+features.dt[, c('gnomad.match.ID', 'gnomad.matches') := rbindlist(lapply(ID, find.SV.match, input = features.dt, reference = features.dt.gnomad, overlap = 0.8, offset = 500))];
+
+gnomad.features <- c('ID', 'AF', 'POPMAX_AF', 'NCR');
+features.dt <- merge(features.dt, features.dt.gnomad[, ..gnomad.features], all.x = TRUE, by.x = 'gnomad.match.ID', by.y = 'ID');
+
+cat(format(Sys.time() - start.time, nsmall = 2), '\n');
+
+###################################################################################################
+# Format features for RF
+###################################################################################################
+continuous.features <- c(
+ 'POS',
+ 'QUAL',
+ 'END',
+ 'PE',
+ 'MAPQ',
+ 'CIPOS',
+ 'SRMAPQ',
+ 'HOMLEN',
+ 'SR',
+ 'SRQ',
+ 'CE',
+ 'RDRATIO',
+ 'SVLEN',
+ 'GQ',
+ 'RC',
+ 'RDCN',
+ 'DR',
+ 'DV',
+ 'RR',
+ 'RV',
+ 'gnomad.matches',
+ 'AF',
+ 'POPMAX_AF',
+ 'NCR'
+ );
+
+categorical.features <- c(
+ 'CHROM',
+ 'SVTYPE',
+ 'CT'
+ );
+
+# Extract features and format
+continuous.features <- continuous.features[continuous.features %in% names(features.dt)];
+categorical.features <- categorical.features[categorical.features %in% names(features.dt)];
+all.features <- c(continuous.features, categorical.features, 'ID');
+
+features.dt <- features.dt[, ..all.features];
+features.dt[, (continuous.features) := lapply(.SD, as.numeric), .SDcols = continuous.features];
+features.dt[, (continuous.features) := lapply(.SD, function(x) ifelse(is.na(x), 0, x)), .SDcols = continuous.features];
+features.dt[, (categorical.features) := lapply(.SD, function(x) ifelse(is.na(x), '', x)), .SDcols = categorical.features];
+features.dt[, (categorical.features) := lapply(.SD, as.factor), .SDcols = categorical.features];
+names(features.dt) <- make.names(names(features.dt));
+
+# Remove rows with NA
+features.dt.rows <- nrow(features.dt);
+features.dt <- features.dt[apply(features.dt, 1, function(x) !any(is.na(x))), ];
+cat('Removed', features.dt.rows - nrow(features.dt), 'rows with missing data\n');
+
+###################################################################################################
+# Save features.dt for input to RF
+###################################################################################################
+saveRDS(features.dt, output.rds);
diff --git a/module/scripts/liftover-Delly2-vcf.R b/module/scripts/liftover-Delly2-vcf.R
new file mode 100644
index 0000000..d473091
--- /dev/null
+++ b/module/scripts/liftover-Delly2-vcf.R
@@ -0,0 +1,157 @@
+#!/usr/bin/env Rscript
+# liftover-Delly2-vcf.R
+###################################################################################################
+#
+#
+#
+###################################################################################################
+
+suppressPackageStartupMessages({
+ library(vcfR);
+ library(data.table);
+ library(argparse);
+ library(rtracklayer);
+ });
+
+###################################################################################################
+# Input
+###################################################################################################
+# Define command line arguments
+parser <- ArgumentParser();
+parser$add_argument('--input-vcf', type = 'character', help = 'GRCh37 Delly2 vcf');
+parser$add_argument('--header-contigs', type = 'character', help = 'Directory with vcf subsets');
+parser$add_argument('--chain-file', type = 'character', help = 'hg19ToHg38.over.chain file');
+parser$add_argument('--output', type = 'character', help = 'Where to write lifted vcf');
+args <- parser$parse_args();
+
+# Save command line arguments
+for (arg in names(args)) {
+ assign(gsub('_', '.', arg), args[[arg]]);
+ }
+
+# Set parameters for interactive runs
+if (interactive()) {
+ # input.vcf <- '/hot/project/method/AlgorithmEvaluation/BNCH-000142-GRCh37v38/gSV/bcftools-merge/CPCG-40QC_GRCh37/CPCG-40QC_GRCh37_regenotype-gSV_delly_bcftools-merge_delly-filter-germline.vcf.gz';
+ input.vcf <- '/hot/project/method/AlgorithmEvaluation/BNCH-000142-GRCh37v38/sSV/bcftools-merge/CPCG-40QC_GRCh37/CPCG-40QC_GRCh37_call-sSV_delly_bcftools-merge_somatic-only.vcf.gz';
+ header.contigs <- '/hot/code/nkwang/GitHub/uclahs-cds/project-method-AlgorithmEvaluation-BNCH-000142-GRCh37v38/report/manuscript/publish/GRCh38-vcf-header-contigs.txt';
+ chain.file <- '/hot/resource/genomics/liftover_chain_files/hg19ToHg38.over.chain';
+ output <- '/hot/project/method/AlgorithmEvaluation/BNCH-000142-GRCh37v38/sSV/stableLift/train_CPCG-40QC_Delly2/CPCG-40QC_Delly2_LiftOver-GRCh38.vcf.gz';
+ }
+
+###################################################################################################
+# Functions
+###################################################################################################
+vcf.fix.to.dt <- function(vcf.fix) {
+ vcf.fix <- as.data.table(vcf.fix);
+ vcf.info <- vcf.info.to.dt(vcf.fix$INFO);
+ cbind(vcf.fix[, -'INFO'], vcf.info);
+ }
+
+# vcfR::getINFO() to data.table
+vcf.info.to.dt <- function(vcf.info) {
+ vcf.info <- lapply(vcf.info, function(x) vcf.info.string.to.list(x));
+ feature.names <- unique(unlist(lapply(vcf.info, names)));
+ vcf.info <- do.call(mapply, c(FUN = list, lapply(vcf.info, `[`, feature.names)));
+ setNames(as.data.table(vcf.info), feature.names);
+ }
+
+# Split vcf info field to list
+vcf.info.string.to.list <- function(vcf.info, keep.columns = NULL) {
+ list.out <- strsplit(vcf.info, split = ';');
+ list.out <- lapply(list.out, function(x) strsplit(x, split = '='));
+ labels <- sapply(list.out[[1]], function(x) x[[1]]);
+ values <- sapply(list.out[[1]], function(x) if (length(x) == 2) x[[2]] else x[[1]]);
+ names(values) <- labels;
+ if (is.null(keep.columns)) return(values);
+ values <- values[labels %in% keep.columns];
+ return(values);
+ }
+
+###################################################################################################
+# Load files
+###################################################################################################
+input.vcf.path <- input.vcf;
+input.vcf <- read.vcfR(input.vcf);
+header.contigs <- scan(header.contigs, character());
+liftover.chain <- import.chain(chain.file);
+
+###################################################################################################
+# Data preprocessing
+###################################################################################################
+if (any(duplicated(input.vcf@fix[, 'ID']))) input.vcf@fix[, 'ID'] <- paste0(substr(input.vcf@fix[, 'ID'], 1, 3), sprintf('%08d', seq_len(nrow(input.vcf@fix))));
+# if (any(duplicated(input.vcf@fix[, 'ID']))) fix.dt[, ID := paste0(substr(ID, 1, 3), sprintf('%08d', seq_len(nrow(fix.dt))))];
+fix.dt <- as.data.table(input.vcf@fix);
+gt.dt <- as.data.table(input.vcf@gt);
+
+fix.dt <- vcf.fix.to.dt(fix.dt);
+fix.dt[, CHROM := paste0('chr', CHROM)];
+fix.dt[, CHR2 := ifelse(is.na(CHR2), CHR2, paste0('chr', CHR2))];
+
+fix.dt <- fix.dt[, -c('CONSENSUS')];
+numeric.columns <- c('POS', 'QUAL', 'END', 'PE', 'MAPQ', 'SRMAPQ', 'INSLEN', 'HOMLEN', 'SR', 'SRQ', 'CE', 'SVLEN', 'POS2');
+character.columns <- names(fix.dt)[!names(fix.dt) %in% numeric.columns];
+fix.dt[, (numeric.columns) := lapply(.SD, as.numeric), .SDcols = numeric.columns];
+fix.dt[, (character.columns) := lapply(.SD, as.character), .SDcols = character.columns];
+
+###################################################################################################
+# Liftover
+###################################################################################################
+# Create GRanges object
+granges.37 <- makeGRangesFromDataFrame(
+ df = fix.dt,
+ seqnames.field = 'CHROM',
+ start.field = 'POS',
+ end.field = 'END',
+ keep.extra.columns = TRUE
+ );
+
+# Liftover using chain file
+granges.38 <- unlist(liftOver(granges.37, liftover.chain));
+granges.38.dt <- as.data.table(granges.38);
+
+# Create GRanges object using CHROM, CHR2, and POS2 from fix.dt
+granges.37.BND <- makeGRangesFromDataFrame(
+ df = fix.dt[SVTYPE == 'BND', ],
+ seqnames.field = 'CHR2',
+ start.field = 'POS2',
+ end.field = 'POS2',
+ keep.extra.columns = TRUE
+ );
+granges.38.BND <- as.data.table(unlist(liftOver(granges.37.BND, liftover.chain)));
+
+# Remove multiple mappings
+granges.38.dt <- granges.38.dt[!duplicated(ID)];
+granges.38.BND <- granges.38.BND[!duplicated(ID)];
+common <- intersect(granges.38.dt$ID, granges.38.BND$ID);
+
+granges.38.dt[ID %in% common, c('CHR2', 'POS2') := granges.38.BND[ID %in% common, .(seqnames, start)]];
+
+pass.liftover <- as.data.table(input.vcf@fix)$ID %in% granges.38.dt$ID;
+fix.lifted <- as.data.table(input.vcf@fix)[pass.liftover];
+gt.dt <- gt.dt[pass.liftover];
+for (i in seq_len(nrow(fix.lifted))) {
+ this.ID <- fix.lifted[i, ID];
+ this.INFO <- vcf.info.string.to.list(fix.lifted[i, INFO]);
+ this.INFO[['END']] <- granges.38.dt[i, end];
+ if (this.INFO[['SVTYPE']] == 'BND') {
+ this.INFO[['CHR2']] <- granges.38.dt[i, CHR2];
+ this.INFO[['POS2']] <- granges.38.dt[i, POS2];
+ }
+ this.INFO <- lapply(names(this.INFO), function(x) paste(x, this.INFO[[x]], sep = '='));
+ this.INFO <- paste(this.INFO, collapse = ';');
+ this.INFO <- gsub('IMPRECISE=IMPRECISE', 'IMPRECISE', this.INFO);
+ this.INFO <- gsub('PRECISE=PRECISE', 'PRECISE', this.INFO);
+ this.INFO <- gsub('SOMATIC=SOMATIC', 'SOMATIC', this.INFO);
+ fix.lifted[i, c('CHROM', 'POS', 'INFO') := granges.38.dt[ID == ..this.ID, .(seqnames, start, ..this.INFO)]];
+ }
+
+###################################################################################################
+# Write output vcf
+###################################################################################################
+output.vcf <- input.vcf;
+output.vcf@fix <- as.matrix(fix.lifted);
+output.vcf@gt <- as.matrix(gt.dt);
+output.vcf@meta <- output.vcf@meta[!grepl('^##(contig|reference)', output.vcf@meta)];
+output.vcf@meta <- c(output.vcf@meta, header.contigs);
+
+write.vcf(output.vcf, output);
diff --git a/module/scripts/predict-liftover-stability.R b/module/scripts/predict-liftover-stability.R
index 12892b9..d8f5dd2 100644
--- a/module/scripts/predict-liftover-stability.R
+++ b/module/scripts/predict-liftover-stability.R
@@ -33,6 +33,27 @@ for (arg in names(args)) {
assign(gsub('_', '.', arg), args[[arg]]);
}
+####################################################################################################
+# Functions
+####################################################################################################
+
+# Sort datatable by chr then position
+sort.genomic.dt <- function(x, chr = 'CHROM', pos = 'POS') {
+ setDT(x);
+ x[, eval(chr) := gsub('chr', '', get(chr))];
+ x[, eval(chr) := gsub('X', '23', get(chr))];
+ x[, eval(chr) := gsub('Y', '24', get(chr))];
+ x[, eval(chr) := as.numeric(get(chr))];
+
+ setorderv(x, c(chr, pos), c(1, 1));
+
+ x[, eval(chr) := gsub('23', 'X', get(chr))];
+ x[, eval(chr) := gsub('24', 'Y', get(chr))];
+ x[, eval(chr) := paste0('chr', get(chr))];
+
+ return(x);
+ }
+
###################################################################################################
# Load data
###################################################################################################
@@ -99,14 +120,12 @@ stability <- predict(rf.model, data = features.dt);
performance.f <- performance(rf.model$prediction, measure = 'f');
index <- which.max(unlist(performance.f@y.values));
threshold <- unlist(performance.f@x.values)[index];
-# f.score <- unlist(performance.f@y.values)[index];
performance <- performance(rf.model$prediction, 'sens', 'spec');
sensitivity <- unlist(performance@y.values)[index];
specificity <- unlist(performance@x.values)[index];
-# cat(sprintf('Max F1-score = %.3f\n', f.score));
# Convert to stability units
threshold.stability <- 1 - threshold;
cat(sprintf('Threshold = %.3f\n', threshold.stability));
@@ -126,4 +145,5 @@ annotation.dt <- data.table(
STABILITY_SCORE = format(round(stability$predictions[, 1], 4), nsmall = 4),
STABILITY = ifelse(stability.classification == '1', 'UNSTABLE', 'STABLE')
);
-fwrite(annotation.dt, file = output.tsv, sep = '\t', col.names = TRUE);
+sort.genomic.dt(annotation.dt);
+fwrite(annotation.dt, file = output.tsv, sep = '\t', col.names = FALSE);
diff --git a/module/annotations.nf b/module/snv_annotations.nf
similarity index 62%
rename from module/annotations.nf
rename to module/snv_annotations.nf
index 2295c79..78ea248 100644
--- a/module/annotations.nf
+++ b/module/snv_annotations.nf
@@ -1,11 +1,49 @@
+include { compress_and_index_HTSlib } from './utils.nf'
+
+process run_Funcotator_GATK {
+ container params.docker_image_gatk
+
+ publishDir path: "${params.output_dir_base}/intermediate/${task.process.replace(':', '/')}",
+ pattern: "output.vcf.gz",
+ mode: "copy",
+ enabled: params.save_intermediate_files,
+ saveAs: { "Funcotator-${sample_id}-${dest_fasta_id}.vcf.gz" }
+
+ input:
+ tuple val(sample_id),
+ path(vcf, stageAs: 'inputs/*'),
+ path(index, stageAs: 'inputs/*')
+ tuple val(dest_fasta_id), path(dest_fasta_ref), path(dest_fasta_fai), path(dest_fasta_dict)
+ path (funcotator_sources)
+
+ output:
+ tuple val(sample_id), path('output.vcf.gz'), emit: funcotator_vcf
+
+ script:
+ """
+ gatk Funcotator \
+ --variant "${vcf}" \
+ --reference "${dest_fasta_ref}" \
+ --ref-version "${dest_fasta_id}" \
+ --data-sources-path "${funcotator_sources}" \
+ --output-file-format VCF \
+ --output "output.vcf.gz"
+ """
+
+ stub:
+ """
+ touch "output.vcf.gz"
+ """
+}
+
process annotate_RepeatMasker_BCFtools {
container params.docker_image_bcftools
- publishDir path: "${intermediate_filepath}",
+ publishDir path: "${params.output_dir_base}/intermediate/${task.process.replace(':', '/')}",
pattern: "output.vcf.gz",
mode: "copy",
enabled: params.save_intermediate_files,
- saveAs: { "${slug}.vcf.gz" }
+ saveAs: { "RepeatMasker-${sample_id}.vcf.gz" }
input:
tuple val(sample_id), path(vcf, stageAs: 'inputs/*')
@@ -15,9 +53,6 @@ process annotate_RepeatMasker_BCFtools {
tuple val(sample_id), path('output.vcf.gz'), emit: repeatmasker_vcf
script:
- intermediate_filepath = "${params.output_dir_base}/BCFtools-${params.bcftools_version}/intermediate/${task.process}"
-
- slug = "RepeatMasker-${sample_id}"
"""
bcftools annotate \
@@ -27,22 +62,21 @@ process annotate_RepeatMasker_BCFtools {
-o "output.vcf.gz" \
"${vcf}"
"""
+
+ stub:
+ """
+ touch "output.vcf.gz"
+ """
}
process extract_TrinucleotideContext_BEDTools {
container params.docker_image_bedtools
- publishDir path: "${intermediate_filepath}",
- pattern: "output.bed",
+ publishDir path: "${params.output_dir_base}/intermediate/${task.process.replace(':', '/')}",
+ pattern: "output.{bed,tsv}",
mode: "copy",
enabled: params.save_intermediate_files,
- saveAs: { "${slug}.bed" }
-
- publishDir path: "${intermediate_filepath}",
- pattern: "output.tsv",
- mode: "copy",
- enabled: params.save_intermediate_files,
- saveAs: { "${slug}-full.tsv" }
+ saveAs: { "Trinucleotide-${sample_id}-${dest_fasta_id}.${file(it).getExtension()}" }
input:
tuple val(sample_id), path(vcf)
@@ -53,10 +87,6 @@ process extract_TrinucleotideContext_BEDTools {
tuple val(sample_id), path('output.bed'), emit: trinucleotide_bed
script:
- intermediate_filepath = "${params.output_dir_base}/BEDtools-${params.bedtools_version}/intermediate/${task.process}"
-
- slug = "Trinucleotide-${sample_id}"
-
"""
zcat ${vcf} \
| grep -v "^#" \
@@ -75,47 +105,22 @@ process extract_TrinucleotideContext_BEDTools {
<(cut -f 2 "partial.tsv") \
> "output.tsv"
"""
-}
-
-process compress_and_index_HTSlib {
- container params.docker_image_samtools
-
- publishDir path: "${intermediate_filepath}",
- pattern: "output.tsv.gz{,.tbi}",
- mode: "copy",
- enabled: params.save_intermediate_files
-
- input:
- tuple val(sample_id), path(tsv)
-
- output:
- tuple val(sample_id), path('output.tsv.gz'), path('output.tsv.gz.tbi'), emit: compressed_tsv_with_index
-
- script:
- intermediate_filepath = "${params.output_dir_base}/SAMtools-${params.samtools_version}/intermediate/${task.process}"
-
- slug = "Trinucleotide-${sample_id}"
+ stub:
"""
- bgzip ${tsv} --output output.tsv.gz
-
- tabix \
- --sequence 1 \
- --begin 2 \
- --end 2 \
- output.tsv.gz
+ touch "output.tsv"
+ touch "output.bed"
"""
}
-
process annotate_trinucleotide_BCFtools {
container params.docker_image_bcftools
- publishDir path: "${intermediate_filepath}",
+ publishDir path: "${params.output_dir_base}/intermediate/${task.process.replace(':', '/')}",
pattern: "output.vcf.gz",
mode: "copy",
enabled: params.save_intermediate_files,
- saveAs: { "${slug}.vcf.gz" }
+ saveAs: { "Trinucleotide-annotated-${sample_id}.vcf.gz" }
input:
tuple val(sample_id),
@@ -127,10 +132,6 @@ process annotate_trinucleotide_BCFtools {
tuple val(sample_id), path('output.vcf.gz'), emit: trinucleotide_vcf
script:
- intermediate_filepath = "${params.output_dir_base}/BCFtools-${params.bcftools_version}/intermediate/${task.process}"
-
- slug = "Trinucleotide-${sample_id}"
-
"""
bcftools annotate \
--annotations ${tsv} \
@@ -139,18 +140,28 @@ process annotate_trinucleotide_BCFtools {
--output output.vcf.gz \
${vcf}
"""
-}
-
+ stub:
+ """
+ touch "output.vcf.gz"
+ """
+}
-workflow workflow_apply_annotations {
+workflow workflow_apply_snv_annotations {
take:
vcf_with_sample_id
dest_fasta_data
main:
- annotate_RepeatMasker_BCFtools(
+
+ run_Funcotator_GATK(
vcf_with_sample_id,
+ dest_fasta_data,
+ Channel.value(params.funcotator_data.data_source)
+ )
+
+ annotate_RepeatMasker_BCFtools(
+ run_Funcotator_GATK.out.funcotator_vcf,
Channel.value(params.repeat_bed)
)
diff --git a/module/snv_workflow.nf b/module/snv_workflow.nf
new file mode 100644
index 0000000..f04de2d
--- /dev/null
+++ b/module/snv_workflow.nf
@@ -0,0 +1,113 @@
+include { workflow_apply_snv_annotations } from './snv_annotations.nf'
+
+process run_liftover_BCFtools {
+ container params.docker_image_bcftools
+
+ publishDir path: "${params.output_dir_base}/intermediate/${task.process.replace(':', '/')}",
+ pattern: "{reject,liftover}.vcf.gz{,.tbi}",
+ mode: "copy",
+ enabled: params.save_intermediate_files,
+ saveAs: { filename -> "LiftOver-${sample_id}-${src_fasta_id}-to-${dest_fasta_id}-${filename}" }
+
+ input:
+ tuple val(sample_id), path(vcf), path(index)
+ tuple val(src_fasta_id), path(src_fasta_ref), path(src_fasta_fai), path(src_fasta_dict)
+ tuple val(dest_fasta_id), path(dest_fasta_ref), path(dest_fasta_fai), path(dest_fasta_dict)
+ path (chain_file)
+
+ output:
+ tuple val(sample_id), path('liftover.vcf.gz'), path('liftover.vcf.gz.tbi'), emit: liftover_vcf_with_index
+
+ script:
+ """
+ bcftools +liftover \
+ --output-type u \
+ "${vcf}" \
+ -- \
+ --src-fasta-ref "${src_fasta_ref}" \
+ --fasta-ref "${dest_fasta_ref}" \
+ --chain "${chain_file}" \
+ --reject-type z \
+ --reject "reject.vcf.gz" | \
+ bcftools view \
+ --output-type u \
+ -e 'REF="." | ALT="."' | \
+ bcftools sort \
+ --output-type z \
+ --write-index=tbi \
+ --output "liftover.vcf.gz"
+ """
+
+ stub:
+ """
+ touch "liftover.vcf.gz"
+ touch "liftover.vcf.gz.tbi"
+ touch "reject.vcf.gz"
+ """
+}
+
+process extract_VCF_features_StableLift {
+ container params.docker_image_stablelift
+ containerOptions "-v ${moduleDir}:${moduleDir}"
+
+ publishDir path: "${params.output_dir_base}/intermediate/${task.process.replace(':', '/')}",
+ pattern: "features.Rds",
+ mode: "copy",
+ enabled: params.save_intermediate_files,
+ saveAs: { "StableLift-${sample_id}.Rds" }
+
+ input:
+ tuple val(sample_id), path(vcf)
+
+ output:
+ tuple val(sample_id), path('features.Rds'), emit: r_annotations
+
+ script:
+ """
+ Rscript "${moduleDir}/scripts/extract-vcf-features.R" \
+ --input-vcf "${vcf}" \
+ --variant-caller ${params.variant_caller} \
+ --output-rds "features.Rds"
+ """
+
+ stub:
+ """
+ touch "features.Rds"
+ """
+}
+
+workflow workflow_extract_snv_annotations {
+ take:
+ vcf_with_sample_id
+ src_sequence
+ dest_sequence
+ chain_file
+ variant_caller
+
+ main:
+
+ // Step 1: Liftover
+ run_liftover_BCFtools(
+ vcf_with_sample_id,
+ src_sequence,
+ dest_sequence,
+ chain_file
+ )
+
+ // Step 2: Annotate
+ workflow_apply_snv_annotations(
+ run_liftover_BCFtools.out.liftover_vcf_with_index,
+ dest_sequence
+ )
+
+ // Step 3: Extract features
+ // FIXME Parallelize HaplotypeCaller
+ extract_VCF_features_StableLift(
+ workflow_apply_snv_annotations.out.annotated_vcf
+ )
+
+ emit:
+ liftover_vcf = workflow_apply_snv_annotations.out.annotated_vcf
+ r_annotations = extract_VCF_features_StableLift.out.r_annotations
+}
+
diff --git a/module/sv_workflow.nf b/module/sv_workflow.nf
new file mode 100644
index 0000000..bdf2621
--- /dev/null
+++ b/module/sv_workflow.nf
@@ -0,0 +1,127 @@
+
+process liftover_SV_StableLift{
+ container params.docker_image_stablelift
+
+ publishDir path: "${params.output_dir_base}/intermediate/${task.process.replace(':', '/')}",
+ pattern: "liftover.vcf.gz",
+ mode: "copy",
+ enabled: params.save_intermediate_files,
+ saveAs: { "LiftOver-${sample_id}.vcf.gz" }
+
+ input:
+ tuple val(sample_id),
+ path(vcf, stageAs: 'inputs/*'),
+ path(index, stageAs: 'inputs/*')
+ path (header_contigs)
+ path (chain_file)
+
+ output:
+ tuple val(sample_id), path('liftover.vcf.gz'), emit: liftover_vcf
+
+ script:
+ """
+ Rscript "${moduleDir}/scripts/liftover-Delly2-vcf.R" \
+ --input-vcf "${vcf}" \
+ --header-contigs "${header_contigs}" \
+ --chain-file "${chain_file}" \
+ --output "liftover.vcf.gz"
+ """
+
+ stub:
+ """
+ touch "liftover.vcf.gz"
+ """
+}
+
+process run_sort_BCFtools {
+ container params.docker_image_bcftools
+
+ publishDir path: "${params.output_dir_base}/intermediate/${task.process.replace(':', '/')}",
+ pattern: "sorted.vcf.gz",
+ mode: "copy",
+ enabled: params.save_intermediate_files,
+ saveAs: { "BCFtools-sorted-${sample_id}.vcf.gz" }
+
+ input:
+ tuple val(sample_id), path(vcf, stageAs: 'inputs/*')
+
+ output:
+ tuple val(sample_id), path('sorted.vcf.gz'), emit: sorted_vcf
+
+ script:
+ """
+ bcftools sort \
+ --output-type z \
+ --output sorted.vcf.gz \
+ ${vcf}
+ """
+
+ stub:
+ """
+ touch sorted.vcf.gz
+ """
+}
+
+process annotate_gnomAD_StableLift {
+ container params.docker_image_stablelift
+
+ publishDir path: "${params.output_dir_base}/intermediate/${task.process.replace(':', '/')}",
+ pattern: "annotations.Rds",
+ mode: "copy",
+ enabled: params.save_intermediate_files,
+ saveAs: { "LiftOver-${sample_id}-${variant_caller}.Rds" }
+
+ input:
+ tuple val(sample_id), path(vcf, stageAs: 'inputs/*')
+ path (gnomad_rds)
+ val (variant_caller)
+
+ output:
+ tuple val(sample_id), path('annotations.Rds'), emit: r_annotations
+
+ script:
+ """
+ Rscript "${moduleDir}/scripts/extract-vcf-features-SV.R" \
+ --variant-caller "${variant_caller}" \
+ --input-vcf "${vcf}" \
+ --output-rds "annotations.Rds" \
+ --gnomad-rds ${gnomad_rds}
+ """
+
+ stub:
+ """
+ touch "annotations.Rds"
+ """
+}
+
+workflow workflow_extract_sv_annotations {
+ take:
+ vcf_with_sample_id
+ header_contigs
+ gnomad_rds
+ chain_file
+ variant_caller
+
+ main:
+
+ // Step 1: Liftover
+ liftover_SV_StableLift(
+ vcf_with_sample_id,
+ header_contigs,
+ chain_file
+ )
+ run_sort_BCFtools(
+ liftover_SV_StableLift.out.liftover_vcf
+ )
+
+ // Step 2: Extract features
+ annotate_gnomAD_StableLift(
+ run_sort_BCFtools.out.sorted_vcf,
+ gnomad_rds,
+ variant_caller
+ )
+
+ emit:
+ liftover_vcf = run_sort_BCFtools.out.sorted_vcf
+ r_annotations = annotate_gnomAD_StableLift.out.r_annotations
+}
diff --git a/module/utils.nf b/module/utils.nf
new file mode 100644
index 0000000..1c8efdf
--- /dev/null
+++ b/module/utils.nf
@@ -0,0 +1,36 @@
+
+process compress_and_index_HTSlib {
+ container params.docker_image_samtools
+
+ publishDir path: "${params.output_dir_base}/intermediate/${task.process.replace(':', '/')}",
+ pattern: "output.tsv.gz{,.tbi}",
+ mode: "copy",
+ enabled: params.save_intermediate_files,
+ saveAs: { "${sample_id}${file(it).getName().replace(file(it).getSimpleName(), "")}" }
+
+ input:
+ tuple val(sample_id), path(tsv)
+
+ output:
+ tuple val(sample_id),
+ path('output.tsv.gz'),
+ path('output.tsv.gz.tbi'),
+ emit: compressed_tsv_with_index
+
+ script:
+ """
+ bgzip ${tsv} --output output.tsv.gz
+
+ tabix \
+ --sequence 1 \
+ --begin 2 \
+ --end 2 \
+ output.tsv.gz
+ """
+
+ stub:
+ """
+ touch "output.tsv.gz"
+ touch "output.tsv.gz.tbi"
+ """
+}
diff --git a/nftest.yml b/nftest.yml
index 32644f5..6cbca4f 100644
--- a/nftest.yml
+++ b/nftest.yml
@@ -3,11 +3,19 @@ global:
temp_dir: test/work
remove_temp: true
clean_logs: true
- nf_config: test/nftest.config
+ nf_config: test/common.config
cases:
- - name: Example test
+ - name: SNV
nf_script: ./main.nf
- params_file: test/nftest.yaml
+ nf_config: test/snv.config
+ params_file: test/snv.yaml
+ skip: false
+ verbose: true
+
+ - name: SV
+ nf_script: ./main.nf
+ nf_config: test/sv.config
+ params_file: test/sv.yaml
skip: false
verbose: true
diff --git a/test/nftest.config b/test/common.config
similarity index 59%
rename from test/nftest.config
rename to test/common.config
index 2c9cbff..e3d037e 100644
--- a/test/nftest.config
+++ b/test/common.config
@@ -1,16 +1,8 @@
-includeConfig "${projectDir}/config/default.config"
-includeConfig "${projectDir}/config/methods.config"
-includeConfig "${projectDir}/nextflow.config"
-
params {
- // Choices: ["Mutect2", "HaplotypeCaller"]
- variant_caller = "Mutect2"
save_intermediate_files = true
ucla_cds = false
- rf_model = "/hot/project/method/AlgorithmEvaluation/BNCH-000142-GRCh37v38/sSNV/stableLift/train_CPCG-40QC_Mutect2/RF-train_Mutect2_ntree2000_nodesize5_classratio0.Rds"
-
// Reference files
funcotator_data {
data_source = "/hot/ref/tool-specific-input/Funcotator/somatic/funcotator_dataSources.v1.7.20200521s"
@@ -30,8 +22,5 @@ params {
// references
chain_file = "/hot/ref/tool-specific-input/liftOver/hg19ToHg38.over.chain"
- repeat_bed = "/hot/ref/database/RepeatMasker-v3.0.1/processed/GRCh38/GRCh38_RepeatMasker_intervals.bed"
+ repeat_bed = "/hot/ref/database/RepeatMasker-3.0.1/processed/GRCh38/GRCh38_RepeatMasker_intervals.bed"
}
-
-// Setup the pipeline config. DO NOT REMOVE THIS LINE!
-methods.setup()
diff --git a/test/snv.config b/test/snv.config
new file mode 100644
index 0000000..a87fc52
--- /dev/null
+++ b/test/snv.config
@@ -0,0 +1,15 @@
+includeConfig "${projectDir}/config/default.config"
+includeConfig "${projectDir}/config/methods.config"
+includeConfig "${projectDir}/nextflow.config"
+
+includeConfig "${projectDir}/test/common.config"
+
+params {
+ // Choices: ["Mutect2", "HaplotypeCaller"]
+ variant_caller = "Mutect2"
+
+ rf_model = "/hot/project/method/AlgorithmEvaluation/BNCH-000142-GRCh37v38/sSNV/stableLift/train_CPCG-40QC_Mutect2/RF-train_Mutect2_ntree2000_nodesize5_classratio0.Rds"
+}
+
+// Setup the pipeline config. DO NOT REMOVE THIS LINE!
+methods.setup()
diff --git a/test/nftest.yaml b/test/snv.yaml
similarity index 100%
rename from test/nftest.yaml
rename to test/snv.yaml
diff --git a/test/sv.config b/test/sv.config
new file mode 100644
index 0000000..ded1405
--- /dev/null
+++ b/test/sv.config
@@ -0,0 +1,21 @@
+includeConfig "${projectDir}/config/default.config"
+includeConfig "${projectDir}/config/methods.config"
+includeConfig "${projectDir}/nextflow.config"
+
+includeConfig "${projectDir}/test/common.config"
+
+params {
+ // Choices: ["Mutect2", "HaplotypeCaller"]
+ variant_caller = "Delly2"
+
+ rf_model = "/hot/project/method/AlgorithmEvaluation/BNCH-000142-GRCh37v38/gSV/stableLift/train_CPCG-40QC_Delly2/RF-train_Delly2_ntree2000_nodesize20_classratio0.Rds"
+
+ // FIXME Should this be bundled?
+ header_contigs = "/hot/code/nkwang/GitHub/uclahs-cds/project-method-AlgorithmEvaluation-BNCH-000142-GRCh37v38/report/manuscript/publish/GRCh38-vcf-header-contigs.txt"
+
+ // FIXME Should this be bundled?
+ gnomad_rds = "/hot/code/nkwang/GitHub/uclahs-cds/project-method-AlgorithmEvaluation-BNCH-000142-GRCh37v38/report/manuscript/publish/data/gnomad.v4.0.sv.Rds"
+}
+
+// Setup the pipeline config. DO NOT REMOVE THIS LINE!
+methods.setup()
diff --git a/test/sv.yaml b/test/sv.yaml
new file mode 100644
index 0000000..75dedd9
--- /dev/null
+++ b/test/sv.yaml
@@ -0,0 +1,4 @@
+---
+sample_id: ExampleID
+input:
+ vcf: /hot/project/method/AlgorithmEvaluation/BNCH-000142-GRCh37v38/validation/TCGA-SARC_WGS/GRCh37/Delly2/TCGA-SARC_WGS_Delly2_regenotype-gSV_bcftools-merge_filter-germline.vcf.gz