diff --git a/CHANGELOG.md b/CHANGELOG.md index cb5160a..bc93c82 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,5 @@ # Changelog -All notable changes to the pipeline-name pipeline. +All notable changes to the StableLift pipeline. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). @@ -8,58 +8,13 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm --- ## [Unreleased] -### Added -- Add `sample_id` extraction from BAM -- Add template input YAMLs -- Add pipeline-Nexflow-config as submodule and redirect set_resources_allocation -- Add pipeline-Nextflow-module as submodule -- Additional out of memory exit code -- Pipeline release action -- Template for NFTest testing results in PR template -- Enable dependabot -- Add example PlantUML image to README -- Add workflow to build documentation -- Add workflows to run Nextflow configuration tests - -### Changed -- Switch resource limit checks to external scripts -- Update links in on-prem Confluence to point to cloud-based Confluence -- Fix `CODEOWNERS` file -- Use `schema.check_path` for `workDir` validation -- Add `Discussions` and `Contributors` to the Table of Contents in `README.md` -- Update from DSL1 to DSL2 -- Standardize config structure -- Restructure repo so main script is main.nf -- Reorganize contributors and metadata -- Reorganize PR template so description is at top -- Update automatic node detection to allow for F2 detection -- Update Issue Template -- Standardize input/output/parameter structure in README -- Avoid modification of input parameter `output_dir` -- Create default docker container registry parameter for tools -- Use `methods.setup_process_afterscript()` to capture log files - ---- -## [1.0.0] - YYYY-MM-DD ### Added -- For new features. -- Added item 1. -### Changed -- For changes in existing functionality. -- Changed item 1. - -### Deprecated -- For soon-to-be removed features. +- Add workflow for SNV callers (Mutect2, HaplotypeCaller, Strelka2, Muse2, SomaticSniper) +- Add workflow for SV caller (Delly2) +- Add pipeline diagram -### Removed -- For now removed features. -- Removed item 1. - -### Fixed -- For any bug fixes. -- Fixed item 1. +### Changed -### Security -- In case of vulnerabilities. +- Sort VCF after liftover in SV branch diff --git a/Dockerfile b/Dockerfile index 3b95730..959ac76 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,19 +1,60 @@ -ARG R_VERSION=4.3.1 +ARG R_VERSION="4.3.1" + +ARG LIBBZ2_VERSION="1.0.8-*" +ARG LIBCURL_VERSION="7.81.0-*" +ARG LIBLZMA_VERSION="5.2.5-*" +ARG LIBXML2_VERSION="2.9.13+dfsg-*" +ARG PYTHON_VERSION="3.10.6-*" +ARG ZLIB_VERSION="1:1.2.11.dfsg-*" +ARG RLIBDIR="/usr/local/stablelift-R" FROM rocker/r-ver:${R_VERSION} AS build +ARG LIBBZ2_VERSION +ARG LIBCURL_VERSION +ARG LIBLZMA_VERSION +ARG LIBXML2_VERSION +ARG ZLIB_VERSION + +# Install build-time dependencies +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + libbz2-dev=${LIBBZ2_VERSION} \ + libcurl4-openssl-dev=${LIBCURL_VERSION} \ + liblzma-dev=${LIBLZMA_VERSION} \ + libxml2-dev=${LIBXML2_VERSION} \ + zlib1g-dev=${ZLIB_VERSION} \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +ARG BIOC_VERSION="3.18" +ENV BIOC_VERSION=${BIOC_VERSION} + +ARG RLIBDIR +ENV RENV_PATHS_CACHE ${RLIBDIR}/.cache + +RUN mkdir -p ${RENV_PATHS_CACHE} + +WORKDIR ${RLIBDIR} + COPY docker/install-stablelift.R /tmp RUN Rscript /tmp/install-stablelift.R +# renv prints to stdout, so we need to change directories +WORKDIR / +RUN echo ".libPaths( c( .libPaths(), \"/usr/local/stablelift-R/renv/library/R-4.3/$(Rscript -e "cat(unname(unlist(R.version['platform'])))")\" ) )" >> /usr/local/lib/R/etc/Rprofile.site + FROM rocker/r-ver:${R_VERSION} -# Overwrite the site library with just the desired packages. By default rocker -# only bundles docopt and littler in that directory. -COPY --from=build /tmp/userlib /usr/local/lib/R/site-library +ARG RLIBDIR +COPY --from=build ${RLIBDIR} ${RLIBDIR} +COPY --from=build \ + /usr/local/lib/R/etc/Rprofile.site \ + /usr/local/lib/R/etc/Rprofile.site # Install python (required for argparse). The version is not important, but # let's pin it for stability. -ARG PYTHON_VERSION=3.10.6-1~22.04 +ARG PYTHON_VERSION RUN apt-get update \ && apt-get install -y --no-install-recommends \ diff --git a/README.md b/README.md index 6d1ed5c..263196b 100644 --- a/README.md +++ b/README.md @@ -39,9 +39,7 @@ If you are using the UCLA Azure cluster, please use the [submission script](http ## Flow Diagram -A directed acyclic graph of your pipeline. The [PlantUML](https://plantuml.com/) code defining this diagram is version-controlled in the [docs/](./docs/) folder, and a [GitHub Action](https://github.com/uclahs-cds/tool-PlantUML-action) automatically regenerates the SVG image when that file is changed. - -![Pipeline Graph](./docs/pipeline-flow.svg) +![Pipeline Graph](./docs/pipeline.mmd.svg) --- diff --git a/config/default.config b/config/default.config index 0ee67bd..05eb6f7 100644 --- a/config/default.config +++ b/config/default.config @@ -22,7 +22,7 @@ params { gatk_version = '4.2.4.1' pipeval_version = '5.0.0-rc.3' samtools_version = '1.20' - stablelift_version = 'branch-nwiltsie-bootstrap' // FIXME + stablelift_version = 'branch-nwiltsie-regroup-modules' // FIXME docker_image_bcftools = "${-> params.docker_container_registry}/bcftools-score:${params.bcftools_version}" docker_image_bedtools = "${-> params.docker_container_registry}/bedtools:${params.bedtools_version}" diff --git a/config/methods.config b/config/methods.config index 7cbf874..05cf337 100644 --- a/config/methods.config +++ b/config/methods.config @@ -9,7 +9,7 @@ methods { set_output_dir = { def date = new Date().format("yyyyMMdd'T'HHmmss'Z'", TimeZone.getTimeZone('UTC')) - params.output_dir_base = "${params.output_dir}/${manifest.name}-${manifest.version}/${params.sample_id.replace(' ', '_')}" + params.output_dir_base = "${params.output_dir}/${manifest.name}-${manifest.version}/${params.sample_id.replace(' ', '_')}/StableLift-${manifest.version}" params.log_output_dir = "${params.output_dir_base}/log-${manifest.name}-${manifest.version}-${date}" } diff --git a/config/schema.yaml b/config/schema.yaml index 7d2645b..76b8a2f 100644 --- a/config/schema.yaml +++ b/config/schema.yaml @@ -3,6 +3,17 @@ sample_id: type: 'String' required: true help: 'sample id supplied from input yaml' +variant_caller: + type: 'String' + required: true + help: 'Tool used to call structural or somatic variants' + choices: + - Mutect2 + - HaplotypeCaller + - Strelka2 + - Muse2 + - SomaticSniper + - Delly2 save_intermediate_files: type: 'Bool' required: true diff --git a/config/template.config b/config/template.config index 06c8741..e30e086 100644 --- a/config/template.config +++ b/config/template.config @@ -36,7 +36,14 @@ params { chain_file = "/hot/ref/tool-specific-input/liftOver/hg19ToHg38.over.chain" // FIXME How to describe this file? - repeat_bed = "/hot/ref/database/RepeatMasker-v3.0.1/processed/GRCh38/GRCh38_RepeatMasker_intervals.bed" + repeat_bed = "/hot/ref/database/RepeatMasker-3.0.1/processed/GRCh38/GRCh38_RepeatMasker_intervals.bed" + + // SV files + // FIXME Should this be bundled? + header_contigs = "/hot/code/nkwang/GitHub/uclahs-cds/project-method-AlgorithmEvaluation-BNCH-000142-GRCh37v38/report/manuscript/publish/GRCh38-vcf-header-contigs.txt" + + // FIXME Should this be bundled? + gnomad_rds = "/hot/code/nkwang/GitHub/uclahs-cds/project-method-AlgorithmEvaluation-BNCH-000142-GRCh37v38/report/manuscript/publish/data/gnomad.v4.0.sv.Rds" } // Setup the pipeline config. DO NOT REMOVE THIS LINE! diff --git a/docker/install-stablelift.R b/docker/install-stablelift.R index b20a08d..ef50338 100644 --- a/docker/install-stablelift.R +++ b/docker/install-stablelift.R @@ -1,26 +1,22 @@ -# Install the remotes package to the library -install.packages('remotes', lib = .Library) +install.packages('renv', lib = .Library) -# Make a temporary directory to hold all of the installed packages -localdir <- '/tmp/userlib' -dir.create(localdir) +options( + renv.settings.bioconductor.version = Sys.getenv('BIOC_VERSION') +) -dependencies <- c( - 'ROCR' = '1.0-11', - 'argparse' = '2.2.2', - 'caret' = '6.0-94', - 'data.table' = '1.14.8', - 'doParallel' = '1.0.17', - 'foreach' = '1.5.2', - 'ranger' = '0.15.1', - 'vcfR' = '1.14.0' +renv::init( + bare = TRUE, + bioconductor = Sys.getenv('BIOC_VERSION') ) -# Unfortunately, this will install the dependencies multiple times -for (name in names(dependencies)) { - remotes::install_version( - name, - unname(dependencies[name]), - lib = localdir - ) -} +renv::install(c( + 'ROCR@1.0-11', + 'argparse@2.2.2', + 'caret@6.0-94', + 'data.table@1.14.8', + 'doParallel@1.0.17', + 'foreach@1.5.2', + 'ranger@0.15.1', + 'vcfR@1.14.0', + 'bioc::rtracklayer@1.62.0' +)) diff --git a/docs/pipeline.mmd b/docs/pipeline.mmd new file mode 100644 index 0000000..f0bba92 --- /dev/null +++ b/docs/pipeline.mmd @@ -0,0 +1,79 @@ +%%{init: {"flowchart": {"htmlLabels": false}} }%% + +flowchart TD + + classDef input fill:#ffffb3 + classDef output fill:#b3de69 + classDef gatk fill:#bebada + classDef bcftools fill:#fdb462 + classDef R fill:#8dd3c7 + classDef linux fill:#fb8072 + + subgraph legend ["`**Legend**`"] + direction RL + subgraph nodes ["`**Nodes**`"] + input[["Input File"]]:::input + input_node(["Parameterized Input"]):::input + output[["Output file"]]:::output + end + + subgraph processes ["`**Processes**`"] + gatk_docker[GATK]:::gatk + bcftools_docker[bcftools]:::bcftools + r_docker[Rscript]:::R + linux_docker[Generic Linux]:::linux + end + end + + legend + ~~~ input_vcf[["Input VCF"]]:::input + --> pipeval:::linux + --> sv_vs_snv{{Variant Type?}} + + sv_vs_snv ------> r_liftover + header_contigs .-> r_liftover + chain_file2 ..-> r_liftover + gnomad_rds .-> r_extract_sv + + subgraph SV ["`**SV**`"] + %% Other input files + header_contigs([header_contigs]):::input + chain_file2([chain_file]):::input + gnomad_rds([gnomad_rds]):::input + + r_liftover[liftover-Delly2-vcf.R]:::R + ---> r_extract_sv[extract-VCF-features-SV.R]:::R + + end + + chain_file .-> bcftools_liftover + sv_vs_snv --> bcftools_liftover + + subgraph SNV ["`**SNV**`"] + funcotator_sources([funcotator_sources]):::input + chain_file([chain_file]):::input + repeat_bed([repeat_bed]):::input + + bcftools_liftover[bcftools +liftover]:::bcftools + ---> gatk_func[gatk Funcotator]:::gatk + --> bcftools_annotate["`bcftools annotate*RepeatMasker*`"]:::bcftools + --> bcftools_annotate2["`bcftools annotate*Trinucleotide*`"]:::bcftools + --> r_extract_snv[extract-VCF-features.R]:::R + end + + funcotator_sources .-> gatk_func + repeat_bed .-> bcftools_annotate + + joinpaths{ } + r_extract_snv --> joinpaths + r_extract_sv --> joinpaths + joinpaths ---> r_predict_stability + + subgraph Predict Stability ["`        **Predict Stability**`"] + r_predict_stability[predict-liftover-stability.R]:::R + --> bcftools_annotate3["`bcftools annotate*Stability*`"]:::bcftools + + rf_model([rf_model]):::input .-> r_predict_stability + end + + bcftools_annotate3 --> output_vcfs[["Output VCFs"]]:::output diff --git a/docs/pipeline.mmd.svg b/docs/pipeline.mmd.svg new file mode 100644 index 0000000..714aef7 --- /dev/null +++ b/docs/pipeline.mmd.svg @@ -0,0 +1 @@ +         Predict StabilitySNVSVbcftools annotateStabilitypredict-liftover-stability.Rrf_modelfuncotator_sourceschain_filerepeat_bedextract-VCF-features.Rbcftools annotateTrinucleotidebcftools annotateRepeatMaskergatk Funcotatorbcftools +liftoverheader_contigschain_filegnomad_rdsextract-VCF-features-SV.Rliftover-Delly2-vcf.RLegendNodesInput FileParameterized InputOutput fileProcessesGATKbcftoolsRscriptGeneric LinuxInput VCFpipevalVariant Type?Output VCFs \ No newline at end of file diff --git a/main.nf b/main.nf index 1e45a85..abac9ca 100644 --- a/main.nf +++ b/main.nf @@ -10,10 +10,9 @@ include { run_validate_PipeVal_with_metadata } from './external/pipeline-Nextflo ] ) -include { run_liftover_BCFtools } from './module/liftover.nf' -include { run_Funcotator_GATK } from './module/funcotator.nf' -include { workflow_apply_annotations } from './module/annotations.nf' -include { workflow_extract_features} from './module/extract_features.nf' +include { workflow_extract_sv_annotations } from './module/sv_workflow.nf' +include { workflow_extract_snv_annotations } from './module/snv_workflow.nf' +include { workflow_predict_stability } from './module/predict_stability.nf' // Log info here log.info """\ @@ -139,28 +138,45 @@ workflow { .map { filename, metadata -> [metadata[0].sample_id, metadata[0] + [(metadata[1]): filename]] } .groupTuple() .map { it[1].inject([:]) { result, i -> result + i } } - .set { validated_vcf_with_index } + .tap { validated_vcf_with_index } + .map { [it.sample_id, it.vcf, it.index] } + .set { validated_vcf_tuple } // The values of validated_vcf_with_index are maps with keys vcf, index, and sample_id. - run_liftover_BCFtools( - validated_vcf_with_index.map { [it.sample_id, it.vcf, it.index] }, - input_ch_src_sequence, - input_ch_dest_sequence, - Channel.value(params.chain_file) - ) + // The values of validated_vcf_tuple are tuples of (sample_id, vcf, index). + + if (params.variant_caller == "Delly2") { + // Take the SV branch + workflow_extract_sv_annotations( + validated_vcf_tuple, + Channel.value(params.header_contigs), + Channel.value(params.gnomad_rds), + Channel.value(params.chain_file), + Channel.value(params.variant_caller) + ) - run_Funcotator_GATK( - run_liftover_BCFtools.out.liftover_vcf_with_index, - input_ch_dest_sequence, - Channel.value(params.funcotator_data.data_source) - ) + workflow_extract_sv_annotations.out.liftover_vcf.set { liftover_vcf } + workflow_extract_sv_annotations.out.r_annotations.set { r_annotations } + + } else { + // Take the SNV branch + workflow_extract_snv_annotations( + validated_vcf_tuple, + input_ch_src_sequence, + input_ch_dest_sequence, + Channel.value(params.chain_file), + Channel.value(params.variant_caller) + ) - workflow_apply_annotations( - run_Funcotator_GATK.out.funcotator_vcf, - input_ch_dest_sequence - ) + workflow_extract_snv_annotations.out.liftover_vcf.set { liftover_vcf } + workflow_extract_snv_annotations.out.r_annotations.set { r_annotations } + } - workflow_extract_features( - workflow_apply_annotations.out.annotated_vcf + // Predict stability and apply annotate lifted VCF + workflow_predict_stability( + liftover_vcf, + r_annotations, + Channel.value(params.rf_model), + Channel.value(params.variant_caller) ) } diff --git a/module/funcotator.nf b/module/funcotator.nf deleted file mode 100644 index 1917ae0..0000000 --- a/module/funcotator.nf +++ /dev/null @@ -1,44 +0,0 @@ -/* -* Module/process description here -* -* @input -* @params -* @output -*/ - -include { generate_standard_filename } from '../external/pipeline-Nextflow-module/modules/common/generate_standardized_filename/main.nf' - -process run_Funcotator_GATK { - container params.docker_image_gatk - - publishDir path: "${intermediate_filepath}", - pattern: "output.vcf.gz", - mode: "copy", - enabled: params.save_intermediate_files, - saveAs: { "${slug}.vcf.gz" } - - input: - tuple val(sample_id), - path(vcf, stageAs: 'inputs/*'), - path(index, stageAs: 'inputs/*') - tuple val(dest_fasta_id), path(dest_fasta_ref), path(dest_fasta_fai), path(dest_fasta_dict) - path (funcotator_sources) - - output: - tuple val(sample_id), path('output.vcf.gz'), emit: funcotator_vcf - - script: - intermediate_filepath = "${params.output_dir_base}/GATK-${params.gatk_version}/intermediate/${task.process}" - - slug = "Funcotator-${sample_id}-${dest_fasta_id}" - - """ - gatk Funcotator \ - --variant "${vcf}" \ - --reference "${dest_fasta_ref}" \ - --ref-version "${dest_fasta_id}" \ - --data-sources-path "${funcotator_sources}" \ - --output-file-format VCF \ - --output "output.vcf.gz" - """ -} diff --git a/module/liftover.nf b/module/liftover.nf deleted file mode 100644 index 5e8f09c..0000000 --- a/module/liftover.nf +++ /dev/null @@ -1,65 +0,0 @@ -/* -* Module/process description here -* -* @input -* @params -* @output -*/ - -// include { generate_standard_filename } from '../external/pipeline-Nextflow-module/modules/common/generate_standardized_filename/main.nf' - -process run_liftover_BCFtools { - container params.docker_image_bcftools - - publishDir path: "${intermediate_path}", - pattern: "reject.vcf.gz", - mode: "copy", - enabled: params.save_intermediate_files, - saveAs: { "${slug}-reject.vcf.gz" } - - publishDir path: "${intermediate_path}", - pattern: "liftover.vcf.gz", - mode: "copy", - enabled: params.save_intermediate_files, - saveAs: { "${slug}.vcf.gz" } - - publishDir path: "${intermediate_path}", - pattern: "liftover.vcf.gz.tbi", - mode: "copy", - enabled: params.save_intermediate_files, - saveAs: { "${slug}.vcf.gz.tbi" } - - input: - tuple val(sample_id), path(vcf), path(index) - tuple val(src_fasta_id), path(src_fasta_ref), path(src_fasta_fai), path(src_fasta_dict) - tuple val(dest_fasta_id), path(dest_fasta_ref), path(dest_fasta_fai), path(dest_fasta_dict) - path (chain_file) - - output: - tuple val(sample_id), path('liftover.vcf.gz'), path('liftover.vcf.gz.tbi'), emit: liftover_vcf_with_index - - script: - // FIXME Use a more standard path - intermediate_path = "${params.output_dir_base}/BCFtools-${params.bcftools_version}/intermediate/${task.process}" - - slug = "LiftOver-${sample_id}-${src_fasta_id}-to-${dest_fasta_id}" - - """ - bcftools +liftover \ - --output-type u \ - "${vcf}" \ - -- \ - --src-fasta-ref "${src_fasta_ref}" \ - --fasta-ref "${dest_fasta_ref}" \ - --chain "${chain_file}" \ - --reject-type z \ - --reject "reject.vcf.gz" | \ - bcftools view \ - --output-type u \ - -e 'REF="." | ALT="."' | \ - bcftools sort \ - --output-type z \ - --write-index=tbi \ - --output "liftover.vcf.gz" - """ -} diff --git a/module/extract_features.nf b/module/predict_stability.nf similarity index 56% rename from module/extract_features.nf rename to module/predict_stability.nf index d5b385f..1ae6987 100644 --- a/module/extract_features.nf +++ b/module/predict_stability.nf @@ -1,58 +1,34 @@ -include { compress_and_index_HTSlib } from './annotations.nf' - -process extract_VCF_features_StableLift { - container params.docker_image_stablelift - containerOptions "-v ${moduleDir}:${moduleDir}" - - publishDir path: "${intermediate_filepath}", - pattern: "features.Rds", - mode: "copy", - enabled: params.save_intermediate_files, - saveAs: { "${slug}.${file(it).getExtension()}" } - - input: - tuple val(sample_id), path(vcf) - - output: - tuple val(sample_id), path('features.Rds'), emit: r_annotations - - script: - intermediate_filepath = "${params.output_dir_base}/stablelift-${params.stablelift_version}/intermediate/${task.process}" - - slug = "stablelift-${sample_id}" - - """ - Rscript "${moduleDir}/scripts/extract-vcf-features.R" \ - --input-vcf "${vcf}" \ - --variant-caller ${params.variant_caller} \ - --output-rds "features.Rds" - """ -} +include { compress_and_index_HTSlib } from './utils.nf' process predict_stability_StableLift { container params.docker_image_stablelift containerOptions "-v ${moduleDir}:${moduleDir}" publishDir path: "${params.output_dir_base}/output", - pattern: "${output_file_name}", - mode: "copy" + pattern: "stability.tsv", + mode: "copy", + saveAs: { "StableLift-${sample_id}-${variant_caller}.tsv" } input: tuple val(sample_id), path(features_rds) path(rf_model) + val(variant_caller) output: - tuple val(sample_id), path(output_file_name), emit: stability_tsv + tuple val(sample_id), path("stability.tsv"), emit: stability_tsv script: - output_file_name = "stablelift-${sample_id}.tsv" - """ Rscript "${moduleDir}/scripts/predict-liftover-stability.R" \ --features-dt "${features_rds}" \ --rf-model "${rf_model}" \ - --variant-caller "${params.variant_caller}" \ - --output-tsv "${output_file_name}" + --variant-caller "${variant_caller}" \ + --output-tsv "stability.tsv" + """ + + stub: + """ + touch "stability.tsv" """ } @@ -60,8 +36,9 @@ process run_apply_stability_annotations { container params.docker_image_bcftools publishDir path: "${params.output_dir_base}/output", - pattern: "*.vcf.gz{,.tbi}", - mode: "copy" + pattern: "{stability,filtered}.vcf.gz{,.tbi}", + mode: "copy", + saveAs: { "${sample_id}-${it}" } input: tuple val(sample_id), @@ -72,21 +49,19 @@ process run_apply_stability_annotations { output: tuple val(sample_id), - path(stability_vcf), - path(stability_vcf_tbi), + path("stability.vcf.gz"), + path("stability.vcf.gz.tbi"), emit: stability_vcf_with_index tuple val(sample_id), - path(filtered_vcf), - path(filtered_vcf_tbi), + path("filtered.vcf.gz"), + path("filtered.vcf.gz.tbi"), emit: filtered_vcf_with_index script: - slug = "${sample_id}_LiftOver" - - stability_vcf = "${slug}_stability.vcf.gz" + stability_vcf = "stability.vcf.gz" stability_vcf_tbi = "${stability_vcf}.tbi" - filtered_vcf = "${slug}_filtered.vcf.gz" + filtered_vcf = "filtered.vcf.gz" filtered_vcf_tbi = "${filtered_vcf}.tbi" """ @@ -107,23 +82,29 @@ process run_apply_stability_annotations { bcftools index -t "${filtered_vcf}" """ + + stub: + """ + touch "stability.vcf.gz" + touch "stability.vcf.gz.tbi" + touch "filtered.vcf.gz" + touch "filtered.vcf.gz.tbi" + """ } -workflow workflow_extract_features { +workflow workflow_predict_stability { take: vcf_with_sample_id + r_annotations + rf_model + variant_caller main: - if (params.variant_caller == "HaplotypeCaller") { - error "HaplotypeCaller is not supported yet" - } else { - extract_VCF_features_StableLift(vcf_with_sample_id) - extract_VCF_features_StableLift.out.r_annotations.set { ch_annotations } - } predict_stability_StableLift( - ch_annotations, - Channel.value(params.rf_model) + r_annotations, + rf_model, + variant_caller ) compress_and_index_HTSlib( diff --git a/module/scripts/extract-vcf-features-SV.R b/module/scripts/extract-vcf-features-SV.R new file mode 100644 index 0000000..9927a7e --- /dev/null +++ b/module/scripts/extract-vcf-features-SV.R @@ -0,0 +1,181 @@ +#!/usr/bin/env Rscript +# extract-vcf-features-SV.R +#################################################################################################### +# +# Extract features from vcf +# Intersect and annotate with gnomAD-SV vcf +# +#################################################################################################### + +suppressPackageStartupMessages({ + library(vcfR); + library(data.table); + library(argparse); + library(GenomicRanges); + }); + +################################################################################################### +# Input +################################################################################################### +# Define command line arguments +parser <- ArgumentParser(); +parser$add_argument('--variant-caller', type = 'character', help = ''); +parser$add_argument('--input-vcf', type = 'character', help = 'Delly2 vcf'); +parser$add_argument('--output-rds', type = 'character', help = 'Rds output for use in RF model'); +parser$add_argument('--gnomad-rds', type = 'character', help = 'gnomAD Rds file'); +args <- parser$parse_args(); + +# Save command line arguments +for (arg in names(args)) { + assign(gsub('_', '.', arg), args[[arg]]); + } + +# Set parameters for interactive runs +if (interactive()) { + variant.caller <- 'Delly2'; + input.vcf <- '/hot/project/method/AlgorithmEvaluation/BNCH-000142-GRCh37v38/sSV/stableLift/train_CPCG-40QC_Delly2/CPCG-40QC_Delly2_LiftOver-GRCh38.vcf.gz'; + gnomad.rds <- '/hot/code/nkwang/GitHub/uclahs-cds/project-method-AlgorithmEvaluation-BNCH-000142-GRCh37v38/report/manuscript/publish/data/gnomad.v4.0.sv.Rds'; + } + +################################################################################################### +# Functions +################################################################################################### +vcf.info.to.dt <- function(vcf.info) { + # Split each string by semicolon and convert to a list of key-value pairs + vcf.info <- strsplit(vcf.info, ';'); + vcf.info <- lapply(vcf.info, function(x) { + x <- strsplit(x, '='); + as.list(stats::setNames(sapply(x, `[`, 2), sapply(x, `[`, 1))); + }) + + # Combine the list of key-value pairs into a data table + rbindlist(vcf.info, fill = TRUE); + } + +calculate.VAF <- function(GT.row) { + total <- sum(GT.row %in% c('0/0', '0/1', '1/1'), na.rm = TRUE) * 2; + alt <- sum(GT.row == '0/1', na.rm = TRUE) + sum(GT.row == '1/1', na.rm = TRUE) * 2; + return(alt / total); + } + +get.overlap <- function(start1, end1, start2, end2) { + max.length <- pmax((end1 - start1), (end2 - start2)); + overlap.length <- pmin(end1, end2) - pmax(start1, start2); + return(overlap.length / max.length); + } + +find.SV.match <- function(this.ID, input, reference, overlap, offset) { + # Match SV type and CHR + this.variant <- input[ID == this.ID]; + reference <- reference[SVTYPE == this.variant$SVTYPE & CHROM == this.variant$CHROM]; + + if (this.variant$SVTYPE == 'BND') { + # reference[, OFFSET := abs(POS - this.variant$POS) + abs(POS2 - this.variant$POS2)]; + reference[, OFFSET := abs(POS - this.variant$POS)]; + reference <- reference[OFFSET < offset & CHR2 == this.variant$CHR2][order(OFFSET)]; + } else { + reference[, OVERLAP := get.overlap(POS, END, this.variant$POS, this.variant$END)]; + reference <- reference[OVERLAP > overlap][order(OVERLAP, decreasing = TRUE)]; + } + + return(list(gnomad.match.ID = reference[1, ID], gnomad.matches = nrow(reference))); + } + +################################################################################################### +# Load files +################################################################################################### +input.vcf <- read.vcfR(input.vcf); +features.dt.gnomad <- readRDS(gnomad.rds); + +################################################################################################### +# Data preprocessing +################################################################################################### +# Convert variant information into dt +input.info <- vcf.info.to.dt(input.vcf@fix[, 'INFO']); +input.fix <- as.data.table(input.vcf@fix); +features.dt <- cbind(input.fix[, -c('INFO')], input.info); + +# Format columns +features.dt[, CONSENSUS := NULL]; +numeric.columns <- c('POS', 'QUAL', 'END', 'PE', 'MAPQ', 'SRMAPQ', 'INSLEN', 'HOMLEN', 'SR', 'SRQ', 'CE', 'RDRATIO', 'SVLEN', 'POS2'); +features.dt[, (numeric.columns) := lapply(.SD, as.numeric), .SDcols = numeric.columns]; + +# Extract and aggregate per sample GT fields +gt.fields <- c('GQ', 'RC', 'RDCN', 'DR', 'DV', 'RR', 'RV'); +for (field in gt.fields) { + features.dt[, (field) := apply(extract.gt(input.vcf, element = ..field, as.numeric = TRUE), 1, mean, na.rm = TRUE)]; + } +features.dt[, COHORT_AF := apply(extract.gt(input.vcf, element = 'GT'), 1, calculate.VAF)]; +features.dt[!SVTYPE %in% c('BND', 'INS'), SVLEN := END - POS + 1]; +features.dt[, CIPOS := as.numeric(sapply(CIPOS, function(x) unlist(strsplit(x, ','))[2]))]; + +################################################################################################### +# Intersect variants with gnomAD SVs +################################################################################################### +start.time <- Sys.time(); + +# features.dt <- features.dt[1:100]; +features.dt[, c('gnomad.match.ID', 'gnomad.matches') := rbindlist(lapply(ID, find.SV.match, input = features.dt, reference = features.dt.gnomad, overlap = 0.8, offset = 500))]; + +gnomad.features <- c('ID', 'AF', 'POPMAX_AF', 'NCR'); +features.dt <- merge(features.dt, features.dt.gnomad[, ..gnomad.features], all.x = TRUE, by.x = 'gnomad.match.ID', by.y = 'ID'); + +cat(format(Sys.time() - start.time, nsmall = 2), '\n'); + +################################################################################################### +# Format features for RF +################################################################################################### +continuous.features <- c( + 'POS', + 'QUAL', + 'END', + 'PE', + 'MAPQ', + 'CIPOS', + 'SRMAPQ', + 'HOMLEN', + 'SR', + 'SRQ', + 'CE', + 'RDRATIO', + 'SVLEN', + 'GQ', + 'RC', + 'RDCN', + 'DR', + 'DV', + 'RR', + 'RV', + 'gnomad.matches', + 'AF', + 'POPMAX_AF', + 'NCR' + ); + +categorical.features <- c( + 'CHROM', + 'SVTYPE', + 'CT' + ); + +# Extract features and format +continuous.features <- continuous.features[continuous.features %in% names(features.dt)]; +categorical.features <- categorical.features[categorical.features %in% names(features.dt)]; +all.features <- c(continuous.features, categorical.features, 'ID'); + +features.dt <- features.dt[, ..all.features]; +features.dt[, (continuous.features) := lapply(.SD, as.numeric), .SDcols = continuous.features]; +features.dt[, (continuous.features) := lapply(.SD, function(x) ifelse(is.na(x), 0, x)), .SDcols = continuous.features]; +features.dt[, (categorical.features) := lapply(.SD, function(x) ifelse(is.na(x), '', x)), .SDcols = categorical.features]; +features.dt[, (categorical.features) := lapply(.SD, as.factor), .SDcols = categorical.features]; +names(features.dt) <- make.names(names(features.dt)); + +# Remove rows with NA +features.dt.rows <- nrow(features.dt); +features.dt <- features.dt[apply(features.dt, 1, function(x) !any(is.na(x))), ]; +cat('Removed', features.dt.rows - nrow(features.dt), 'rows with missing data\n'); + +################################################################################################### +# Save features.dt for input to RF +################################################################################################### +saveRDS(features.dt, output.rds); diff --git a/module/scripts/liftover-Delly2-vcf.R b/module/scripts/liftover-Delly2-vcf.R new file mode 100644 index 0000000..d473091 --- /dev/null +++ b/module/scripts/liftover-Delly2-vcf.R @@ -0,0 +1,157 @@ +#!/usr/bin/env Rscript +# liftover-Delly2-vcf.R +################################################################################################### +# +# +# +################################################################################################### + +suppressPackageStartupMessages({ + library(vcfR); + library(data.table); + library(argparse); + library(rtracklayer); + }); + +################################################################################################### +# Input +################################################################################################### +# Define command line arguments +parser <- ArgumentParser(); +parser$add_argument('--input-vcf', type = 'character', help = 'GRCh37 Delly2 vcf'); +parser$add_argument('--header-contigs', type = 'character', help = 'Directory with vcf subsets'); +parser$add_argument('--chain-file', type = 'character', help = 'hg19ToHg38.over.chain file'); +parser$add_argument('--output', type = 'character', help = 'Where to write lifted vcf'); +args <- parser$parse_args(); + +# Save command line arguments +for (arg in names(args)) { + assign(gsub('_', '.', arg), args[[arg]]); + } + +# Set parameters for interactive runs +if (interactive()) { + # input.vcf <- '/hot/project/method/AlgorithmEvaluation/BNCH-000142-GRCh37v38/gSV/bcftools-merge/CPCG-40QC_GRCh37/CPCG-40QC_GRCh37_regenotype-gSV_delly_bcftools-merge_delly-filter-germline.vcf.gz'; + input.vcf <- '/hot/project/method/AlgorithmEvaluation/BNCH-000142-GRCh37v38/sSV/bcftools-merge/CPCG-40QC_GRCh37/CPCG-40QC_GRCh37_call-sSV_delly_bcftools-merge_somatic-only.vcf.gz'; + header.contigs <- '/hot/code/nkwang/GitHub/uclahs-cds/project-method-AlgorithmEvaluation-BNCH-000142-GRCh37v38/report/manuscript/publish/GRCh38-vcf-header-contigs.txt'; + chain.file <- '/hot/resource/genomics/liftover_chain_files/hg19ToHg38.over.chain'; + output <- '/hot/project/method/AlgorithmEvaluation/BNCH-000142-GRCh37v38/sSV/stableLift/train_CPCG-40QC_Delly2/CPCG-40QC_Delly2_LiftOver-GRCh38.vcf.gz'; + } + +################################################################################################### +# Functions +################################################################################################### +vcf.fix.to.dt <- function(vcf.fix) { + vcf.fix <- as.data.table(vcf.fix); + vcf.info <- vcf.info.to.dt(vcf.fix$INFO); + cbind(vcf.fix[, -'INFO'], vcf.info); + } + +# vcfR::getINFO() to data.table +vcf.info.to.dt <- function(vcf.info) { + vcf.info <- lapply(vcf.info, function(x) vcf.info.string.to.list(x)); + feature.names <- unique(unlist(lapply(vcf.info, names))); + vcf.info <- do.call(mapply, c(FUN = list, lapply(vcf.info, `[`, feature.names))); + setNames(as.data.table(vcf.info), feature.names); + } + +# Split vcf info field to list +vcf.info.string.to.list <- function(vcf.info, keep.columns = NULL) { + list.out <- strsplit(vcf.info, split = ';'); + list.out <- lapply(list.out, function(x) strsplit(x, split = '=')); + labels <- sapply(list.out[[1]], function(x) x[[1]]); + values <- sapply(list.out[[1]], function(x) if (length(x) == 2) x[[2]] else x[[1]]); + names(values) <- labels; + if (is.null(keep.columns)) return(values); + values <- values[labels %in% keep.columns]; + return(values); + } + +################################################################################################### +# Load files +################################################################################################### +input.vcf.path <- input.vcf; +input.vcf <- read.vcfR(input.vcf); +header.contigs <- scan(header.contigs, character()); +liftover.chain <- import.chain(chain.file); + +################################################################################################### +# Data preprocessing +################################################################################################### +if (any(duplicated(input.vcf@fix[, 'ID']))) input.vcf@fix[, 'ID'] <- paste0(substr(input.vcf@fix[, 'ID'], 1, 3), sprintf('%08d', seq_len(nrow(input.vcf@fix)))); +# if (any(duplicated(input.vcf@fix[, 'ID']))) fix.dt[, ID := paste0(substr(ID, 1, 3), sprintf('%08d', seq_len(nrow(fix.dt))))]; +fix.dt <- as.data.table(input.vcf@fix); +gt.dt <- as.data.table(input.vcf@gt); + +fix.dt <- vcf.fix.to.dt(fix.dt); +fix.dt[, CHROM := paste0('chr', CHROM)]; +fix.dt[, CHR2 := ifelse(is.na(CHR2), CHR2, paste0('chr', CHR2))]; + +fix.dt <- fix.dt[, -c('CONSENSUS')]; +numeric.columns <- c('POS', 'QUAL', 'END', 'PE', 'MAPQ', 'SRMAPQ', 'INSLEN', 'HOMLEN', 'SR', 'SRQ', 'CE', 'SVLEN', 'POS2'); +character.columns <- names(fix.dt)[!names(fix.dt) %in% numeric.columns]; +fix.dt[, (numeric.columns) := lapply(.SD, as.numeric), .SDcols = numeric.columns]; +fix.dt[, (character.columns) := lapply(.SD, as.character), .SDcols = character.columns]; + +################################################################################################### +# Liftover +################################################################################################### +# Create GRanges object +granges.37 <- makeGRangesFromDataFrame( + df = fix.dt, + seqnames.field = 'CHROM', + start.field = 'POS', + end.field = 'END', + keep.extra.columns = TRUE + ); + +# Liftover using chain file +granges.38 <- unlist(liftOver(granges.37, liftover.chain)); +granges.38.dt <- as.data.table(granges.38); + +# Create GRanges object using CHROM, CHR2, and POS2 from fix.dt +granges.37.BND <- makeGRangesFromDataFrame( + df = fix.dt[SVTYPE == 'BND', ], + seqnames.field = 'CHR2', + start.field = 'POS2', + end.field = 'POS2', + keep.extra.columns = TRUE + ); +granges.38.BND <- as.data.table(unlist(liftOver(granges.37.BND, liftover.chain))); + +# Remove multiple mappings +granges.38.dt <- granges.38.dt[!duplicated(ID)]; +granges.38.BND <- granges.38.BND[!duplicated(ID)]; +common <- intersect(granges.38.dt$ID, granges.38.BND$ID); + +granges.38.dt[ID %in% common, c('CHR2', 'POS2') := granges.38.BND[ID %in% common, .(seqnames, start)]]; + +pass.liftover <- as.data.table(input.vcf@fix)$ID %in% granges.38.dt$ID; +fix.lifted <- as.data.table(input.vcf@fix)[pass.liftover]; +gt.dt <- gt.dt[pass.liftover]; +for (i in seq_len(nrow(fix.lifted))) { + this.ID <- fix.lifted[i, ID]; + this.INFO <- vcf.info.string.to.list(fix.lifted[i, INFO]); + this.INFO[['END']] <- granges.38.dt[i, end]; + if (this.INFO[['SVTYPE']] == 'BND') { + this.INFO[['CHR2']] <- granges.38.dt[i, CHR2]; + this.INFO[['POS2']] <- granges.38.dt[i, POS2]; + } + this.INFO <- lapply(names(this.INFO), function(x) paste(x, this.INFO[[x]], sep = '=')); + this.INFO <- paste(this.INFO, collapse = ';'); + this.INFO <- gsub('IMPRECISE=IMPRECISE', 'IMPRECISE', this.INFO); + this.INFO <- gsub('PRECISE=PRECISE', 'PRECISE', this.INFO); + this.INFO <- gsub('SOMATIC=SOMATIC', 'SOMATIC', this.INFO); + fix.lifted[i, c('CHROM', 'POS', 'INFO') := granges.38.dt[ID == ..this.ID, .(seqnames, start, ..this.INFO)]]; + } + +################################################################################################### +# Write output vcf +################################################################################################### +output.vcf <- input.vcf; +output.vcf@fix <- as.matrix(fix.lifted); +output.vcf@gt <- as.matrix(gt.dt); +output.vcf@meta <- output.vcf@meta[!grepl('^##(contig|reference)', output.vcf@meta)]; +output.vcf@meta <- c(output.vcf@meta, header.contigs); + +write.vcf(output.vcf, output); diff --git a/module/scripts/predict-liftover-stability.R b/module/scripts/predict-liftover-stability.R index 12892b9..d8f5dd2 100644 --- a/module/scripts/predict-liftover-stability.R +++ b/module/scripts/predict-liftover-stability.R @@ -33,6 +33,27 @@ for (arg in names(args)) { assign(gsub('_', '.', arg), args[[arg]]); } +#################################################################################################### +# Functions +#################################################################################################### + +# Sort datatable by chr then position +sort.genomic.dt <- function(x, chr = 'CHROM', pos = 'POS') { + setDT(x); + x[, eval(chr) := gsub('chr', '', get(chr))]; + x[, eval(chr) := gsub('X', '23', get(chr))]; + x[, eval(chr) := gsub('Y', '24', get(chr))]; + x[, eval(chr) := as.numeric(get(chr))]; + + setorderv(x, c(chr, pos), c(1, 1)); + + x[, eval(chr) := gsub('23', 'X', get(chr))]; + x[, eval(chr) := gsub('24', 'Y', get(chr))]; + x[, eval(chr) := paste0('chr', get(chr))]; + + return(x); + } + ################################################################################################### # Load data ################################################################################################### @@ -99,14 +120,12 @@ stability <- predict(rf.model, data = features.dt); performance.f <- performance(rf.model$prediction, measure = 'f'); index <- which.max(unlist(performance.f@y.values)); threshold <- unlist(performance.f@x.values)[index]; -# f.score <- unlist(performance.f@y.values)[index]; performance <- performance(rf.model$prediction, 'sens', 'spec'); sensitivity <- unlist(performance@y.values)[index]; specificity <- unlist(performance@x.values)[index]; -# cat(sprintf('Max F1-score = %.3f\n', f.score)); # Convert to stability units threshold.stability <- 1 - threshold; cat(sprintf('Threshold = %.3f\n', threshold.stability)); @@ -126,4 +145,5 @@ annotation.dt <- data.table( STABILITY_SCORE = format(round(stability$predictions[, 1], 4), nsmall = 4), STABILITY = ifelse(stability.classification == '1', 'UNSTABLE', 'STABLE') ); -fwrite(annotation.dt, file = output.tsv, sep = '\t', col.names = TRUE); +sort.genomic.dt(annotation.dt); +fwrite(annotation.dt, file = output.tsv, sep = '\t', col.names = FALSE); diff --git a/module/annotations.nf b/module/snv_annotations.nf similarity index 62% rename from module/annotations.nf rename to module/snv_annotations.nf index 2295c79..78ea248 100644 --- a/module/annotations.nf +++ b/module/snv_annotations.nf @@ -1,11 +1,49 @@ +include { compress_and_index_HTSlib } from './utils.nf' + +process run_Funcotator_GATK { + container params.docker_image_gatk + + publishDir path: "${params.output_dir_base}/intermediate/${task.process.replace(':', '/')}", + pattern: "output.vcf.gz", + mode: "copy", + enabled: params.save_intermediate_files, + saveAs: { "Funcotator-${sample_id}-${dest_fasta_id}.vcf.gz" } + + input: + tuple val(sample_id), + path(vcf, stageAs: 'inputs/*'), + path(index, stageAs: 'inputs/*') + tuple val(dest_fasta_id), path(dest_fasta_ref), path(dest_fasta_fai), path(dest_fasta_dict) + path (funcotator_sources) + + output: + tuple val(sample_id), path('output.vcf.gz'), emit: funcotator_vcf + + script: + """ + gatk Funcotator \ + --variant "${vcf}" \ + --reference "${dest_fasta_ref}" \ + --ref-version "${dest_fasta_id}" \ + --data-sources-path "${funcotator_sources}" \ + --output-file-format VCF \ + --output "output.vcf.gz" + """ + + stub: + """ + touch "output.vcf.gz" + """ +} + process annotate_RepeatMasker_BCFtools { container params.docker_image_bcftools - publishDir path: "${intermediate_filepath}", + publishDir path: "${params.output_dir_base}/intermediate/${task.process.replace(':', '/')}", pattern: "output.vcf.gz", mode: "copy", enabled: params.save_intermediate_files, - saveAs: { "${slug}.vcf.gz" } + saveAs: { "RepeatMasker-${sample_id}.vcf.gz" } input: tuple val(sample_id), path(vcf, stageAs: 'inputs/*') @@ -15,9 +53,6 @@ process annotate_RepeatMasker_BCFtools { tuple val(sample_id), path('output.vcf.gz'), emit: repeatmasker_vcf script: - intermediate_filepath = "${params.output_dir_base}/BCFtools-${params.bcftools_version}/intermediate/${task.process}" - - slug = "RepeatMasker-${sample_id}" """ bcftools annotate \ @@ -27,22 +62,21 @@ process annotate_RepeatMasker_BCFtools { -o "output.vcf.gz" \ "${vcf}" """ + + stub: + """ + touch "output.vcf.gz" + """ } process extract_TrinucleotideContext_BEDTools { container params.docker_image_bedtools - publishDir path: "${intermediate_filepath}", - pattern: "output.bed", + publishDir path: "${params.output_dir_base}/intermediate/${task.process.replace(':', '/')}", + pattern: "output.{bed,tsv}", mode: "copy", enabled: params.save_intermediate_files, - saveAs: { "${slug}.bed" } - - publishDir path: "${intermediate_filepath}", - pattern: "output.tsv", - mode: "copy", - enabled: params.save_intermediate_files, - saveAs: { "${slug}-full.tsv" } + saveAs: { "Trinucleotide-${sample_id}-${dest_fasta_id}.${file(it).getExtension()}" } input: tuple val(sample_id), path(vcf) @@ -53,10 +87,6 @@ process extract_TrinucleotideContext_BEDTools { tuple val(sample_id), path('output.bed'), emit: trinucleotide_bed script: - intermediate_filepath = "${params.output_dir_base}/BEDtools-${params.bedtools_version}/intermediate/${task.process}" - - slug = "Trinucleotide-${sample_id}" - """ zcat ${vcf} \ | grep -v "^#" \ @@ -75,47 +105,22 @@ process extract_TrinucleotideContext_BEDTools { <(cut -f 2 "partial.tsv") \ > "output.tsv" """ -} - -process compress_and_index_HTSlib { - container params.docker_image_samtools - - publishDir path: "${intermediate_filepath}", - pattern: "output.tsv.gz{,.tbi}", - mode: "copy", - enabled: params.save_intermediate_files - - input: - tuple val(sample_id), path(tsv) - - output: - tuple val(sample_id), path('output.tsv.gz'), path('output.tsv.gz.tbi'), emit: compressed_tsv_with_index - - script: - intermediate_filepath = "${params.output_dir_base}/SAMtools-${params.samtools_version}/intermediate/${task.process}" - - slug = "Trinucleotide-${sample_id}" + stub: """ - bgzip ${tsv} --output output.tsv.gz - - tabix \ - --sequence 1 \ - --begin 2 \ - --end 2 \ - output.tsv.gz + touch "output.tsv" + touch "output.bed" """ } - process annotate_trinucleotide_BCFtools { container params.docker_image_bcftools - publishDir path: "${intermediate_filepath}", + publishDir path: "${params.output_dir_base}/intermediate/${task.process.replace(':', '/')}", pattern: "output.vcf.gz", mode: "copy", enabled: params.save_intermediate_files, - saveAs: { "${slug}.vcf.gz" } + saveAs: { "Trinucleotide-annotated-${sample_id}.vcf.gz" } input: tuple val(sample_id), @@ -127,10 +132,6 @@ process annotate_trinucleotide_BCFtools { tuple val(sample_id), path('output.vcf.gz'), emit: trinucleotide_vcf script: - intermediate_filepath = "${params.output_dir_base}/BCFtools-${params.bcftools_version}/intermediate/${task.process}" - - slug = "Trinucleotide-${sample_id}" - """ bcftools annotate \ --annotations ${tsv} \ @@ -139,18 +140,28 @@ process annotate_trinucleotide_BCFtools { --output output.vcf.gz \ ${vcf} """ -} - + stub: + """ + touch "output.vcf.gz" + """ +} -workflow workflow_apply_annotations { +workflow workflow_apply_snv_annotations { take: vcf_with_sample_id dest_fasta_data main: - annotate_RepeatMasker_BCFtools( + + run_Funcotator_GATK( vcf_with_sample_id, + dest_fasta_data, + Channel.value(params.funcotator_data.data_source) + ) + + annotate_RepeatMasker_BCFtools( + run_Funcotator_GATK.out.funcotator_vcf, Channel.value(params.repeat_bed) ) diff --git a/module/snv_workflow.nf b/module/snv_workflow.nf new file mode 100644 index 0000000..f04de2d --- /dev/null +++ b/module/snv_workflow.nf @@ -0,0 +1,113 @@ +include { workflow_apply_snv_annotations } from './snv_annotations.nf' + +process run_liftover_BCFtools { + container params.docker_image_bcftools + + publishDir path: "${params.output_dir_base}/intermediate/${task.process.replace(':', '/')}", + pattern: "{reject,liftover}.vcf.gz{,.tbi}", + mode: "copy", + enabled: params.save_intermediate_files, + saveAs: { filename -> "LiftOver-${sample_id}-${src_fasta_id}-to-${dest_fasta_id}-${filename}" } + + input: + tuple val(sample_id), path(vcf), path(index) + tuple val(src_fasta_id), path(src_fasta_ref), path(src_fasta_fai), path(src_fasta_dict) + tuple val(dest_fasta_id), path(dest_fasta_ref), path(dest_fasta_fai), path(dest_fasta_dict) + path (chain_file) + + output: + tuple val(sample_id), path('liftover.vcf.gz'), path('liftover.vcf.gz.tbi'), emit: liftover_vcf_with_index + + script: + """ + bcftools +liftover \ + --output-type u \ + "${vcf}" \ + -- \ + --src-fasta-ref "${src_fasta_ref}" \ + --fasta-ref "${dest_fasta_ref}" \ + --chain "${chain_file}" \ + --reject-type z \ + --reject "reject.vcf.gz" | \ + bcftools view \ + --output-type u \ + -e 'REF="." | ALT="."' | \ + bcftools sort \ + --output-type z \ + --write-index=tbi \ + --output "liftover.vcf.gz" + """ + + stub: + """ + touch "liftover.vcf.gz" + touch "liftover.vcf.gz.tbi" + touch "reject.vcf.gz" + """ +} + +process extract_VCF_features_StableLift { + container params.docker_image_stablelift + containerOptions "-v ${moduleDir}:${moduleDir}" + + publishDir path: "${params.output_dir_base}/intermediate/${task.process.replace(':', '/')}", + pattern: "features.Rds", + mode: "copy", + enabled: params.save_intermediate_files, + saveAs: { "StableLift-${sample_id}.Rds" } + + input: + tuple val(sample_id), path(vcf) + + output: + tuple val(sample_id), path('features.Rds'), emit: r_annotations + + script: + """ + Rscript "${moduleDir}/scripts/extract-vcf-features.R" \ + --input-vcf "${vcf}" \ + --variant-caller ${params.variant_caller} \ + --output-rds "features.Rds" + """ + + stub: + """ + touch "features.Rds" + """ +} + +workflow workflow_extract_snv_annotations { + take: + vcf_with_sample_id + src_sequence + dest_sequence + chain_file + variant_caller + + main: + + // Step 1: Liftover + run_liftover_BCFtools( + vcf_with_sample_id, + src_sequence, + dest_sequence, + chain_file + ) + + // Step 2: Annotate + workflow_apply_snv_annotations( + run_liftover_BCFtools.out.liftover_vcf_with_index, + dest_sequence + ) + + // Step 3: Extract features + // FIXME Parallelize HaplotypeCaller + extract_VCF_features_StableLift( + workflow_apply_snv_annotations.out.annotated_vcf + ) + + emit: + liftover_vcf = workflow_apply_snv_annotations.out.annotated_vcf + r_annotations = extract_VCF_features_StableLift.out.r_annotations +} + diff --git a/module/sv_workflow.nf b/module/sv_workflow.nf new file mode 100644 index 0000000..bdf2621 --- /dev/null +++ b/module/sv_workflow.nf @@ -0,0 +1,127 @@ + +process liftover_SV_StableLift{ + container params.docker_image_stablelift + + publishDir path: "${params.output_dir_base}/intermediate/${task.process.replace(':', '/')}", + pattern: "liftover.vcf.gz", + mode: "copy", + enabled: params.save_intermediate_files, + saveAs: { "LiftOver-${sample_id}.vcf.gz" } + + input: + tuple val(sample_id), + path(vcf, stageAs: 'inputs/*'), + path(index, stageAs: 'inputs/*') + path (header_contigs) + path (chain_file) + + output: + tuple val(sample_id), path('liftover.vcf.gz'), emit: liftover_vcf + + script: + """ + Rscript "${moduleDir}/scripts/liftover-Delly2-vcf.R" \ + --input-vcf "${vcf}" \ + --header-contigs "${header_contigs}" \ + --chain-file "${chain_file}" \ + --output "liftover.vcf.gz" + """ + + stub: + """ + touch "liftover.vcf.gz" + """ +} + +process run_sort_BCFtools { + container params.docker_image_bcftools + + publishDir path: "${params.output_dir_base}/intermediate/${task.process.replace(':', '/')}", + pattern: "sorted.vcf.gz", + mode: "copy", + enabled: params.save_intermediate_files, + saveAs: { "BCFtools-sorted-${sample_id}.vcf.gz" } + + input: + tuple val(sample_id), path(vcf, stageAs: 'inputs/*') + + output: + tuple val(sample_id), path('sorted.vcf.gz'), emit: sorted_vcf + + script: + """ + bcftools sort \ + --output-type z \ + --output sorted.vcf.gz \ + ${vcf} + """ + + stub: + """ + touch sorted.vcf.gz + """ +} + +process annotate_gnomAD_StableLift { + container params.docker_image_stablelift + + publishDir path: "${params.output_dir_base}/intermediate/${task.process.replace(':', '/')}", + pattern: "annotations.Rds", + mode: "copy", + enabled: params.save_intermediate_files, + saveAs: { "LiftOver-${sample_id}-${variant_caller}.Rds" } + + input: + tuple val(sample_id), path(vcf, stageAs: 'inputs/*') + path (gnomad_rds) + val (variant_caller) + + output: + tuple val(sample_id), path('annotations.Rds'), emit: r_annotations + + script: + """ + Rscript "${moduleDir}/scripts/extract-vcf-features-SV.R" \ + --variant-caller "${variant_caller}" \ + --input-vcf "${vcf}" \ + --output-rds "annotations.Rds" \ + --gnomad-rds ${gnomad_rds} + """ + + stub: + """ + touch "annotations.Rds" + """ +} + +workflow workflow_extract_sv_annotations { + take: + vcf_with_sample_id + header_contigs + gnomad_rds + chain_file + variant_caller + + main: + + // Step 1: Liftover + liftover_SV_StableLift( + vcf_with_sample_id, + header_contigs, + chain_file + ) + run_sort_BCFtools( + liftover_SV_StableLift.out.liftover_vcf + ) + + // Step 2: Extract features + annotate_gnomAD_StableLift( + run_sort_BCFtools.out.sorted_vcf, + gnomad_rds, + variant_caller + ) + + emit: + liftover_vcf = run_sort_BCFtools.out.sorted_vcf + r_annotations = annotate_gnomAD_StableLift.out.r_annotations +} diff --git a/module/utils.nf b/module/utils.nf new file mode 100644 index 0000000..1c8efdf --- /dev/null +++ b/module/utils.nf @@ -0,0 +1,36 @@ + +process compress_and_index_HTSlib { + container params.docker_image_samtools + + publishDir path: "${params.output_dir_base}/intermediate/${task.process.replace(':', '/')}", + pattern: "output.tsv.gz{,.tbi}", + mode: "copy", + enabled: params.save_intermediate_files, + saveAs: { "${sample_id}${file(it).getName().replace(file(it).getSimpleName(), "")}" } + + input: + tuple val(sample_id), path(tsv) + + output: + tuple val(sample_id), + path('output.tsv.gz'), + path('output.tsv.gz.tbi'), + emit: compressed_tsv_with_index + + script: + """ + bgzip ${tsv} --output output.tsv.gz + + tabix \ + --sequence 1 \ + --begin 2 \ + --end 2 \ + output.tsv.gz + """ + + stub: + """ + touch "output.tsv.gz" + touch "output.tsv.gz.tbi" + """ +} diff --git a/nftest.yml b/nftest.yml index 32644f5..6cbca4f 100644 --- a/nftest.yml +++ b/nftest.yml @@ -3,11 +3,19 @@ global: temp_dir: test/work remove_temp: true clean_logs: true - nf_config: test/nftest.config + nf_config: test/common.config cases: - - name: Example test + - name: SNV nf_script: ./main.nf - params_file: test/nftest.yaml + nf_config: test/snv.config + params_file: test/snv.yaml + skip: false + verbose: true + + - name: SV + nf_script: ./main.nf + nf_config: test/sv.config + params_file: test/sv.yaml skip: false verbose: true diff --git a/test/nftest.config b/test/common.config similarity index 59% rename from test/nftest.config rename to test/common.config index 2c9cbff..e3d037e 100644 --- a/test/nftest.config +++ b/test/common.config @@ -1,16 +1,8 @@ -includeConfig "${projectDir}/config/default.config" -includeConfig "${projectDir}/config/methods.config" -includeConfig "${projectDir}/nextflow.config" - params { - // Choices: ["Mutect2", "HaplotypeCaller"] - variant_caller = "Mutect2" save_intermediate_files = true ucla_cds = false - rf_model = "/hot/project/method/AlgorithmEvaluation/BNCH-000142-GRCh37v38/sSNV/stableLift/train_CPCG-40QC_Mutect2/RF-train_Mutect2_ntree2000_nodesize5_classratio0.Rds" - // Reference files funcotator_data { data_source = "/hot/ref/tool-specific-input/Funcotator/somatic/funcotator_dataSources.v1.7.20200521s" @@ -30,8 +22,5 @@ params { // references chain_file = "/hot/ref/tool-specific-input/liftOver/hg19ToHg38.over.chain" - repeat_bed = "/hot/ref/database/RepeatMasker-v3.0.1/processed/GRCh38/GRCh38_RepeatMasker_intervals.bed" + repeat_bed = "/hot/ref/database/RepeatMasker-3.0.1/processed/GRCh38/GRCh38_RepeatMasker_intervals.bed" } - -// Setup the pipeline config. DO NOT REMOVE THIS LINE! -methods.setup() diff --git a/test/snv.config b/test/snv.config new file mode 100644 index 0000000..a87fc52 --- /dev/null +++ b/test/snv.config @@ -0,0 +1,15 @@ +includeConfig "${projectDir}/config/default.config" +includeConfig "${projectDir}/config/methods.config" +includeConfig "${projectDir}/nextflow.config" + +includeConfig "${projectDir}/test/common.config" + +params { + // Choices: ["Mutect2", "HaplotypeCaller"] + variant_caller = "Mutect2" + + rf_model = "/hot/project/method/AlgorithmEvaluation/BNCH-000142-GRCh37v38/sSNV/stableLift/train_CPCG-40QC_Mutect2/RF-train_Mutect2_ntree2000_nodesize5_classratio0.Rds" +} + +// Setup the pipeline config. DO NOT REMOVE THIS LINE! +methods.setup() diff --git a/test/nftest.yaml b/test/snv.yaml similarity index 100% rename from test/nftest.yaml rename to test/snv.yaml diff --git a/test/sv.config b/test/sv.config new file mode 100644 index 0000000..ded1405 --- /dev/null +++ b/test/sv.config @@ -0,0 +1,21 @@ +includeConfig "${projectDir}/config/default.config" +includeConfig "${projectDir}/config/methods.config" +includeConfig "${projectDir}/nextflow.config" + +includeConfig "${projectDir}/test/common.config" + +params { + // Choices: ["Mutect2", "HaplotypeCaller"] + variant_caller = "Delly2" + + rf_model = "/hot/project/method/AlgorithmEvaluation/BNCH-000142-GRCh37v38/gSV/stableLift/train_CPCG-40QC_Delly2/RF-train_Delly2_ntree2000_nodesize20_classratio0.Rds" + + // FIXME Should this be bundled? + header_contigs = "/hot/code/nkwang/GitHub/uclahs-cds/project-method-AlgorithmEvaluation-BNCH-000142-GRCh37v38/report/manuscript/publish/GRCh38-vcf-header-contigs.txt" + + // FIXME Should this be bundled? + gnomad_rds = "/hot/code/nkwang/GitHub/uclahs-cds/project-method-AlgorithmEvaluation-BNCH-000142-GRCh37v38/report/manuscript/publish/data/gnomad.v4.0.sv.Rds" +} + +// Setup the pipeline config. DO NOT REMOVE THIS LINE! +methods.setup() diff --git a/test/sv.yaml b/test/sv.yaml new file mode 100644 index 0000000..75dedd9 --- /dev/null +++ b/test/sv.yaml @@ -0,0 +1,4 @@ +--- +sample_id: ExampleID +input: + vcf: /hot/project/method/AlgorithmEvaluation/BNCH-000142-GRCh37v38/validation/TCGA-SARC_WGS/GRCh37/Delly2/TCGA-SARC_WGS_Delly2_regenotype-gSV_bcftools-merge_filter-germline.vcf.gz