diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index cdbfd10..520b43b 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -1,2 +1,2 @@ # Default owner(s) -* @tyamaguchi-ucla @yashpatel6 @zhuchcn @uclahs-cds/software-wg +* @uclahs-cds/nextflow-wg diff --git a/.github/workflows/docker-build-release.yaml b/.github/workflows/docker-build-release.yaml new file mode 100644 index 0000000..486d726 --- /dev/null +++ b/.github/workflows/docker-build-release.yaml @@ -0,0 +1,33 @@ +--- +name: Update image in GHCR + +run-name: > + ${{ + github.event_name == 'delete' && format( + 'Delete `{0}{1}`', + github.event.ref_type == 'branch' && 'branch-' || '', + github.event.ref + ) + || github.ref == 'refs/heads/main' && 'Update `dev`' + || format( + 'Update `{0}{1}`', + !startsWith(github.ref, 'refs/tags') && 'branch-' || '', + github.ref_name + ) + }} docker tag + +on: + push: + branches-ignore: ['gh-pages'] + tags: ['v*'] + delete: + +jobs: + push-or-delete-image: + runs-on: ubuntu-latest + name: Update GitHub Container Registry + permissions: + contents: read + packages: write + steps: + - uses: uclahs-cds/tool-Docker-action@v2.0.0 diff --git a/.github/workflows/static-analysis.yml b/.github/workflows/static-analysis.yml index 608753d..ccfdabb 100644 --- a/.github/workflows/static-analysis.yml +++ b/.github/workflows/static-analysis.yml @@ -10,7 +10,7 @@ on: - main jobs: - CICD-base: + static-analysis: runs-on: ubuntu-latest steps: diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..3b95730 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,32 @@ +ARG R_VERSION=4.3.1 + +FROM rocker/r-ver:${R_VERSION} AS build + +COPY docker/install-stablelift.R /tmp +RUN Rscript /tmp/install-stablelift.R + +FROM rocker/r-ver:${R_VERSION} + +# Overwrite the site library with just the desired packages. By default rocker +# only bundles docopt and littler in that directory. +COPY --from=build /tmp/userlib /usr/local/lib/R/site-library + +# Install python (required for argparse). The version is not important, but +# let's pin it for stability. +ARG PYTHON_VERSION=3.10.6-1~22.04 + +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + python3=${PYTHON_VERSION} \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +# Add a new user/group called bldocker +RUN groupadd -g 500001 bldocker && \ + useradd -l -r -u 500001 -g bldocker bldocker + +# Change the default user to bldocker from root +USER bldocker + +LABEL maintainer="Nicholas Wiltsie A 2-3 sentence description of each step/proccess in your pipeline that includes the purpose of the step/process, the tool(s) being used and their version, and the expected scientific inputs/outputs (e.g: FASTQs and BAMs) of the pipeline. +> A 2-3 sentence description of each step/process in your pipeline that includes the purpose of the step/process, the tool(s) being used and their version, and the expected scientific inputs/outputs (e.g: FASTQs and BAMs) of the pipeline. ### 2. Step/Process 2 -> A 2-3 sentence description of each step/proccess in your pipeline that includes the purpose of the step/process, the tool(s) being used and their version, and the expected scientific inputs/outputs (e.g: FASTQs and BAMs) of the pipeline. +> A 2-3 sentence description of each step/process in your pipeline that includes the purpose of the step/process, the tool(s) being used and their version, and the expected scientific inputs/outputs (e.g: FASTQs and BAMs) of the pipeline. ### 3. Step/Process n -> A 2-3 sentence description of each step/proccess in your pipeline that includes the purpose of the step/process, the tool(s) being used and their version, and the expected scientific inputs/outputs (e.g: FASTQs and BAMs) of the pipeline. +> A 2-3 sentence description of each step/process in your pipeline that includes the purpose of the step/process, the tool(s) being used and their version, and the expected scientific inputs/outputs (e.g: FASTQs and BAMs) of the pipeline. --- ## Inputs +UCLA pipelines have a hierarchical configuration structure to reduce code repetition: + +* `config/default.config`: Parameters with sensible defaults that may be overridden in `myconfig.config`. +* `config/template.config -> myconfig.config`: Required sample-agnostic parameters. Often shared for many samples. +* `input/template.yaml -> mysample.yaml`: Required sample-specific parameters. + ### Input YAML -> include an example of the organization structure within the YAML. Example: ```yaml -input 1: 'patient_id' +--- +sample_id: "" # Identifying string for the input sample input: - normal: - - id: - BAM: - tumor: - - id: - BAM: + vcf: "" # Path to the sample's VCF file ``` -### Config - -| Field | Type | Required | Description | -| ----- | ---- | ------------ | ------------------------ | -| param 1 | _type_ | yes/no | 1-2 sentence description of the parameter, including any defaults if any. | -| param 2 | _type_ | yes/no | 1-2 sentence description of the parameter, including any defaults if any. | -| param n | _type_ | yes/no | 1-2 sentence description of the parameter, including any defaults if any. | -| `work_dir` | path | no | Path of working directory for Nextflow. When included in the sample config file, Nextflow intermediate files and logs will be saved to this directory. With ucla_cds, the default is `/scratch` and should only be changed for testing/development. Changing this directory to `/hot` or `/tmp` can lead to high server latency and potential disk space limitations, respectively. | - -> Include the optional param `work_dir` in the inputs accompanied by a warning of the potentials dangers of using the param. Update the warning if necessary. +### Input Configuration + +| Required Parameter | Type | Description | +| ----------------------------------- | ------ | ------------------------------------------------------------------------------------------------------------------------------- | +| `output_dir` | path | Absolute path to the directory where the output files are to be saved. | +| `variant_caller` | string | ??? | +| `rf_model` | path | ??? | +| `funcotator_data.data_source` | path | ??? | +| `funcotator_data.src_reference_id` | string | ??? | +| `funcotator_data.dest_reference_id` | string | ??? | +| `src_fasta_ref` | path | Absolute path to the source reference sequence in FASTA format. Must correspond with `functotator_data.src_reference_id`. | +| `dest_fasta_ref` | path | Absolute path to the destination reference sequence in FASTA format. Must correspond with `functotator_data.dest_reference_id`. | +| `chain_file` | path | LiftOver chain file between the source and destination sequences. | +| `repeat_bed` | path | ??? | + + +| Optional Parameter | Type | Default | Description | +| --------------------------- | ----------------------------------------------------------------------------------------- | ---------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `work_dir` | path | `/scratch/$SLURM_JOB_ID` | Path of working directory for Nextflow. When included in the sample config file, Nextflow intermediate files and logs will be saved to this directory. With `ucla_cds`, the default is `/scratch` and should only be changed for testing/development. Changing this directory to `/hot` or `/tmp` can lead to high server latency and potential disk space limitations, respectively. | +| `save_intermediate_files` | boolean | false | If set, save output files from intermediate pipeline processes. | +| `min_cpus` | int | 1 | Minimum number of CPUs that can be assigned to each process. | +| `max_cpus` | int | `SysHelper.getAvailCpus()` | Maximum number of CPUs that can be assigned to each process. | +| `min_memory` | [MemoryUnit](https://www.nextflow.io/docs/latest/script.html#implicit-classes-memoryunit) | `1.MB` | Minimum amount of memory that can be assigned to each process. | +| `max_memory` | [MemoryUnit](https://www.nextflow.io/docs/latest/script.html#implicit-classes-memoryunit) | `SysHelper.getAvailMemory()` | Maximum amount of memory that can be assigned to each process. | +| `dataset_id` | string | `""` | ??? | +| `blcds_registered_dataset` | boolean | false | Set to true when using BLCDS folder structure; use false for now. | +| `ucla_cds` | boolean | true | If set, overwrite default memory and CPU values by UCLA cluster-specific configs. | +| `src_fasta_fai` | path | Relative to `src_fasta_ref` | Index for source reference sequence. | +| `src_fasta_dict` | path | Relative to `src_fasta_ref` | Dictionary for source reference sequence. | +| `dest_fasta_fai` | path | Relative to `dest_fasta_ref` | Index for destination reference sequence. | +| `dest_fasta_dict` | path | Relative to `src_fasta_ref` | Dictionary for destination reference sequence. | +| `docker_container_registry` | string | `ghcr.io/uclahs-cds` | Container registry for the docker images in the following table. | + +The docker images in the following table are generally defined like `docker_image_pipeval = "${-> params.docker_container_registry}/pipeval:${params.pipeval_version}"`. As such, there are three ways to modify each image: + +* Change `params.docker_container_registry`. This will affect all of the images (except for GATK). +* Change `params._version`. This will pull a different version of the same image from the registry. +* Change `params.docker_image_`. This will explicitly set the image to use, ignoring `docker_container_registry` and `_version`, and thus requires that the docker tag be explicitly set (e.g. `broadinstitute/gatk:4.2.4.1`). + +| Tool Parameter | Version Parameter | Default | Notes | +| ------------------------ | -------------------- | ------------------------------------------------------------ | ------------------------------------------------------------------- | +| `docker_image_bcftools` | `bcftools_version` | `ghcr.io/uclahs-cds/bcftools-score:1.20_score-1.20-20240505` | This image must have both BCFtools and the score plugins available. | +| `docker_image_bedtools` | `bedtools_version` | `ghcr.io/uclahs-cds/bedtools:2.31.0` | | +| `docker_image_gatk` | `gatk_version` | `broadinstitute/gatk:4.2.4.1` | | +| `docker_image_pipeval` | `pipeval_version` | `ghcr.io/uclahs-cds/pipeval:5.0.0-rc.3` | | +| `docker_image_samtools` | `samtools_version` | `ghcr.io/uclahs-cds/samtools:1.20` | | +| `doker_image_stablelift` | `stablelift_version` | `ghcr.io/uclahs-cds/stablelift:FIXME` | This image is built and maintained via this repository. | --- ## Outputs - - | Output | Description | | ------------ | ------------------------ | -| ouput 1 | 1 - 2 sentence description of the output. | -| ouput 2 | 1 - 2 sentence description of the output. | -| ouput n | 1 - 2 sentence description of the output. | +| `*_stability.vcf.gz` | ??? | +| `*_stability.vcf.gz.tbi` | ??? | +| `*_filtered.vcf.gz` | ??? | +| `*_filtered.vcf.gz.tbi` | ??? | --- @@ -107,8 +147,8 @@ A 2-3 sentence description of the test data set(s) used to validate and test thi ### Validation - Input/Output | Description | Result - | ------------ | ------------------------ | ------------------------ | +| Input/Output | Description | Result | +| ------------ | ------------------------ | ------------------------ | | metric 1 | 1 - 2 sentence description of the metric | quantifiable result | | metric 2 | 1 - 2 sentence description of the metric | quantifiable result | | metric n | 1 - 2 sentence description of the metric | quantifiable result | @@ -133,23 +173,21 @@ Included is a template for validating your input files. For more information on ## Discussions -- [Issue tracker]() to report errors and enhancement ideas. -- Discussions can take place in [ Discussions]() -- [ pull requests]() are also open for discussion +- [Issue tracker](https://github.com/uclahs-cds/pipeline-StableLift/issues) to report errors and enhancement ideas. +- Discussions can take place in [pipeline-StableLift Discussions](https://github.com/uclahs-cds/pipeline-StableLift/discussions) +- [pipeline-StableLift pull requests](https://github.com/uclahs-cds/pipeline-StableLift/pulls) are also open for discussion --- ## Contributors -> Update link to repo-specific URL for GitHub Insights Contributors page. - -Please see list of [Contributors](https://github.com/uclahs-cds/template-NextflowPipeline/graphs/contributors) at GitHub. +Please see list of [Contributors](https://github.com/uclahs-cds/pipeline-StableLift/graphs/contributors) at GitHub. --- ## License -[pipeline name] is licensed under the GNU General Public License version 2. See the file LICENSE for the terms of the GNU GPL license. +pipeline-StableLift is licensed under the GNU General Public License version 2. See the file LICENSE for the terms of the GNU GPL license. diff --git a/config/F16.config b/config/F16.config new file mode 100644 index 0000000..e69de29 diff --git a/config/F2.config b/config/F2.config new file mode 100644 index 0000000..e69de29 diff --git a/config/F32.config b/config/F32.config new file mode 100644 index 0000000..e69de29 diff --git a/config/F72.config b/config/F72.config index 21ae4b0..e69de29 100644 --- a/config/F72.config +++ b/config/F72.config @@ -1,23 +0,0 @@ -// Static process resource allocation here -// Specific for each node type - F72 here -process { - withName: process_name { - cpus = - memory = - // Other process-specific allocations here - } - withName: process_name_2 { - cpus = - memory = - // Other process-specific allocations here - } - withName: process_name_3 { - cpus = - memory = - // Other process-specific allocations here - } - withName: example_process { - cpus = 2 - memory = 5.GB - } -} diff --git a/config/custom_schema_types.config b/config/custom_schema_types.config new file mode 100644 index 0000000..bcf24fc --- /dev/null +++ b/config/custom_schema_types.config @@ -0,0 +1,49 @@ +import nextflow.Nextflow + +/** +* This custom schema namespace implements a custom type for Funcotator data sources. +*/ +custom_schema_types { + + /** + * Check that input refers to a properly configured Funcotator data source + * directory + */ + check_funcotator_data_source = { Map options, String name, Map properties -> + if (!(options[name] in Map)) { + throw new Exception("${name} should be a Map, not ${options[name].getClass()}.") + } + + options[name].each { entry -> + def entry_as_map = [:] + entry_as_map[entry.key] = entry.value + schema.validate_parameter(entry_as_map, entry.key, properties.elements[entry.key]) + } + + /* + Confirm that the destination reference sequence ID is a valid subfolder + in at least _one_ of the data sources. A reference-specific data source + directory requires a .config file at a path like: + + dataSourcesFolder//hg19/.config + dataSourcesFolder//hg38/.config + + There can be mulitple folders, but there should be only + one config per reference-specific subfolder. + */ + config_glob = [ + options[name].data_source, + "*", + options[name].dest_reference_id, + "*.config" + ].join("/") + + if (!Nextflow.file(config_glob)) { + throw new Exception("${name} is improperly configured - no files found matching '${config_glob}'") + } + } + + types = [ + 'FuncotatorDataSource': custom_schema_types.check_funcotator_data_source + ] +} diff --git a/config/default.config b/config/default.config index 2e67a46..0ee67bd 100644 --- a/config/default.config +++ b/config/default.config @@ -1,4 +1,5 @@ import nextflow.util.SysHelper +import nextflow.Nextflow // Default inputs/parameters of the pipeline params { @@ -7,29 +8,50 @@ params { min_cpus = 1 min_memory = 1.MB + save_intermediate_files = false + + dataset_id = '' + blcds_registered_dataset = false + ucla_cds = true docker_container_registry = "ghcr.io/uclahs-cds" // Docker images - // REPLACE WITH TOOLS USED - tool_a_version = 'x.x.x' // Docker version for tool a - docker_image_tool_a = "${-> params.docker_container_registry}/tool_a:${params.tool_a_version}" + bcftools_version = '1.20_score-1.20-20240505' + bedtools_version = '2.31.0' + gatk_version = '4.2.4.1' + pipeval_version = '5.0.0-rc.3' + samtools_version = '1.20' + stablelift_version = 'branch-nwiltsie-bootstrap' // FIXME + + docker_image_bcftools = "${-> params.docker_container_registry}/bcftools-score:${params.bcftools_version}" + docker_image_bedtools = "${-> params.docker_container_registry}/bedtools:${params.bedtools_version}" + docker_image_gatk = "broadinstitute/gatk:${params.gatk_version}" + docker_image_pipeval = "${-> params.docker_container_registry}/pipeval:${params.pipeval_version}" + docker_image_samtools = "${-> params.docker_container_registry}/samtools:${params.samtools_version}" + docker_image_stablelift = "${-> params.docker_container_registry}/stablelift:${params.stablelift_version}" + + // These are the index files associated with the source and destination + // reference sequences + src_fasta_fai = "${ -> params.src_fasta_ref}.fai" + dest_fasta_fai = "${ -> params.dest_fasta_ref}.fai" + + src_fasta_dict = "${ -> Nextflow.file(params.src_fasta_ref).resolveSibling(Nextflow.file(params.src_fasta_ref).getBaseName() + '.dict') }" + dest_fasta_dict = "${ -> Nextflow.file(params.dest_fasta_ref).resolveSibling(Nextflow.file(params.dest_fasta_ref).getBaseName() + '.dict') }" } // Process specific scope process { // Process results are stored to local cache. - // If pipeline is launched with the 'resume' option, existing cache results will be used when available - // rather than re-executing processes + // If pipeline is launched with the 'resume' option, existing cache results + // will be used when available rather than re-executing processes cache = true // Forward process 'stdout' to shell terminal and, consequently, the log file echo = true executor = 'local' - // Other directives or options that should apply for every process - - // total amount of resources avaible to the pipeline + // Total amount of resources available to the pipeline cpus = params.max_cpus memory = params.max_memory } diff --git a/config/methods.config b/config/methods.config index 761fd3b..7cbf874 100644 --- a/config/methods.config +++ b/config/methods.config @@ -2,45 +2,15 @@ includeConfig "${projectDir}/external/pipeline-Nextflow-config/config/methods/common_methods.config" includeConfig "${projectDir}/external/pipeline-Nextflow-config/config/schema/schema.config" includeConfig "${projectDir}/external/pipeline-Nextflow-config/config/bam/bam_parser.config" +includeConfig "${projectDir}/external/pipeline-Nextflow-config/config/store_object_as_json/store_object_as_json.config" methods { - get_ids_from_bams = { - params.samples_to_process = [] as Set - params.input.BAM.each { k, v -> - v.each { bam_path -> - def bam_header = bam_parser.parse_bam_header(bam_path) - def sm_tags = bam_header['read_group'].collect{ it['SM'] }.unique() - if (sm_tags.size() != 1) { - throw new Exception("${bam_path} contains multiple samples! Please run pipeline with single sample BAMs.") - } - if (alreadyExists == params.samples_to_process.any { it.orig_id == sm_tags[0] }) { - throw new Exception("Sample ${sm_tags[0]} was found in multiple BAMs. Please provide only one BAM per sample") - } - new_sm_tag = methods.sanitize_string(sm_tags[0]) - params.samples_to_process.add(['orig_id': sm_tags[0], 'id': new_sm_tag, 'path': bam_path, 'sample_type': k]) - } - } - } - // Set the output and log output dirs here. + // Set the output and log output dirs here. set_output_dir = { - - tz = TimeZone.getTimeZone('UTC') - def date = new Date().format("yyyyMMdd'T'HHmmss'Z'", tz) - - params.dataset_registry_prefix = '/hot/data' + def date = new Date().format("yyyyMMdd'T'HHmmss'Z'", TimeZone.getTimeZone('UTC')) - if (params.blcds_registered_dataset == true) { - if ("${params.dataset_id.length()}" != 11) { - throw new Exception("Dataset id must be eleven characters long") - } - def disease = "${params.dataset_id.substring(0,4)}" - // Need to fill in analyte, technology, raw_od_aligned, genome, pipeline-name - params.output_log_directory = "${params.dataset_registry_prefix}/$disease/${params.dataset_id}/${patient}/${sample}/analyte/technology,raw_or_aligned/genome/log/pipeline-name/$date" - params.disease = "${disease}" - } else { - params.output_dir_base = "${params.output_dir}/${manifest.name}-${manifest.version}/${params.sample_name.replace(' ', '_')}" - params.log_output_dir = "${params.output_dir_base}/log-${manifest.name}-${manifest.version}-${date}" - } + params.output_dir_base = "${params.output_dir}/${manifest.name}-${manifest.version}/${params.sample_id.replace(' ', '_')}" + params.log_output_dir = "${params.output_dir_base}/log-${manifest.name}-${manifest.version}-${date}" } set_pipeline_logs = { @@ -68,15 +38,20 @@ methods { } setup = { -// add this file and uncomment if needed -// schema.load_custom_types("${projectDir}/config/custom_schema_types.config") + schema.load_custom_types("${projectDir}/config/custom_schema_types.config") + schema.validate() + methods.set_output_dir() methods.set_resources_allocation() methods.modify_base_allocations() methods.set_pipeline_logs() methods.set_env() - methods.get_ids_from_bams() methods.setup_docker_cpus() methods.setup_process_afterscript() + + json_extractor.store_object_as_json( + params, + new File("${params.log_output_dir}/nextflow-log/params.json") + ) } } diff --git a/config/schema.yaml b/config/schema.yaml new file mode 100644 index 0000000..7d2645b --- /dev/null +++ b/config/schema.yaml @@ -0,0 +1,90 @@ +--- +sample_id: + type: 'String' + required: true + help: 'sample id supplied from input yaml' +save_intermediate_files: + type: 'Bool' + required: true + default: false + help: 'Enable to store intermediate files' +output_dir: + type: 'Path' + mode: 'w' + required: true + help: 'absolute path to directory to store output' +src_fasta_ref: + type: 'Path' + mode: 'r' + required: true + help: 'Source reference sequence (FASTA)' +src_fasta_fai: + type: 'Path' + mode: 'r' + required: true + help: 'Source reference sequence index file' +src_fasta_dict: + type: 'Path' + mode: 'r' + required: true + help: 'Source reference sequence dictionary' +dest_fasta_ref: + type: 'Path' + mode: 'r' + required: true + help: 'Destination reference sequence (FASTA)' +dest_fasta_fai: + type: 'Path' + mode: 'r' + required: true + help: 'Destination reference sequence index file' +dest_fasta_dict: + type: 'Path' + mode: 'r' + required: true + help: 'Destination reference sequence dictionary' +chain_file: + type: 'Path' + mode: 'r' + required: true + help: 'FIXME ???' +repeat_bed: + type: 'Path' + mode: 'r' + required: true + help: 'FIXME ???' +funcotator_data: + type: 'FuncotatorDataSource' + required: true + help: 'Funcotator data source and reference sample IDs' + elements: + data_source: + type: 'Path' + mode: 'r' + required: true + help: 'Root data source folder for Funcotator' + src_reference_id: + type: 'String' + mode: 'r' + required: true + help: 'Reference build ID for the source sequence' + dest_reference_id: + type: 'String' + mode: 'r' + required: true + help: 'Reference build ID for the destination sequence' +rf_model: + type: 'Path' + mode: 'r' + required: true + help: 'FIXME ???' +input: + type: 'Namespace' + required: true + help: 'Input sample' + elements: + vcf: + type: 'Path' + mode: 'r' + required: true + help: 'Input dataset supplied by input yaml' diff --git a/config/template.config b/config/template.config index 1002616..06c8741 100644 --- a/config/template.config +++ b/config/template.config @@ -8,18 +8,35 @@ includeConfig "${projectDir}/nextflow.config" // Inputs/parameters of the pipeline params { - dataset_id = '' - blcds_registered_dataset = false // if you want the output to be registered - - variable_name = 'foo-bar' - // input/output locations output_dir = 'where/to/save/outputs/' - // Tool-specific temp dirs here - // Using other directories, like /hot or /tmp, can cause latency and disk space issues - // tool_temp_dir = '/scratch' // Default is scratch - // Add other inputs/parameters here + // Choices: ["Mutect2", "HaplotypeCaller"] + variant_caller = "Mutect2" + + rf_model = "" + + // Reference files + funcotator_data { + data_source = "/hot/ref/tool-specific-input/Funcotator/somatic/funcotator_dataSources.v1.7.20200521s" + src_reference_id = "hg19" + dest_reference_id = "hg38" + } + + // The source reference sequence (FASTA). Must correspond with + // params.funcotator_data.src_reference_id + src_fasta_ref = "/hot/ref/reference/GRCh37-EBI-hs37d5/hs37d5.fa" + + // The destination reference sequence (FASTA). Must correspond with + // params.funcotator_data.dest_reference_id + dest_fasta_ref = "/hot/ref/reference/GRCh38-BI-20160721/Homo_sapiens_assembly38.fasta" + + // The liftover chain file between the source and destination FASTA + // references + chain_file = "/hot/ref/tool-specific-input/liftOver/hg19ToHg38.over.chain" + + // FIXME How to describe this file? + repeat_bed = "/hot/ref/database/RepeatMasker-v3.0.1/processed/GRCh38/GRCh38_RepeatMasker_intervals.bed" } // Setup the pipeline config. DO NOT REMOVE THIS LINE! diff --git a/docker/install-stablelift.R b/docker/install-stablelift.R new file mode 100644 index 0000000..b20a08d --- /dev/null +++ b/docker/install-stablelift.R @@ -0,0 +1,26 @@ +# Install the remotes package to the library +install.packages('remotes', lib = .Library) + +# Make a temporary directory to hold all of the installed packages +localdir <- '/tmp/userlib' +dir.create(localdir) + +dependencies <- c( + 'ROCR' = '1.0-11', + 'argparse' = '2.2.2', + 'caret' = '6.0-94', + 'data.table' = '1.14.8', + 'doParallel' = '1.0.17', + 'foreach' = '1.5.2', + 'ranger' = '0.15.1', + 'vcfR' = '1.14.0' +) + +# Unfortunately, this will install the dependencies multiple times +for (name in names(dependencies)) { + remotes::install_version( + name, + unname(dependencies[name]), + lib = localdir + ) +} diff --git a/input/template-input-BAM.yaml b/input/template-input-BAM.yaml deleted file mode 100644 index 624f9c1..0000000 --- a/input/template-input-BAM.yaml +++ /dev/null @@ -1,9 +0,0 @@ ---- -patient_id: 'patient_id' -input: - normal: - - id: normal_sample_id - path: /path/to/normal.bam - tumor: - - id: tumor_sample_id - path: /path/to/tumor.bam diff --git a/input/template.yaml b/input/template.yaml new file mode 100644 index 0000000..c968d97 --- /dev/null +++ b/input/template.yaml @@ -0,0 +1,4 @@ +--- +sample_id: "" +input: + vcf: /path/to/file.vcf diff --git a/main.nf b/main.nf index bae2734..1e45a85 100644 --- a/main.nf +++ b/main.nf @@ -3,50 +3,77 @@ nextflow.enable.dsl=2 // Include processes and workflows here -include { run_validate_PipeVal } from './external/pipeline-Nextflow-module/modules/PipeVal/validate/main.nf' addParams( +include { run_validate_PipeVal_with_metadata } from './external/pipeline-Nextflow-module/modules/PipeVal/validate/main.nf' addParams( options: [ docker_image_version: params.pipeval_version, main_process: "./" //Save logs in /process-log/run_validate_PipeVal ] ) -include { generate_standard_filename } from './external/pipeline-Nextflow-module/modules/common/generate_standardized_filename/main.nf' -include { tool_name_command_name } from './module/module-name' addParams( - workflow_output_dir: "${params.output_dir_base}/ToolName-${params.toolname_version}", - workflow_log_output_dir: "${params.log_output_dir}/process-log/ToolName-${params.toolname_version}" - ) + +include { run_liftover_BCFtools } from './module/liftover.nf' +include { run_Funcotator_GATK } from './module/funcotator.nf' +include { workflow_apply_annotations } from './module/annotations.nf' +include { workflow_extract_features} from './module/extract_features.nf' // Log info here log.info """\ - ====================================== - T E M P L A T E - N F P I P E L I N E - ====================================== - Boutros Lab - - Current Configuration: - - pipeline: - name: ${workflow.manifest.name} - version: ${workflow.manifest.version} - - - input: - input a: ${params.variable_name} - ... - - - output: - output a: ${params.output_path} - ... - - - options: - option a: ${params.option_name} - ... - - Tools Used: - tool a: ${params.docker_image_name} - - ------------------------------------ - Starting workflow... - ------------------------------------ - """ - .stripIndent() + ===================================== + S T A B L E L I F T P I P E L I N E + ===================================== + Boutros Lab + + Current Configuration: + - pipeline: + name: ${workflow.manifest.name} + version: ${workflow.manifest.version} + + - input: + dataset_id: ${params.dataset_id} + variant_caller: ${params.variant_caller} + rf_model: ${params.rf_model} + + src_fasta_ref: ${params.src_fasta_ref} + src_fasta_fai: ${params.src_fasta_fai} + src_fasta_dict: ${params.src_fasta_dict} + + dest_fasta_ref: ${params.dest_fasta_ref} + dest_fasta_fai: ${params.dest_fasta_fai} + dest_fasta_dict: ${params.dest_fasta_dict} + + chain_file: ${params.chain_file} + repeat_bed: ${params.repeat_bed} + + funcotator_data: + data_source: ${params.funcotator_data.data_source} + src_reference_id: ${params.funcotator_data.src_reference_id} + dest_reference_id: ${params.funcotator_data.dest_reference_id} + + - output: + output_dir_base: ${params.output_dir_base} + + - options: + blcds_registered_dataset: ${params.blcds_registered_dataset} + ucla_cds: ${params.ucla_cds} + + min_cpus: ${params.min_cpus} + max_cpus: ${params.max_cpus} + + min_memory: ${params.min_memory} + max_memory: ${params.max_memory} + + Tools Used: + BCFtools: ${params.docker_image_bcftools} + BEDtools: ${params.docker_image_bedtools} + PipeVal: ${params.docker_image_pipeval} + SAMTools: ${params.docker_image_samtools} + StableLift: ${params.docker_image_stablelift} + GATK: ${params.docker_image_gatk} + + ------------------------------------ + Starting workflow... + ------------------------------------ + """ + .stripIndent() def indexFile(bam_or_vcf) { if(bam_or_vcf.endsWith('.bam')) { @@ -60,54 +87,80 @@ def indexFile(bam_or_vcf) { } } -// Channels here -Channel - .fromList(params.samples_to_process) - .map { sample -> - return tuple(sample.orig_id, sample.id, sample.path, sample.sample_type) - } - .set { samplesToProcessChannel } - -Channel - .fromList(params.samples_to_process) - .map{ it -> [it['path'], indexFile(it['path'])] } - .flatten() - .set { files_to_validate_ch } - -// Possible reference channel Channel - .from( - params.reference, - params.reference_index, - params.reference_dict - ) - .set { reference_ch } + .value( [ + params.funcotator_data.src_reference_id, + params.src_fasta_ref, + params.src_fasta_fai, + params.src_fasta_dict, + ] ) + .set { input_ch_src_sequence } -// Decription of input channel Channel - .fromPath(params.variable_name) - .ifEmpty { error "Cannot find: ${params.variable_name}" } - .set { input_ch_variable_name } - -files_to_validate_ch = files_to_validate_ch - .mix(reference_ch) - .mix(input_ch_variable_name) + .value( [ + params.funcotator_data.dest_reference_id, + params.dest_fasta_ref, + params.dest_fasta_fai, + params.dest_fasta_dict + ] ) + .set { input_ch_dest_sequence } // Main workflow here workflow { - // Validation process - run_validate_PipeVal( - files_to_validate_ch - ) - run_validate_PipeVal.out.val_file.collectFile( - name: 'input_validation.txt', newLine: true, - storeDir: "${params.output_dir_base}/validation" + // Currently this is written for a single sample_id and VCF file, but + // abstract that away + Channel.of ([ + vcf: params.input.vcf, + index: indexFile(params.input.vcf), + sample_id: params.sample_id + ]).set { vcf_with_index } + + // The values of vcf_with_index are maps with keys vcf, index, and sample_id. + + // Run the input VCF and TBI files through PipeVal + vcf_with_index + .flatMap { sample -> + [ + [sample.vcf, [[sample_id: sample.sample_id], "vcf"]], + [sample.index, [[sample_id: sample.sample_id], "index"]] + ] + } | run_validate_PipeVal_with_metadata + + // Save the validation result + run_validate_PipeVal_with_metadata.out.validation_result + .collectFile( + name: 'input_validation.txt', + newLine: true, + storeDir: "${params.output_dir_base}/validation" ) - // Workflow or process - tool_name_command_name( - samplesToProcessChannel, - input_ch_variable_name - ) + run_validate_PipeVal_with_metadata.out.validated_file + .map { filename, metadata -> [metadata[0].sample_id, metadata[0] + [(metadata[1]): filename]] } + .groupTuple() + .map { it[1].inject([:]) { result, i -> result + i } } + .set { validated_vcf_with_index } + + // The values of validated_vcf_with_index are maps with keys vcf, index, and sample_id. + run_liftover_BCFtools( + validated_vcf_with_index.map { [it.sample_id, it.vcf, it.index] }, + input_ch_src_sequence, + input_ch_dest_sequence, + Channel.value(params.chain_file) + ) + + run_Funcotator_GATK( + run_liftover_BCFtools.out.liftover_vcf_with_index, + input_ch_dest_sequence, + Channel.value(params.funcotator_data.data_source) + ) + + workflow_apply_annotations( + run_Funcotator_GATK.out.funcotator_vcf, + input_ch_dest_sequence + ) + + workflow_extract_features( + workflow_apply_annotations.out.annotated_vcf + ) } diff --git a/metadata.yaml b/metadata.yaml index 8b58496..5150903 100644 --- a/metadata.yaml +++ b/metadata.yaml @@ -6,3 +6,4 @@ languages: ["Nextflow", "Docker"] dependencies: ["Java", "Nextflow", "Docker"] references: "https://uclahs-cds.atlassian.net/wiki/spaces/BOUTROSLAB/pages/3189620/Guide+to+Nextflow" tools: ["tool_a:v1.1.0", "tool_b:v1.1.0"] +image_name: "stablelift" diff --git a/module/annotations.nf b/module/annotations.nf new file mode 100644 index 0000000..2295c79 --- /dev/null +++ b/module/annotations.nf @@ -0,0 +1,176 @@ +process annotate_RepeatMasker_BCFtools { + container params.docker_image_bcftools + + publishDir path: "${intermediate_filepath}", + pattern: "output.vcf.gz", + mode: "copy", + enabled: params.save_intermediate_files, + saveAs: { "${slug}.vcf.gz" } + + input: + tuple val(sample_id), path(vcf, stageAs: 'inputs/*') + path(repeat_bed) + + output: + tuple val(sample_id), path('output.vcf.gz'), emit: repeatmasker_vcf + + script: + intermediate_filepath = "${params.output_dir_base}/BCFtools-${params.bcftools_version}/intermediate/${task.process}" + + slug = "RepeatMasker-${sample_id}" + + """ + bcftools annotate \ + -a "${repeat_bed}" \ + -c CHROM,FROM,TO,REPEAT \ + -h <(echo '##INFO=') \ + -o "output.vcf.gz" \ + "${vcf}" + """ +} + +process extract_TrinucleotideContext_BEDTools { + container params.docker_image_bedtools + + publishDir path: "${intermediate_filepath}", + pattern: "output.bed", + mode: "copy", + enabled: params.save_intermediate_files, + saveAs: { "${slug}.bed" } + + publishDir path: "${intermediate_filepath}", + pattern: "output.tsv", + mode: "copy", + enabled: params.save_intermediate_files, + saveAs: { "${slug}-full.tsv" } + + input: + tuple val(sample_id), path(vcf) + tuple val(dest_fasta_id), path(dest_fasta_ref), path(dest_fasta_fai), path(dest_fasta_dict) + + output: + tuple val(sample_id), path('output.tsv'), emit: trinucleotide_tsv + tuple val(sample_id), path('output.bed'), emit: trinucleotide_bed + + script: + intermediate_filepath = "${params.output_dir_base}/BEDtools-${params.bedtools_version}/intermediate/${task.process}" + + slug = "Trinucleotide-${sample_id}" + + """ + zcat ${vcf} \ + | grep -v "^#" \ + | awk '{if (length(\$4) == 1 && length(\$5) == 1) print \$1"\t"(\$2-2)"\t"(\$2+1)"\t"\$2}' \ + > "output.bed" + + bedtools getfasta \ + -fi ${dest_fasta_ref} \ + -bed "output.bed" \ + -fo "partial.tsv" \ + -name \ + -tab + + paste -d '\t' \ + <(cut -f 1,4 "output.bed") \ + <(cut -f 2 "partial.tsv") \ + > "output.tsv" + """ +} + +process compress_and_index_HTSlib { + container params.docker_image_samtools + + publishDir path: "${intermediate_filepath}", + pattern: "output.tsv.gz{,.tbi}", + mode: "copy", + enabled: params.save_intermediate_files + + input: + tuple val(sample_id), path(tsv) + + output: + tuple val(sample_id), path('output.tsv.gz'), path('output.tsv.gz.tbi'), emit: compressed_tsv_with_index + + script: + intermediate_filepath = "${params.output_dir_base}/SAMtools-${params.samtools_version}/intermediate/${task.process}" + + slug = "Trinucleotide-${sample_id}" + + """ + bgzip ${tsv} --output output.tsv.gz + + tabix \ + --sequence 1 \ + --begin 2 \ + --end 2 \ + output.tsv.gz + """ +} + + +process annotate_trinucleotide_BCFtools { + container params.docker_image_bcftools + + publishDir path: "${intermediate_filepath}", + pattern: "output.vcf.gz", + mode: "copy", + enabled: params.save_intermediate_files, + saveAs: { "${slug}.vcf.gz" } + + input: + tuple val(sample_id), + path(vcf, stageAs: 'inputs/*'), + path(tsv, stageAs: 'inputs/*'), + path(tsv_tbi, stageAs: 'inputs/*') + + output: + tuple val(sample_id), path('output.vcf.gz'), emit: trinucleotide_vcf + + script: + intermediate_filepath = "${params.output_dir_base}/BCFtools-${params.bcftools_version}/intermediate/${task.process}" + + slug = "Trinucleotide-${sample_id}" + + """ + bcftools annotate \ + --annotations ${tsv} \ + --columns CHROM,POS,TRINUCLEOTIDE \ + --header-lines <(echo '##INFO=') \ + --output output.vcf.gz \ + ${vcf} + """ +} + + + +workflow workflow_apply_annotations { + take: + vcf_with_sample_id + dest_fasta_data + + main: + annotate_RepeatMasker_BCFtools( + vcf_with_sample_id, + Channel.value(params.repeat_bed) + ) + + extract_TrinucleotideContext_BEDTools( + annotate_RepeatMasker_BCFtools.out.repeatmasker_vcf, + dest_fasta_data + ) + + compress_and_index_HTSlib( + extract_TrinucleotideContext_BEDTools.out.trinucleotide_tsv + ) + + annotate_trinucleotide_BCFtools( + annotate_RepeatMasker_BCFtools.out.repeatmasker_vcf.join( + compress_and_index_HTSlib.out.compressed_tsv_with_index, + failOnDuplicate: true, + failOnMismatch: true + ) + ) + + emit: + annotated_vcf = annotate_trinucleotide_BCFtools.out.trinucleotide_vcf +} diff --git a/module/extract_features.nf b/module/extract_features.nf new file mode 100644 index 0000000..d5b385f --- /dev/null +++ b/module/extract_features.nf @@ -0,0 +1,144 @@ +include { compress_and_index_HTSlib } from './annotations.nf' + +process extract_VCF_features_StableLift { + container params.docker_image_stablelift + containerOptions "-v ${moduleDir}:${moduleDir}" + + publishDir path: "${intermediate_filepath}", + pattern: "features.Rds", + mode: "copy", + enabled: params.save_intermediate_files, + saveAs: { "${slug}.${file(it).getExtension()}" } + + input: + tuple val(sample_id), path(vcf) + + output: + tuple val(sample_id), path('features.Rds'), emit: r_annotations + + script: + intermediate_filepath = "${params.output_dir_base}/stablelift-${params.stablelift_version}/intermediate/${task.process}" + + slug = "stablelift-${sample_id}" + + """ + Rscript "${moduleDir}/scripts/extract-vcf-features.R" \ + --input-vcf "${vcf}" \ + --variant-caller ${params.variant_caller} \ + --output-rds "features.Rds" + """ +} + +process predict_stability_StableLift { + container params.docker_image_stablelift + containerOptions "-v ${moduleDir}:${moduleDir}" + + publishDir path: "${params.output_dir_base}/output", + pattern: "${output_file_name}", + mode: "copy" + + input: + tuple val(sample_id), path(features_rds) + path(rf_model) + + output: + tuple val(sample_id), path(output_file_name), emit: stability_tsv + + script: + output_file_name = "stablelift-${sample_id}.tsv" + + """ + Rscript "${moduleDir}/scripts/predict-liftover-stability.R" \ + --features-dt "${features_rds}" \ + --rf-model "${rf_model}" \ + --variant-caller "${params.variant_caller}" \ + --output-tsv "${output_file_name}" + """ +} + +process run_apply_stability_annotations { + container params.docker_image_bcftools + + publishDir path: "${params.output_dir_base}/output", + pattern: "*.vcf.gz{,.tbi}", + mode: "copy" + + input: + tuple val(sample_id), + path(annotated_vcf, stageAs: 'inputs/*'), + // FIXME Should there be an annotated_vcf_tbi? + path(stability_tsv, stageAs: 'inputs/*'), + path(stability_tsv_tbi, stageAs: 'inputs/*') + + output: + tuple val(sample_id), + path(stability_vcf), + path(stability_vcf_tbi), + emit: stability_vcf_with_index + tuple val(sample_id), + path(filtered_vcf), + path(filtered_vcf_tbi), + emit: filtered_vcf_with_index + + script: + slug = "${sample_id}_LiftOver" + + stability_vcf = "${slug}_stability.vcf.gz" + stability_vcf_tbi = "${stability_vcf}.tbi" + + filtered_vcf = "${slug}_filtered.vcf.gz" + filtered_vcf_tbi = "${filtered_vcf}.tbi" + + """ + bcftools annotate \ + -a "${stability_tsv}" \ + -c CHROM,POS,STABILITY_SCORE,STABILITY \ + -h <(echo '##INFO= +##INFO=') \ + -o "${stability_vcf}" \ + "${annotated_vcf}" + + bcftools index -t "${stability_vcf}" + + bcftools filter \ + -i 'INFO/STABILITY="STABLE"' \ + -o "${filtered_vcf}" \ + "${stability_vcf}" + + bcftools index -t "${filtered_vcf}" + """ +} + +workflow workflow_extract_features { + take: + vcf_with_sample_id + + main: + if (params.variant_caller == "HaplotypeCaller") { + error "HaplotypeCaller is not supported yet" + } else { + extract_VCF_features_StableLift(vcf_with_sample_id) + extract_VCF_features_StableLift.out.r_annotations.set { ch_annotations } + } + + predict_stability_StableLift( + ch_annotations, + Channel.value(params.rf_model) + ) + + compress_and_index_HTSlib( + predict_stability_StableLift.out.stability_tsv + ) + + run_apply_stability_annotations( + vcf_with_sample_id.join( + compress_and_index_HTSlib.out.compressed_tsv_with_index, + failOnDuplicate: true, + failOnMismatch: true + ) + ) + + emit: + stability_vcf = run_apply_stability_annotations.out.stability_vcf_with_index + filtered_vcf = run_apply_stability_annotations.out.filtered_vcf_with_index +} diff --git a/module/funcotator.nf b/module/funcotator.nf new file mode 100644 index 0000000..1917ae0 --- /dev/null +++ b/module/funcotator.nf @@ -0,0 +1,44 @@ +/* +* Module/process description here +* +* @input +* @params +* @output +*/ + +include { generate_standard_filename } from '../external/pipeline-Nextflow-module/modules/common/generate_standardized_filename/main.nf' + +process run_Funcotator_GATK { + container params.docker_image_gatk + + publishDir path: "${intermediate_filepath}", + pattern: "output.vcf.gz", + mode: "copy", + enabled: params.save_intermediate_files, + saveAs: { "${slug}.vcf.gz" } + + input: + tuple val(sample_id), + path(vcf, stageAs: 'inputs/*'), + path(index, stageAs: 'inputs/*') + tuple val(dest_fasta_id), path(dest_fasta_ref), path(dest_fasta_fai), path(dest_fasta_dict) + path (funcotator_sources) + + output: + tuple val(sample_id), path('output.vcf.gz'), emit: funcotator_vcf + + script: + intermediate_filepath = "${params.output_dir_base}/GATK-${params.gatk_version}/intermediate/${task.process}" + + slug = "Funcotator-${sample_id}-${dest_fasta_id}" + + """ + gatk Funcotator \ + --variant "${vcf}" \ + --reference "${dest_fasta_ref}" \ + --ref-version "${dest_fasta_id}" \ + --data-sources-path "${funcotator_sources}" \ + --output-file-format VCF \ + --output "output.vcf.gz" + """ +} diff --git a/module/liftover.nf b/module/liftover.nf new file mode 100644 index 0000000..5e8f09c --- /dev/null +++ b/module/liftover.nf @@ -0,0 +1,65 @@ +/* +* Module/process description here +* +* @input +* @params +* @output +*/ + +// include { generate_standard_filename } from '../external/pipeline-Nextflow-module/modules/common/generate_standardized_filename/main.nf' + +process run_liftover_BCFtools { + container params.docker_image_bcftools + + publishDir path: "${intermediate_path}", + pattern: "reject.vcf.gz", + mode: "copy", + enabled: params.save_intermediate_files, + saveAs: { "${slug}-reject.vcf.gz" } + + publishDir path: "${intermediate_path}", + pattern: "liftover.vcf.gz", + mode: "copy", + enabled: params.save_intermediate_files, + saveAs: { "${slug}.vcf.gz" } + + publishDir path: "${intermediate_path}", + pattern: "liftover.vcf.gz.tbi", + mode: "copy", + enabled: params.save_intermediate_files, + saveAs: { "${slug}.vcf.gz.tbi" } + + input: + tuple val(sample_id), path(vcf), path(index) + tuple val(src_fasta_id), path(src_fasta_ref), path(src_fasta_fai), path(src_fasta_dict) + tuple val(dest_fasta_id), path(dest_fasta_ref), path(dest_fasta_fai), path(dest_fasta_dict) + path (chain_file) + + output: + tuple val(sample_id), path('liftover.vcf.gz'), path('liftover.vcf.gz.tbi'), emit: liftover_vcf_with_index + + script: + // FIXME Use a more standard path + intermediate_path = "${params.output_dir_base}/BCFtools-${params.bcftools_version}/intermediate/${task.process}" + + slug = "LiftOver-${sample_id}-${src_fasta_id}-to-${dest_fasta_id}" + + """ + bcftools +liftover \ + --output-type u \ + "${vcf}" \ + -- \ + --src-fasta-ref "${src_fasta_ref}" \ + --fasta-ref "${dest_fasta_ref}" \ + --chain "${chain_file}" \ + --reject-type z \ + --reject "reject.vcf.gz" | \ + bcftools view \ + --output-type u \ + -e 'REF="." | ALT="."' | \ + bcftools sort \ + --output-type z \ + --write-index=tbi \ + --output "liftover.vcf.gz" + """ +} diff --git a/module/module-name.nf b/module/module-name.nf deleted file mode 100644 index 33d2ea4..0000000 --- a/module/module-name.nf +++ /dev/null @@ -1,60 +0,0 @@ -/* -* Module/process description here -* -* @input -* @params -* @output -*/ - -include { generate_standard_filename } from '../external/pipeline-Nextflow-module/modules/common/generate_standardized_filename/main.nf' - -process tool_name_command_name { - container params.docker_image_name - - // if setting resources via label (see base.config), set label here - label "resource_allocation_tool_name_command_name" - - // remove task.index extension if not needed - publishDir path: "${params.workflow_output_dir}/output", - pattern: "", - mode: "copy", - enabled: true - - // Process logs (the `.command.*` files) will be automatically saved as - // `tool_name_command_name/log.command.*` due to - // methods.setup_process_afterscript(). The folder can be customized - // per-process to add a suffix distinguishing multiple runs of the same - // process: - - // ext log_dir_suffix: { "-${variable_name}" } - - // Additional directives here - - input: - tuple val(orig_id), val(id), path(path), val(sample_type) - val(variable_name) - - output: - path("${variable_name}.command_name.file_extension"), emit: output_tag - - script: - output_filename = generate_standard_filename("Samtools-${params.samtools_version}", - params.dataset_id, - id, - [:]) - - """ - # make sure to specify pipefail to make sure process correctly fails on error - set -euo pipefail - - # the script should ideally only have call to a single tool - # to make the command more human readable: - # - seperate components of the call out on different lines - # - when possible be explict with command options, spelling out their long names - tool_name \ - command_name \ - --option_1_long_name ${id} \ - --input ${path} \ - --output ${variable_name}.command_name.file_extension - """ -} diff --git a/module/scripts/extract-vcf-features.R b/module/scripts/extract-vcf-features.R new file mode 100644 index 0000000..5aa3d75 --- /dev/null +++ b/module/scripts/extract-vcf-features.R @@ -0,0 +1,265 @@ +#!/usr/bin/env Rscript +# extract-vcf-features.R +#################################################################################################### +# +# Extract features from vcf +# Extract Funcotator annotations if present +# Annotate with RepeatMasker regions if intersect file is provided +# +#################################################################################################### + +suppressPackageStartupMessages({ + library(vcfR); + library(data.table); + library(argparse); + library(doParallel); + library(foreach); + }); + +################################################################################################### +# Input +################################################################################################### +# Define command line arguments +parser <- ArgumentParser(); +parser$add_argument('--input-vcf', type = 'character', help = 'GRCh37 vcf lifted to GRCh38 for feature extraction'); +parser$add_argument('--input-dir', type = 'character', help = 'Directory with vcf subsets'); +parser$add_argument('--output-rds', type = 'character', help = 'Rds output for use in RF model'); +parser$add_argument('--variant-caller', type = 'character', help = ''); +parser$add_argument('--ncore', type = 'integer', help = 'Number of cores to use for parallelizing features extraction', default = 1); +args <- parser$parse_args(); + +# Save command line arguments +for (arg in names(args)) { + assign(gsub('_', '.', arg), args[[arg]]); + } + +# Set parameters for interactive runs +if (interactive()) { + variant.caller <- 'Strelka2'; + input.vcf <- '/hot/project/method/AlgorithmEvaluation/BNCH-000142-GRCh37v38/gSNP/stableLift/validate_TCGA-SARC_WXS/TCGA-SARC_WXS_HaplotypeCaller_LiftOver-GRCh38_annotated_exome.vcf.gz'; + vcf.subset <- input.vcf; + } + +if (!is.null(input.dir)) { + vcf.subsets <- list.files(input.dir, full.names = TRUE, pattern = '(\\.vcf.gz|\\.vcf)$'); + output.path <- output.rds; +} else if (!is.null(input.vcf)) { + vcf.subsets <- input.vcf; + output.path <- ifelse(is.null(output.rds), sub('\\..*$', '.Rds', input.vcf), output.rds); +} else { + stop('Either `--input-vcf` or `--input-dir` must be provided'); + } + +################################################################################################### +# Functions +################################################################################################### +vcf.info.to.dt <- function(vcf.info) { + # Split each string by semicolon and convert to a list of key-value pairs + vcf.info <- strsplit(vcf.info, ';'); + vcf.info <- lapply(vcf.info, function(x) { + x <- strsplit(x, '='); + as.list(stats::setNames(sapply(x, `[`, 2), sapply(x, `[`, 1))); + }) + + # Combine the list of key-value pairs into a data table + rbindlist(vcf.info, fill = TRUE); + } + +sum.MAF <- function(x) { + x <- unlist(strsplit(x, ',')); + x <- 1 - as.numeric(x[1]); + if (is.na(x)) 0 else x; + } + +################################################################################################### +# Extract RF features from Funcotator vcf +################################################################################################### +start.extract <- Sys.time(); + +registerDoParallel(cores = ncore); +features.dt.subsets <- foreach(vcf.subset = vcf.subsets) %dopar% { + start.subset <- Sys.time(); + cat('Processing:', basename(vcf.subset), '\n'); + + input.vcf <- read.vcfR( + file = vcf.subset + ); + + # Parse info column + info <- vcf.info.to.dt(input.vcf@fix[, 'INFO']); + + # Aggregate per sample metrics, average depth per GT called + info$DP <- apply(extract.gt(input.vcf, element = 'DP', as.numeric = TRUE), 1, mean, na.rm = TRUE); + if (variant.caller == 'HaplotypeCaller') { + # Take mean of genotype quality and remove cohort allele frequency + info$GQ <- apply(extract.gt(input.vcf, element = 'GQ', as.numeric = TRUE), 1, mean, na.rm = TRUE); + info[, AF := NULL]; + } else if (variant.caller == 'Mutect2') { + # Get metric for ALT allele + info$MBQ <- sapply(extract.info(input.vcf, element = 'MBQ'), function(x) as.numeric(unlist(strsplit(x, ','))[2])); + info$MFRL <- sapply(extract.info(input.vcf, element = 'MFRL'), function(x) as.numeric(unlist(strsplit(x, ','))[2])); + info$MMQ <- sapply(extract.info(input.vcf, element = 'MMQ'), function(x) as.numeric(unlist(strsplit(x, ','))[2])); + # Take max of somatic variant allele frequencies + info$AF <- apply(extract.gt(input.vcf, element = 'AF', as.numeric = TRUE), 1, max, na.rm = TRUE); + } else if (variant.caller == 'Muse2') { + # Calculate VAF from allelic depths + info$AF <- apply(extract.gt(input.vcf, element = 'AD'), 1, function(x) mean(sapply(strsplit(x, ','), function(y) as.numeric(y[2]) / (as.numeric(y[1]) + as.numeric(y[2]))), na.rm = TRUE)); + info$BQ <- apply(extract.gt(input.vcf, element = 'BQ'), 1, function(x) mean(sapply(strsplit(x, ','), function(y) as.numeric(y[2])), na.rm = TRUE)); + } else if (variant.caller == 'Strelka2') { + # Calculate VAF from allelic depths + info[input.vcf@fix[, 'REF'] == 'A', REFCOUNTS := apply(extract.gt(input.vcf, element = 'AU')[input.vcf@fix[, 'REF'] == 'A', ], 1, function(x) mean(sapply(strsplit(x, ','), function(y) as.numeric(y[1])), na.rm = TRUE))]; + info[input.vcf@fix[, 'ALT'] == 'A', ALTCOUNTS := apply(extract.gt(input.vcf, element = 'AU')[input.vcf@fix[, 'ALT'] == 'A', ], 1, function(x) mean(sapply(strsplit(x, ','), function(y) as.numeric(y[1])), na.rm = TRUE))]; + info[input.vcf@fix[, 'REF'] == 'T', REFCOUNTS := apply(extract.gt(input.vcf, element = 'TU')[input.vcf@fix[, 'REF'] == 'T', ], 1, function(x) mean(sapply(strsplit(x, ','), function(y) as.numeric(y[1])), na.rm = TRUE))]; + info[input.vcf@fix[, 'ALT'] == 'T', ALTCOUNTS := apply(extract.gt(input.vcf, element = 'TU')[input.vcf@fix[, 'ALT'] == 'T', ], 1, function(x) mean(sapply(strsplit(x, ','), function(y) as.numeric(y[1])), na.rm = TRUE))]; + info[input.vcf@fix[, 'REF'] == 'C', REFCOUNTS := apply(extract.gt(input.vcf, element = 'CU')[input.vcf@fix[, 'REF'] == 'C', ], 1, function(x) mean(sapply(strsplit(x, ','), function(y) as.numeric(y[1])), na.rm = TRUE))]; + info[input.vcf@fix[, 'ALT'] == 'C', ALTCOUNTS := apply(extract.gt(input.vcf, element = 'CU')[input.vcf@fix[, 'ALT'] == 'C', ], 1, function(x) mean(sapply(strsplit(x, ','), function(y) as.numeric(y[1])), na.rm = TRUE))]; + info[input.vcf@fix[, 'REF'] == 'G', REFCOUNTS := apply(extract.gt(input.vcf, element = 'GU')[input.vcf@fix[, 'REF'] == 'G', ], 1, function(x) mean(sapply(strsplit(x, ','), function(y) as.numeric(y[1])), na.rm = TRUE))]; + info[input.vcf@fix[, 'ALT'] == 'G', ALTCOUNTS := apply(extract.gt(input.vcf, element = 'GU')[input.vcf@fix[, 'ALT'] == 'G', ], 1, function(x) mean(sapply(strsplit(x, ','), function(y) as.numeric(y[1])), na.rm = TRUE))]; + info[, AF := ALTCOUNTS / (REFCOUNTS + ALTCOUNTS)]; + } + + # Get funcotation fields + funcotator.fields <- grep('ID=FUNCOTATION', input.vcf@meta, value = TRUE); + funcotator.fields <- sub('.*Funcotation fields are: (.*)\\".*', '\\1', funcotator.fields); + funcotator.fields <- strsplit(funcotator.fields, split = '\\|')[[1]]; + + # Parse funcotation field from info column + funcotation <- sapply(info$FUNCOTATION, function(x) unlist(strsplit(x, split = ','))[1]); + names(funcotation) <- NULL; + funcotation <- gsub('\\[|\\]', '', funcotation); + funcotation <- lapply(funcotation, function(x) { + x <- unlist(strsplit(x, split = '\\|')); + x <- as.list(x[seq_along(funcotator.fields)]); + names(x) <- funcotator.fields; + return(x); + }); + features.dt <- rbindlist(funcotation, use.names = TRUE, fill = TRUE); + rm(funcotation); + + # names(features.dt) <- funcotator.fields; + funcotator.fields.keep <- c( + 'GC Content' = 'Gencode_34_gcContent', + 'GENCODE Variant Classification' = 'Gencode_34_variantClassification', + 'GENCODE Variant Type' = 'Gencode_34_variantType', + 'HGNC Locus Group' = 'HGNC_Locus_Group', + '1KG AF' = 'dbSNP_CAF', + 'TOPMED AF' = 'dbSNP_TOPMED' + ); + features.dt <- features.dt[, ..funcotator.fields.keep]; + + # Replace ascii character codes + features.dt <- as.data.table(lapply(features.dt, function(x) gsub('_%7C_', ';', x, fixed = TRUE))); + features.dt <- as.data.table(lapply(features.dt, function(x) gsub('_%2C_', ',', x, fixed = TRUE))); + features.dt <- as.data.table(lapply(features.dt, function(x) gsub('_%20_', ' ', x, fixed = TRUE))); + features.dt <- as.data.table(lapply(features.dt, function(x) gsub('_%3D_', '=', x, fixed = TRUE))); + + # Aggregate minor allele frequencies + features.dt[, dbSNP_CAF := sapply(dbSNP_CAF, sum.MAF)]; + features.dt[, dbSNP_TOPMED := sapply(dbSNP_TOPMED, sum.MAF)]; + + # Aggregate flattened fields + features.dt <- cbind(input.vcf@fix[, 1:7], info[, -c('FUNCOTATION')], features.dt); + + # Collapse trinucleotide reverse complements + features.dt[Gencode_34_variantType != 'SNP', TRINUCLEOTIDE := '']; + features.dt[Gencode_34_variantType == 'SNP', TRINUCLEOTIDE := sapply(TRINUCLEOTIDE, function(x) { + x <- as.character(x); + if (substr(x, 2, 2) %in% c('A', 'G')) { + x <- chartr('ATGC', 'TACG', x); + paste(rev(unlist(strsplit(x, ''))), collapse = ''); + } else { + x; + } + })]; + features.dt[Gencode_34_variantType == 'SNP', TRINUCLEOTIDE_SEQ := TRINUCLEOTIDE]; + # Add substitution base + features.dt[Gencode_34_variantType == 'SNP', ALT := sapply(ALT, function(x) unlist(strsplit(x, ','))[1])]; + features.dt[Gencode_34_variantType == 'SNP' & REF %in% c('A', 'G'), ALT := chartr('ATGC', 'TACG', ALT)]; + features.dt[Gencode_34_variantType == 'SNP' & TRINUCLEOTIDE != '', TRINUCLEOTIDE := paste0(TRINUCLEOTIDE, '->', substr(TRINUCLEOTIDE, 1, 1), ALT, substr(TRINUCLEOTIDE, 3, 3))]; + + cat(format(Sys.time() - start.subset, nsmall = 2), '\n'); + return(features.dt); + } + +features.dt <- rbindlist(features.dt.subsets, use.names = TRUE, fill = TRUE); +rm(features.dt.subsets); +gc(); +cat(format(Sys.time() - start.extract, nsmall = 2), '\n'); + +################################################################################################### +# Format features for RF +################################################################################################### +continuous.features <- c( + 'Chromosome Position (POS)' = 'POS', + 'Local GC Content' = 'Gencode_34_gcContent', # + '1000 Genomes Allele Frequency' = 'dbSNP_CAF', # + 'TOPMED Allele Frequency' = 'dbSNP_TOPMED', # + 'Sequencing Depth (DP)' = 'DP', + 'Variant Frequency (AF)' = 'AF' + ); +if (variant.caller == 'HaplotypeCaller') continuous.features <- c(continuous.features, + 'Variant Quality (QUAL)' = 'QUAL', + 'ExcessHet' = 'ExcessHet', + "Fisher's Strand Bias (FS)" = 'FS', + 'Mapping Quality (MQ)' = 'MQ', + 'Quality by Depth (QD)' = 'QD', + 'Strand Bias Odds Ratio (SOR)' = 'SOR', + 'VQSR Log Odds (VQSLOD)' = 'VQSLOD', + 'Genotype Quality (GQ)' = 'GQ' + ); +if (variant.caller == 'Mutect2') continuous.features <- c(continuous.features, + 'Tumor Variant Log Odds (TLOD)' = 'TLOD', + 'Events in Haplotype (ECNT)' = 'ECNT', + 'Germline Quality (GERMQ)' = 'GERMQ', + 'Median Distance to Read End (MPOS)' = 'MPOS', + 'Normal Artifact Log Odds (NALOD)' = 'NALOD', + 'Normal Variant Log Odds (NLOD)' = 'NLOD', + 'Read Orientation Artifact (ROQ)' = 'ROQ', + 'Median Base Quality (MBQ)' = 'MBQ', + 'Median Fragment Length (MFRL)' = 'MFRL', + 'Median Mapping Quality (MMQ)' = 'MMQ' + ); +if (variant.caller == 'Strelka2') continuous.features <- c(continuous.features, + 'Mapping Quality (MQ)' = 'MQ', + 'MQ0 Reads (MQ0)' = 'MQ0', + 'Empirical Variant Score (SomaticEVS)' = 'SomaticEVS', + 'Quality Score Somatic (QSS)' = 'QSS', + 'Quality Score Joint (QSS_NT)' = 'QSS_NT', + 'ReadPosRankSum' + ); +if (variant.caller == 'Muse2') continuous.features <- c(continuous.features, + 'Variant Base Quality (BQ)' = 'BQ' + ); + +categorical.features <- c( + 'Chromosome (CHR)' = 'CHROM', + 'GENCODE Variant Type' = 'Gencode_34_variantType', # + 'GENCODE Variant Type' = 'Gencode_34_variantClassification', # + 'HGNC Locus Group' = 'HGNC_Locus_Group', # + 'Repeat Class' = 'REPEAT', ## + 'Trinucleotide Substitution' = 'TRINUCLEOTIDE', ## + 'Trinucleotide Context' = 'TRINUCLEOTIDE_SEQ', ## + 'VQSR Culprit' = 'culprit', # HaplotypeCaller + 'Somatic Genotype (SGT)' = 'SGT' # Strelka2 + ); + +# Extract features and format +continuous.features <- continuous.features[continuous.features %in% names(features.dt)]; +categorical.features <- categorical.features[categorical.features %in% names(features.dt)]; +all.features <- c(continuous.features, categorical.features); + +features.dt <- features.dt[, ..all.features]; +features.dt[, (continuous.features) := lapply(.SD, as.numeric), .SDcols = continuous.features]; +features.dt[, (categorical.features) := lapply(.SD, function(x) ifelse(is.na(x), '', x)), .SDcols = categorical.features]; +features.dt[, (categorical.features) := lapply(.SD, as.factor), .SDcols = categorical.features]; +names(features.dt) <- make.names(names(features.dt)); + +# Remove rows with NA +features.dt.rows <- nrow(features.dt); +features.dt <- features.dt[apply(features.dt, 1, function(x) !any(is.na(x))), ]; +cat('Removed', features.dt.rows - nrow(features.dt), 'rows with missing data\n'); + +################################################################################################### +# Save features.dt for input to RF +################################################################################################### +saveRDS(features.dt, output.path); diff --git a/module/scripts/predict-liftover-stability.R b/module/scripts/predict-liftover-stability.R new file mode 100644 index 0000000..12892b9 --- /dev/null +++ b/module/scripts/predict-liftover-stability.R @@ -0,0 +1,129 @@ +#!/usr/bin/env Rscript +# predict-liftover-stability.R +#################################################################################################### +# +# Apply random forest model to predict variant LiftOver stability +# Validate results and plot model performance with discordance file +# +#################################################################################################### + +suppressPackageStartupMessages({ + library(caret); + library(ranger); + library(argparse); + library(ROCR); + library(data.table); + }); + +################################################################################################### +# Set parameters +################################################################################################### +# Define command line arguments +parser <- ArgumentParser(); +parser$add_argument('--variant-caller', type = 'character'); +parser$add_argument('--features-dt', type = 'character'); +parser$add_argument('--rf-model', type = 'character'); +parser$add_argument('--specificity', type = 'numeric', help = 'Target specificity, overrides `--threshold`'); +parser$add_argument('--threshold', type = 'numeric', help = 'Stability score threshold', default = 0.5); +parser$add_argument('--output-tsv', type = 'character', help = 'TSV output file'); +args <- parser$parse_args(); + +# Save command line arguments +for (arg in names(args)) { + assign(gsub('_', '.', arg), args[[arg]]); + } + +################################################################################################### +# Load data +################################################################################################### +features.dt.path <- features.dt; +features.dt <- readRDS(features.dt.path); +rf.model.path <- rf.model; +rf.model <- readRDS(rf.model); + +#################################################################################################### +# Feature engineering +#################################################################################################### +if (variant.caller == 'HaplotypeCaller') { + normalize.features <- c('FS', 'VQSLOD', 'QD', 'SOR'); + features.dt[, (normalize.features) := lapply(.SD, scale), .SDcols = normalize.features]; + features.dt[, c('QUAL', 'GQ', 'DP') := NULL]; +} else if (variant.caller == 'Mutect2') { + features.dt[, c('TRINUCLEOTIDE') := NULL]; +} else if (variant.caller == 'Muse2') { + features.dt[, c('TRINUCLEOTIDE', 'Gencode_34_variantType') := NULL]; +} else if (variant.caller == 'Strelka2') { + features.dt[, c('TRINUCLEOTIDE_SEQ', 'DP', 'Gencode_34_variantType') := NULL]; +} else if (variant.caller == 'SomaticSniper') { + features.dt[, c('DP', 'BQ', 'GQ', 'MQ', 'SSC', 'Gencode_34_variantType', 'TRINUCLEOTIDE', 'TRINUCLEOTIDE_SEQ', 'Gencode_34_variantClassification', 'Gencode_34_gcContent', 'dbSNP_CAF') := NULL]; +} else if (variant.caller == 'Delly2') { + normalize.features <- c('SR', 'SRQ', 'DV'); + features.dt[, (normalize.features) := lapply(.SD, scale), .SDcols = normalize.features]; + } + +cat('Input data dimensions:\n'); +print(dim(features.dt)); + +################################################################################################### +# Apply random forest model +################################################################################################### +cat('\nPredicting liftover stability with', basename(rf.model.path), '\n'); +stability <- predict(rf.model, data = features.dt); + +# if (!is.null(specificity) && is.numeric(specificity)) { +# cat('Target specificity =', specificity, '\n'); +# operating.index <- max(which(unlist(rf.model$performance@x.values) < 1 - specificity)); +# sensitivity <- unlist(rf.model$performance@y.values)[operating.index]; +# cat('Projected sensitivity =', round(sensitivity, 3), '\n'); +# threshold <- 1 - unlist(rf.model$performance@alpha.values)[operating.index]; +# cat('Stability score threshold =', round(threshold, 3), '\n'); +# } else if (!is.null(threshold) && is.numeric(threshold)) { +# cat('Target threshold =', threshold, '\n'); +# operating.index <- min(which(unlist(rf.model$performance@alpha.values) <= 1 - threshold)); +# specificity <- 1 - unlist(rf.model$performance@x.values)[operating.index]; +# sensitivity <- unlist(rf.model$performance@y.values)[operating.index]; +# cat('Projected specificity =', round(specificity, 3), '\n'); +# cat('Projected sensitivity =', round(sensitivity, 3), '\n'); +# } else { +# performance.acc <- performance(prediction$train, measure = 'f'); #F1-score +# index <- which.max(unlist(performance.acc@y.values)); +# cutoff <- unlist(performance.acc@x.values)[index]; +# metric <- unlist(performance.acc@y.values)[index]; +# specificity <- 1 - unlist(performance$train@x.values)[index]; +# sensitivity <- unlist(performance$train@y.values)[index]; +# cat(sprintf('Projected F[0.5]-score = %.3f\n', metric)); +# cat(sprintf('Projected sensitivity = %.3f\n', sensitivity)); +# cat(sprintf('Projected specificity = %.3f\n', specificity)); +# } + +performance.f <- performance(rf.model$prediction, measure = 'f'); +index <- which.max(unlist(performance.f@y.values)); +threshold <- unlist(performance.f@x.values)[index]; +# f.score <- unlist(performance.f@y.values)[index]; + +performance <- performance(rf.model$prediction, 'sens', 'spec'); + +sensitivity <- unlist(performance@y.values)[index]; +specificity <- unlist(performance@x.values)[index]; + +# cat(sprintf('Max F1-score = %.3f\n', f.score)); +# Convert to stability units +threshold.stability <- 1 - threshold; +cat(sprintf('Threshold = %.3f\n', threshold.stability)); +cat(sprintf('Training sensitivity = %.3f\n', sensitivity)); +cat(sprintf('Training specificity = %.3f\n', specificity)); + +stability.classification <- ifelse(stability$predictions[, 1] < threshold.stability, 1, 0); +cat(sprintf('Proportion predicted unstable = %.3f\n\n', mean(stability.classification))); +stability.classification <- as.factor(stability.classification); + +################################################################################################### +# Output stability scores +################################################################################################### +annotation.dt <- data.table( + CHROM = features.dt$CHROM, + POS = features.dt$POS, + STABILITY_SCORE = format(round(stability$predictions[, 1], 4), nsmall = 4), + STABILITY = ifelse(stability.classification == '1', 'UNSTABLE', 'STABLE') + ); +fwrite(annotation.dt, file = output.tsv, sep = '\t', col.names = TRUE); diff --git a/nextflow.config b/nextflow.config index bfd2628..82d031d 100644 --- a/nextflow.config +++ b/nextflow.config @@ -1,7 +1,7 @@ // Metadata manifest { - name = 'pipeline-name-nf' - author = 'Authors Name' + name = 'pipeline-StableLift' + author = 'Nicholas Wiltsie' description = 'A template for Nextflow pipelines' - version = 'Version Number' + version = '0.1.0' } diff --git a/nftest.yml b/nftest.yml new file mode 100644 index 0000000..32644f5 --- /dev/null +++ b/nftest.yml @@ -0,0 +1,13 @@ +--- +global: + temp_dir: test/work + remove_temp: true + clean_logs: true + nf_config: test/nftest.config + +cases: + - name: Example test + nf_script: ./main.nf + params_file: test/nftest.yaml + skip: false + verbose: true diff --git a/test/nftest.config b/test/nftest.config new file mode 100644 index 0000000..2c9cbff --- /dev/null +++ b/test/nftest.config @@ -0,0 +1,37 @@ +includeConfig "${projectDir}/config/default.config" +includeConfig "${projectDir}/config/methods.config" +includeConfig "${projectDir}/nextflow.config" + +params { + // Choices: ["Mutect2", "HaplotypeCaller"] + variant_caller = "Mutect2" + save_intermediate_files = true + + ucla_cds = false + + rf_model = "/hot/project/method/AlgorithmEvaluation/BNCH-000142-GRCh37v38/sSNV/stableLift/train_CPCG-40QC_Mutect2/RF-train_Mutect2_ntree2000_nodesize5_classratio0.Rds" + + // Reference files + funcotator_data { + data_source = "/hot/ref/tool-specific-input/Funcotator/somatic/funcotator_dataSources.v1.7.20200521s" + src_reference_id = "hg19" + dest_reference_id = "hg38" + } + + // The source reference sequence (FASTA). Must correspond with + // params.funcotator_data.src_reference_id + src_fasta_ref = "/hot/ref/reference/GRCh37-EBI-hs37d5/hs37d5.fa" + + // The destination reference sequence (FASTA). Must correspond with + // params.funcotator_data.dest_reference_id + dest_fasta_ref = "/hot/ref/reference/GRCh38-BI-20160721/Homo_sapiens_assembly38.fasta" + + // The liftover chain file between the source and destination FASTA + // references + chain_file = "/hot/ref/tool-specific-input/liftOver/hg19ToHg38.over.chain" + + repeat_bed = "/hot/ref/database/RepeatMasker-v3.0.1/processed/GRCh38/GRCh38_RepeatMasker_intervals.bed" +} + +// Setup the pipeline config. DO NOT REMOVE THIS LINE! +methods.setup() diff --git a/test/nftest.yaml b/test/nftest.yaml new file mode 100644 index 0000000..dd3d6f7 --- /dev/null +++ b/test/nftest.yaml @@ -0,0 +1,4 @@ +--- +sample_id: ExampleID +input: + vcf: /hot/project/method/AlgorithmEvaluation/BNCH-000142-GRCh37v38/validation/TCGA-SARC_WGS/GRCh37/Mutect2/TCGA-SARC_WGS_Mutect2_merge.vcf.gz