diff --git a/.editorconfig b/.editorconfig index dd9ffa5..0fbfe58 100644 --- a/.editorconfig +++ b/.editorconfig @@ -24,6 +24,19 @@ end_of_line = unset insert_final_newline = unset trim_trailing_whitespace = unset indent_style = unset +[/modules/msk/**] +charset = unset +end_of_line = unset +insert_final_newline = unset +trim_trailing_whitespace = unset +indent_style = unset +[/subworkflows/msk/**] +charset = unset +end_of_line = unset +insert_final_newline = unset +trim_trailing_whitespace = unset +indent_style = unset + [/assets/email*] indent_size = unset @@ -31,6 +44,11 @@ indent_size = unset # ignore Readme [README.md] indent_style = unset +trim_trailing_whitespace = unset + +[/docs/usage.md] +indent_style = unset +trim_trailing_whitespace = unset # ignore python [*.{py,md}] diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml index 715f9bc..9dbf64e 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.yml +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -2,53 +2,53 @@ name: Bug report description: Report something that is broken or incorrect labels: bug body: -- type: textarea - id: description - attributes: - label: Description of the bug - description: A clear and concise description of what the bug is. - validations: - required: true -- type: textarea - id: command_used - attributes: - label: Command used and terminal output - description: Steps to reproduce the behaviour. Please paste the command you used - to launch the pipeline and the output from your terminal. - render: console - placeholder: '$ nextflow run ... - - - Some output where something broke - - ' -- type: textarea - id: files - attributes: - label: Relevant files - description: 'Please drag and drop the relevant files here. Create a `.zip` archive - if the extension is not allowed. - - Your verbose log file `.nextflow.log` is often useful _(this is a hidden file - in the directory where you launched the pipeline)_ as well as custom Nextflow - configuration files. - - ' -- type: textarea - id: system - attributes: - label: System information - description: '* Nextflow version _(eg. 23.04.0)_ - - * Hardware _(eg. HPC, Desktop, Cloud)_ - - * Executor _(eg. slurm, local, awsbatch)_ - - * Container engine: _(e.g. Docker, Singularity, Conda, Podman, Shifter, Charliecloud, - or Apptainer)_ - - * OS _(eg. CentOS Linux, macOS, Linux Mint)_ - - * Version of mskcc/neoantigenpipeline _(eg. 1.1, 1.5, 1.8.2)_ - - ' + - type: textarea + id: description + attributes: + label: Description of the bug + description: A clear and concise description of what the bug is. + validations: + required: true + - type: textarea + id: command_used + attributes: + label: Command used and terminal output + description: Steps to reproduce the behaviour. Please paste the command you used + to launch the pipeline and the output from your terminal. + render: console + placeholder: "$ nextflow run ... + + + Some output where something broke + + " + - type: textarea + id: files + attributes: + label: Relevant files + description: "Please drag and drop the relevant files here. Create a `.zip` archive + if the extension is not allowed. + + Your verbose log file `.nextflow.log` is often useful _(this is a hidden file + in the directory where you launched the pipeline)_ as well as custom Nextflow + configuration files. + + " + - type: textarea + id: system + attributes: + label: System information + description: "* Nextflow version _(eg. 23.04.0)_ + + * Hardware _(eg. HPC, Desktop, Cloud)_ + + * Executor _(eg. slurm, local, awsbatch)_ + + * Container engine: _(e.g. Docker, Singularity, Conda, Podman, Shifter, Charliecloud, + or Apptainer)_ + + * OS _(eg. CentOS Linux, macOS, Linux Mint)_ + + * Version of mskcc/neoantigenpipeline _(eg. 1.1, 1.5, 1.8.2)_ + + " diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index d4182fa..24a404c 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -17,7 +17,7 @@ Learn more about contributing: [CONTRIBUTING.md](https://github.com/mskcc/neoant - [ ] If you've fixed a bug or added code that should be tested, add tests! - [ ] If you've added a new tool - have you followed the pipeline conventions in the [contribution docs](https://github.com/mskcc/neoantigenpipeline/tree/master/.github/CONTRIBUTING.md) - [ ] Make sure your code lints (`nf-core lint`). -- [ ] Ensure the test suite passes (`nf-test test main.nf.test -profile test,docker`). +- [ ] Ensure the test suite passes (`nextflow run . -profile test,docker --outdir `). - [ ] Check for unexpected warnings in debug mode (`nextflow run . -profile debug,test,docker --outdir `). - [ ] Usage Documentation in `docs/usage.md` is updated. - [ ] Output Documentation in `docs/output.md` is updated. diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a74d119..37175cf 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -5,6 +5,7 @@ on: branches: - dev pull_request: + branches: ["dev", "main", "master"] release: types: [published] diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 073e187..1fcafe8 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -14,13 +14,12 @@ jobs: pre-commit: runs-on: ubuntu-latest steps: - - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4 + - uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4 - - name: Set up Python 3.11 - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5 + - name: Set up Python 3.12 + uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5 with: - python-version: 3.11 - cache: "pip" + python-version: "3.12" - name: Install pre-commit run: pip install pre-commit @@ -32,14 +31,14 @@ jobs: runs-on: ubuntu-latest steps: - name: Check out pipeline code - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4 + uses: actions/checkout@0ad4b8fadaa221de15dcec353f45205ec38ea70b # v4 - name: Install Nextflow - uses: nf-core/setup-nextflow@v1 + uses: nf-core/setup-nextflow@v2 - - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5 + - uses: actions/setup-python@82c7e631bb3cdc910f68e0081d67478d79c6982d # v5 with: - python-version: "3.11" + python-version: "3.12" architecture: "x64" - name: Install dependencies @@ -60,7 +59,7 @@ jobs: - name: Upload linting log file artifact if: ${{ always() }} - uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3 # v4 + uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808 # v4 with: name: linting-logs path: | diff --git a/.github/workflows/linting_comment.yml b/.github/workflows/linting_comment.yml index b706875..40acc23 100644 --- a/.github/workflows/linting_comment.yml +++ b/.github/workflows/linting_comment.yml @@ -11,7 +11,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Download lint results - uses: dawidd6/action-download-artifact@f6b0bace624032e30a85a8fd9c1a7f8f611f5737 # v3 + uses: dawidd6/action-download-artifact@09f2f74827fd3a8607589e5ad7f9398816f540fe # v3 with: workflow: linting.yml workflow_conclusion: completed diff --git a/.nf-core.yml b/.nf-core.yml index 5b0c769..0374a1a 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -1,23 +1,24 @@ +nf_core_version: 2.14.1 lint: files_exist: - - CODE_OF_CONDUCT.md - - assets/nf-core-neoantigenpipeline_logo_light.png - - docs/images/nf-core-neoantigenpipeline_logo_light.png - - docs/images/nf-core-neoantigenpipeline_logo_dark.png - - .github/ISSUE_TEMPLATE/config.yml - - .github/workflows/awstest.yml - - .github/workflows/awsfulltest.yml + - CODE_OF_CONDUCT.md + - assets/nf-core-neoantigenpipeline_logo_light.png + - docs/images/nf-core-neoantigenpipeline_logo_light.png + - docs/images/nf-core-neoantigenpipeline_logo_dark.png + - .github/ISSUE_TEMPLATE/config.yml + - .github/workflows/awstest.yml + - .github/workflows/awsfulltest.yml files_unchanged: - - CODE_OF_CONDUCT.md - - assets/nf-core-neoantigenpipeline_logo_light.png - - docs/images/nf-core-neoantigenpipeline_logo_light.png - - docs/images/nf-core-neoantigenpipeline_logo_dark.png - - .github/ISSUE_TEMPLATE/bug_report.yml + - CODE_OF_CONDUCT.md + - assets/nf-core-neoantigenpipeline_logo_light.png + - docs/images/nf-core-neoantigenpipeline_logo_light.png + - docs/images/nf-core-neoantigenpipeline_logo_dark.png + - .github/ISSUE_TEMPLATE/bug_report.yml multiqc_config: - - report_comment + - report_comment nextflow_config: - - manifest.name - - manifest.homePage + - manifest.name + - manifest.homePage repository_type: pipeline template: prefix: mskcc diff --git a/.prettierignore b/.prettierignore index 437d763..191016c 100644 --- a/.prettierignore +++ b/.prettierignore @@ -10,3 +10,4 @@ testing/ testing* *.pyc bin/ +README.md diff --git a/CITATIONS.md b/CITATIONS.md index d283cd7..b3aa453 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -10,13 +10,20 @@ ## Pipeline tools -- [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) +- [Phylowgs](https://github.com/morrislab/phylowgs) - > Andrews, S. (2010). FastQC: A Quality Control Tool for High Throughput Sequence Data [Online]. + > Deshwar, A. G., Vembu, S., Yung, C. K., Jang, G. H., Stein, L., & Morris, Q. (2015). PhyloWGS: reconstructing subclonal composition and evolution from whole-genome sequencing of tumors. Genome biology, 16(1), 35. https://doi.org/10.1186/s13059-015-0602-8 -- [MultiQC](https://pubmed.ncbi.nlm.nih.gov/27312411/) +- [NetMHCPan-4](https://services.healthtech.dtu.dk/services/NetMHCpan-4.1/) - > Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: 10.1093/bioinformatics/btw354. Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924. + > Jurtz, V., Paul, S., Andreatta, M., Marcatili, P., Peters, B., & Nielsen, M. (2017). NetMHCpan-4.0: Improved Peptide-MHC Class I Interaction Predictions Integrating Eluted Ligand and Peptide Binding Affinity Data. Journal of immunology (Baltimore, Md. : 1950), 199(9), 3360–3368. https://doi.org/10.4049/jimmunol.1700893 + +- [NeoantigenEditing](https://github.com/LukszaLab/NeoantigenEditing) + + > Łuksza, M., Sethna, Z.M., Rojas, L.A. et al. Neoantigen quality predicts immunoediting in survivors of pancreatic cancer. Nature 606, 389–395 (2022). https://doi.org/10.1038/s41586-022-04735-9 + +- [NetMHCPanStab](https://services.healthtech.dtu.dk/services/NetMHCstabpan-1.0/) + > Rasmussen, M., Fenoy, E., Harndahl, M., Kristensen, A. B., Nielsen, I. K., Nielsen, M., & Buus, S. (2016). Pan-Specific Prediction of Peptide-MHC Class I Complex Stability, a Correlate of T Cell Immunogenicity. Journal of immunology (Baltimore, Md. : 1950), 197(4), 1517–1524. https://doi.org/10.4049/jimmunol.1600582 ## Software packaging/containerisation tools diff --git a/README.md b/README.md index 099f904..2a6977b 100644 --- a/README.md +++ b/README.md @@ -1,29 +1,22 @@ -[![GitHub Actions CI Status](https://github.com/mskcc/neoantigenpipeline/actions/workflows/ci.yml/badge.svg)](https://github.com/mskcc/neoantigenpipeline/actions/workflows/ci.yml) -[![GitHub Actions Linting Status](https://github.com/mskcc/neoantigenpipeline/actions/workflows/linting.yml/badge.svg)](https://github.com/mskcc/neoantigenpipeline/actions/workflows/linting.yml)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.XXXXXXX-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.XXXXXXX) -[![nf-test](https://img.shields.io/badge/unit_tests-nf--test-337ab7.svg)](https://www.nf-test.com) +> [![GitHub Actions CI Status](https://github.com/mskcc/neoantigen-pipeline/actions/workflows/ci.yml/badge.svg)](https://github.com/mskcc/neoantigen-pipeline/actions/workflows/ci.yml) > [![GitHub Actions Linting Status](https://github.com/mskcc/neoantigen-pipeline/actions/workflows/linting.yml/badge.svg)](https://github.com/mskcc/neoantigen-pipeline/actions/workflows/linting.yml) > [![nf-test](https://img.shields.io/badge/unit_tests-nf--test-337ab7.svg)](https://www.nf-test.com) [![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A523.04.0-23aa62.svg)](https://www.nextflow.io/) [![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/) [![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/) [![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/) -[![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://tower.nf/launch?pipeline=https://github.com/mskcc/neoantigenpipeline) +[![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://tower.nf/launch?pipeline=https://github.com/mskcc/neoantigen-pipeline) ## Introduction -**mskcc/neoantigenpipeline** is a bioinformatics pipeline that ... +**mskcc/neoantigenpipeline** is a bioinformatics pipeline that adapts Luksza et al.'s neoantigenEditing and fitness pipeline for usage by investigators in MSK. The pipeline curently supports working with TEMPO output mafs, Facets gene-level copy number calls, and Polysolver outputs. It outputs a json representation of the clonal structure of the tumor annotated with neoantigen burden, driver burden, and fitness of the clone. Also individual neoantigens are labeled with the quality of the neoantigen as described by Luksza et al. - +![Workflow Diagram](assets/workflow_diagram.png) - - -1. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)) -2. Present QC for raw reads ([`MultiQC`](http://multiqc.info/)) +1. Create phylogenetic trees using [PhyloWGS](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-015-0602-8) +2. Use [netMHCpan-4](https://services.healthtech.dtu.dk/services/NetMHCpan-4.1/) to calculate binding affinities +3. Use [netMHCpanStab](https://services.healthtech.dtu.dk/services/NetMHCstabpan-1.0/) to calculate stability scores +4. Use Luksza et al.'s neoantigen quality and fitness computations tool ([NeoantigenEditing](https://github.com/LukszaLab/NeoantigenEditing)) to evaluate peptides ## Usage @@ -38,12 +31,10 @@ First, prepare a samplesheet with your input data that looks as follows: `samplesheet.csv`: ```csv -sample,fastq_1,fastq_2 -CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz +sample,maf,facets_hisens_cncf,hla_file +tumor_normal,temp_test_somatic_unfiltered.maf,facets_hisens.cncf.txt,winners.hla.txt +tumor_normal2,temp_test_somatic_unfiltered.maf,facets_hisens.cncf.txt,winners.hla.txt ``` - -Each row represents a fastq file (single-end) or a pair of fastq files (paired end). - --> Now, you can run the pipeline using: @@ -52,7 +43,7 @@ Now, you can run the pipeline using: ```bash nextflow run mskcc/neoantigenpipeline \ - -profile \ + -profile prod, \ --input samplesheet.csv \ --outdir ``` @@ -63,11 +54,14 @@ nextflow run mskcc/neoantigenpipeline \ ## Credits -mskcc/neoantigenpipeline was originally written by Nikhil Kumar. - We thank the following people for their extensive assistance in the development of this pipeline: - + - Nikhil ([@nikhil](https://github.com/nikhil)) + - John ([@johnoooh](https://github.com/johnoooh)) + - Alex ([@pintoa1-mskcc](https://github.com/pintoa1-mskcc)) + - Martina ([@BradicM](https://github.com/BradicM)) + - Allison ([@arichards2564](https://github.com/arichards2564)) + ## Contributions and Support @@ -75,10 +69,12 @@ If you would like to contribute to this pipeline, please see the [contributing g ## Citations - - - +- Deshwar, A. G., Vembu, S., Yung, C. K., Jang, G. H., Stein, L., & Morris, Q. (2015). PhyloWGS: reconstructing subclonal composition and evolution from whole-genome sequencing of tumors. Genome biology, 16(1), 35. https://doi.org/10.1186/s13059-015-0602-8 +- Jurtz, V., Paul, S., Andreatta, M., Marcatili, P., Peters, B., & Nielsen, M. (2017). NetMHCpan-4.0: Improved Peptide-MHC Class I Interaction Predictions Integrating Eluted Ligand and Peptide Binding Affinity Data. Journal of immunology (Baltimore, Md. : 1950), 199(9), 3360–3368. https://doi.org/10.4049/jimmunol.1700893 +- Łuksza, M., Sethna, Z.M., Rojas, L.A. et al. Neoantigen quality predicts immunoediting in survivors of pancreatic cancer. Nature 606, 389–395 (2022). https://doi.org/10.1038/s41586-022-04735-9 +- Rasmussen, M., Fenoy, E., Harndahl, M., Kristensen, A. B., Nielsen, I. K., Nielsen, M., & Buus, S. (2016). Pan-Specific Prediction of Peptide-MHC Class I Complex Stability, a Correlate of T Cell Immunogenicity. Journal of immunology (Baltimore, Md. : 1950), 197(4), 1517–1524. https://doi.org/10.4049/jimmunol.1600582 + An extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file. diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index eb3e635..07855f8 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -1,8 +1,8 @@ report_comment: > - + This report has been generated by the mskcc/neoantigenpipeline analysis pipeline. - + report_section_order: "mskcc-neoantigenpipeline-methods-description": order: -1000 diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv index 5f653ab..ee41a2b 100644 --- a/assets/samplesheet.csv +++ b/assets/samplesheet.csv @@ -1,3 +1,3 @@ -sample,fastq_1,fastq_2 -SAMPLE_PAIRED_END,/path/to/fastq/files/AEG588A1_S1_L002_R1_001.fastq.gz,/path/to/fastq/files/AEG588A1_S1_L002_R2_001.fastq.gz -SAMPLE_SINGLE_END,/path/to/fastq/files/AEG588A4_S4_L003_R1_001.fastq.gz, +sample,maf,facets_hisens_cncf,hla_file +tumor_normal,https://raw.githubusercontent.com/mskcc-omics-workflows/test-datasets/neoantigen/neoantigen/temp_test_somatic_unfiltered.maf,https://raw.githubusercontent.com/mskcc-omics-workflows/test-datasets/neoantigen/neoantigen/facets_hisens.cncf.txt,https://raw.githubusercontent.com/mskcc-omics-workflows/test-datasets/neoantigen/neoantigen/winners.hla.txt +tumor_normal2,https://raw.githubusercontent.com/mskcc-omics-workflows/test-datasets/neoantigen/neoantigen/temp_test_somatic_unfiltered.maf,https://raw.githubusercontent.com/mskcc-omics-workflows/test-datasets/neoantigen/neoantigen/facets_hisens.cncf.txt,https://raw.githubusercontent.com/mskcc-omics-workflows/test-datasets/neoantigen/neoantigen/winners.hla.txt diff --git a/assets/schema_input.json b/assets/schema_input.json index a22f875..27bd597 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -13,21 +13,28 @@ "errorMessage": "Sample name must be provided and cannot contain spaces", "meta": ["id"] }, - "fastq_1": { + "maf": { "type": "string", "format": "file-path", "exists": true, - "pattern": "^\\S+\\.f(ast)?q\\.gz$", - "errorMessage": "FastQ file for reads 1 must be provided, cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'" + "pattern": "^\\S+.maf$", + "errorMessage": "TEMPO outputted maf file. Other mafs may be usable. '*.maf'" }, - "fastq_2": { + "facets_hisens_cncf": { "type": "string", "format": "file-path", "exists": true, - "pattern": "^\\S+\\.f(ast)?q\\.gz$", - "errorMessage": "FastQ file for reads 2 cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'" + "pattern": "^\\S+.txt$", + "errorMessage": "Facets hisens cncf file output for the sample pair '*.txt'" + }, + "hla_file": { + "type": "string", + "format": "file-path", + "exists": true, + "pattern": "winners.hla.txt$", + "errorMessage": "HLA_file output by polysolver." } }, - "required": ["sample", "fastq_1"] + "required": ["sample", "maf", "facets_hisens_cncf", "hla_file"] } } diff --git a/assets/workflow_diagram.png b/assets/workflow_diagram.png new file mode 100644 index 0000000..29dfc30 Binary files /dev/null and b/assets/workflow_diagram.png differ diff --git a/conf/base.config b/conf/base.config index ab45fb2..96257e7 100644 --- a/conf/base.config +++ b/conf/base.config @@ -44,7 +44,7 @@ process { withLabel:process_high { cpus = { check_max( 12 * task.attempt, 'cpus' ) } memory = { check_max( 72.GB * task.attempt, 'memory' ) } - time = { check_max( 16.h * task.attempt, 'time' ) } + time = { check_max( 72.h * task.attempt, 'time' ) } } withLabel:process_long { time = { check_max( 20.h * task.attempt, 'time' ) } @@ -59,7 +59,4 @@ process { errorStrategy = 'retry' maxRetries = 2 } - withName:CUSTOM_DUMPSOFTWAREVERSIONS { - cache = false - } } diff --git a/conf/modules.config b/conf/modules.config index e3ea8fa..ca2d9da 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -13,30 +13,21 @@ process { publishDir = [ - path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, + path: { "${params.outdir}/${task.tag}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, mode: params.publish_dir_mode, saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] - withName: FASTQC { - ext.args = '--quiet' + withName: 'PHYLOWGS_PARSECNVS' { + ext.args = '-f facets' } - withName: CUSTOM_DUMPSOFTWAREVERSIONS { - publishDir = [ - path: { "${params.outdir}/pipeline_info" }, - mode: params.publish_dir_mode, - pattern: '*_versions.yml' - ] - } - withName: 'MULTIQC' { - ext.args = { params.multiqc_title ? "--title \"$params.multiqc_title\"" : '' } - publishDir = [ - path: { "${params.outdir}/multiqc" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] + withName: 'PHYLOWGS_MULTIEVOLVE' { + ext.args = "--burnin-samples ${params.phylo_burnin_samples} --mcmc-samples ${params.phylo_mcmc_samples}" } + withName: 'PHYLOWGS_WRITERESULTS' { + ext.args = "--max-multiprimary 1.0" + } } diff --git a/conf/prod.config b/conf/prod.config new file mode 100644 index 0000000..57ca622 --- /dev/null +++ b/conf/prod.config @@ -0,0 +1,39 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run mskcc/neoantigenpipeline -profile prod, --outdir + +---------------------------------------------------------------------------------------- +*/ + +nextflow.enable.moduleBinaries = true + +process { + beforeScript = "module load singularity/3.7.1; unset R_LIBS; catch_term () { echo 'caught USR2/TERM signal'; set +e; false; on_exit ; } ; trap catch_term USR2 TERM" + maxRetries = 3 + executor = 'lsf' + queueSize = 500 + perJobMemLimit = true +} + +params { + config_profile_name = 'Prod profile' + config_profile_description = 'Minimal conf to run the pipeline' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 5 + max_memory = '10.GB' + max_time = '100.h' + + // Genome references + genome = 'GRCh37' + phylo_burnin_samples = 1000 + phylo_mcmc_samples = 2500 + iedbfasta = 'https://raw.githubusercontent.com/mskcc-omics-workflows/test-datasets/neoantigen/neoantigen/neoantigenEditing/data/iedb.fasta' + cds = 'https://github.com/mskcc-omics-workflows/test-datasets/raw/neoantigen/neoantigen/Homo_sapiens.GRCh37.75.cds.all.fa.gz' + cdna = 'https://github.com/mskcc-omics-workflows/test-datasets/raw/neoantigen/neoantigen/Homo_sapiens.GRCh37.75.cdna.all.fa.gz' +} diff --git a/conf/test.config b/conf/test.config index a5bd5fb..548f4ae 100644 --- a/conf/test.config +++ b/conf/test.config @@ -10,6 +10,8 @@ ---------------------------------------------------------------------------------------- */ +nextflow.enable.moduleBinaries = true + params { config_profile_name = 'Test profile' config_profile_description = 'Minimal test dataset to check pipeline function' @@ -20,10 +22,14 @@ params { max_time = '6.h' // Input data - // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets - // TODO nf-core: Give any required params for the test so that command line flags are not needed - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv' + input = "${projectDir}/assets/samplesheet.csv" // Genome references - genome = 'R64-1-1' + genome = 'GRCh37' + phylo_burnin_samples = 2 + phylo_mcmc_samples = 2 + + iedbfasta = 'https://raw.githubusercontent.com/mskcc-omics-workflows/test-datasets/neoantigen/neoantigen/neoantigenEditing/data/iedb.fasta' + cds = 'https://github.com/mskcc-omics-workflows/test-datasets/raw/neoantigen/neoantigen/Homo_sapiens.GRCh37.75.cds.all.fa.gz' + cdna = 'https://github.com/mskcc-omics-workflows/test-datasets/raw/neoantigen/neoantigen/Homo_sapiens.GRCh37.75.cdna.all.fa.gz' } diff --git a/docs/images/luksza2021_fig3.png b/docs/images/luksza2021_fig3.png new file mode 100644 index 0000000..0c87548 Binary files /dev/null and b/docs/images/luksza2021_fig3.png differ diff --git a/docs/images/mqc_fastqc_adapter.png b/docs/images/mqc_fastqc_adapter.png deleted file mode 100755 index 361d0e4..0000000 Binary files a/docs/images/mqc_fastqc_adapter.png and /dev/null differ diff --git a/docs/images/mqc_fastqc_counts.png b/docs/images/mqc_fastqc_counts.png deleted file mode 100755 index cb39ebb..0000000 Binary files a/docs/images/mqc_fastqc_counts.png and /dev/null differ diff --git a/docs/images/mqc_fastqc_quality.png b/docs/images/mqc_fastqc_quality.png deleted file mode 100755 index a4b89bf..0000000 Binary files a/docs/images/mqc_fastqc_quality.png and /dev/null differ diff --git a/docs/output.md b/docs/output.md index 7580a0e..012467e 100644 --- a/docs/output.md +++ b/docs/output.md @@ -12,48 +12,95 @@ The directories listed below will be created in the results directory after the The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: -- [FastQC](#fastqc) - Raw read QC -- [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline -- [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution +1. Create phylogenetic trees using [PhyloWGS](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-015-0602-8) +2. Use [netMHCpan-4](https://services.healthtech.dtu.dk/services/NetMHCpan-4.1/) to calculate binding affinities +3. Use [netMHCpanStab](https://services.healthtech.dtu.dk/services/NetMHCstabpan-1.0/) to calculate stability scores +4. Use Luksza et al.'s neoantigen quality and fitness computations tool ([NeoantigenEditing](https://github.com/LukszaLab/NeoantigenEditing)) to evaluate peptides -### FastQC +### PhyloWGS
Output files -- `fastqc/` - - `*_fastqc.html`: FastQC report containing quality metrics. - - `*_fastqc.zip`: Zip archive containing the FastQC report, tab-delimited data file and plot images. +- `phylowgs/` + - `*_.summ.json.gz`: Output file for JSON-formatted tree summaries + - `*.muts.json.gz`: Output file for JSON-formatted list of mutations + - `*.muts.json.gz`: Output file for JSON-formatted list of mutations + - `*.muts.json.gz`: Output zipped folder for JSON-formatted list of SSMs and CNVs
-[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/). +### netMHCpan -![MultiQC - FastQC sequence counts plot](images/mqc_fastqc_counts.png) - -![MultiQC - FastQC mean quality scores plot](images/mqc_fastqc_quality.png) +
+Output files -![MultiQC - FastQC adapter content plot](images/mqc_fastqc_adapter.png) +- `netmhcpan/` + - `*.xls`: TSV/XLS file of netMHCpan. This contains the MUT or WT antigens + - `*.WT.netmhcpan.output,*.MUT.netmhcpan.output`: STDOUT file of netMHCpan. A uniquely formated file of neoantigens. This contains either the MUT or WT neoantigens. Neoantigenutils contains a parser for this file. -:::note -The FastQC plots displayed in the MultiQC report shows _untrimmed_ reads. They may contain adapter sequence and potentially regions with low quality. -::: +
-### MultiQC +### netMHCstabpan
Output files -- `multiqc/` - - `multiqc_report.html`: a standalone HTML file that can be viewed in your web browser. - - `multiqc_data/`: directory containing parsed statistics from the different tools used in the pipeline. - - `multiqc_plots/`: directory containing static images from the report in various formats. +- `netmhcstabpan/` + - `*.xls`: TSV/XLS file of netMHCpan. This contains the MUT or WT antigens + - `*.WT.netmhcpan.output,*.MUT.netmhcpan.output`: STDOUT file of netMHCpan. A uniquely formated file of neoantigens. This contains either the MUT or WT neoantigens. Neoantigenutils contains a parser for this file.
-[MultiQC](http://multiqc.info) is a visualization tool that generates a single HTML report summarising all samples in your project. Most of the pipeline QC results are visualised in the report and further statistics are available in the report data directory. +### Neoantigen Ediitng Final Output + +
+Output files + +- `neoantigenediting/` + + - `*._annotated.json`: The final output of the pipeline. This file is an annotated version of the tree output from phyloWGS with an extra property titled 'neoantigens'. Each entry in 'neoantigens' is a property with properties describing the neoantigen. These neoantigen properities are described below + + "id": "XSYI_MG_M_9_C1203_11", + + "mutation_id": "X_72667534_C_G", + + "HLA_gene_id": "HLA-C\*12:03", -Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQC. The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. For more information about how to use MultiQC reports, see . + "sequence": "ASRSRHSPY", + + "WT_sequence": "PSRSRHSPY", + + "mutated_position": 1, + + "Kd": 192.03, + + "KdWT": 4582.17, + + "R": 0.8911371281207195, + + "logC": 2.263955023939215, + + "logA": 3.1722763542054815, + + "quality": 2.645601185190205 + + The above is an example output from a run. Each neoantigenic mutation will have an output like this. + + - id: This is a unique id that combines an id created from the mutation, HLA allele, and window. + - mutation_id : ID containing the chromosome, position, ref and alt allele. I and D denote insertions and deletions respectively. + - HLA_gene_id : The HLA gene this neoantigen binds to + - sequence : Mutated sequence + - WT_sequence : The wild type sequence + - mutated_position : The position of the first difference + - Kd: Binding affinity in nM from netMHCpan for the mutated peptide + - kdWT : Binding affinity in nM from netMHCpan for the wild type peptide + - R : Similarity of mutated peptide to IEDB peptides + - logC : the log of the cross-reactivity + - logA : Log of the amplitude. This is a function of kd/kdWT and a constant + - quality: The final output of the pipeline and neoantigen editing. A higher quality is a better neoantigen. This is decribed in the Luksza et al. paper and is visualized below + +
### Pipeline information diff --git a/docs/usage.md b/docs/usage.md index a2b2182..4444c81 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -2,51 +2,25 @@ > _Documentation of pipeline parameters is generated automatically from the pipeline schema and can no longer be found in markdown files._ -## Introduction - - - ## Samplesheet input -You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row as shown in the examples below. +You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a tsv file with 4 columns, and a header row as shown in the examples below. ```bash --input '[path to samplesheet file]' ``` -### Multiple runs of the same sample - -The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will concatenate the raw reads before performing any downstream analysis. Below is an example for the same sample sequenced across 3 lanes: - -```csv title="samplesheet.csv" -sample,fastq_1,fastq_2 -CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz -CONTROL_REP1,AEG588A1_S1_L003_R1_001.fastq.gz,AEG588A1_S1_L003_R2_001.fastq.gz -CONTROL_REP1,AEG588A1_S1_L004_R1_001.fastq.gz,AEG588A1_S1_L004_R2_001.fastq.gz -``` - -### Full samplesheet - -The pipeline will auto-detect whether a sample is single- or paired-end using the information provided in the samplesheet. The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 3 columns to match those defined in the table below. - -A final samplesheet file consisting of both single- and paired-end data may look something like the one below. This is for 6 samples, where `TREATMENT_REP3` has been sequenced twice. - ```csv title="samplesheet.csv" -sample,fastq_1,fastq_2 -CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz -CONTROL_REP2,AEG588A2_S2_L002_R1_001.fastq.gz,AEG588A2_S2_L002_R2_001.fastq.gz -CONTROL_REP3,AEG588A3_S3_L002_R1_001.fastq.gz,AEG588A3_S3_L002_R2_001.fastq.gz -TREATMENT_REP1,AEG588A4_S4_L003_R1_001.fastq.gz, -TREATMENT_REP2,AEG588A5_S5_L003_R1_001.fastq.gz, -TREATMENT_REP3,AEG588A6_S6_L003_R1_001.fastq.gz, -TREATMENT_REP3,AEG588A6_S6_L004_R1_001.fastq.gz, +sample maf facets_hisens_cncf hla_file +tumor_normal temp_test_somatic_unfiltered.maf facets_hisens.cncf.txt winners.hla.txt ``` -| Column | Description | -| --------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). | -| `fastq_1` | Full path to FastQ file for Illumina short reads 1. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | -| `fastq_2` | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | +| Column | Description | +| ------------------------------------- | -------------------------------------------------- | +| `sample` | Custom sample name. | +| `maf` | The path to a maf output by the TEMPO pipeline. | +| `facets_hisens_cncf` | The path to the hisens cncf file output by facets. | +| `hla_file` A hla file from polysolver | | An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. @@ -55,10 +29,13 @@ An [example samplesheet](../assets/samplesheet.csv) has been provided with the p The typical command for running the pipeline is as follows: ```bash -nextflow run mskcc/neoantigenpipeline --input ./samplesheet.csv --outdir ./results --genome GRCh37 -profile docker +nextflow run mskcc/neoantigenpipeline \ + -profile prod, \ + --input samplesheet.csv \ + --outdir ``` -This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles. +This will launch the pipeline with the prod profile and either docker or singularity. Note that the pipeline will create the following files in your working directory: @@ -88,7 +65,6 @@ with `params.yaml` containing: ```yaml input: './samplesheet.csv' outdir: './results/' -genome: 'GRCh37' <...> ``` @@ -146,16 +122,7 @@ If `-profile` is not specified, the pipeline will run locally and expect all sof - A generic configuration profile to be used with [Docker](https://docker.com/) - `singularity` - A generic configuration profile to be used with [Singularity](https://sylabs.io/docs/) -- `podman` - - A generic configuration profile to be used with [Podman](https://podman.io/) -- `shifter` - - A generic configuration profile to be used with [Shifter](https://nersc.gitlab.io/development/shifter/how-to-use/) -- `charliecloud` - - A generic configuration profile to be used with [Charliecloud](https://hpc.github.io/charliecloud/) -- `apptainer` - - A generic configuration profile to be used with [Apptainer](https://apptainer.org/) -- `conda` - - A generic configuration profile to be used with [Conda](https://conda.io/docs/). Please only use Conda as a last resort i.e. when it's not possible to run the pipeline with Docker, Singularity, Podman, Shifter, Charliecloud, or Apptainer. + - ### `-resume` diff --git a/main.nf b/main.nf index 515f562..c9d5b36 100644 --- a/main.nf +++ b/main.nf @@ -56,7 +56,7 @@ workflow MSKCC_NEOANTIGENPIPELINE { ) emit: - multiqc_report = NEOANTIGENPIPELINE.out.multiqc_report // channel: /path/to/multiqc_report.html + out = NEOANTIGENPIPELINE.out.neo_out // channel: /path/to/multiqc_report.html } /* @@ -99,7 +99,7 @@ workflow { params.outdir, params.monochrome_logs, params.hook_url, - MSKCC_NEOANTIGENPIPELINE.out.multiqc_report + MSKCC_NEOANTIGENPIPELINE.out.out ) } diff --git a/modules.json b/modules.json index 856a489..bf1a62f 100644 --- a/modules.json +++ b/modules.json @@ -2,17 +2,97 @@ "name": "mskcc/neoantigenpipeline", "homePage": "https://github.com/mskcc/neoantigenpipeline", "repos": { - "https://github.com/nf-core/modules.git": { + "https://github.com/mskcc-omics-workflows/modules.git": { "modules": { - "nf-core": { - "fastqc": { - "branch": "master", - "git_sha": "f4ae1d942bd50c5c0b9bd2de1393ce38315ba57c", + "msk": { + "neoantigenediting/aligntoiedb": { + "branch": "develop", + "git_sha": "cac9c047e374ee259fb612ba5816e7e6aae6b86f", + "installed_by": ["neoantigen_editing"] + }, + "neoantigenediting/computefitness": { + "branch": "develop", + "git_sha": "1f65c2ecdc5010549055ff7f4e6b8bccee48d4ae", + "installed_by": ["neoantigen_editing"] + }, + "neoantigenutils/formatnetmhcpan": { + "branch": "develop", + "git_sha": "c5d1252252e15555abcc82ea537cebeb281a1856", + "installed_by": ["netmhcstabandpan"] + }, + "neoantigenutils/generatehlastring": { + "branch": "develop", + "git_sha": "33f0bd33095fa15016ee24f4fb4d61e896dbb970", + "installed_by": ["netmhcstabandpan"] + }, + "neoantigenutils/generatemutfasta": { + "branch": "develop", + "git_sha": "bb7975c796ab9a2d7a45ef733a6a226a0f5ad74a", + "installed_by": ["netmhcstabandpan"] + }, + "neoantigenutils/neoantigeninput": { + "branch": "neoantigen", + "git_sha": "d66d3e2c7d132efe8bbde0c7e8a072b0f974b085", "installed_by": ["modules"] }, + "netmhcpan": { + "branch": "develop", + "git_sha": "503abeb67260f060d8228221b07d743aa4180345", + "installed_by": ["modules", "netmhcstabandpan"] + }, + "netmhcstabpan": { + "branch": "develop", + "git_sha": "c1a473f8bc08f778269a36ab62d5adf24357225f", + "installed_by": ["modules", "netmhcstabandpan"] + }, + "phylowgs/createinput": { + "branch": "develop", + "git_sha": "b031249dcf4279606c25e626da2a628756e75e8a", + "installed_by": ["phylowgs"] + }, + "phylowgs/multievolve": { + "branch": "develop", + "git_sha": "535662d391a3533dea3b11c462c14799227e08b2", + "installed_by": ["phylowgs"] + }, + "phylowgs/parsecnvs": { + "branch": "develop", + "git_sha": "8471691d7c29bc2f5f4fb92279c94fb2640b6c38", + "installed_by": ["phylowgs"] + }, + "phylowgs/writeresults": { + "branch": "develop", + "git_sha": "6d27f08bf649e8680ace321d3127dcdf0e210973", + "installed_by": ["phylowgs"] + } + } + }, + "subworkflows": { + "msk": { + "neoantigen_editing": { + "branch": "develop", + "git_sha": "56a628201401866096d6307b9e8c690c5eb46ac2", + "installed_by": ["subworkflows"] + }, + "netmhcstabandpan": { + "branch": "develop", + "git_sha": "d60211568e3709e9284bc06eef938e361d474d08", + "installed_by": ["subworkflows"] + }, + "phylowgs": { + "branch": "develop", + "git_sha": "a5d61394af346f21ee2eb7ecfd97ab25bdbd1d0e", + "installed_by": ["subworkflows"] + } + } + } + }, + "https://github.com/nf-core/modules.git": { + "modules": { + "nf-core": { "multiqc": { "branch": "master", - "git_sha": "b7ebe95761cd389603f9cc0e0dc384c0f663815a", + "git_sha": "b80f5fd12ff7c43938f424dd76392a2704fa2396", "installed_by": ["modules"] } } @@ -26,7 +106,7 @@ }, "utils_nfcore_pipeline": { "branch": "master", - "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", + "git_sha": "92de218a329bfc9a9033116eb5f65fd270e72ba3", "installed_by": ["subworkflows"] }, "utils_nfvalidation_plugin": { diff --git a/modules/msk/neoantigenediting/aligntoiedb/environment.yml b/modules/msk/neoantigenediting/aligntoiedb/environment.yml new file mode 100644 index 0000000..fb0fef8 --- /dev/null +++ b/modules/msk/neoantigenediting/aligntoiedb/environment.yml @@ -0,0 +1,9 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +name: "neoantigenediting_aligntoiedb" +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - "NEOANTIGENEDITING" diff --git a/modules/msk/neoantigenediting/aligntoiedb/main.nf b/modules/msk/neoantigenediting/aligntoiedb/main.nf new file mode 100644 index 0000000..aa70524 --- /dev/null +++ b/modules/msk/neoantigenediting/aligntoiedb/main.nf @@ -0,0 +1,49 @@ +process NEOANTIGENEDITING_ALIGNTOIEDB { + tag "$meta.id" + label 'process_medium' + + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'docker://mskcc/neoantigenediting:1.3': + 'docker.io/mskcc/neoantigenediting:1.3' }" + + input: + tuple val(meta), path(patient_data) + path(iedb_fasta) + + output: + tuple val(meta), path("iedb_alignments_*.txt") , emit: iedb_alignment + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + align_neoantigens_to_IEDB.py \\ + --fasta ${iedb_fasta} \\ + --input ${patient_data} + + + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + neoantigenEditing: \$NEOANTIGEN_EDITING_TAG + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + + touch iedb_alignments_example.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + neoantigenEditing: \$NEOANTIGEN_EDITING_TAG + END_VERSIONS + """ +} diff --git a/modules/msk/neoantigenediting/aligntoiedb/meta.yml b/modules/msk/neoantigenediting/aligntoiedb/meta.yml new file mode 100644 index 0000000..77d6121 --- /dev/null +++ b/modules/msk/neoantigenediting/aligntoiedb/meta.yml @@ -0,0 +1,48 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "neoantigenediting_aligntoiedb" +description: Align neoantigens to the IEDB file +keywords: + - neoantigenediting + - neoantigens + - IEDB +tools: + - "neoantigenediting": + description: "Code for computing neoantigen qualities and for performing clone composition predictions." + homepage: "https://www.nature.com/articles/s41586-022-04735-9" + tool_dev_url: "https://github.com/LukszaLab/NeoantigenEditing" + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - patient_data: + type: file + description: Patient data consisting of mutation, neoantigen, and tree information + pattern: "*.json" + - iedb_fasta: + type: file + description: IEDB epitopes used for analysis + pattern: "*.fasta" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - iedb_alignment: + type: file + description: IEDB alignment file + pattern: "iedb_alignments_*.txt" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@nikhil" +maintainers: + - "@nikhil" diff --git a/modules/msk/neoantigenediting/aligntoiedb/resources/usr/bin/align_neoantigens_to_IEDB.py b/modules/msk/neoantigenediting/aligntoiedb/resources/usr/bin/align_neoantigens_to_IEDB.py new file mode 100755 index 0000000..9ff645a --- /dev/null +++ b/modules/msk/neoantigenediting/aligntoiedb/resources/usr/bin/align_neoantigens_to_IEDB.py @@ -0,0 +1,265 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +"""script for computing alignments of neoantigens to IEDB peptides + Copyright (C) 2022 Marta Luksza +""" + + +import json +import os +import subprocess +import tempfile +from collections import defaultdict +import numpy as np +import argparse + +import pandas as pd +from Bio import SeqIO +from Bio.pairwise2 import align + + +def load_blosum62_mat(): + raw_blosum62_mat_str = """ + A R N D C Q E G H I L K M F P S T W Y V B Z X * +A 4 -1 -2 -2 0 -1 -1 0 -2 -1 -1 -1 -1 -2 -1 1 0 -3 -2 0 -2 -1 0 -4 +R -1 5 0 -2 -3 1 0 -2 0 -3 -2 2 -1 -3 -2 -1 -1 -3 -2 -3 -1 0 -1 -4 +N -2 0 6 1 -3 0 0 0 1 -3 -3 0 -2 -3 -2 1 0 -4 -2 -3 3 0 -1 -4 +D -2 -2 1 6 -3 0 2 -1 -1 -3 -4 -1 -3 -3 -1 0 -1 -4 -3 -3 4 1 -1 -4 +C 0 -3 -3 -3 9 -3 -4 -3 -3 -1 -1 -3 -1 -2 -3 -1 -1 -2 -2 -1 -3 -3 -2 -4 +Q -1 1 0 0 -3 5 2 -2 0 -3 -2 1 0 -3 -1 0 -1 -2 -1 -2 0 3 -1 -4 +E -1 0 0 2 -4 2 5 -2 0 -3 -3 1 -2 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4 +G 0 -2 0 -1 -3 -2 -2 6 -2 -4 -4 -2 -3 -3 -2 0 -2 -2 -3 -3 -1 -2 -1 -4 +H -2 0 1 -1 -3 0 0 -2 8 -3 -3 -1 -2 -1 -2 -1 -2 -2 2 -3 0 0 -1 -4 +I -1 -3 -3 -3 -1 -3 -3 -4 -3 4 2 -3 1 0 -3 -2 -1 -3 -1 3 -3 -3 -1 -4 +L -1 -2 -3 -4 -1 -2 -3 -4 -3 2 4 -2 2 0 -3 -2 -1 -2 -1 1 -4 -3 -1 -4 +K -1 2 0 -1 -3 1 1 -2 -1 -3 -2 5 -1 -3 -1 0 -1 -3 -2 -2 0 1 -1 -4 +M -1 -1 -2 -3 -1 0 -2 -3 -2 1 2 -1 5 0 -2 -1 -1 -1 -1 1 -3 -1 -1 -4 +F -2 -3 -3 -3 -2 -3 -3 -3 -1 0 0 -3 0 6 -4 -2 -2 1 3 -1 -3 -3 -1 -4 +P -1 -2 -2 -1 -3 -1 -1 -2 -2 -3 -3 -1 -2 -4 7 -1 -1 -4 -3 -2 -2 -1 -2 -4 +S 1 -1 1 0 -1 0 0 0 -1 -2 -2 0 -1 -2 -1 4 1 -3 -2 -2 0 0 0 -4 +T 0 -1 0 -1 -1 -1 -1 -2 -2 -1 -1 -1 -1 -2 -1 1 5 -2 -2 0 -1 -1 0 -4 +W -3 -3 -4 -4 -2 -2 -3 -2 -2 -3 -2 -3 -1 1 -4 -3 -2 11 2 -3 -4 -3 -2 -4 +Y -2 -2 -2 -3 -2 -1 -2 -3 2 -1 -1 -2 -1 3 -3 -2 -2 2 7 -1 -3 -2 -1 -4 +V 0 -3 -3 -3 -1 -2 -2 -3 -3 3 1 -2 1 -1 -2 -2 0 -3 -1 4 -3 -2 -1 -4 +B -2 -1 3 4 -3 0 1 -1 0 -3 -4 0 -3 -3 -2 0 -1 -4 -3 -3 4 1 -1 -4 +Z -1 0 0 1 -3 3 4 -2 0 -3 -3 1 -1 -3 -1 0 -1 -3 -2 -2 1 4 -1 -4 +X 0 -1 -1 -1 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2 0 0 -2 -1 -1 -1 -1 -1 -4 +* -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 -4 1 +""" + amino_acids = "ACDEFGHIKLMNPQRSTVWY" + blosum62_mat_str_list = [ + l.split() for l in raw_blosum62_mat_str.strip().split("\n") + ] + blosum_aa_order = [blosum62_mat_str_list[0].index(aa) for aa in amino_acids] + + blosum62_mat = np.zeros((len(amino_acids), len(amino_acids))) + for i, bl_ind in enumerate(blosum_aa_order): + blosum62_mat[i] = np.array( + [int(x) for x in blosum62_mat_str_list[bl_ind + 1][1:]] + )[blosum_aa_order] + blosum62 = { + (aaA, aaB): blosum62_mat[i, j] + for i, aaA in enumerate(amino_acids) + for j, aaB in enumerate(amino_acids) + } + return blosum62 + + +def align_peptides(seq1, seq2, matrix): + gap_open = -11 + gap_extend = -1 + aln = align.localds(seq1.upper(), seq2.upper(), matrix, gap_open, gap_extend) + return aln[0] + + +def run_blastp_n(pep_list, blastdb): + """ + Run BLASTP on the given n neoantigens + + :param pep_list: list + list of peptides (neoantigen sequnces) + + :param blastdb: str + fasta file with IEDB peptides + + :return: dict + str (peptide) -> list of IEDB identifiers + """ + + if blastdb is None: + raise ValueError("No BLAST database specified") + os_fid, fa_file = tempfile.mkstemp(suffix=".fa", dir=os.getcwd()) + os.close(os_fid) + os_fid, txt_file = tempfile.mkstemp(suffix=".txt", dir=os.getcwd()) + os.close(os_fid) + id2seq = {} + seq2id = {} + with open(fa_file, "w") as fh: + for seqid, neoseq in enumerate(pep_list): + seqid2 = "seq_" + str(seqid) + # seqid2id[seqid2] = seqid + id2seq[seqid2] = neoseq + seq2id[neoseq] = seqid2 + fh.write(">seq_{}\n{}\n".format(seqid, neoseq)) + # run BLASTP + blastpexe = "blastp" + blast_args = [ + blastpexe, + "-db", + blastdb, + "-query", + fa_file, + "-outfmt", + "6 qseqid sacc score", + "-gapopen", + "32767", + "-gapextend", + "32767", + "-evalue", + "1e6", + "-max_hsps", + "1", + "-matrix", + "BLOSUM62", + "-max_target_seqs", + "10000000", + "-out", + txt_file, + ] + + subprocess.check_call(blast_args) + os.unlink(fa_file) + alignments = defaultdict(list) + with open(txt_file) as fh: + for line in fh: + S = line.split() + epi_id = int(S[1].split("|")[0]) + seq_id = id2seq[S[0]] + alignments[seq_id].append(epi_id) + os.unlink(txt_file) + return alignments + + +def run_blastp(peplist, blastdb, n=1000): + """ + Blast peptides in neolist against peptides in blastdb. + + :param peplist: list + list of peptides (neoantigens) + + :param blastdb: + iedb fasta file + + :param n: int + run blastp in batches of size n + + :return: dict + dictionary mapping neoantigen peptide sequences to alignment candidates + """ + + alignments = defaultdict(set) + for i in range(0, len(peplist) + n, n): # run blastp in batches of size n + peplist0 = peplist[i : (i + n)] + if len(peplist0) == 0: + continue + alignments0 = run_blastp_n(peplist0, blastdb) + for pepseq in alignments0: + for epi in alignments0[pepseq]: + alignments[pepseq].add(epi) + return alignments + + +def prepare_blastdb(peptidesfasta): + """ + Builds BLAST database + + :param peptidesfasta: str + path to the IEDB.fasta file + """ + instr = ["makeblastdb", "-in", peptidesfasta, "-dbtype", "prot", ">", "/dev/null"] + instr = "\t".join(instr) + os.system(instr) + + +def load_epitopes(iedbfasta): + """ + Load IEDB epitopes from fasta file + + :param iedbfasta: str + + :return: dict + IEDB epitope identifiers mapped to epitope sequence + """ + epitopes = {} + with open(iedbfasta) as f: + seqs = SeqIO.parse(f, "fasta") + for seq in seqs: + seqid = int((seq.id).split("|")[0]) + epitopes[seqid] = str(seq.seq) + return epitopes + + +if __name__ == "__main__": + + """ + + Aligns neoantigens peptides of all patients to IEDB + Requirement: blastp installed and in the PATH + + run as: + python align_neoantigens_to_IEDB.py + + """ + + parser = argparse.ArgumentParser(prog="align_neoantigens_to_IEDB") + parser.add_argument("--fasta", help="IEDB fasta file", required=True) + parser.add_argument("--input", help="patient_data file", required=True) + + args = parser.parse_args() + + iedb_file = args.fasta + patient_file = args.input + + # blosum62 + + blosum62 = load_blosum62_mat() + + # prepare blast database + prepare_blastdb(iedb_file) + epitopes = load_epitopes(iedb_file) + + with open(patient_file) as f: + pjson = json.load(f) + patient = pjson["patient"] + neoantigens = pjson["neoantigens"] + peptides = set( + [("_".join(neo["id"].split("_")[:-1]), neo["sequence"]) for neo in neoantigens] + ) + pepseq2pepid = defaultdict(set) + for pep_id, pep_seq in peptides: + pepseq2pepid[pep_seq].add(pep_id) + + seqlist = list(set([pep_seq for pep_id, pep_seq in peptides])) + alignments = run_blastp(seqlist, iedb_file, n=100) + scores = [] + aln_data = [] + for pep_seq in alignments: + for epitope_id in alignments[pep_seq]: + episeq = epitopes[epitope_id] + score = align_peptides(pep_seq, episeq, blosum62).score + pep_ids = pepseq2pepid[pep_seq] + for pep_id in pep_ids: + aln_data.append([pep_id, pep_seq, epitope_id, score]) + if len(aln_data): + aln_data = pd.DataFrame(aln_data) + aln_data.columns = [ + "Peptide_ID", + "Peptide", + "Epitope_ID", + "Alignment_score", + ] + else: + aln_data = pd.DataFrame( + columns=["Peptide_ID", "Peptide", "Epitope_ID", "Alignment_score"] + ) + aln_data.to_csv("iedb_alignments_" + patient + ".txt", sep="\t", index=False) diff --git a/modules/msk/neoantigenediting/aligntoiedb/tests/main.nf.test b/modules/msk/neoantigenediting/aligntoiedb/tests/main.nf.test new file mode 100644 index 0000000..85018c5 --- /dev/null +++ b/modules/msk/neoantigenediting/aligntoiedb/tests/main.nf.test @@ -0,0 +1,64 @@ +nextflow_process { + + name "Test Process NEOANTIGENEDITING_ALIGNTOIEDB" + script "../main.nf" + process "NEOANTIGENEDITING_ALIGNTOIEDB" + + tag "modules" + tag "modules_nfcore" + tag "modules_msk" + tag "neoantigenediting" + tag "neoantigenediting/aligntoiedb" + + test("neoantigenediting_aligntoiedb - json") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map, + file(params.test_data_mskcc['neoantigen']['patient_data'], checkIfExists: true) + ] + input[1] = file(params.test_data_mskcc['neoantigen']['iedb']['iedb_fasta'], checkIfExists: true) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.versions,file(process.out.iedb_alignment[0][1]).name).match() } + + ) + } + + } + + test("neoantigenediting_aligntoiedb - json - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map, + file('patient_data') + ] + input[1] = file('iedb_fasta') + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.versions,process.out).match() } + + ) + } + + } + + +} diff --git a/modules/msk/neoantigenediting/aligntoiedb/tests/main.nf.test.snap b/modules/msk/neoantigenediting/aligntoiedb/tests/main.nf.test.snap new file mode 100644 index 0000000..f128b43 --- /dev/null +++ b/modules/msk/neoantigenediting/aligntoiedb/tests/main.nf.test.snap @@ -0,0 +1,45 @@ +{ + "neoantigenediting_aligntoiedb - json - stub": { + "content": [ + [ + "versions.yml:md5,ee206e836fcb1b31809f63f2ffa49519" + ], + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "iedb_alignments_example.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + "versions.yml:md5,ee206e836fcb1b31809f63f2ffa49519" + ], + "iedb_alignment": [ + [ + { + "id": "test", + "single_end": false + }, + "iedb_alignments_example.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,ee206e836fcb1b31809f63f2ffa49519" + ] + } + ], + "timestamp": "2024-06-19T14:54:25.768151" + }, + "neoantigenediting_aligntoiedb - json": { + "content": [ + [ + "versions.yml:md5,ee206e836fcb1b31809f63f2ffa49519" + ], + "iedb_alignments_3-OLTS.txt" + ], + "timestamp": "2024-06-19T14:54:19.606216" + } +} \ No newline at end of file diff --git a/modules/msk/neoantigenediting/aligntoiedb/tests/tags.yml b/modules/msk/neoantigenediting/aligntoiedb/tests/tags.yml new file mode 100644 index 0000000..61570fd --- /dev/null +++ b/modules/msk/neoantigenediting/aligntoiedb/tests/tags.yml @@ -0,0 +1,2 @@ +neoantigenediting/aligntoiedb: + - "modules/msk/neoantigenediting/aligntoiedb/**" diff --git a/modules/msk/neoantigenediting/computefitness/environment.yml b/modules/msk/neoantigenediting/computefitness/environment.yml new file mode 100644 index 0000000..d737bc6 --- /dev/null +++ b/modules/msk/neoantigenediting/computefitness/environment.yml @@ -0,0 +1,9 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +name: "neoantigenediting_computefitness" +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - "NEOANTIGENEDITING" diff --git a/modules/msk/neoantigenediting/computefitness/main.nf b/modules/msk/neoantigenediting/computefitness/main.nf new file mode 100644 index 0000000..3e15400 --- /dev/null +++ b/modules/msk/neoantigenediting/computefitness/main.nf @@ -0,0 +1,47 @@ +process NEOANTIGENEDITING_COMPUTEFITNESS { + tag "$meta.id" + label 'process_medium' + + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'docker://mskcc/neoantigenediting:1.3': + 'docker.io/mskcc/neoantigenediting:1.3' }" + + input: + tuple val(meta), path(patient_data), path(alignment_file) + + output: + tuple val(meta), path("*_annotated.json") , emit: annotated_output + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + compute_fitness.py \\ + --alignment ${alignment_file} \\ + --input ${patient_data} + + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + neoantigenEditing: \$NEOANTIGEN_EDITING_TAG + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + + touch patient_data_annotated.json + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + neoantigenEditing: \$NEOANTIGEN_EDITING_TAG + END_VERSIONS + """ +} diff --git a/modules/msk/neoantigenediting/computefitness/meta.yml b/modules/msk/neoantigenediting/computefitness/meta.yml new file mode 100644 index 0000000..0a7febb --- /dev/null +++ b/modules/msk/neoantigenediting/computefitness/meta.yml @@ -0,0 +1,48 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "neoantigenediting_computefitness" +description: Compute fitness of the neoantigens +keywords: + - neoantigenediting + - neoantigens + - fitness +tools: + - "neoantigenediting": + description: "Code for computing neoantigen qualities and for performing clone composition predictions." + homepage: "https://www.nature.com/articles/s41586-022-04735-9" + tool_dev_url: "https://github.com/LukszaLab/NeoantigenEditing" + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - patient_data: + type: file + description: Patient data consisting of mutation, neoantigen, and tree information + pattern: "*.json" + - alignment: + type: file + description: IEDB alignment file + pattern: "iedb_alignments_*.txt" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - annotated_output: + type: file + description: Output containing neoantigen quality scores + pattern: "*_annotated.json" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@nikhil" +maintainers: + - "@nikhil" diff --git a/modules/msk/neoantigenediting/computefitness/resources/usr/bin/EpitopeDistance.py b/modules/msk/neoantigenediting/computefitness/resources/usr/bin/EpitopeDistance.py new file mode 100755 index 0000000..f278817 --- /dev/null +++ b/modules/msk/neoantigenediting/computefitness/resources/usr/bin/EpitopeDistance.py @@ -0,0 +1,86 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +"""Class for computing the crossreactivity distance between two epitopes + Copyright (C) 2022 Zachary Sethna + + Use is subject to the included term of use found at + https://github.com/LukszaLab/NeoantigenEditing +""" +import numpy as np +import json +import os + + +class EpitopeDistance(object): + """Base class for epitope crossreactivity. + + Model: + dist({a_i}, {b_i}) = \sum_i d_i M_ab(a_i, b_i) + + Attributes + ---------- + amino_acids : str + Allowed amino acids in specified order. + + amino_acid_dict : dict + Dictionary of amino acids and corresponding indicies + + d_i : ndarray + Position scaling array d_i. + d_i.shape == (9,) + + M_ab : ndarray + Amino acid substitution matrix. Indexed by the order of amino_acids. + M_ab.shape == (20, 20) + + + """ + + def __init__( + self, + model_file=os.path.join( + os.path.dirname(__file__), "distance_data", "epitope_distance_model_parameters.json" + ), + amino_acids="ACDEFGHIKLMNPQRSTVWY", + ): + """Initialize class and compute M_ab.""" + + self.amino_acids = amino_acids + # self.amino_acid_dict = {aa: i for i, aa in enumerate(self.amino_acids)} + self.amino_acid_dict = {} + for i, aa in enumerate(self.amino_acids): + self.amino_acid_dict[aa.upper()] = i + self.amino_acid_dict[aa.lower()] = i + + self.set_model(model_file) + + def set_model(self, model_file): + """Load model and format substitution matrix M_ab.""" + with open(model_file, "r") as modelf: + c_model = json.load(modelf) + self.d_i = c_model["d_i"] + self.M_ab_dict = c_model["M_ab"] + M_ab = np.zeros((len(self.amino_acids), len(self.amino_acids))) + for i, aaA in enumerate(self.amino_acids): + for j, aaB in enumerate(self.amino_acids): + M_ab[i, j] = self.M_ab_dict[aaA + "->" + aaB] + self.M_ab = M_ab + + def epitope_dist(self, epiA, epiB): + """Compute the model difference between the 9-mers epiA and epiB. + + Ignores capitalization. + + Model: + dist({a_i}, {b_i}) = \sum_i d_i M_ab(a_i, b_i) + """ + + return sum( + [ + self.d_i[i] + * self.M_ab[ + self.amino_acid_dict[epiA[i]], self.amino_acid_dict[epiB[i]] + ] + for i in range(9) + ] + ) diff --git a/modules/msk/neoantigenediting/computefitness/resources/usr/bin/compute_fitness.py b/modules/msk/neoantigenediting/computefitness/resources/usr/bin/compute_fitness.py new file mode 100755 index 0000000..da19f4d --- /dev/null +++ b/modules/msk/neoantigenediting/computefitness/resources/usr/bin/compute_fitness.py @@ -0,0 +1,345 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +"""script for computing neoantigen qualities and fitness of tumor clones + Copyright (C) 2022 Marta Luksza +""" + +import argparse +import json + +from collections import defaultdict + +import numpy as np +import pandas as pd + +from EpitopeDistance import EpitopeDistance + + +def fill_up_clone_mutations(tree, mut2missense): + """ + Fills up the field with all mutations for each clone on the tree + + :param tree: dict + imported from json file + + :param mut2missense: dict + maps mutation identifiers to 1 if missense else to 0 + + :return: dict + json dictionary with filled up mutations + """ + + nodes = [(tree["topology"], [])] + while len(nodes) > 0: + (node, anc_mutations) = nodes[0] + nodes = nodes[1:] + cmutations = node["clone_mutations"] + node["all_mutations"] = list(set(cmutations + anc_mutations)) + node["TMB"] = sum([mut2missense[mid] for mid in node["all_mutations"]]) + if "children" in node: + for child in node["children"]: + nodes.append((child, node["all_mutations"])) + + +def fill_up_clone_neoantigens(tree, mut2neo): + """ + Adds neoantigen field for each clone on the tree + + :param tree: dict + tree imported from json file + + :param mut2neo: dict + str -> list + mapts mutation identifiers to neoantigen entries + + :return: dict + annotated json dictionary + """ + + nodes = [tree["topology"]] + while len(nodes) > 0: + node = nodes[0] + nodes = nodes[1:] + node["neoantigens"] = [ + neo["id"] for mid in node["all_mutations"] for neo in mut2neo[mid] + ] + node["neoantigen_load"] = len(node["neoantigens"]) + node["NA_Mut"] = sum([len(mut2neo[mid]) > 0 for mid in node["all_mutations"]]) + if "children" in node: + for child in node["children"]: + nodes.append(child) + + +def mark_driver_gene_mutations(pjson): + """ + Create a dictionary mapping mutation identifiers to their driver gene status + (1 - in a driver gene, 0 - not in a driver gene) + + :param pjson: dict + json representation of a sample + + :return dict + (str -> int) + """ + + dg_genes = set(["TP53", "KRAS", "CDKN2A", "SMAD4"]) + mutid2dg = {} + for mut in pjson["mutations"]: + mutid2dg[mut["id"]] = mut["gene"] in dg_genes + return mutid2dg + + +def mark_missense_mutations(pjson): + """ + Create a dictionary mapping mutation identifiers to their missense status + (1 - is a missense mutation, 0 - not a missense mutation) + + :param pjson: dict + json representation of a sample + + :return dict + (str -> int) + """ + + mut2missense = {} + for mut in pjson["mutations"]: + mut2missense[mut["id"]] = mut["missense"] + return mut2missense + + +def map_neoantigen_qualities(pjson): + neoid2quality = {} + for neo in pjson["neoantigens"]: + neoid2quality[neo["id"]] = neo["quality"] + return neoid2quality + + +def get_property(tree, property): + """ + Auxiliary function to extract a clone attribute values into a list + :param tree: dict + json representation of a tree + + :param property: str + name of the attribute + + :return list + """ + + nodes = [tree["topology"]] + vals = [] + while nodes: + node = nodes[0] + nodes = nodes[1:] + if "children" in node: + for child in node["children"]: + nodes.append(child) + vals.append((node["clone_id"], node[property])) + vals.sort(key=lambda x: x[0]) + vals = [x for (_, x) in vals] + return vals + + +def compute_effective_sample_size(sample_json): + """ + Computes the effective cancer cell population size for each sample (see Methods, p.10) + + :param sample_json: dict + json representation of a sample + + :return float + """ + + mut_freqs = {} + for mut in sample_json["mutations"]: + mut_freqs[mut["id"]] = [] + for tree in sample_json["sample_trees"]: + clone_muts_list = get_property(tree, "clone_mutations") + freqs = get_property(tree, "X") + for clone_muts, X in zip(clone_muts_list, freqs): + for mid in clone_muts: + mut_freqs[mid].append(X) + avev = np.mean( + [np.var(mut_freqs[mid]) if mut_freqs[mid] else 0 for mid in mut_freqs] + ) + n = 1 / avev + return n + + +INF = float("inf") + + +def log_sum2(v1, v2): + ma = max(v1, v2) + if ma == -INF: + return -INF + return ma + np.log(np.exp(v1 - ma) + np.exp(v2 - ma)) + + +def log_sum(v): + if len(v): + ma = max(v) + if ma == -INF: + return -INF + return np.log(sum(list(map(lambda x: np.exp(x - ma), v)))) + ma + return -INF + + +def compute_R(scores, a, k): + """ + Computes the value of the R component given the alignment scores and parameters a and k + + :param scores: list + list of alignment scores for a given neoantigen and IEDB epitopes + + :param a: float + shift parameter of the sigmoid function + + :param k: float + slope parameter of the sigmoid function + + :return float + + """ + v = [-k * (a - score) for score in scores] + lgb = log_sum(v) + lZ = log_sum2(0, lgb) + bindProb = np.exp(lgb - lZ) + return bindProb + + +def set_immune_fitness(tree, neo2qualities): + """ + Sets the value of the immune fitness component for each clone in the tree + as the negative of the max quality neoantigen. + + :param tree: dict + json representation of a tree + + :param neo2qualities: dict + mapping neoantigen identifiers to their qualities (str->float) + """ + + nodes = [tree["topology"]] + while len(nodes) > 0: + node = nodes[0] + nodes = nodes[1:] + if node["neoantigens"]: + node["F_I"] = -max([neo2qualities[neoid] for neoid in node["neoantigens"]]) + else: + node["F_I"] = 0 + if "children" in node: + for child in node["children"]: + nodes.append(child) + + +def set_driver_gene_fitness(tree, mut2dg): + """ + Sets the driver-gene fitness component for each clone in the tree + as the number of driver gene mutations in that clone. + + :param tree: dict + json representation of a tree + + :param mut2dg: dict + mapping mutation identifiers to 1 if this is a mutation in a driver gene or to 0. + """ + nodes = [tree["topology"]] + while len(nodes) > 0: + node = nodes[0] + nodes = nodes[1:] + if node["all_mutations"]: + node["F_P"] = sum([mut2dg[mut_id] for mut_id in node["all_mutations"]]) + else: + node["F_P"] = 0 + if "children" in node: + for child in node["children"]: + nodes.append(child) + + +def clean_data(tree): + """ + Removes no longer needed clone attributes. + """ + nodes = [tree["topology"]] + while len(nodes) > 0: + node = nodes[0] + nodes = nodes[1:] + del node["all_mutations"] + del node["clone_mutations"] + del node["neoantigens"] + if "children" in node: + for child in node["children"]: + nodes.append(child) + + +if __name__ == "__main__": + + """ + Computes components contributing to neoantigen quality score, fitness of clones and annotates clones with neoantigens in *_annotated.json + files. + + Run as: + + python compute_fitness.py + + """ + + a = 22.897590714815188 + k = 1 + w = 0.22402192838740312 + + parser = argparse.ArgumentParser(prog="align_neoantigens_to_IEDB") + parser.add_argument("--alignment", help="neoantigen alignment file", required=True) + parser.add_argument("--input", help="patient_data file", required=True) + + args = parser.parse_args() + + alignment_file = args.alignment + patient_file = args.input + + epidist = EpitopeDistance() + + sample_file = patient_file + output_file = patient_file.replace(".json", "_annotated.json") + + norm = 1 + + with open(sample_file) as f: + sjson = json.load(f) + patient = sjson["patient"] + neoantigens = sjson["neoantigens"] + nalist = [neo["sequence"] for neo in neoantigens] + + alignments = pd.read_csv(alignment_file, sep="\t") + naseq2scores = defaultdict(list) + for r in alignments.itertuples(): + naseq2scores[r.Peptide].append(r.Alignment_score) + + mut2neo = defaultdict(list) + for neo in neoantigens: + score_list = naseq2scores[neo["sequence"]] + neo["R"] = compute_R(score_list, a, k) + neo["logC"] = epidist.epitope_dist(neo["sequence"], neo["WT_sequence"]) + neo["logA"] = np.log(neo["KdWT"] / neo["Kd"]) + neo["quality"] = (w * neo["logC"] + (1 - w) * neo["logA"]) * neo["R"] + mut2neo[neo["mutation_id"]].append(neo) + + mut2dg = mark_driver_gene_mutations(sjson) + mut2missense = mark_missense_mutations(sjson) + neo2qualities = map_neoantigen_qualities(sjson) + + for tree in sjson["sample_trees"]: + fill_up_clone_mutations(tree, mut2missense) + fill_up_clone_neoantigens(tree, mut2neo) + set_immune_fitness(tree, neo2qualities) + set_driver_gene_fitness(tree, mut2dg) + + neff = compute_effective_sample_size(sjson) + sjson["Effective_N"] = neff / norm + + for tree in sjson["sample_trees"]: + clean_data(tree) + + with open(output_file, "w") as of: + json.dump(sjson, of, indent=True) diff --git a/modules/msk/neoantigenediting/computefitness/resources/usr/bin/distance_data/epitope_distance_model_parameters.json b/modules/msk/neoantigenediting/computefitness/resources/usr/bin/distance_data/epitope_distance_model_parameters.json new file mode 100644 index 0000000..c2c3261 --- /dev/null +++ b/modules/msk/neoantigenediting/computefitness/resources/usr/bin/distance_data/epitope_distance_model_parameters.json @@ -0,0 +1,432 @@ +{ + "terms_of_use": "Use of these data is subject to the terms of use found at https://github.com/LukszaLab/NeoantigenEditing", + "d_i": [ + 0.4265316417189761, 0.7769577054740209, 1.1858350996115459, 1.2011211326454208, 1.6962418186727721, + 1.4795549981576157, 1.38899685274111, 0.7822985074612253, 0.9749012286522828 + ], + "euclid_coords": { + "A": [3.5312316688405128, -4.038507673277327], + "C": [7.272490502087666, -3.5962613422282153], + "D": [-3.3267651105489104, -4.134381581418677], + "E": [-2.791220079193741, -1.9929601506592383], + "F": [3.7349203453934803, 4.27588200724904], + "G": [1.0828701007327262, -7.270285095315686], + "H": [-1.6594024299600756, 1.7960492980168123], + "I": [6.94579315511744, -0.0012785445892339589], + "K": [-2.4118355486039613, -0.9046522878973625], + "L": [6.454412382783402, 0.772812962193054], + "M": [4.769499796942178, 0.5052384809555668], + "N": [-2.1016216436806427, -3.1474943437753105], + "P": [-0.9992387821861454, -6.803979888540144], + "Q": [-1.5702795695104803, -0.9551312866936055], + "R": [-2.6578083660129788, -0.06342218626699866], + "S": [0.8142701713381114, -3.464839915690124], + "T": [2.606749707538103, -2.6220589959640557], + "V": [6.508726391948886, -0.5670130202032231], + "W": [2.7313743259108967, 5.737800116112936], + "Y": [2.2822689847157145, 4.075146976868077] + }, + "M_ab": { + "A->A": 0.0, + "A->C": 3.7673066605568826, + "A->D": 6.858666899068506, + "A->E": 6.645123081893246, + "A->F": 8.316884322665505, + "A->G": 4.054486363743763, + "A->H": 7.80927252686884, + "A->I": 5.287574981132023, + "A->K": 6.718712490316743, + "A->L": 5.629723949153348, + "A->M": 4.709451886491528, + "A->N": 5.702888846341898, + "A->P": 5.307824326502934, + "A->Q": 5.96092500006195, + "A->R": 7.355645531294955, + "A->S": 2.7768641441391444, + "A->T": 1.6914472360189268, + "A->V": 4.573483349970818, + "A->W": 9.808973736427841, + "A->Y": 8.209220399536266, + "C->A": 3.7673066605568826, + "C->C": 0.0, + "C->D": 10.612906903098432, + "C->E": 10.19062536720303, + "C->F": 8.630471802181635, + "C->G": 7.19790603239027, + "C->H": 10.433394720342184, + "C->I": 3.609796735529243, + "C->K": 10.05141434612365, + "C->L": 4.445004171745916, + "C->M": 4.804920734998789, + "C->N": 9.384847912477928, + "C->P": 8.871919951434778, + "C->Q": 9.22876755094634, + "C->R": 10.540008923700857, + "C->S": 6.459557371202782, + "C->T": 4.766362069061748, + "C->V": 3.124048817228584, + "C->W": 10.380098238149005, + "C->Y": 9.151656483431939, + "D->A": 6.858666899068506, + "D->C": 10.612906903098432, + "D->D": 0.0, + "D->E": 2.2073726972863037, + "D->F": 10.981800176179599, + "D->G": 5.410986374502014, + "D->H": 6.160365957017682, + "D->I": 11.07284940900755, + "D->K": 3.3568210278104593, + "D->L": 10.943125305182395, + "D->M": 9.331429674784955, + "D->N": 1.5731887789560899, + "D->P": 3.541769886738501, + "D->Q": 3.6321995942385272, + "D->R": 4.125556147119299, + "D->S": 4.194813374630591, + "D->T": 6.123211387738817, + "D->V": 10.462457242314292, + "D->W": 11.582790030932431, + "D->Y": 9.94271706482523, + "E->A": 6.645123081893246, + "E->C": 10.19062536720303, + "E->D": 2.2073726972863037, + "E->E": 0.0, + "E->F": 9.049248081592172, + "E->G": 6.546658177551038, + "E->H": 3.9544410215949877, + "E->I": 9.93862275896961, + "E->K": 1.1525392080966916, + "E->L": 9.650451830548775, + "E->M": 7.9627558953165725, + "E->N": 1.344803035889008, + "E->P": 5.133917401663634, + "E->Q": 1.6024308032067456, + "E->R": 1.9341446277992693, + "E->S": 3.8943536035377697, + "E->T": 5.434504869409326, + "E->V": 9.408630590313022, + "E->W": 9.500721197130058, + "E->Y": 7.909628018608502, + "F->A": 8.316884322665505, + "F->C": 8.630471802181635, + "F->D": 10.981800176179599, + "F->E": 9.049248081592172, + "F->F": 0.0, + "F->G": 11.846828489539009, + "F->H": 5.937026904981663, + "F->I": 5.3482526666590955, + "F->K": 8.038690415955843, + "F->L": 4.434763767762229, + "F->M": 3.910000926335655, + "F->N": 9.44307888557397, + "F->P": 12.04888386013437, + "F->Q": 7.4504124864668855, + "F->R": 7.726353684727768, + "F->S": 8.273389470312676, + "F->T": 6.989589334998694, + "F->V": 5.581006381543492, + "F->W": 1.7732200005198253, + "F->Y": 1.4664550890160293, + "G->A": 4.054486363743763, + "G->C": 7.19790603239027, + "G->D": 5.410986374502014, + "G->E": 6.546658177551038, + "G->F": 11.846828489539009, + "G->G": 0.0, + "G->H": 9.471983845230948, + "G->I": 9.338753823510922, + "G->K": 7.26183507210032, + "G->L": 9.671860878370772, + "G->M": 8.605231281152383, + "G->N": 5.2094521066253945, + "G->P": 2.1336864685786554, + "G->Q": 6.849844582124883, + "G->R": 8.11982440582287, + "G->S": 3.8149127037464723, + "G->T": 4.891647465499837, + "G->V": 8.624023017473489, + "G->W": 13.112125954558945, + "G->Y": 11.408654021725596, + "H->A": 7.80927252686884, + "H->C": 10.433394720342184, + "H->D": 6.160365957017682, + "H->E": 3.9544410215949877, + "H->F": 5.937026904981663, + "H->G": 9.471983845230948, + "H->H": 0.0, + "H->I": 8.790891788166004, + "H->K": 2.803559283159802, + "H->L": 8.178080668130237, + "H->M": 6.557207973559407, + "H->N": 4.963283345859569, + "H->P": 8.625330025655463, + "H->Q": 2.7526237472532045, + "H->R": 2.110556470225403, + "H->S": 5.8134337062799055, + "H->T": 6.141639435657454, + "H->V": 8.503081322149713, + "H->W": 5.900535571611977, + "H->Y": 4.553137354727778, + "I->A": 5.287574981132023, + "I->C": 3.609796735529243, + "I->D": 11.07284940900755, + "I->E": 9.93862275896961, + "I->F": 5.3482526666590955, + "I->G": 9.338753823510922, + "I->H": 8.790891788166004, + "I->I": 0.0, + "I->K": 9.401132861352892, + "I->L": 0.9168820667305412, + "I->M": 2.234460175971931, + "I->N": 9.578851100030663, + "I->P": 10.4594587842663, + "I->Q": 8.569324927015641, + "I->R": 9.60380258068956, + "I->S": 7.042146794266393, + "I->T": 5.069101322156581, + "I->V": 0.7149005891484997, + "I->W": 7.120277378143191, + "I->Y": 6.194005385847643, + "K->A": 6.718712490316743, + "K->C": 10.05141434612365, + "K->D": 3.3568210278104593, + "K->E": 1.1525392080966916, + "K->F": 8.038690415955843, + "K->G": 7.26183507210032, + "K->H": 2.803559283159802, + "K->I": 9.401132861352892, + "K->K": 0.0, + "K->L": 9.02353822212172, + "K->M": 7.318426697404776, + "K->N": 2.264193709562544, + "K->P": 6.066093946206223, + "K->Q": 0.843068559055228, + "K->R": 0.8764534846716978, + "K->S": 4.118533574677532, + "K->T": 5.304308039139411, + "K->V": 8.926949401129649, + "K->W": 8.400879820164631, + "K->Y": 6.843465356607772, + "L->A": 5.629723949153348, + "L->C": 4.445004171745916, + "L->D": 10.943125305182395, + "L->E": 9.650451830548775, + "L->F": 4.434763767762229, + "L->G": 9.671860878370772, + "L->H": 8.178080668130237, + "L->I": 0.9168820667305412, + "L->K": 9.02353822212172, + "L->L": 0.0, + "L->M": 1.706026531134751, + "L->N": 9.4114041266561, + "L->P": 10.628485573776425, + "L->Q": 8.208621824427695, + "L->R": 9.15051125338444, + "L->S": 7.0547080789185435, + "L->T": 5.131243872079878, + "L->V": 1.3409264240426382, + "L->W": 6.205812582692975, + "L->Y": 5.3209200781931925, + "M->A": 4.709451886491528, + "M->C": 4.804920734998789, + "M->D": 9.331429674784955, + "M->E": 7.9627558953165725, + "M->F": 3.910000926335655, + "M->G": 8.605231281152383, + "M->H": 6.557207973559407, + "M->I": 2.234460175971931, + "M->K": 7.318426697404776, + "M->L": 1.706026531134751, + "M->M": 0.0, + "M->N": 7.781694348961095, + "M->P": 9.31144553586053, + "M->Q": 6.505803737707016, + "M->R": 7.449045677262714, + "M->S": 5.604048881547064, + "M->T": 3.8022989701449266, + "M->V": 2.0431917263232804, + "M->W": 5.615483656942405, + "M->Y": 4.350926772824042, + "N->A": 5.702888846341898, + "N->C": 9.384847912477928, + "N->D": 1.5731887789560899, + "N->E": 1.344803035889008, + "N->F": 9.44307888557397, + "N->G": 5.2094521066253945, + "N->H": 4.963283345859569, + "N->I": 9.578851100030663, + "N->K": 2.264193709562544, + "N->L": 9.4114041266561, + "N->M": 7.781694348961095, + "N->N": 0.0, + "N->P": 3.8190489015448565, + "N->Q": 2.2558324791172084, + "N->R": 3.133822704432521, + "N->S": 2.9331098323975704, + "N->T": 4.737598873449183, + "N->V": 8.988713876632943, + "N->W": 10.114658060508463, + "N->Y": 8.448967078194036, + "P->A": 5.307824326502934, + "P->C": 8.871919951434778, + "P->D": 3.541769886738501, + "P->E": 5.133917401663634, + "P->F": 12.04888386013437, + "P->G": 2.1336864685786554, + "P->H": 8.625330025655463, + "P->I": 10.4594587842663, + "P->K": 6.066093946206223, + "P->L": 10.628485573776425, + "P->M": 9.31144553586053, + "P->N": 3.8190489015448565, + "P->P": 0.0, + "P->Q": 5.876658706110999, + "P->R": 6.941611570958817, + "P->S": 3.799825059499118, + "P->T": 5.521921344947621, + "P->V": 9.760599201522258, + "P->W": 13.084866061501023, + "P->Y": 11.363260737036889, + "Q->A": 5.96092500006195, + "Q->C": 9.22876755094634, + "Q->D": 3.6321995942385272, + "Q->E": 1.6024308032067456, + "Q->F": 7.4504124864668855, + "Q->G": 6.849844582124883, + "Q->H": 2.7526237472532045, + "Q->I": 8.569324927015641, + "Q->K": 0.843068559055228, + "Q->L": 8.208621824427695, + "Q->M": 6.505803737707016, + "Q->N": 2.2558324791172084, + "Q->P": 5.876658706110999, + "Q->Q": 0.0, + "Q->R": 1.4063655296564266, + "Q->S": 3.4618946935227064, + "Q->T": 4.497357176081819, + "Q->V": 8.088323257269053, + "Q->W": 7.956101872064475, + "Q->Y": 6.3360736873501935, + "R->A": 7.355645531294955, + "R->C": 10.540008923700857, + "R->D": 4.125556147119299, + "R->E": 1.9341446277992693, + "R->F": 7.726353684727768, + "R->G": 8.11982440582287, + "R->H": 2.110556470225403, + "R->I": 9.60380258068956, + "R->K": 0.8764534846716978, + "R->L": 9.15051125338444, + "R->M": 7.449045677262714, + "R->N": 3.133822704432521, + "R->P": 6.941611570958817, + "R->Q": 1.4063655296564266, + "R->R": 0.0, + "R->S": 4.860552637259271, + "R->T": 5.853391669257117, + "R->V": 9.180357465640842, + "R->W": 7.918173418697156, + "R->Y": 6.444541795134483, + "S->A": 2.7768641441391444, + "S->C": 6.459557371202782, + "S->D": 4.194813374630591, + "S->E": 3.8943536035377697, + "S->F": 8.273389470312676, + "S->G": 3.8149127037464723, + "S->H": 5.8134337062799055, + "S->I": 7.042146794266393, + "S->K": 4.118533574677532, + "S->L": 7.0547080789185435, + "S->M": 5.604048881547064, + "S->N": 2.9331098323975704, + "S->P": 3.799825059499118, + "S->Q": 3.4618946935227064, + "S->R": 4.860552637259271, + "S->S": 0.0, + "S->T": 1.9807227383836576, + "S->V": 6.3893843494236595, + "S->W": 9.400205949574959, + "S->Y": 7.681563828806445, + "T->A": 1.6914472360189268, + "T->C": 4.766362069061748, + "T->D": 6.123211387738817, + "T->E": 5.434504869409326, + "T->F": 6.989589334998694, + "T->G": 4.891647465499837, + "T->H": 6.141639435657454, + "T->I": 5.069101322156581, + "T->K": 5.304308039139411, + "T->L": 5.131243872079878, + "T->M": 3.8022989701449266, + "T->N": 4.737598873449183, + "T->P": 5.521921344947621, + "T->Q": 4.497357176081819, + "T->R": 5.853391669257117, + "T->S": 1.9807227383836576, + "T->T": 0.0, + "T->V": 4.410060771483332, + "T->W": 8.360787981361641, + "T->Y": 6.705061937224871, + "V->A": 4.573483349970818, + "V->C": 3.124048817228584, + "V->D": 10.462457242314292, + "V->E": 9.408630590313022, + "V->F": 5.581006381543492, + "V->G": 8.624023017473489, + "V->H": 8.503081322149713, + "V->I": 0.7149005891484997, + "V->K": 8.926949401129649, + "V->L": 1.3409264240426382, + "V->M": 2.0431917263232804, + "V->N": 8.988713876632943, + "V->P": 9.760599201522258, + "V->Q": 8.088323257269053, + "V->R": 9.180357465640842, + "V->S": 6.3893843494236595, + "V->T": 4.410060771483332, + "V->V": 0.0, + "V->W": 7.349765799987526, + "V->Y": 6.2779448590733224, + "W->A": 9.808973736427841, + "W->C": 10.380098238149005, + "W->D": 11.582790030932431, + "W->E": 9.500721197130058, + "W->F": 1.7732200005198253, + "W->G": 13.112125954558945, + "W->H": 5.900535571611977, + "W->I": 7.120277378143191, + "W->K": 8.400879820164631, + "W->L": 6.205812582692975, + "W->M": 5.615483656942405, + "W->N": 10.114658060508463, + "W->P": 13.084866061501023, + "W->Q": 7.956101872064475, + "W->R": 7.918173418697156, + "W->S": 9.400205949574959, + "W->T": 8.360787981361641, + "W->V": 7.349765799987526, + "W->W": 0.0, + "W->Y": 1.7222401310301723, + "Y->A": 8.209220399536266, + "Y->C": 9.151656483431939, + "Y->D": 9.94271706482523, + "Y->E": 7.909628018608502, + "Y->F": 1.4664550890160293, + "Y->G": 11.408654021725596, + "Y->H": 4.553137354727778, + "Y->I": 6.194005385847643, + "Y->K": 6.843465356607772, + "Y->L": 5.3209200781931925, + "Y->M": 4.350926772824042, + "Y->N": 8.448967078194036, + "Y->P": 11.363260737036889, + "Y->Q": 6.3360736873501935, + "Y->R": 6.444541795134483, + "Y->S": 7.681563828806445, + "Y->T": 6.705061937224871, + "Y->V": 6.2779448590733224, + "Y->W": 1.7222401310301723, + "Y->Y": 0.0 + }, + "blosum62_reg": 0.1 +} diff --git a/modules/msk/neoantigenediting/computefitness/tests/main.nf.test b/modules/msk/neoantigenediting/computefitness/tests/main.nf.test new file mode 100644 index 0000000..236520e --- /dev/null +++ b/modules/msk/neoantigenediting/computefitness/tests/main.nf.test @@ -0,0 +1,66 @@ +nextflow_process { + + name "Test Process NEOANTIGENEDITING_COMPUTEFITNESS" + script "../main.nf" + process "NEOANTIGENEDITING_COMPUTEFITNESS" + + tag "modules" + tag "modules_nfcore" + tag "modules_msk" + tag "neoantigenediting" + tag "neoantigenediting/computefitness" + + test("neoantigenediting_computefitness - json") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map, + file(params.test_data_mskcc['neoantigen']['patient_data'], checkIfExists: true), + file(params.test_data_mskcc['neoantigen']['iedb_alignments'], checkIfExists: true) + ] + + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.versions,file(process.out.annotated_output[0][1]).name).match() } + + ) + } + + } + + test("neoantigenediting_computefitness - json - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map, + file('patient_data'), + file('alignment_file') + ] + + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.versions,process.out).match() } + + ) + } + + } + + +} diff --git a/modules/msk/neoantigenediting/computefitness/tests/main.nf.test.snap b/modules/msk/neoantigenediting/computefitness/tests/main.nf.test.snap new file mode 100644 index 0000000..bddbaf6 --- /dev/null +++ b/modules/msk/neoantigenediting/computefitness/tests/main.nf.test.snap @@ -0,0 +1,45 @@ +{ + "neoantigenediting_computefitness - json - stub": { + "content": [ + [ + "versions.yml:md5,d615e02977a234e01fe78e023a51362a" + ], + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "patient_data_annotated.json:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + "versions.yml:md5,d615e02977a234e01fe78e023a51362a" + ], + "annotated_output": [ + [ + { + "id": "test", + "single_end": false + }, + "patient_data_annotated.json:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,d615e02977a234e01fe78e023a51362a" + ] + } + ], + "timestamp": "2024-06-19T14:54:10.949451" + }, + "neoantigenediting_computefitness - json": { + "content": [ + [ + "versions.yml:md5,d615e02977a234e01fe78e023a51362a" + ], + "3_OLTS_primary_tumor_annotated.json" + ], + "timestamp": "2024-06-19T14:54:04.735114" + } +} \ No newline at end of file diff --git a/modules/msk/neoantigenediting/computefitness/tests/tags.yml b/modules/msk/neoantigenediting/computefitness/tests/tags.yml new file mode 100644 index 0000000..47db9d4 --- /dev/null +++ b/modules/msk/neoantigenediting/computefitness/tests/tags.yml @@ -0,0 +1,2 @@ +neoantigenediting/computefitness: + - "modules/msk/neoantigenediting/computefitness/**" diff --git a/modules/msk/neoantigenutils/formatnetmhcpan/environment.yml b/modules/msk/neoantigenutils/formatnetmhcpan/environment.yml new file mode 100644 index 0000000..4f886c1 --- /dev/null +++ b/modules/msk/neoantigenutils/formatnetmhcpan/environment.yml @@ -0,0 +1,9 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +name: "neoantigenutils_formatnetmhcpan" +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - "NEOANTIGENUTILS" diff --git a/modules/msk/neoantigenutils/formatnetmhcpan/main.nf b/modules/msk/neoantigenutils/formatnetmhcpan/main.nf new file mode 100644 index 0000000..d6d67d1 --- /dev/null +++ b/modules/msk/neoantigenutils/formatnetmhcpan/main.nf @@ -0,0 +1,48 @@ +process NEOANTIGENUTILS_FORMATNETMHCPAN { + tag "$meta.id" + label 'process_single' + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'docker://mskcc/neoantigen-utils-base:1.0.0': + 'docker.io/mskcc/neoantigen-utils-base:1.0.0' }" + + input: + tuple val(meta), path(netmhcPanOutput) + + output: + tuple val(meta), path("*.tsv"), emit: netMHCpanreformatted + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def netmhcOutputType = meta.typeMut ? "--type_MUT": "" + def netmhcOutputFrom = meta.fromStab ? "--from_STAB": "" + """ + format_netmhcpan_output.py \ + --netMHCpan_output ${netmhcPanOutput} \ + --id ${prefix} \ + ${netmhcOutputType} \ + ${netmhcOutputFrom} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + formatNetmhcpanOutput: \$(echo \$(format_netmhcpan_output.py -v)) + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def netmhcOutputType = meta.typeMut ? "MUT": "WT" + def netmhcOutputFrom = meta.fromStab ? "STAB": "PAN" + """ + touch ${prefix}.${netmhcOutputType}.${netmhcOutputFrom}.tsv + cat <<-END_VERSIONS > versions.yml + "${task.process}": + formatNetmhcpanOutput: \$(echo \$(format_netmhcpan_output.py -v)) + END_VERSIONS + """ +} diff --git a/modules/msk/neoantigenutils/formatnetmhcpan/meta.yml b/modules/msk/neoantigenutils/formatnetmhcpan/meta.yml new file mode 100644 index 0000000..b7776bc --- /dev/null +++ b/modules/msk/neoantigenutils/formatnetmhcpan/meta.yml @@ -0,0 +1,51 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "neoantigenutils_formatnetmhcpan" +description: Takes the standard out of netmhcpan tools and converts them to a tsv for downstream processing +keywords: + - neoantigen + - tsv + - peptides + - netmhc +tools: + - neoantigen_utils: + description: "Collection of helper scripts for neoantigen processing" + documentation: "https://github.com/mskcc-omics-workflows/modules" + licence: [""] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information. + typeMut indicated if a mutated fasta was used + fromStab indicates if the output was from netmhcstabpan + e.g. `[ id:'sample1', typeMut: false, fromStab: false ]` + + - netmhcOutput: + type: file + description: Maf outputtted by Tempo that was run through phyloWGS + pattern: "*.{output}" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - netMHCpanreformatted: + type: file + description: A reformatted file of neoantigens and their binding affinities output by netmhcpan or netmhcstabpan. This contains the wild type antigens + pattern: "*.{tsv}" + +authors: + - "@johnoooh" + - "@nikhil" +maintainers: + - "@johnoooh" + - "@nikhil" diff --git a/modules/msk/neoantigenutils/formatnetmhcpan/resources/usr/bin/format_netmhcpan_output.py b/modules/msk/neoantigenutils/formatnetmhcpan/resources/usr/bin/format_netmhcpan_output.py new file mode 100755 index 0000000..a11be09 --- /dev/null +++ b/modules/msk/neoantigenutils/formatnetmhcpan/resources/usr/bin/format_netmhcpan_output.py @@ -0,0 +1,99 @@ +#!/usr/bin/env python3 + +import argparse + +VERSION = 1.0 + +PAN_HEADER = [ + "pos", + "MHC", + "peptide", + "core", + "OF", + "Gp", + "Gl", + "Ip", + "Il", + "icore", + "Identity", + "score_el", + "rank_el", + "score_ba", + "rank_ba", + "affinity", +] +STAB_PAN_HEADER = [ + "pos", + "HLA", + "peptide", + "Identity", + "Pred", + "Thalf(h)", + "%Rank_Stab", +] + + +def netMHCpan_out_reformat(netMHCpanoutput, mut, stab, prefix): + file_li = [] + stab_prefix = "" + type_prefix = "WT" + if stab: + stab_prefix = "stab" + if mut: + type_prefix = "MUT" + outfilename = "{}_netmHC{}panoutput.{}.tsv".format(prefix, stab_prefix, type_prefix) + with open(netMHCpanoutput, "r") as file: + # data = file.read() + for line in file: + # Remove leading whitespace + line = line.lstrip() + # Check if the line starts with a digit + if line == "": + pass + elif line[0].isdigit(): + # Print or process the line as needed + match = ( + line.strip().replace(" <= WB", "").replace(" <= SB", "") + ) # strip to remove leading/trailing whitespace + splititem = match.split() + tab_separated_line = "\t".join(splititem) + file_li.append(tab_separated_line) + if stab: + header = "\t".join(STAB_PAN_HEADER) + "\n" + else: + header = "\t".join(PAN_HEADER) + "\n" + with open(outfilename, "w") as file: + file.writelines(header) + for item in file_li: + file.writelines(item) + file.writelines("\n") + + +def parse_args(): + parser = argparse.ArgumentParser(description="Process input files and parameters") + parser.add_argument( + "--netMHCpan_output", required=True, help="Path to netMHC output" + ) + parser.add_argument("--type_MUT", action="store_true", help="Output is a MUT type") + parser.add_argument( + "--from_STAB", + action="store_true", + help="Output is from netmhcstab", + ) + parser.add_argument("--id", required=True, help="Prefix to label the output") + parser.add_argument( + "-v", "--version", action="version", version="%(prog)s {}".format(VERSION) + ) + + return parser.parse_args() + + +def main(args): + netMHCpan_out_reformat( + args.netMHCpan_output, args.type_MUT, args.from_STAB, args.id + ) + + +if __name__ == "__main__": + args = parse_args() + main(args) diff --git a/modules/msk/neoantigenutils/formatnetmhcpan/tests/main.nf.test b/modules/msk/neoantigenutils/formatnetmhcpan/tests/main.nf.test new file mode 100644 index 0000000..93e87c7 --- /dev/null +++ b/modules/msk/neoantigenutils/formatnetmhcpan/tests/main.nf.test @@ -0,0 +1,133 @@ +nextflow_process { + + name "Test Process NEOANTIGENUTILS_FORMATNETMHCPAN" + script "../main.nf" + process "NEOANTIGENUTILS_FORMATNETMHCPAN" + + tag "modules" + tag "modules_nfcore" + tag "neoantigenutils" + tag "neoantigenutils/formatnetmhcpan" + tag "modules_msk" + + test("neoantigenutils_formatnetmhcpan - output(MUT,netmhcpan) - tsv") { + + when { + + process { + """ + + input[0] = [ + [ id:'test', typeMut: true, fromStab: false ], // meta map + file(params.test_data_mskcc['neoantigen']['MUTnetMHCpan'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("neoantigenutils_formatnetmhcpan - output(WT,netmhcpan) - tsv") { + + when { + + process { + """ + + input[0] = [ + [ id:'test', typeMut: false, fromStab: false], // meta map + file(params.test_data_mskcc['neoantigen']['WTnetMHCpan'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("neoantigenutils_formatnetmhcpan - output(MUT,netmhcpanstab) - tsv") { + + when { + + process { + """ + + input[0] = [ + [ id:'test', typeMut: true, fromStab: true ], // meta map + file(params.test_data_mskcc['neoantigen']['MUTnetMHCpanstab'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("neoantigenutils_formatnetmhcpan - output(WT,netmhcpanstab) - tsv") { + + when { + + process { + """ + + input[0] = [ + [ id:'test', typeMut: false, fromStab: false ], // meta map + file(params.test_data_mskcc['neoantigen']['WTnetMHCpanstab'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + +} + test("neoantigenutils_formatnetmhcpan - output(MUT,netmhcpan) - tsv - stub") { + + options "-stub" + + when { + + process { + """ + + input[0] = [ + [ id:'test', typeMut: true, fromStab: false ], // meta map + file('MUTnetMHCpan') + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } +} diff --git a/modules/msk/neoantigenutils/formatnetmhcpan/tests/main.nf.test.snap b/modules/msk/neoantigenutils/formatnetmhcpan/tests/main.nf.test.snap new file mode 100644 index 0000000..685825a --- /dev/null +++ b/modules/msk/neoantigenutils/formatnetmhcpan/tests/main.nf.test.snap @@ -0,0 +1,167 @@ +{ + "neoantigenutils_formatnetmhcpan - output(MUT,netmhcpan) - tsv": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "typeMut": true, + "fromStab": false + }, + "test_netmHCpanoutput.MUT.tsv:md5,7f00f2df190fe801700b626b72dfdb99" + ] + ], + "1": [ + "versions.yml:md5,2c02f5f3103ee1532c27f7f3b873a578" + ], + "netMHCpanreformatted": [ + [ + { + "id": "test", + "typeMut": true, + "fromStab": false + }, + "test_netmHCpanoutput.MUT.tsv:md5,7f00f2df190fe801700b626b72dfdb99" + ] + ], + "versions": [ + "versions.yml:md5,2c02f5f3103ee1532c27f7f3b873a578" + ] + } + ], + "timestamp": "2024-07-30T13:46:27.878268" + }, + "neoantigenutils_formatnetmhcpan - output(MUT,netmhcpan) - tsv - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "typeMut": true, + "fromStab": false + }, + "test.MUT.PAN.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + "versions.yml:md5,2c02f5f3103ee1532c27f7f3b873a578" + ], + "netMHCpanreformatted": [ + [ + { + "id": "test", + "typeMut": true, + "fromStab": false + }, + "test.MUT.PAN.tsv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,2c02f5f3103ee1532c27f7f3b873a578" + ] + } + ], + "timestamp": "2024-07-30T13:47:05.72509" + }, + "neoantigenutils_formatnetmhcpan - output(WT,netmhcpan) - tsv": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "typeMut": false, + "fromStab": false + }, + "test_netmHCpanoutput.WT.tsv:md5,a1d7db1b6f116e96457f2fa60660558e" + ] + ], + "1": [ + "versions.yml:md5,2c02f5f3103ee1532c27f7f3b873a578" + ], + "netMHCpanreformatted": [ + [ + { + "id": "test", + "typeMut": false, + "fromStab": false + }, + "test_netmHCpanoutput.WT.tsv:md5,a1d7db1b6f116e96457f2fa60660558e" + ] + ], + "versions": [ + "versions.yml:md5,2c02f5f3103ee1532c27f7f3b873a578" + ] + } + ], + "timestamp": "2024-07-30T13:46:37.183992" + }, + "neoantigenutils_formatnetmhcpan - output(MUT,netmhcpanstab) - tsv": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "typeMut": true, + "fromStab": true + }, + "test_netmHCstabpanoutput.MUT.tsv:md5,246eb723691371ad49bd080071475740" + ] + ], + "1": [ + "versions.yml:md5,2c02f5f3103ee1532c27f7f3b873a578" + ], + "netMHCpanreformatted": [ + [ + { + "id": "test", + "typeMut": true, + "fromStab": true + }, + "test_netmHCstabpanoutput.MUT.tsv:md5,246eb723691371ad49bd080071475740" + ] + ], + "versions": [ + "versions.yml:md5,2c02f5f3103ee1532c27f7f3b873a578" + ] + } + ], + "timestamp": "2024-07-30T13:46:47.110076" + }, + "neoantigenutils_formatnetmhcpan - output(WT,netmhcpanstab) - tsv": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "typeMut": false, + "fromStab": false + }, + "test_netmHCpanoutput.WT.tsv:md5,b95a6624d4010eb6517ca880a13e670d" + ] + ], + "1": [ + "versions.yml:md5,2c02f5f3103ee1532c27f7f3b873a578" + ], + "netMHCpanreformatted": [ + [ + { + "id": "test", + "typeMut": false, + "fromStab": false + }, + "test_netmHCpanoutput.WT.tsv:md5,b95a6624d4010eb6517ca880a13e670d" + ] + ], + "versions": [ + "versions.yml:md5,2c02f5f3103ee1532c27f7f3b873a578" + ] + } + ], + "timestamp": "2024-07-30T13:46:56.841519" + } +} \ No newline at end of file diff --git a/modules/msk/neoantigenutils/formatnetmhcpan/tests/tags.yml b/modules/msk/neoantigenutils/formatnetmhcpan/tests/tags.yml new file mode 100644 index 0000000..89a2d4b --- /dev/null +++ b/modules/msk/neoantigenutils/formatnetmhcpan/tests/tags.yml @@ -0,0 +1,2 @@ +neoantigenutils/formatnetmhcpan: + - "modules/msk/neoantigenutils/formatnetmhcpan/**" diff --git a/modules/msk/neoantigenutils/generatehlastring/environment.yml b/modules/msk/neoantigenutils/generatehlastring/environment.yml new file mode 100644 index 0000000..21ef0ea --- /dev/null +++ b/modules/msk/neoantigenutils/generatehlastring/environment.yml @@ -0,0 +1,9 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +name: "neoantigenutils_generatehlastring" +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - "NEOANTIGENUTILS" diff --git a/modules/msk/neoantigenutils/generatehlastring/main.nf b/modules/msk/neoantigenutils/generatehlastring/main.nf new file mode 100644 index 0000000..2362e69 --- /dev/null +++ b/modules/msk/neoantigenutils/generatehlastring/main.nf @@ -0,0 +1,43 @@ +process NEOANTIGENUTILS_GENERATEHLASTRING { + tag "$meta.id" + label 'process_single' + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'docker://mskcc/neoantigen-utils-base:1.0.0': + 'docker.io/mskcc/neoantigen-utils-base:1.0.0' }" + + input: + tuple val(meta), path(inputHLA) + + output: + tuple val(meta), stdout, emit: hlastring + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + generateHLAString.sh -f ${inputHLA} + + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + generateHLAstring: \$(echo \$(generateHLAString.sh -v)) + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + echo "HLA-test:01,HLA-test2:02" + cat <<-END_VERSIONS > versions.yml + "${task.process}": + generateHLAstring: \$(echo \$(generateHLAString.sh -v)) + END_VERSIONS + """ +} diff --git a/modules/msk/neoantigenutils/generatehlastring/meta.yml b/modules/msk/neoantigenutils/generatehlastring/meta.yml new file mode 100644 index 0000000..08709c2 --- /dev/null +++ b/modules/msk/neoantigenutils/generatehlastring/meta.yml @@ -0,0 +1,49 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "neoantigenutils_generatehlastring" +description: Generate the hla string for netmhc tools +keywords: + - neoantigen + - string + - netmhc + - hla +tools: + - neoantigen_utils: + description: "Collection of helper scripts for neoantigen processing" + documentation: "https://github.com/mskcc-omics-workflows/modules" + licence: [""] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information. + e.g. `[ id:'sample1', single_end:false ]` + + - inputHLA: + type: file + description: Winners HLA file from polysolver + pattern: "*.{hla.txt}" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + + - hlastring: + type: string + description: HLA string to use for netmhc tool input + +authors: + - "@johnoooh" + - "@nikhil" +maintainers: + - "@johnoooh" + - "@nikhil" diff --git a/modules/msk/neoantigenutils/generatehlastring/resources/usr/bin/generateHLAString.sh b/modules/msk/neoantigenutils/generatehlastring/resources/usr/bin/generateHLAString.sh new file mode 100755 index 0000000..262494f --- /dev/null +++ b/modules/msk/neoantigenutils/generatehlastring/resources/usr/bin/generateHLAString.sh @@ -0,0 +1,39 @@ +#!/bin/bash + +### Versioning + +VERSION=1.0.0 + +get_help() { echo "USAGE: generateHLASTRING.sh -f [HLA_FILE]"; exit 0; } +get_version() { echo $VERSION; exit 0; } + +while (( "$#" )); do + case $1 in + -h|--help) get_help ;; + -f) file=$2; shift ;; + -v) get_version ;; + *) get_help ;; + esac + shift +done + +cat $file | tr "\t" "\n" | grep -v "HLA" | tr "\n" "," > massaged.winners.hla.txt + +input_string=`head -n 1 massaged.winners.hla.txt` + +IFS=',' read -ra items <<< "$input_string" + +for item in "${items[@]}"; do + + # Append the transformed item to the output string + truncated_value=$(echo "$item" | cut -c 1-11) + + # Replace the first '_', the next '_', and remaining '_' with '-', '*', and ':', respectively + modified_value=$(echo "$truncated_value" | tr '[:lower:]' '[:upper:]' | sed 's/_/-/; s/_//; s/_/:/g') + output_hla+=",$modified_value" + +done + +# Remove leading comma +output_hla="${output_hla:1}" +echo $output_hla diff --git a/modules/msk/neoantigenutils/generatehlastring/tests/main.nf.test b/modules/msk/neoantigenutils/generatehlastring/tests/main.nf.test new file mode 100644 index 0000000..cfc3260 --- /dev/null +++ b/modules/msk/neoantigenutils/generatehlastring/tests/main.nf.test @@ -0,0 +1,66 @@ +nextflow_process { + + name "Test Process NEOANTIGENUTILS_GENERATEHLASTRING" + script "../main.nf" + process "NEOANTIGENUTILS_GENERATEHLASTRING" + + tag "modules" + tag "modules_nfcore" + tag "neoantigenutils" + tag "neoantigenutils/generatehlastring" + tag "modules_msk" + + test("neoantigenutils_generatehlastring - hla - string") { + + when { + + process { + """ + + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.test_data_mskcc['neoantigen']['winners_hla_txt'], checkIfExists: true) + ] + + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + + test("neoantigenutils_generatehlastring - hla - string - stub") { + + options "-stub" + + when { + + process { + """ + + input[0] = [ + [ id:'test', single_end:false ], // meta map + file('winners_hla_txt') + ] + + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/msk/neoantigenutils/generatehlastring/tests/main.nf.test.snap b/modules/msk/neoantigenutils/generatehlastring/tests/main.nf.test.snap new file mode 100644 index 0000000..2095aa4 --- /dev/null +++ b/modules/msk/neoantigenutils/generatehlastring/tests/main.nf.test.snap @@ -0,0 +1,64 @@ +{ + "neoantigenutils_generatehlastring - hla - string - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "HLA-test:01,HLA-test2:02\n" + ] + ], + "1": [ + "versions.yml:md5,85e2db7a93782eb39ae80340a9728de2" + ], + "hlastring": [ + [ + { + "id": "test", + "single_end": false + }, + "HLA-test:01,HLA-test2:02\n" + ] + ], + "versions": [ + "versions.yml:md5,85e2db7a93782eb39ae80340a9728de2" + ] + } + ], + "timestamp": "2024-04-30T09:36:22.811416" + }, + "neoantigenutils_generatehlastring - hla - string": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "HLA-A24:02,HLA-A24:02,HLA-B39:01,HLA-B39:01,HLA-C07:01,HLA-C06:02\n" + ] + ], + "1": [ + "versions.yml:md5,85e2db7a93782eb39ae80340a9728de2" + ], + "hlastring": [ + [ + { + "id": "test", + "single_end": false + }, + "HLA-A24:02,HLA-A24:02,HLA-B39:01,HLA-B39:01,HLA-C07:01,HLA-C06:02\n" + ] + ], + "versions": [ + "versions.yml:md5,85e2db7a93782eb39ae80340a9728de2" + ] + } + ], + "timestamp": "2024-04-30T09:36:16.803412" + } +} \ No newline at end of file diff --git a/modules/msk/neoantigenutils/generatehlastring/tests/tags.yml b/modules/msk/neoantigenutils/generatehlastring/tests/tags.yml new file mode 100644 index 0000000..0912218 --- /dev/null +++ b/modules/msk/neoantigenutils/generatehlastring/tests/tags.yml @@ -0,0 +1,2 @@ +neoantigenutils/generatehlastring: + - "modules/msk/neoantigenutils/generatehlastring/**" diff --git a/modules/msk/neoantigenutils/generatemutfasta/environment.yml b/modules/msk/neoantigenutils/generatemutfasta/environment.yml new file mode 100644 index 0000000..3ab33c4 --- /dev/null +++ b/modules/msk/neoantigenutils/generatemutfasta/environment.yml @@ -0,0 +1,9 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +name: "neoantigenutils_generatemutfasta" +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - "NEOANTIGENUTILS" diff --git a/modules/msk/neoantigenutils/generatemutfasta/main.nf b/modules/msk/neoantigenutils/generatemutfasta/main.nf new file mode 100644 index 0000000..f9051a9 --- /dev/null +++ b/modules/msk/neoantigenutils/generatemutfasta/main.nf @@ -0,0 +1,53 @@ +process NEOANTIGENUTILS_GENERATEMUTFASTA { + tag "$meta.id" + label 'process_single' + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'docker://mskcc/neoantigen-utils-base:1.0.0': + 'docker.io/mskcc/neoantigen-utils-base:1.0.0' }" + + input: + tuple val(meta), path(inputMaf) + tuple path(cds), path(cdna) + + output: + tuple val(meta), path("*_out/*.MUT_sequences.fa"), emit: mut_fasta + tuple val(meta), path("*_out/*.WT_sequences.fa"), emit: wt_fasta + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + mkdir ${prefix}_out + + generateMutFasta.py --sample_id ${prefix} \ + --output_dir ${prefix}_out \ + --maf_file ${inputMaf} \ + --CDS_file ${cds} \ + --CDNA_file ${cdna} + + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + generateMutFasta: \$(echo \$(generateMutFasta.py -v)) + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + mkdir ${prefix}_out + touch ${prefix}_out/${prefix}.MUT_sequences.fa + touch ${prefix}_out/${prefix}.WT_sequences.fa + cat <<-END_VERSIONS > versions.yml + "${task.process}": + generateMutFasta: \$(echo \$(generateMutFasta.py -v)) + END_VERSIONS + """ +} diff --git a/modules/msk/neoantigenutils/generatemutfasta/meta.yml b/modules/msk/neoantigenutils/generatemutfasta/meta.yml new file mode 100644 index 0000000..5d319c9 --- /dev/null +++ b/modules/msk/neoantigenutils/generatemutfasta/meta.yml @@ -0,0 +1,69 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "neoantigenutils_generatemutfasta" +description: Generate the mutation fasta for netmhc tools +keywords: + - neoantigen + - fasta + - netmhc + - mutation +tools: + - neoantigen_utils: + description: "Collection of helper scripts for neoantigen processing" + documentation: "https://github.com/mskcc-omics-workflows/modules" + licence: [""] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information. + e.g. `[ id:'sample1', single_end:false ]` + + - inputMaf: + type: file + description: Maf outputtted by Tempo that was run through phyloWGS + pattern: "*.{maf}" + + - cds: + type: file + description: coding sequence resource fasta + pattern: "*.{cds.all.fa.gz}" + + - cdna: + type: file + description: cDNA resource fasta + pattern: "*.{cdna.all.fa.gz}" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - json: + type: file + description: output combined Json ready for input into the neoantigen pipeline + pattern: "*.{json}" + + - mut_fasta: + type: file + description: Mutated fasta sequence + pattern: "*.{MUT_sequences.fa}" + + - wt_fasta: + type: file + description: Wildtype fasta sequence + pattern: "*.{WT_sequences.fa}" + +authors: + - "@johnoooh" + - "@nikhil" +maintainers: + - "@johnoooh" + - "@nikhil" diff --git a/modules/msk/neoantigenutils/generatemutfasta/resources/usr/bin/generateMutFasta.py b/modules/msk/neoantigenutils/generatemutfasta/resources/usr/bin/generateMutFasta.py new file mode 100755 index 0000000..84d23fc --- /dev/null +++ b/modules/msk/neoantigenutils/generatemutfasta/resources/usr/bin/generateMutFasta.py @@ -0,0 +1,699 @@ +#!/usr/bin/env python3 + +import os, sys +import argparse, re +import traceback +import pandas as pd +import logging, logging.handlers +import gzip +import copy + +VERSION = 1.1 + +####################### +### FASTA with mutated peptides +####################### + + +def main(): + prog_description = ( + "# Neoantigen prediction pipeline. Four main steps: \n" + "\t\t(1) Genotype HLA using POLYSOLVER, \n" + "\t\t(2) Constructed mutated peptide sequences from HGVSp/HGVSc \n" + "\t\t(3) Run NetMHCpan \n" + "\t\t(4) Gather results and generate: \n" + "\t\t\t\t- .neoantigens.maf: original maf with neoantigen prediction columns added \n" + "\t\t\t\t- .all_neoantigen_predictions.txt: all the predictions made for all peptides by both the algorithms \n" + ) + prog_epilog = "\n" + + parser = argparse.ArgumentParser( + description=prog_description, + epilog=prog_epilog, + formatter_class=argparse.RawDescriptionHelpFormatter, + add_help=True, + ) + required_arguments = parser.add_argument_group("Required arguments") + required_arguments.add_argument( + "--sample_id", + required=True, + help="sample_id used to limit neoantigen prediction to identify mutations " + "associated with the patient in the MAF (column 16). ", + ) + required_arguments.add_argument( + "--output_dir", required=True, help="output directory" + ) + required_arguments.add_argument( + "--maf_file", required=True, help="expects a CMO maf file (post vcf2maf.pl)" + ) + required_arguments.add_argument( + "--CDS_file", required=True, help="expects a fa.gz file" + ) + required_arguments.add_argument( + "--CDNA_file", required=True, help="expects a fa.gz file" + ) + + optional_arguments = parser.add_argument_group("Optional arguments") + optional_arguments.add_argument( + "--peptide_lengths", + required=False, + help="comma-separated numbers indicating the lengths of peptides to generate. Default: 9,10", + ) + optional_arguments.add_argument( + "-v", "--version", action="version", version="%(prog)s {}".format(VERSION) + ) + + args = parser.parse_args() + + maf_file = str(args.maf_file) + output_dir = str(args.output_dir) + sample_id = str(args.sample_id) + reference_cds_file = str(args.CDS_file) + reference_cdna_file = str(args.CDNA_file) + peptide_lengths = [9, 10, 11] + sample_path_pfx = output_dir + "/" + sample_id + mutated_sequences_fa = sample_path_pfx + ".MUT_sequences.fa" + WT_sequences_fa = sample_path_pfx + ".WT_sequences.fa" + + mutations = [] + out_fa = open(mutated_sequences_fa, "w") + out_WT_fa = open(WT_sequences_fa, "w") + + # + # initialize loggers + # + logger = logging.getLogger("neoantigen") + logger.setLevel(logging.DEBUG) + console_formatter = logging.Formatter( + "%(asctime)s: %(levelname)s: %(message)s", datefmt="%m-%d-%Y %H:%M:%S" + ) + + # logfile handler + handler_file = logging.FileHandler(output_dir + "/neoantigen_run.log", mode="w") + handler_file.setLevel(logging.DEBUG) + handler_file.setFormatter(console_formatter) + logger.addHandler(handler_file) + + # stdout handler + handler_stdout = logging.StreamHandler(sys.stdout) + handler_stdout.setFormatter(console_formatter) + handler_stdout.setLevel(logging.INFO) + logger.addHandler(handler_stdout) + + logger.info("Starting neoantigen prediction run") + logger.info("\tLog file: " + output_dir + "/neoantigen_run.log") + logger.info("\t--maf_file: " + maf_file) + logger.info("\t--output_dir: " + output_dir) + + ## generate .debug.fa for debugging purposes. + debug_out_fa = open(sample_path_pfx + ".mutated_sequences.debug.fa", "w") + + try: + logger.info("Loading reference CDS/cDNA sequences...") + cds_seqs = load_transcript_fasta(reference_cds_file) + cdna_seqs = load_transcript_fasta(reference_cdna_file) + logger.info("Finished loading reference CDS/cDNA sequences...") + + logger.info("Reading MAF file and constructing mutated peptides...") + maf_df = skip_lines_start_with( + maf_file, "#", low_memory=False, header=0, sep="\t" + ) + n_muts = n_non_syn_muts = n_missing_tx_id = 0 + for index, row in maf_df.iterrows(): + cds_seq = "" + cdna_seq = "" + + n_muts += 1 + tx_id = row["Transcript_ID"] + if tx_id in cds_seqs: + cds_seq = cds_seqs[tx_id] + + if tx_id in cdna_seqs: + cdna_seq = cdna_seqs[tx_id] + + mut = mutation(row, cds_seq, cdna_seq) + + if mut.is_non_syn(): + n_non_syn_muts += 1 + + if cds_seq == "": + n_missing_tx_id += 1 + + if cds_seq != "" and mut.is_non_syn(): + mut.generate_translated_sequences(max(peptide_lengths)) + + if len(mut.mt_altered_aa) > 5: + out_fa.write(">" + mut.identifier_key + "_M\n") + out_fa.write(mut.mt_altered_aa + "\n") + out_WT_fa.write(">" + mut.identifier_key + "_W\n") + out_WT_fa.write(mut.wt_altered_aa + "\n") + + ### write out WT/MT CDS + AA for debugging purposes + debug_out_fa.write(">" + mut.identifier_key + "_M\n") + debug_out_fa.write("mt_altered_aa: " + mut.mt_altered_aa + "\n") + debug_out_fa.write("wt_full_cds: " + mut.wt_cds + "\n") + debug_out_fa.write("wt_full_aa: " + mut.wt_aa + "\n") + debug_out_fa.write("mt_full_cds: " + mut.mt_cds + "\n") + debug_out_fa.write("mt_full_aa: " + mut.mt_aa + "\n") + mutations.append(mut) + + out_fa.close() + debug_out_fa.close() + + logger.info("\tMAF mutations summary") + logger.info("\t\t# mutations: " + str(n_muts)) + logger.info( + "\t\t# non-syn: " + + str(n_non_syn_muts) + + " (# with missing CDS: " + + str(n_missing_tx_id) + + ")" + ) + + except Exception: + logger.error("Error while generating mutated peptides") + logger.error(traceback.format_exc()) + exit(1) + + +# skip the header lines that start with "#" +def skip_lines_start_with(fle, junk, **kwargs): + if os.stat(fle).st_size == 0: + raise ValueError("File is empty") + with open(fle) as f: + pos = 0 + cur_line = f.readline() + while cur_line.startswith(junk): + pos = f.tell() + cur_line = f.readline() + f.seek(pos) + return pd.read_csv(f, **kwargs) + + +def load_transcript_fasta(fa_file): + seqs = dict() + if fa_file[-3 : len(fa_file)] == ".gz": + lines = gzip.open(fa_file, "rb").readlines() + else: + lines = open(fa_file).readlines() + idx = 0 + while idx < len(lines): + line = lines[idx] + m = re.search("^>(ENST\d+)\s", line.decode("utf-8")) + transcript_id = "" + if not m: + sys.exit("Error parsing transcript file " + fa_file + " at line: " + line) + else: + transcript_id = m.group(1) + + idx = idx + 1 + seq_str = "" + while idx < len(lines) and not re.match("^>ENST", lines[idx].decode("utf-8")): + seq_str = seq_str + lines[idx].decode("utf-8").strip() + idx = idx + 1 + seqs[transcript_id] = seq_str + + return seqs + + +class neopeptide(object): + row = None + algorithm = "" + version = "" + hla_allele = "" + peptide = "" + core = "" + icore = "" + score_el = 0 + rank_el = 100 + score_ba = 0 + rank_ba = 0 + binding_affinity = 10000 + best_binder_for_icore_group = "" + binder_class = "" + is_in_wt_peptidome = False + + def __init__(self, row): + self.row = row + self.algorithm = row["algorithm"] + self.version = row["version"] + self.hla_allele = row["hla_allele"] + self.peptide = row["peptide"] + self.core = row["core"] + self.icore = row["icore"] + self.score_el = row["score_el"] + self.rank_el = row["rank_el"] + self.score_ba = row["score_ba"] + self.rank_ba = row["rank_ba"] + self.binding_affinity = row["affinity"] + self.binder_class = row["binder_class"] + self.best_binder_for_icore_group = row["best_binder_for_icore_group"] + self.is_in_wt_peptidome = row["is_in_wt_peptidome"] + + def is_strong_binder(self): + if self.binder_class == "Strong Binder": + return True + return False + + def is_weak_binder(self): + if self.binder_class == "Weak Binder": + return True + return False + + +# +# class to hold the list of neopeptides and helper functions to identify strong/weak binders +# +class binding_predictions(object): + neopeptides = None + + def __init__(self, neopeptides): + self.neopeptides = neopeptides + + def add_neopeptide(self, np): + self.neopeptides.append(np) + + def get_best_per_icore(self): + return [x for x in self.neopeptides if x.best_binder_for_icore_group] + + def get_strong_binders(self): + return [x for x in self.get_best_per_icore() if x.is_strong_binder()] + + def get_weak_binders(self): + return [x for x in self.get_best_per_icore() if x.is_weak_binder()] + + def get_all_binders(self): + return [ + x + for x in self.get_best_per_icore() + if x.is_strong_binder() or x.is_weak_binder() + ] + + def get_best_binder(self): + if len(self.get_best_per_icore()) == 0: + return None + return sorted( + self.get_best_per_icore(), key=lambda x: x.rank_el, reverse=False + )[0] + + +# +# mutation class holds each row in the maf and has +# +class mutation(object): + maf_row = None + cds_seq = "" + cdna_seq = "" + wt_cds = "" + wt_aa = "" + wt_altered_aa = "" + mt_cds = "" + mt_aa = "" + mt_altered_aa = "" + identifier_key = "" + predicted_neopeptides = None + + def __init__(self, maf_row, cds_seq, cdna_seq): + self.maf_row = maf_row + self.cds_seq = cds_seq + self.cdna_seq = cdna_seq + self.predicted_neopeptides = binding_predictions([]) + + ##ENCODING FASTA ID FOR USE IN MATCHING LATER + ALPHABET = [ + "A", + "B", + "C", + "D", + "E", + "F", + "G", + "H", + "I", + "J", + "K", + "L", + "M", + "N", + "O", + "P", + "Q", + "R", + "S", + "T", + "U", + "V", + "W", + "X", + "Y", + "Z", + ] + + variant_type_map = { + "Missense_Mutation": "M", + "Nonsense_Mutation": "X", + "Silent_Mutation": "S", + "Silent": "S", + "Frame_shift_Ins": "I+", + "Frame_shift_Del": "I-", + "In_Frame_Ins": "If", + "In_Frame_Del": "Id", + "Splice_Site": "Sp", + } + + position = int(str(self.maf_row["Start_Position"])[0:2]) + + if position < 26: + encoded_start = ALPHABET[position] + elif position < 100: + + encoded_start = ALPHABET[position // 4] + + position = int(str(self.maf_row["Start_Position"])[-2:]) + + if position < 26: + encoded_end = ALPHABET[position] + elif position < 100: + + encoded_end = ALPHABET[position // 4] + + sum_remaining = sum(int(d) for d in str(self.maf_row["Start_Position"])[2:-2]) + + encoded_position = encoded_start + ALPHABET[sum_remaining % 26] + encoded_end + + if self.maf_row["Tumor_Seq_Allele2"] == "-": + # handles deletion + if len(self.maf_row["Reference_Allele"]) > 3: + Allele2code = self.maf_row["Reference_Allele"][0:3] + else: + Allele2code = self.maf_row["Reference_Allele"] + + elif len(self.maf_row["Tumor_Seq_Allele2"]) > 1: + # handles INS and DNP + if len(self.maf_row["Tumor_Seq_Allele2"]) > 3: + Allele2code = self.maf_row["Tumor_Seq_Allele2"][0:3] + else: + Allele2code = self.maf_row["Tumor_Seq_Allele2"] + + else: + # SNPs + Allele2code = self.maf_row["Tumor_Seq_Allele2"] + + if self.maf_row["Tumor_Seq_Allele2"] == "-": + # handles deletion + if len(self.maf_row["Reference_Allele"]) > 3: + Allele2code = self.maf_row["Reference_Allele"][0:3] + else: + Allele2code = self.maf_row["Reference_Allele"] + + elif len(self.maf_row["Tumor_Seq_Allele2"]) > 1: + # handles INS and DNP + if len(self.maf_row["Tumor_Seq_Allele2"]) > 3: + Allele2code = self.maf_row["Tumor_Seq_Allele2"][0:3] + else: + Allele2code = self.maf_row["Tumor_Seq_Allele2"] + + else: + # SNPs + Allele2code = self.maf_row["Tumor_Seq_Allele2"] + + if self.maf_row["Variant_Classification"] in variant_type_map: + self.identifier_key = ( + str(self.maf_row["Chromosome"]) + + encoded_position + + "_" + + variant_type_map[self.maf_row["Variant_Classification"]] + + Allele2code + ) + else: + + self.identifier_key = ( + str(self.maf_row["Chromosome"]) + + encoded_position + + "_" + + "SY" + + Allele2code + ) + print(self.identifier_key) + + ### Check if the variant_classification is among those that can generate a neoantigen + def is_non_syn(self): + types = [ + "Frame_Shift_Del", + "Frame_Shift_Ins", + "In_Frame_Del", + "In_Frame_Ins", + "Missense_Mutation", + "Nonstop_Mutation", + ] + + return self.maf_row["Variant_Classification"] in types and not pd.isnull( + self.maf_row["HGVSp_Short"] + ) + + ### helper function #source: stackoverflow. + def reverse_complement(self, dna): + complement = {"A": "T", "C": "G", "G": "C", "T": "A"} + return "".join([complement[base] for base in dna[::-1]]) + + ### helper function to translate cDNA sequence + @staticmethod + def cds_to_aa(cds): + # https://www.geeksforgeeks.org/dna-protein-python-3/ + codon_table = { + "ATA": "I", + "ATC": "I", + "ATT": "I", + "ATG": "M", + "ACA": "T", + "ACC": "T", + "ACG": "T", + "ACT": "T", + "AAC": "N", + "AAT": "N", + "AAA": "K", + "AAG": "K", + "AGC": "S", + "AGT": "S", + "AGA": "R", + "AGG": "R", + "CTA": "L", + "CTC": "L", + "CTG": "L", + "CTT": "L", + "CCA": "P", + "CCC": "P", + "CCG": "P", + "CCT": "P", + "CAC": "H", + "CAT": "H", + "CAA": "Q", + "CAG": "Q", + "CGA": "R", + "CGC": "R", + "CGG": "R", + "CGT": "R", + "GTA": "V", + "GTC": "V", + "GTG": "V", + "GTT": "V", + "GCA": "A", + "GCC": "A", + "GCG": "A", + "GCT": "A", + "GAC": "D", + "GAT": "D", + "GAA": "E", + "GAG": "E", + "GGA": "G", + "GGC": "G", + "GGG": "G", + "GGT": "G", + "TCA": "S", + "TCC": "S", + "TCG": "S", + "TCT": "S", + "TTC": "F", + "TTT": "F", + "TTA": "L", + "TTG": "L", + "TAC": "Y", + "TAT": "Y", + "TAA": "_", + "TAG": "_", + "TGC": "C", + "TGT": "C", + "TGA": "_", + "TGG": "W", + } + protein = "" + + for i in range(0, len(cds), 3): + codon = cds[i : i + 3] + if len(codon) != 3: + ## This is unusual; in some cases in Ensembl the CDS length is not a multiple of 3. Eg: ENST00000390464 + ## For this reason, decided not to throw an error and just stop translating if the CDS ends with a non-triplet + # print 'CDS ends with non-triplet: ' + codon + ' ' + cds + break + if codon_table[codon] == "_": # stop codon reached + break + protein += codon_table[codon] + return protein + + # function that parses the HGVSc and constructs the WT and mutated coding sequences for the given mutation. + def generate_translated_sequences(self, pad_len=10): + if not self.is_non_syn(): + return None + + ## append the 3'UTR to the CDS -- to account for non stop mutations and indels that shift the canonical stop + if not self.cds_seq in self.cdna_seq: + print( + "Skipping because the CDS is not contained within cDNA. Note: only 2 transcripts/peptides are like this" + ) + return None + + hgvsc = self.maf_row["HGVSc"] + position, ref_allele, alt_allele, sequence, hgvsc_type = [-1, "", "", "", "ONP"] + + if re.match(r"^c\.(\d+).*([ATCG]+)>([ATCG]+)$", hgvsc): + position, ref_allele, alt_allele = re.match( + r"^c\.(\d+).*(\w+)>(\w+)", hgvsc + ).groups() + + elif re.match(r"^c\.(\d+).*del([ATCG]+)ins([ATCG]+)$", hgvsc): + position, ref_allele, alt_allele = re.match( + r"^c\.(\d+).*del([ATCG]+)ins([ATCG]+)$", hgvsc + ).groups() + + elif re.match(r"^c\.(\d+).*(dup|ins|del|inv)([ATCG]+)$", hgvsc): + position, hgvsc_type, sequence = re.match( + r"^c\.(\d+).*(dup|ins|del|inv)([ATCG]+)$", hgvsc + ).groups() + + else: + sys.exit("Error: not one of the known HGVSc strings: " + hgvsc) + + position = int(position) - 1 + if hgvsc_type in "dup,ins": + alt_allele = sequence + elif hgvsc_type == "del": + ref_allele = sequence + elif hgvsc_type == "inv": + ref_allele = sequence + alt_allele = self.reverse_complement(sequence) + + ## start of mutated region in CDS + cds = re.search(self.cds_seq + ".*", self.cdna_seq).group() + + seq_5p = cds[0:position] + seq_3p = cds[position : len(cds)] + + # print self.hgvsp + '\t' + self.variant_class + '\t' + self.variant_type + '\t' + self.ref_allele + '\t' + self.alt_allele + \ + # '\t' + self.cds_position + '\nFull CDS: ' + self.cds_seq + '\nSeq_5: ' + seq_5p + '\nSeq_3' + seq_3p + '\n>mut_1--' + mut_cds_1 + '\n>mut_2--' + mut_cds_2 + '\n>mut_3--' + mut_cds_3 + self.wt_cds = seq_5p + ref_allele + seq_3p[len(ref_allele) : len(seq_3p)] + self.mt_cds = seq_5p + alt_allele + seq_3p[len(ref_allele) : len(seq_3p)] + wt = mutation.cds_to_aa(self.wt_cds) + mt = mutation.cds_to_aa(self.mt_cds) + + ### identify regions of mutation in WT and MT sequences. + ### logic is to match the wt and mt sequences first from the beginning until a mismatch is found; and, then, + ### start from the end of both sequences until a mismatch is found. the intervening sequence represents the WT and MT sequences + ### Note, aside from missenses, the interpretation of WT sequence is ambiguous. + len_from_start = len_from_end = 0 + + ## from start + for i in range(0, min(len(wt), len(mt))): + len_from_start = i + if wt[i : i + 1] != mt[i : i + 1]: + break + + ## from end + wt_rev = wt[::-1] + mt_rev = mt[::-1] + for i in range(0, min(len(wt), len(mt))): + len_from_end = i + if ( + len_from_end + len_from_start >= min(len(wt), len(mt)) + or wt_rev[i : i + 1] != mt_rev[i : i + 1] + ): + break + + wt_start = len_from_start + wt_end = len(wt) - len_from_end + + mt_start = len_from_start + mt_end = len(mt) - len_from_end + + self.wt_aa = wt + self.mt_aa = mt + + self.wt_altered_aa = wt[ + max(0, wt_start - pad_len + 1) : min(len(wt), wt_end + pad_len - 1) + ] + self.mt_altered_aa = mt[ + max(0, mt_start - pad_len + 1) : min(len(mt), mt_end + pad_len - 1) + ] + + # function to iterate over all the the neopeptide predictions made in the entire MAF and identify + # which neopeptides are generated by this mutation object + def match_with_neopeptides(self, all_neopeptides): + for np in all_neopeptides: + # make sure the neopeptide is not a peptide fragment of the wild-type protein + if np.icore in self.mt_altered_aa and np.icore not in self.wt_aa: + self.predicted_neopeptides.add_neopeptide(copy.deepcopy(np)) + + # simply prints the original row in the MAF file along with some neoantigen prediction specific + # appended at the end + def get_maf_row_to_print(self): + row = self.maf_row + row["neo_maf_identifier_key"] = self.identifier_key + + if self.predicted_neopeptides.get_best_binder() is not None: + best_binder = self.predicted_neopeptides.get_best_binder() + + strong_binders = self.predicted_neopeptides.get_strong_binders() + weak_binders = self.predicted_neopeptides.get_weak_binders() + row["neo_best_icore_peptide"] = best_binder.icore + row["neo_best_score_el"] = best_binder.score_el + row["neo_best_rank_el"] = best_binder.rank_el + row["neo_best_score_ba"] = best_binder.score_ba + row["neo_best_rank_ba"] = best_binder.rank_ba + row["neo_best_binding_affinity"] = best_binder.binding_affinity + row["neo_best_binder_class"] = best_binder.binder_class + row["neo_best_is_in_wt_peptidome"] = best_binder.is_in_wt_peptidome + row["neo_best_algorithm"] = best_binder.algorithm + row["neo_best_version"] = best_binder.version + row["neo_best_hla_allele"] = best_binder.hla_allele + row["neo_n_peptides_evaluated"] = len( + self.predicted_neopeptides.get_best_per_icore() + ) + row["neo_n_strong_binders"] = len(strong_binders) + row["neo_n_weak_binders"] = len(weak_binders) + else: + row["neo_best_icore_peptide"] = "" + row["neo_best_score_el"] = "" + row["neo_best_rank_el"] = "" + row["neo_best_score_ba"] = "" + row["neo_best_rank_ba"] = "" + row["neo_best_binding_affinity"] = "" + row["neo_best_binder_class"] = "" + row["neo_best_is_in_wt_peptidome"] = "" + row["neo_best_algorithm"] = "" + row["neo_best_version"] = "" + row["neo_best_hla_allele"] = "" + row["neo_n_peptides_evaluated"] = 0 + row["neo_n_strong_binders"] = 0 + row["neo_n_weak_binders"] = 0 + return row + + # simply prints the original row of the 'combined_output' of neoantigen predictions along with additional columns + def get_predictions_rows_to_print(self): + rows = [] + for prediction in self.predicted_neopeptides.neopeptides: + prediction.row["neo_maf_identifier_key"] = self.identifier_key + rows.append(prediction.row) + return rows + + +if __name__ == "__main__": + main() diff --git a/modules/msk/neoantigenutils/generatemutfasta/tests/main.nf.test b/modules/msk/neoantigenutils/generatemutfasta/tests/main.nf.test new file mode 100644 index 0000000..46e13ed --- /dev/null +++ b/modules/msk/neoantigenutils/generatemutfasta/tests/main.nf.test @@ -0,0 +1,85 @@ +nextflow_process { + + name "Test Process NEOANTIGENUTILS_GENERATEMUTFASTA" + script "../main.nf" + process "NEOANTIGENUTILS_GENERATEMUTFASTA" + + tag "modules" + tag "modules_nfcore" + tag "neoantigenutils" + tag "neoantigenutils/generatemutfasta" + tag "modules_msk" + + test("neoantigenutils_generatemutfasta - maf - fasta") { + + when { + + process { + """ + + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.test_data_mskcc['neoantigen']['temp_test_maf'], checkIfExists: true) + ] + + input[1] = [ + file(params.test_data_mskcc['neoantigen']['cds'], checkIfExists: true), + file(params.test_data_mskcc['neoantigen']['cdna'], checkIfExists: true) + ] + + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + + test("neoantigenutils_generatemutfasta - maf - fasta - stub") { + + options "-stub" + + when { + + process { + """ + + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.test_data_mskcc['neoantigen']['temp_test_maf'], checkIfExists: true) + ] + + input[1] = [ + file(params.test_data_mskcc['neoantigen']['cds'], checkIfExists: true), + file(params.test_data_mskcc['neoantigen']['cdna'], checkIfExists: true) + ] + + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/msk/neoantigenutils/generatemutfasta/tests/main.nf.test.snap b/modules/msk/neoantigenutils/generatemutfasta/tests/main.nf.test.snap new file mode 100644 index 0000000..6f36db5 --- /dev/null +++ b/modules/msk/neoantigenutils/generatemutfasta/tests/main.nf.test.snap @@ -0,0 +1,108 @@ +{ + "neoantigenutils_generatemutfasta - maf - fasta": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.MUT_sequences.fa:md5,c236ca28e7c658b74377c19437d7c5ab" + ] + ], + "1": [ + [ + { + "id": "test", + "single_end": false + }, + "test.WT_sequences.fa:md5,b1a6ea2978a6624c98112e8118658495" + ] + ], + "2": [ + "versions.yml:md5,0fa44b299ab4ec385f2cc6831bbc6923" + ], + "mut_fasta": [ + [ + { + "id": "test", + "single_end": false + }, + "test.MUT_sequences.fa:md5,c236ca28e7c658b74377c19437d7c5ab" + ] + ], + "versions": [ + "versions.yml:md5,0fa44b299ab4ec385f2cc6831bbc6923" + ], + "wt_fasta": [ + [ + { + "id": "test", + "single_end": false + }, + "test.WT_sequences.fa:md5,b1a6ea2978a6624c98112e8118658495" + ] + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-11T15:34:00.546613" + }, + "neoantigenutils_generatemutfasta - maf - fasta - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.MUT_sequences.fa:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "test", + "single_end": false + }, + "test.WT_sequences.fa:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + "versions.yml:md5,0fa44b299ab4ec385f2cc6831bbc6923" + ], + "mut_fasta": [ + [ + { + "id": "test", + "single_end": false + }, + "test.MUT_sequences.fa:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,0fa44b299ab4ec385f2cc6831bbc6923" + ], + "wt_fasta": [ + [ + { + "id": "test", + "single_end": false + }, + "test.WT_sequences.fa:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-11T15:34:11.407096" + } +} \ No newline at end of file diff --git a/modules/msk/neoantigenutils/generatemutfasta/tests/tags.yml b/modules/msk/neoantigenutils/generatemutfasta/tests/tags.yml new file mode 100644 index 0000000..4efe2b6 --- /dev/null +++ b/modules/msk/neoantigenutils/generatemutfasta/tests/tags.yml @@ -0,0 +1,2 @@ +neoantigenutils/generatemutfasta: + - "modules/msk/neoantigenutils/generatemutfasta/**" diff --git a/modules/msk/neoantigenutils/neoantigeninput/environment.yml b/modules/msk/neoantigenutils/neoantigeninput/environment.yml new file mode 100644 index 0000000..5cc66dd --- /dev/null +++ b/modules/msk/neoantigenutils/neoantigeninput/environment.yml @@ -0,0 +1,9 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +name: "neoantigenutils_neoantigeninput" +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - "NEOANTIGENUTILS" diff --git a/modules/msk/neoantigenutils/neoantigeninput/main.nf b/modules/msk/neoantigenutils/neoantigeninput/main.nf new file mode 100644 index 0000000..747b1e5 --- /dev/null +++ b/modules/msk/neoantigenutils/neoantigeninput/main.nf @@ -0,0 +1,66 @@ +process NEOANTIGENUTILS_NEOANTIGENINPUT { + tag "$meta.id" + label 'process_single' + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'docker://mskcc/neoantigen-utils-base:1.1.0': + 'docker.io/mskcc/neoantigen-utils-base:1.1.0' }" + + input: + tuple val(meta), path(inputMaf), path(hlaFile) + tuple val(meta2), path(phyloWGSsumm), path(phyloWGSmut), path(phyloWGSfolder) + tuple val(meta3), path(mutNetMHCpan), path(wtNetMHCpan) + + output: + tuple val(meta), path("*_.json"), emit: json + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def id = task.ext.prefix ?: "${meta.id}" + def patientid = task.ext.cohort ?: "${meta.id}_patient" + def cohort = task.ext.cohort ?: "${meta.id}_cohort" + + """ + tree_folder_name=\$(basename -s .zip "${phyloWGSfolder}") + mkdir \$tree_folder_name + unzip ${phyloWGSfolder} -d \$tree_folder_name + gzip -d -c ${phyloWGSsumm} > ${id}.summ.json + gzip -d -c ${phyloWGSmut} > ${id}.mut.json + + + + generate_input.py --maf_file ${inputMaf} \ + --summary_file ${id}.summ.json \ + --mutation_file ${id}.mut.json \ + --tree_directory \$tree_folder_name \ + --id ${id} --patient_id ${patientid} \ + --cohort ${cohort} --HLA_genes ${hlaFile} \ + --netMHCpan_MUT_input ${mutNetMHCpan} \ + --netMHCpan_WT_input ${wtNetMHCpan} + ${args} + + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + neoantigeninput: \$(echo \$(generate_input.py -v)) + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def id = task.ext.prefix ?: "${meta.id}" + def patientid =task.ext.cohort ?: "${meta.id}_patient" + def cohort =task.ext.cohort ?: "${meta.id}_cohort" + """ + + touch ${patientid}_${id}_.json + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + neoantigeninput: \$(echo \$(generate_input.py -v)) + END_VERSIONS + """ +} diff --git a/modules/msk/neoantigenutils/neoantigeninput/meta.yml b/modules/msk/neoantigenutils/neoantigeninput/meta.yml new file mode 100644 index 0000000..8f22679 --- /dev/null +++ b/modules/msk/neoantigenutils/neoantigeninput/meta.yml @@ -0,0 +1,78 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "neoantigenutils_neoantigeninput" +description: This module take several inputs to the Lukza neoantigen pipeline and combines them into a single json file ready for input into their pipeline +keywords: + - neoantigen + - aggregate + - genomics +tools: + - neoantigen_utils: + description: "Collection of helper scripts for neoantigen processing" + documentation: "https://github.com/mskcc-omics-workflows/modules" + licence: [""] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information. Maybe cohort and patient_id as well? + e.g. `[ id:'sample1', single_end:false ]` + + - inputMaf: + type: file + description: Maf outputtted by Tempo that was run through phyloWGS + pattern: "*.{maf}" + + - phyloWGSsumm: + type: file + description: Summ json outputtted by phyloWGS + pattern: "*.{json.gz}" + + - phyloWGSmut: + type: file + description: Summary json outputtted by phyloWGS + pattern: "*.{json.gz}" + + - phyloWGSfolder: + type: file + description: Folder of mutations in trees output by PhyloWGS + pattern: ".{zip}" + + - mutNetMHCpan: + type: file + description: tsv formatted output from netMHCpan with the mutated neoantigens . + pattern: ".{tsv}" + + - wtNetMHCpan: + type: file + description: tsv formatted STDOUT file of netMHCpan. A poorly formated file of neoantigens. This containes the wild type antigens + pattern: ".{tsv}" + + - hlaFile: + type: file + description: HLA tsv outputtted by Polysolver + pattern: "winners.{tsv}" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - json: + type: file + description: output combined Json ready for input into the neoantigen pipeline + pattern: "*.{json}" + +authors: + - "@johnoooh" + - "@nikhil" +maintainers: + - "@johnoooh" + - "@nikhil" diff --git a/modules/msk/neoantigenutils/neoantigeninput/resources/usr/bin/generate_input.py b/modules/msk/neoantigenutils/neoantigeninput/resources/usr/bin/generate_input.py new file mode 100755 index 0000000..5aa3e09 --- /dev/null +++ b/modules/msk/neoantigenutils/neoantigeninput/resources/usr/bin/generate_input.py @@ -0,0 +1,643 @@ +#!/usr/bin/env python3 + +import json +import pandas as pd +import argparse +from Bio import pairwise2 +from Bio.pairwise2 import format_alignment +import numpy as np + +VERSION = 1.7 + +def main(args): + + def makeChild(subTree, start): + if start: + subTree = 0 + + newsubtree = { + "clone_id": int(subTree), + "clone_mutations": [], + "children": [], + "X": 0, + "x": 0, + "new_x": 0, + } + + if str(subTree) in trees[tree]["structure"]: + for item in trees[tree]["structure"][str(subTree)]: + + child_dict = makeChild(item, False) + + newsubtree["children"].append(child_dict) + + try: + ssmli = [] + if start: + pass + else: + for ssm in treefile["mut_assignments"][str(subTree)]["ssms"]: + ssmli.append( + chrom_pos_dict[mut_data["ssms"][ssm]["name"]]["id"] + ) + newsubtree["clone_mutations"] = ssmli + newsubtree["X"] = trees[tree]["populations"][str(subTree)][ + "cellular_prevalence" + ][0] + newsubtree["x"] = trees[tree]["populations"][str(subTree)][ + "cellular_prevalence" + ][0] + newsubtree["new_x"] = 0.0 + except Exception as e: + print("Error in adding new subtree. Error not in base case**") + print(subTree) + print(e) + pass + + return newsubtree + + else: + # Base Case + # make childrendict and return it + ssmli = [] + + for ssm in treefile["mut_assignments"][str(subTree)]["ssms"]: + try: + ssmli.append(chrom_pos_dict[mut_data["ssms"][ssm]["name"]]["id"]) + except Exception as e: + print( + "Error in appending to mutation list. Error in base case appending ssm to ssmli" + ) + print(e) + # print(str(subTree)) + pass + + try: + newsubtree["clone_mutations"] = ssmli + newsubtree["X"] = trees[tree]["populations"][str(subTree)][ + "cellular_prevalence" + ][0] + newsubtree["x"] = trees[tree]["populations"][str(subTree)][ + "cellular_prevalence" + ][0] + newsubtree["new_x"] = 0.0 + except Exception as e: + print("Error in adding new subtree. Error in base case") + print(e) + pass + return newsubtree + + with open(args.summary_file, "r") as f: + # Load the JSON data into a dictionary + summ_data = json.load(f) + + with open(args.mutation_file, "r") as f: + # Load the JSON data into a dictionary + mut_data = json.load(f) + + chrom_pos_dict = {} # Just used for mapping right now + mutation_list = [] # Used as the output for mutations + mutation_dict = ( + {} + ) # Used for matching mutation without the subsititution information from netMHCpan to phyloWGS output + + mafdf = pd.read_csv(args.maf_file, delimiter="\t") + + for index, row in mafdf.iterrows(): + if ( + #We + row["Variant_Type"] == "SNP" + or row["Variant_Type"] == "DEL" + or row["Variant_Type"] == "INS" + or row["Variant_Type"] == "DNP" + or row["Variant_Type"] == "TNP" + ): + if row["Variant_Classification"] == "Missense_Mutation": + missense = 1 + + else: + missense = 0 + print(row["Variant_Type"]) + if row["Variant_Type"] == "SNP" or row["Variant_Type"] == "DNP" or row["Variant_Type"] == "TNP": + chrom_pos_dict[ + str(row["Chromosome"]) + + "_" + + str(row["Start_Position"]) + + "_" + + row["Reference_Allele"] + + "_" + + row["Tumor_Seq_Allele2"] + ] = { + "id": str(row["Chromosome"]) + + "_" + + str(row["Start_Position"]) + + "_" + + row["Reference_Allele"] + + "_" + + row["Tumor_Seq_Allele2"], + "gene": row["Hugo_Symbol"], + "missense": missense, + } + + mutation_list.append( + { + "id": str(row["Chromosome"]) + + "_" + + str(row["Start_Position"]) + + "_" + + row["Reference_Allele"] + + "_" + + row["Tumor_Seq_Allele2"], + "gene": row["Hugo_Symbol"], + "missense": missense, + } + ) + + mutation_dict[makeID(row)] = ( + str(row["Chromosome"]) + + "_" + + str(row["Start_Position"]) + + "_" + + row["Reference_Allele"] + + "_" + + row["Tumor_Seq_Allele2"] + ) + + elif row["Variant_Type"] == "DEL": + chrom_pos_dict[ + str(row["Chromosome"]) + + "_" + + str(row["Start_Position"] - 1) + + "_" + + row["Reference_Allele"] + + "_" + + "D" + ] = { + "id": str(row["Chromosome"]) + + "_" + + str(row["Start_Position"]) + + "_" + + row["Reference_Allele"] + + "_" + + "D", + "gene": row["Hugo_Symbol"], + "missense": missense, + } + + mutation_list.append( + { + "id": str(row["Chromosome"]) + + "_" + + str(row["Start_Position"]) + + "_" + + row["Reference_Allele"] + + "_" + + "D", + "gene": row["Hugo_Symbol"], + "missense": missense, + } + ) + mutation_dict[makeID(row)] = ( + str(row["Chromosome"]) + + "_" + + str(row["Start_Position"] - 1) + + "_" + + row["Reference_Allele"] + + "_" + + "D" + ) + + elif row["Variant_Type"] == "INS": + print( + str(row["Chromosome"]) + + "_" + + str(row["Start_Position"]) + + "_" + + "I" + + "_" + + row["Tumor_Seq_Allele2"] + ) + chrom_pos_dict[ + str(row["Chromosome"]) + + "_" + + str(row["Start_Position"]) + + "_" + + "I" + + "_" + + row["Tumor_Seq_Allele2"] + ] = { + "id": str(row["Chromosome"]) + + "_" + + str(row["Start_Position"]) + + "_" + + "I" + + "_" + + row["Tumor_Seq_Allele2"], + "gene": row["Hugo_Symbol"], + "missense": missense, + } + + mutation_list.append( + { + "id": str(row["Chromosome"]) + + "_" + + str(row["Start_Position"]) + + "_" + + "I" + + "_" + + row["Tumor_Seq_Allele2"], + "gene": row["Hugo_Symbol"], + "missense": missense, + } + ) + mutation_dict[makeID(row)] = ( + str(row["Chromosome"]) + + "_" + + str(row["Start_Position"]) + + "_" + + "I" + + "_" + + row["Tumor_Seq_Allele2"] + ) + + outer_dict = {"id": args.id, "sample_trees": []} + + trees = summ_data["trees"] + + for tree in trees: + + inner_sample_tree_dict = {"topology": [], "score": trees[tree]["llh"]} + with open("./" + args.tree_directory + "/" + str(tree) + ".json", "r") as f: + # Load the JSON data into a dictionary + treefile = json.load(f) + + bigtree = makeChild(tree, True) + + inner_sample_tree_dict["topology"] = bigtree + + outer_dict["sample_trees"].append(inner_sample_tree_dict) + + outer_dict["mutations"] = mutation_list + + # TODO format HLA_gene input data, depending on format inputted. They should look like this A*02:01 + # this will be setup for polysolver winners output + def convert_polysolver_hla(polyHLA): + allele = polyHLA[4] + shortHLA = polyHLA.split("_")[2:4] + return allele.upper() + "*" + shortHLA[0] + ":" + shortHLA[1] + + HLA_gene_li = [] + with open(args.HLA_genes, "r") as f: + for line in f: + line = line.split("\t") + + HLA_gene_li.append(convert_polysolver_hla(line[1])) + HLA_gene_li.append(convert_polysolver_hla(line[2])) + + outer_dict["HLA_genes"] = HLA_gene_li + + if args.patient_data_file: + # TODO format optional input data, depending on format of inputted file. I was imagining a tsv, but can be anything + status = 0 + OS_tmp = 0 + PFS = 0 + + outer_dict["status"] = status + outer_dict["OS"] = OS_tmp + outer_dict["PFS"] = PFS + else: + outer_dict["status"] = 0 + outer_dict["OS"] = 0 + outer_dict["PFS"] = 0 + + outer_dict["id"] = args.id + outer_dict["patient"] = args.patient_id + outer_dict["cohort"] = args.cohort + + outer_dict["neoantigens"] = [] + + # print(mutation_dict) + + neoantigen_mut_in = pd.read_csv(args.netMHCpan_MUT_input, sep="\t") + neoantigen_WT_in = pd.read_csv(args.netMHCpan_WT_input, sep="\t") + + def find_first_difference_index(str1, str2): + min_length = min(len(str1), len(str2)) + for i in range(min_length): + if str1[i] != str2[i]: + return i + # If no difference found in the common length, return the length of the shorter string + return min_length + + WTdict = {} + + for index_WT, row_WT in neoantigen_WT_in.iterrows(): + + id = ( + row_WT["Identity"][:-2] + + "_" + + str(len(row_WT["peptide"])) + + "_" + + row_WT["MHC"].split("-")[1].replace(":", "").replace("*", "") + + "_" + + str(row_WT["pos"]) + ) + + noposID = ( + row_WT["Identity"][:-2] + + "_" + + str(len(row_WT["peptide"])) + + "_" + + row_WT["MHC"].split("-")[1].replace(":", "").replace("*", "") + ) + WTdict[id] = {"affinity": row_WT["affinity"], "peptide": row_WT["peptide"]} + + # This is used as last resort for the matching. We will preferentially find the peptide matching in length as well as POS. Worst case we will default to the WT pos 0 + if noposID not in WTdict: + WTdict[noposID] = { + 'peptides' : {row_WT["peptide"]:id}, #This is a dict so we can match the peptide with the ID later + "affinity": row_WT["affinity"] + } + + else: + # print(WTdict[noposID]['peptides']) + WTdict[noposID]['peptides'][row_WT["peptide"]]=id + + def find_most_similar_string(target, strings): + max_score = -1 + max_score2 = -2 + most_similar_string = None + most_similar_string2 = None + first_AA_same = None + first_AA_same_score = -1 + + for s in strings: + alignments = pairwise2.align.globalxx(target, s) + score = alignments[0][2] # The third element is the score + + if score > max_score2: + + if score > max_score: + max_score2 = max_score + most_similar_string2 = most_similar_string + max_score = score + most_similar_string = s + + else: + max_score2 = score + most_similar_string2 = s + + if target[0]==s[0]: + if score > first_AA_same_score: + first_AA_same = s + first_AA_same_score = score + + return most_similar_string, most_similar_string2, first_AA_same, first_AA_same_score, max_score + + for index_mut, row_mut in neoantigen_mut_in.iterrows(): + IDsplit = row_mut["Identity"].split('_') + if row_mut["affinity"]< 500: + peplen = len(row_mut["peptide"]) + matchfound = False + IDsplit = row_mut["Identity"].split('_') + if (IDsplit[1][0] == "S" and IDsplit[1][1] != 'p') : + #If it is a silent mutation. Silent mutations can either be S or SY. These include intron mutations. Splices can be Sp + continue + # first find match in WT + WTid = ( + row_mut["Identity"][:-2] + + "_" + + str(peplen) + + "_" + + row_mut["MHC"].split("-")[1].replace(":", "").replace("*", "") + + "_" + + str(row_mut["pos"]) + ) + + noposID = ( + row_mut["Identity"][:-2] + + "_" + + str(peplen) + + "_" + + row_mut["MHC"].split("-")[1].replace(":", "").replace("*", "") + ) + + if WTid in WTdict and ('M' == IDsplit[1][0] and 'Sp' not in row_mut["Identity"]): + # match + matchfound = True + best_pepmatch = WTdict[WTid]["peptide"] + + else: + if "-" in row_mut["Identity"] or "+" in row_mut["Identity"] and WTid in WTdict: + # Means there is a frame shift and we don't need to do a analysis of 5' end and 3' end as 3' end is no longer recognizeable/comparable to the WT sequence at all + # We can just move the windows along together. There will likely be little to no match with the WT peptides. + matchfound = True + best_pepmatch = WTdict[WTid]["peptide"] + # print(mutation_dict[row_mut["Identity"]]) + + else: + best_pepmatch,best_pepmatch2 , first_AA_same, first_AA_same_score, match_score = find_most_similar_string(row_mut["peptide"],list(WTdict[noposID]['peptides'].keys())) + + if best_pepmatch == row_mut["peptide"]: + #it seems this can happen where the row_mut is actually the canonical sequence. + # In this case we don't want to report the peptide as a neoantigen, its not neo + continue + + elif (best_pepmatch[0] != row_mut["peptide"][0] and best_pepmatch2[0] == row_mut["peptide"][0]) or (best_pepmatch[-1] != row_mut["peptide"][-1] and best_pepmatch2[-1] == row_mut["peptide"][-1]): + # We should preferentially match the first AA if we can. I have found that the pairwise alignment isnt always the best at this. + # It will also do this when the last AA of the best match doesnt match but the last A of the second best match does + best_pepmatch = best_pepmatch2 + + WTid = WTdict[noposID]['peptides'][best_pepmatch] + matchfound=True + + if matchfound == True: + mut_pos = ( + find_first_difference_index( + row_mut["peptide"], best_pepmatch #WTdict[WTid]["peptide"] + ) + + 1 + ) + + neo_dict = { + "id": row_mut["Identity"] + + "_" + + str(peplen) + + "_" + + row_mut["MHC"].split("-")[1].replace(":", "").replace("*", "") + + "_" + + str(row_mut["pos"]), + "mutation_id": mutation_dict[row_mut["Identity"]], + "HLA_gene_id": row_mut["MHC"], + "sequence": row_mut["peptide"], + "WT_sequence": best_pepmatch ,#WTdict[WTid]["peptide"], + "mutated_position": mut_pos, + "Kd": float(row_mut["affinity"]), + "KdWT": float(WTdict[WTid]["affinity"]), + } + outer_dict["neoantigens"].append(neo_dict) + + outjson = args.patient_id + "_" + args.id + "_" + ".json" + with open(outjson, "w") as tstout: + json.dump(outer_dict, tstout, indent=1) + # tstout.write(json.dumps(outer_dict)) + + +def makeID(maf_row): + ##ENCODING FASTA ID FOR USE IN MATCHING LATER + ALPHABET = [ + "A", + "B", + "C", + "D", + "E", + "F", + "G", + "H", + "I", + "J", + "K", + "L", + "M", + "N", + "O", + "P", + "Q", + "R", + "S", + "T", + "U", + "V", + "W", + "X", + "Y", + "Z", + ] + + variant_type_map = { + "Missense_Mutation": "M", + "Nonsense_Mutation": "X", + "Silent_Mutation": "S", + "Silent": "S", + "Frame_shift_Ins": "I+", + "Frame_shift_Del": "I-", + "In_Frame_Ins": "If", + "In_Frame_Del": "Id", + "Splice_Site": "Sp" + } + + position = int(str(maf_row["Start_Position"])[0:2]) + + if position < 26: + encoded_start = ALPHABET[position] + elif position < 100: + encoded_start = ALPHABET[position // 4] + + position = int(str(maf_row["Start_Position"])[-2:]) + + if position < 26: + encoded_end = ALPHABET[position] + elif position < 100: + encoded_end = ALPHABET[position // 4] + + sum_remaining = sum(int(d) for d in str(maf_row["Start_Position"])[2:-2]) + + encoded_position = encoded_start + ALPHABET[sum_remaining % 26] + encoded_end + + if maf_row["Tumor_Seq_Allele2"] == "-": + # handles deletion + if len(maf_row["Reference_Allele"]) > 3: + Allele2code = maf_row["Reference_Allele"][0:3] + else: + Allele2code = maf_row["Reference_Allele"] + + elif len(maf_row["Tumor_Seq_Allele2"]) > 1: + # handles INS and DNP + if len(maf_row["Tumor_Seq_Allele2"]) > 3: + Allele2code = maf_row["Tumor_Seq_Allele2"][0:3] + else: + Allele2code = maf_row["Tumor_Seq_Allele2"] + + else: + # SNPs + Allele2code = maf_row["Tumor_Seq_Allele2"] + + if maf_row["Variant_Classification"] in variant_type_map: + identifier_key = ( + str(maf_row["Chromosome"]) + + encoded_position + + "_" + + variant_type_map[maf_row["Variant_Classification"]] + + Allele2code + + "_M" # This indicates mutated. It is added in the generateMutFasta script as well but not in this function. + ) + else: + + identifier_key = ( + str(maf_row["Chromosome"]) + + encoded_position + + "_" + + "SY" + + Allele2code + + "_M" + ) + return identifier_key + + +def parse_args(): + parser = argparse.ArgumentParser(description="Process input files and parameters") + parser.add_argument("--maf_file", required=True, help="Path to the MAF file") + parser.add_argument( + "--summary_file", required=True, help="Path to the summary file" + ) + parser.add_argument( + "--mutation_file", required=True, help="Path to the mutation file" + ) + parser.add_argument( + "--tree_directory", + required=True, + help="Path to the tree directory containing json files", + ) + parser.add_argument("--id", required=True, help="ID") + parser.add_argument("--patient_id", required=True, help="Patient ID") + parser.add_argument("--cohort", required=True, help="Cohort") + parser.add_argument( + "--HLA_genes", required=True, help="Path to the file containing HLA genes" + ) + parser.add_argument( + "--netMHCpan_MUT_input", + required=True, + help="Path to the file containing MUT netmhcpan results", + ) + parser.add_argument( + "--netMHCpan_WT_input", + required=True, + help="Path to the file containing WT netmhcpan results", + ) + + parser.add_argument( + "--patient_data_file", + help="Path to the optional file containing status, overall survival, and PFS", + ) + parser.add_argument( + "-v", "--version", action="version", version="%(prog)s {}".format(VERSION) + ) + + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + print("MAF File:", args.maf_file) + print("Summary File:", args.summary_file) + print("Mutation File:", args.mutation_file) + print("Tree directory:", args.tree_directory) + print("ID:", args.id) + print("Patient ID:", args.patient_id) + print("Cohort:", args.cohort) + print("HLA Genes File:", args.HLA_genes) + print("netMHCpan Files:", args.netMHCpan_MUT_input, args.netMHCpan_WT_input) + if args.patient_data_file: + print("patient_data_file File:", args.patient_data_file) + + main(args) diff --git a/modules/msk/neoantigenutils/neoantigeninput/tests/main.nf.test b/modules/msk/neoantigenutils/neoantigeninput/tests/main.nf.test new file mode 100644 index 0000000..778654f --- /dev/null +++ b/modules/msk/neoantigenutils/neoantigeninput/tests/main.nf.test @@ -0,0 +1,100 @@ +nextflow_process { + + name "Test Process NEOANTIGENUTILS_NEOANTIGENINPUT" + script "../main.nf" + process "NEOANTIGENUTILS_NEOANTIGENINPUT" + + tag "modules" + tag "modules_nfcore" + tag "neoantigenutils" + tag "neoantigenutils/neoantigeninput" + tag "modules_msk" + + test("neoantigenutils_neoantigeninput - json,tsv") { + + when { + params { + folderPath = 'tst' + } + + process { + """ + + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.test_data_mskcc['neoantigen']['temp_test_maf'], checkIfExists: true), + file(params.test_data_mskcc['neoantigen']['winners_hla_txt'], checkIfExists: true) + ] + + input[1] = [ + [ id:'test', single_end:false ], // meta map + file(params.test_data_mskcc['neoantigen']['test4_summ_json'], checkIfExists: true), + file(params.test_data_mskcc['neoantigen']['test4_muts_json'], checkIfExists: true), + file(params.test_data_mskcc['neoantigen']['test4_mutass_zip'], checkIfExists: true) + ] + + input[2] = [ + [ id:'test', single_end:false ], // meta map + file(params.test_data_mskcc['neoantigen']['MUTnetMHCpan_tsv'], checkIfExists: true), + file(params.test_data_mskcc['neoantigen']['WTnetMHCpan_tsv'], checkIfExists: true) + ] + + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + + test("neoantigenutils_neoantigeninput - json,tsv - stub") { + + options "-stub" + + when { + params { + folderPath = 'tst' + } + + process { + """ + + input[0] = [ + [ id:'test', single_end:false ], // meta map + file('temp_test_maf'), + file('winners_hla_txt') + ] + + input[1] = [ + [ id:'test', single_end:false ], // meta map + file('test4_summ_json'), + file('test4_muts_json'), + file(params.folderPath) + ] + + input[2] = [ + [ id:'test', single_end:false ], // meta map + file('MUTnetMHCpan.tsv'), + file('WTnetMHCpan.tsv') + ] + + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/msk/neoantigenutils/neoantigeninput/tests/main.nf.test.snap b/modules/msk/neoantigenutils/neoantigeninput/tests/main.nf.test.snap new file mode 100644 index 0000000..48c12c3 --- /dev/null +++ b/modules/msk/neoantigenutils/neoantigeninput/tests/main.nf.test.snap @@ -0,0 +1,72 @@ +{ + "neoantigenutils_neoantigeninput - json,tsv - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test_patient_test_.json:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + "versions.yml:md5,4815d54ee3ae0778628a1e51b4b21d9b" + ], + "json": [ + [ + { + "id": "test", + "single_end": false + }, + "test_patient_test_.json:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,4815d54ee3ae0778628a1e51b4b21d9b" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-07-01T16:08:47.586409668" + }, + "neoantigenutils_neoantigeninput - json,tsv": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test_patient_test_.json:md5,a4442316ba2b6f404b8eb42dd6558eae" + ] + ], + "1": [ + "versions.yml:md5,4815d54ee3ae0778628a1e51b4b21d9b" + ], + "json": [ + [ + { + "id": "test", + "single_end": false + }, + "test_patient_test_.json:md5,a4442316ba2b6f404b8eb42dd6558eae" + ] + ], + "versions": [ + "versions.yml:md5,4815d54ee3ae0778628a1e51b4b21d9b" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-07-01T16:08:31.862365529" + } +} \ No newline at end of file diff --git a/modules/msk/neoantigenutils/neoantigeninput/tests/tags.yml b/modules/msk/neoantigenutils/neoantigeninput/tests/tags.yml new file mode 100644 index 0000000..2e9b3d6 --- /dev/null +++ b/modules/msk/neoantigenutils/neoantigeninput/tests/tags.yml @@ -0,0 +1,2 @@ +neoantigenutils/neoantigeninput: + - "modules/msk/neoantigenutils/neoantigeninput/**" diff --git a/modules/msk/netmhcpan/environment.yml b/modules/msk/netmhcpan/environment.yml new file mode 100644 index 0000000..dcbf0e4 --- /dev/null +++ b/modules/msk/netmhcpan/environment.yml @@ -0,0 +1,9 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +name: "netmhcpan" +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - "NETMHCPAN" diff --git a/modules/msk/netmhcpan/main.nf b/modules/msk/netmhcpan/main.nf new file mode 100644 index 0000000..bc54431 --- /dev/null +++ b/modules/msk/netmhcpan/main.nf @@ -0,0 +1,67 @@ +process NETMHCPAN { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'docker://mskcc/netmhctools:1.0.0': + 'docker.io/mskcc/netmhctools:1.0.0' }" + + input: + tuple val(meta), path(inputFasta), val(hlaString), val(inputType) + + output: + tuple val(output_meta), path("*.xls"), emit: xls + tuple val(output_meta), path("*.netmhcpan.output"), emit: netmhcpanoutput + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def hla = hlaString.trim() + output_meta = meta.clone() + output_meta.typeMut = inputType == "MUT" ? true : false + output_meta.fromStab = false + def NETMHCPAN_VERSION = "4.1" + + """ + /usr/local/bin/netMHCpan-${NETMHCPAN_VERSION}/netMHCpan \ + -s 0 \ + -BA 1 \ + -f ${inputFasta} \ + -a ${hla} \ + -l 9,10 \ + -inptype 0 \ + -xls \ + ${args} \ + -xlsfile \ + ${prefix}.${inputType}.xls > ${prefix}.${inputType}.netmhcpan.output + + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + netmhcpan: v${NETMHCPAN_VERSION} + END_VERSIONS + + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def NETMHCPAN_VERSION = "4.1" + output_meta = meta.clone() + output_meta.typeMut = inputType == "MUT" ? true : false + output_meta.fromStab = false + """ + touch ${prefix}.MUT.netmhcpan.output + touch ${prefix}.MUT.xls + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + netmhcpan: v${NETMHCPAN_VERSION} + END_VERSIONS + """ +} diff --git a/modules/msk/netmhcpan/meta.yml b/modules/msk/netmhcpan/meta.yml new file mode 100644 index 0000000..5c0885e --- /dev/null +++ b/modules/msk/netmhcpan/meta.yml @@ -0,0 +1,67 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "netmhcpan" +description: write your description here +keywords: + - immune + - netmhcpan + - genomics +tools: + - "netmhcpan": + description: " Runs netMHCpan and outputs tsvs and STDout for mutated and wild type neoantigens" + homepage: "https://services.healthtech.dtu.dk/services/NetMHCpan-4.1/" + documentation: "https://services.healthtech.dtu.dk/services/NetMHCpan-4.1/" + licence: ["MIT"] + +input: + # Only when we have meta + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + + - inputMaf: + type: file + description: Maf outputtted by Tempo that was run through phyloWGS + pattern: "*.{maf}" + + - hlaFile: + type: file + description: HLA tsv outputtted by Polysolver + pattern: "winners.{tsv}" + + - inputType: + type: string + description: Allows netmhcpan to run in parallel. Should be 'MUT' or 'WT', it will kick off two jobs. make a Channel.Of('MUT','WT') outside the module as an input. Running them in series is kicked off by putting in anything other than MUT or WT. + pattern: "*" + +output: + #Only when we have meta + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + + - xls: + type: file + description: TSV/XLS file of netMHCpan. A poorly formated file of neoantigens. This contains the MUT or WT antigens + pattern: "*.xls" + + - netmhcpanoutput: + type: file + description: STDOUT file of netMHCpan. A poorly formated file of neoantigens. This contains either the MUT or WT neoantigens. Neoantigenutils contains a parser for this file. + pattern: "*.WT.netmhcpan.output,*.MUT.netmhcpan.output" + +authors: + - "@johnoooh" + - "@nikhil" +maintainers: + - "@johnoooh" + - "@nikhil" diff --git a/modules/msk/netmhcpan/tests/main.nf.test b/modules/msk/netmhcpan/tests/main.nf.test new file mode 100644 index 0000000..092e7c5 --- /dev/null +++ b/modules/msk/netmhcpan/tests/main.nf.test @@ -0,0 +1,106 @@ +nextflow_process { + + name "Test Process NETMHCPAN" + script "../main.nf" + process "NETMHCPAN" + + tag "modules" + tag "modules_nfcore" + tag "netmhcpan" + tag "modules_msk" + + test("netmhcpan - MUT - xls,output,fa") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(file(params.test_data_mskcc['neoantigen']['MUT_sequence_fa']), checkIfExists: true), + "HLA-A24:02,HLA-A24:02,HLA-B39:01,HLA-B39:01,HLA-C07:01,HLA-C06:02", + "MUT" + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out.versions, + process.out.netmhcpanoutput[0][0], + file(process.out.xls[0][1]).name, + file(process.out.netmhcpanoutput[0][1]).name + ).match() + } + ) + } + + } + + test("netmhcpan - WT - xls,output,fa") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(file(params.test_data_mskcc['neoantigen']['WT_sequence_fa']), checkIfExists: true), + "HLA-A24:02,HLA-A24:02,HLA-B39:01,HLA-B39:01,HLA-C07:01,HLA-C06:02", + "WT" + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out.versions, + process.out.netmhcpanoutput[0][0], + file(process.out.xls[0][1]).name, + file(process.out.netmhcpanoutput[0][1]).name + ).match() + } + ) + } + + } + + + + test("netmhcpan - xls,output,fa - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file('MUT_sequence_fa'), + "HLA", + "MUT" + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out.versions, + process.out.netmhcpanoutput[0][0], + file(process.out.xls[0][1]).name, + file(process.out.netmhcpanoutput[0][1]).name + ).match() + } + ) + } + + } + +} diff --git a/modules/msk/netmhcpan/tests/main.nf.test.snap b/modules/msk/netmhcpan/tests/main.nf.test.snap new file mode 100644 index 0000000..d8f9f0e --- /dev/null +++ b/modules/msk/netmhcpan/tests/main.nf.test.snap @@ -0,0 +1,62 @@ +{ + "netmhcpan - WT - xls,output,fa": { + "content": [ + [ + "versions.yml:md5,35ec563839ee27410cf9f8d134c6e8e5" + ], + { + "id": "test", + "single_end": false, + "typeMut": false, + "fromStab": false + }, + "test.WT.xls", + "test.WT.netmhcpan.output" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-13T11:02:31.338253" + }, + "netmhcpan - xls,output,fa - stub": { + "content": [ + [ + "versions.yml:md5,35ec563839ee27410cf9f8d134c6e8e5" + ], + { + "id": "test", + "single_end": false, + "typeMut": true, + "fromStab": false + }, + "test.MUT.xls", + "test.MUT.netmhcpan.output" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-13T11:02:35.582061" + }, + "netmhcpan - MUT - xls,output,fa": { + "content": [ + [ + "versions.yml:md5,35ec563839ee27410cf9f8d134c6e8e5" + ], + { + "id": "test", + "single_end": false, + "typeMut": true, + "fromStab": false + }, + "test.MUT.xls", + "test.MUT.netmhcpan.output" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-13T11:02:12.753672" + } +} \ No newline at end of file diff --git a/modules/msk/netmhcpan/tests/tags.yml b/modules/msk/netmhcpan/tests/tags.yml new file mode 100644 index 0000000..b66af53 --- /dev/null +++ b/modules/msk/netmhcpan/tests/tags.yml @@ -0,0 +1,2 @@ +netmhcpan: + - "modules/msk/netmhcpan/**" diff --git a/modules/msk/netmhcstabpan/environment.yml b/modules/msk/netmhcstabpan/environment.yml new file mode 100644 index 0000000..b87b3c7 --- /dev/null +++ b/modules/msk/netmhcstabpan/environment.yml @@ -0,0 +1,10 @@ +name: netmhcstabpan + +channels: + - conda-forge + - bioconda + - defaults + +dependencies: + - netmhcpan + - netmhcstabpan diff --git a/modules/msk/netmhcstabpan/main.nf b/modules/msk/netmhcstabpan/main.nf new file mode 100644 index 0000000..316195f --- /dev/null +++ b/modules/msk/netmhcstabpan/main.nf @@ -0,0 +1,68 @@ +process NETMHCSTABPAN { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'docker://mskcc/netmhctools:1.0.0': + 'docker.io/mskcc/netmhctools:1.0.0' }" + + input: + tuple val(meta), path(inputFasta), val(hlaString), val(inputType) + + + output: + tuple val(output_meta), path("*.netmhcstabpan.output"), emit: netmhcstabpanoutput + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def hla = hlaString.trim() + output_meta = meta.clone() + output_meta.typeMut = inputType == "MUT" ? true : false + output_meta.fromStab = true + + def NETMHCPAN_VERSION = "4.1" + def NETMHCSTABPAN_VERSION = "1.0" + + """ + + /usr/local/bin/netMHCstabpan-${NETMHCSTABPAN_VERSION}/netMHCstabpan \ + -s -1 \ + -f ${inputFasta} \ + -a ${hla} \ + -l 9,10 \ + -inptype 0 > ${prefix}.${inputType}.netmhcstabpan.output + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + netmhcpan: v${NETMHCPAN_VERSION} + netmhcstabpan: v${NETMHCSTABPAN_VERSION} + END_VERSIONS + + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + output_meta = meta.clone() + output_meta.typeMut = inputType == "MUT" ? true : false + output_meta.fromStab = true + def NETMHCPAN_VERSION = "4.1" + def NETMHCSTABPAN_VERSION = "1.0" + + """ + touch ${prefix}.MUT.netmhcstabpan.output + + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + netmhcpan: v${NETMHCPAN_VERSION} + netmhcstabpan: v${NETMHCSTABPAN_VERSION} + END_VERSIONS + """ +} diff --git a/modules/msk/netmhcstabpan/meta.yml b/modules/msk/netmhcstabpan/meta.yml new file mode 100644 index 0000000..ec5e22c --- /dev/null +++ b/modules/msk/netmhcstabpan/meta.yml @@ -0,0 +1,66 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "netmhcstabpan" +description: Runs netMHCpan and netMHCstabpan and outputs STDout for mutated and wild type neoantigens" +keywords: + - immune + - netmhcstabpan + - netMHCstabpan + - genomics +tools: + - "netmhcstabpan": + description: " Runs netMHCstabpan and netMHCpan then outputs tsvs and STDout for mutated and wild type neoantigens" + homepage: "https://services.healthtech.dtu.dk/services/NetMHCstabpan-1.0/" + documentation: "https://services.healthtech.dtu.dk/services/NetMHCstabpan-1.0/" + licence: ["MIT"] + + - "netmhcpan": + description: " Runs netMHCpan and outputs tsvs and STDout for mutated and wild type neoantigens" + homepage: "https://services.healthtech.dtu.dk/services/NetMHCpan-4.1/" + documentation: "https://services.healthtech.dtu.dk/services/NetMHCpan-4.1/" + licence: ["MIT"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + + - inputMaf: + type: file + description: Maf outputtted by Tempo that was run through phyloWGS + pattern: "*.{maf}" + + - hlaString: + type: string + description: HLA in string format. e.g. HLA-A24:02 + + - inputType: + type: string + description: Allows netmhcstabpan to run in parallel. Should be 'MUT' or 'WT', it will kick off two jobs. make a Channel.Of('MUT','WT') outside the module as an input. Running them in series is kicked off by putting in anything other than MUT or WT. + pattern: "*" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + + - netmhcstabpanoutput: + type: file + description: STDOUT file of netMHCstabpan runs for MUT and WT. A poorly formated file of neoantigens. Neoantigenutils contains a parser for this file + pattern: "*.WT.netmhcstabpan.output,*.MUT.netmhcstabpan.output" + +authors: + - "@johnoooh" + - "@nikhil" +maintainers: + - "@johnoooh" + - "@nikhil" diff --git a/modules/msk/netmhcstabpan/tests/main.nf.test b/modules/msk/netmhcstabpan/tests/main.nf.test new file mode 100644 index 0000000..390f061 --- /dev/null +++ b/modules/msk/netmhcstabpan/tests/main.nf.test @@ -0,0 +1,103 @@ +nextflow_process { + + name "Test Process NETMHCSTABPAN" + script "../main.nf" + process "NETMHCSTABPAN" + + tag "modules" + tag "modules_nfcore" + tag "netmhcstabpan" + tag "modules_msk" + + test("netmhcstabpan - MUT - xls,output,fa") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(file(params.test_data_mskcc['neoantigen']['MUT_sequence_fa']), checkIfExists: true), + "HLA-A24:02,HLA-A24:02,HLA-B39:01,HLA-B39:01,HLA-C07:01,HLA-C06:02", + "MUT" + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out.versions, + process.out.netmhcstabpanoutput[0][0], + file(process.out.netmhcstabpanoutput[0][1]).name + ).match() + } + ) + } + + } + + test("netmhcstabpan - WT - xls,output,fa") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(file(params.test_data_mskcc['neoantigen']['WT_sequence_fa']), checkIfExists: true), + "HLA-A24:02,HLA-A24:02,HLA-B39:01,HLA-B39:01,HLA-C07:01,HLA-C06:02", + "WT" + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out.versions, + process.out.netmhcstabpanoutput[0][0], + file(process.out.netmhcstabpanoutput[0][1]).name + ).match() + } + ) + } + + } + + test("netmhcstabpan - xls,output,fa - stub") { + + options "-stub" + + when { + process { + + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file('MUT_sequence_fa'), + "HLA", + "MUT" + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out.versions, + process.out.netmhcstabpanoutput[0][0], + file(process.out.netmhcstabpanoutput[0][1]).name + ).match() + } + ) + } + + } + + +} diff --git a/modules/msk/netmhcstabpan/tests/main.nf.test.snap b/modules/msk/netmhcstabpan/tests/main.nf.test.snap new file mode 100644 index 0000000..0c95eca --- /dev/null +++ b/modules/msk/netmhcstabpan/tests/main.nf.test.snap @@ -0,0 +1,59 @@ +{ + "netmhcstabpan - MUT - xls,output,fa": { + "content": [ + [ + "versions.yml:md5,d1b3afd365748a44098c5642fad6c94a" + ], + { + "id": "test", + "single_end": false, + "typeMut": true, + "fromStab": true + }, + "test.MUT.netmhcstabpan.output" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-13T11:11:05.586612" + }, + "netmhcstabpan - xls,output,fa - stub": { + "content": [ + [ + "versions.yml:md5,d1b3afd365748a44098c5642fad6c94a" + ], + { + "id": "test", + "single_end": false, + "typeMut": true, + "fromStab": true + }, + "test.MUT.netmhcstabpan.output" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-13T11:12:06.099529" + }, + "netmhcstabpan - WT - xls,output,fa": { + "content": [ + [ + "versions.yml:md5,d1b3afd365748a44098c5642fad6c94a" + ], + { + "id": "test", + "single_end": false, + "typeMut": false, + "fromStab": true + }, + "test.WT.netmhcstabpan.output" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.04.2" + }, + "timestamp": "2024-06-13T11:12:01.701587" + } +} \ No newline at end of file diff --git a/modules/msk/netmhcstabpan/tests/tags.yml b/modules/msk/netmhcstabpan/tests/tags.yml new file mode 100644 index 0000000..c5803ff --- /dev/null +++ b/modules/msk/netmhcstabpan/tests/tags.yml @@ -0,0 +1,2 @@ +netmhcstabpan: + - "modules/msk/netmhcstabpan/**" diff --git a/modules/nf-core/fastqc/environment.yml b/modules/msk/phylowgs/createinput/environment.yml similarity index 57% rename from modules/nf-core/fastqc/environment.yml rename to modules/msk/phylowgs/createinput/environment.yml index 1787b38..0d609eb 100644 --- a/modules/nf-core/fastqc/environment.yml +++ b/modules/msk/phylowgs/createinput/environment.yml @@ -1,7 +1,8 @@ -name: fastqc +--- +name: "phylowgs_createinput" channels: - conda-forge - bioconda - defaults dependencies: - - bioconda::fastqc=0.12.1 + - "PHYLOWGS" diff --git a/modules/msk/phylowgs/createinput/main.nf b/modules/msk/phylowgs/createinput/main.nf new file mode 100644 index 0000000..332665b --- /dev/null +++ b/modules/msk/phylowgs/createinput/main.nf @@ -0,0 +1,47 @@ +process PHYLOWGS_CREATEINPUT { + tag "$meta.id" + label 'process_low' + + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'docker://mskcc/phylowgs:v1.5-msk': + 'docker.io/mskcc/phylowgs:v1.5-msk' }" + + input: + tuple val(meta), path(unfilteredmaf), path(cnv) + + output: + tuple val(meta), path("cnv_data.txt"), path("ssm_data.txt"), emit: phylowgsinput + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + create_phylowgs_inputs.py \\ + --cnvs S1=${cnv} \\ + ${args} \\ + --vcf-type S1=maf S1=${unfilteredmaf} + + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + phylowgs: \$PHYLOWGS_TAG + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch cnv_data.txt + touch ssm_data.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + phylowgs: \$PHYLOWGS_TAG + END_VERSIONS + """ +} diff --git a/modules/msk/phylowgs/createinput/meta.yml b/modules/msk/phylowgs/createinput/meta.yml new file mode 100644 index 0000000..9b8f5f9 --- /dev/null +++ b/modules/msk/phylowgs/createinput/meta.yml @@ -0,0 +1,43 @@ +--- +name: "phylowgs_createinput" +description: Create input files for phylowgs +keywords: + - phylowgs + - cnvs + - parser +tools: + - "phylowgs_createinput": + description: "create phylowgs expected input" + homepage: "https://genomebiology.biomedcentral.com/articles/10.1186/s13059-015-0602-8" + tool_dev_url: "https://github.com/mskcc/phylowgs" + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - cnv: + type: file + description: converted cnv file for phylowgs + pattern: "*.txt" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - phylowgsinput: + type: file + description: cnv_data.txt and ssm_data.txt + pattern: "*.txt" + +authors: + - "@pintoa1-mskcc" +maintainers: + - "@pintoa1-mskcc" diff --git a/modules/msk/phylowgs/createinput/resources/usr/bin/create_phylowgs_inputs.py b/modules/msk/phylowgs/createinput/resources/usr/bin/create_phylowgs_inputs.py new file mode 100755 index 0000000..e15092a --- /dev/null +++ b/modules/msk/phylowgs/createinput/resources/usr/bin/create_phylowgs_inputs.py @@ -0,0 +1,1771 @@ +#!/usr/bin/env python2 + +from __future__ import print_function + +# Requires PyVCF. To install: pip2 install pyvcf +import vcf +import argparse +import csv +from collections import defaultdict, namedtuple, OrderedDict +import random +import sys +import numpy as np +import numpy.ma as ma +import json +from scipy.stats.mstats import gmean + +VariantId = namedtuple("VariantId", ["CHROM", "POS", "ID"]) + + +class ReadCountsUnavailableError(Exception): + pass + + +class MafVariant(object): + def __init__( + self, chromosome, position, filter, t_ref, t_alt, ref_allele, alt_allele + ): + self.CHROM = chromosome + self.POS = position + self.FILTER = filter + self.T_REF = t_ref + self.T_ALT = t_alt + self.REF = ref_allele + self.ALT = alt_allele + self.ID = self.generate_id() + + def generate_id(self): + if self.REF == "-": + self.REF = "I" + if self.ALT == "-": + self.ALT = "D" + id = "{}_{}_{}_{}".format(self.CHROM, self.POS, self.REF, self.ALT) + return id + + +class VariantParser(object): + def __init__(self): + # Child classes must give the following variables sensible values in + # constructor so that list_variants() works subsequently. + self._cnvs = None + self._vcf_filename = None + + def list_variants(self): + variants = self._filter(self._vcf_filename) + variants_and_reads = [] + for variant in variants: + try: + ref_reads, total_reads = self._calc_read_counts(variant) + except ReadCountsUnavailableError as exc: + log("Read counts unavailable for %s_%s" % (variant.CHROM, variant.POS)) + continue + variants_and_reads.append((variant, ref_reads, total_reads)) + return variants_and_reads + + def _calc_read_counts(self, variant): + raise Exception("Not implemented -- use child class") + + def _parse_vcf(self, vcf_filename): + vcfr = vcf.Reader(filename=vcf_filename) + records = [] + for variant in vcfr: + variant.CHROM = variant.CHROM.upper() + # Some VCF dialects prepend "chr", some don't. Remove the prefix to + # standardize. + if variant.CHROM.startswith("CHR"): + variant.CHROM = variant.CHROM[3:] + records.append(variant) + return records + + def _does_variant_pass_filters(self, variant): + if variant.FILTER is None: + return True + if variant.FILTER == "PASS": + return True + if len(variant.FILTER) > 0: + # Variant failed one or more filters. + return False + return True + + def _filter(self, vcf_filename): + variants = [] + + all_variants = self._parse_vcf(vcf_filename) + + for variant in all_variants: + if not is_good_chrom(variant.CHROM): + continue + if not self._does_variant_pass_filters(variant): + continue + variants.append(variant) + return variants + + def _get_tumor_index(self, variant, tumor_sample=None): + """Find the index of the tumor sample. + + Currently hardcodes tumour sample as the last column if name not specified. + Might not always be true + """ + if self._tumor_sample: + tumor_is = [ + i for i, s in enumerate(variant.samples) if s.sample == tumor_sample + ] + assert len(tumor_is) == 1, ( + "Did not find tumor name %s in samples" % tumor_sample + ) + return tumor_is[0] + else: + # Don't make this -1, as some code assumes it will be >= 0. + return len(variant.samples) - 1 + + +class SangerParser(VariantParser): + """ + Works with PCAWG variant calls from the Sanger. + """ + + def __init__(self, vcf_filename, tumor_sample=None): + self._vcf_filename = vcf_filename + self._tumor_sample = tumor_sample + + def _find_ref_and_variant_nt(self, variant): + assert len(variant.REF) == len(variant.ALT) == 1 + return (str(variant.REF[0]), str(variant.ALT[0])) + + def _calc_read_counts(self, variant): + normal = variant.genotype("NORMAL") + tumor = variant.genotype("TUMOUR") + + reference_nt, variant_nt = self._find_ref_and_variant_nt(variant) + tumor_reads = { + "forward": { + "A": int(tumor["FAZ"]), + "C": int(tumor["FCZ"]), + "G": int(tumor["FGZ"]), + "T": int(tumor["FTZ"]), + }, + "reverse": { + "A": int(tumor["RAZ"]), + "C": int(tumor["RCZ"]), + "G": int(tumor["RGZ"]), + "T": int(tumor["RTZ"]), + }, + } + + ref_reads = ( + tumor_reads["forward"][reference_nt] + tumor_reads["reverse"][reference_nt] + ) + # For now, variant reads are defined as only the non-reference nucleotide in + # the inferred tumor SNP. We ignore reads of a third or fourth base. + variant_reads = ( + tumor_reads["forward"][variant_nt] + tumor_reads["reverse"][variant_nt] + ) + total_reads = ref_reads + variant_reads + + return (ref_reads, total_reads) + + +class PcawgConsensusParser(VariantParser): + def __init__(self, vcf_filename, tumor_sample=None): + self._vcf_filename = vcf_filename + self._tumor_sample = tumor_sample + + def _find_ref_and_variant_nt(self, variant): + assert len(variant.REF) == len(variant.ALT) == 1 + return (str(variant.REF[0]), str(variant.ALT[0])) + + def _calc_read_counts(self, variant): + if not ("t_alt_count" in variant.INFO and "t_ref_count" in variant.INFO): + raise ReadCountsUnavailableError() + assert len(variant.INFO["t_alt_count"]) == len(variant.INFO["t_ref_count"]) == 1 + + alt_reads = int(variant.INFO["t_alt_count"][0]) + ref_reads = int(variant.INFO["t_ref_count"][0]) + total_reads = alt_reads + ref_reads + # Some variants havezero alt and ref reads. + if total_reads == 0: + raise ReadCountsUnavailableError() + return (ref_reads, total_reads) + + +class MuseParser(VariantParser): + def __init__(self, vcf_filename, tier=0, tumor_sample=None): + self._vcf_filename = vcf_filename + self._tier = tier + self._tumor_sample = tumor_sample + + def _get_normal_genotype(self, variant): + tumor_i = self._get_tumor_index(variant, self._tumor_sample) + assert tumor_i in (0, 1), "Tumor index %s is not 0 or 1" % tumor_i + normal_i = 1 - tumor_i + return set([int(t) for t in variant.samples[normal_i]["GT"].split("/")]) + + def _calc_read_counts(self, variant): + normal_gt = self._get_normal_genotype(variant) + assert len(normal_gt) == 1 + normal_gt = normal_gt.pop() + + tumor_i = self._get_tumor_index(variant, self._tumor_sample) + total_reads = int(variant.samples[tumor_i]["DP"]) + ref_reads = int(variant.samples[tumor_i]["AD"][normal_gt]) + + return (ref_reads, total_reads) + + def _does_variant_pass_filters(self, variant): + # Ignore heterozygous normal variants. + if len(self._get_normal_genotype(variant)) != 1: + return False + if variant.FILTER is None or len(variant.FILTER) == 0: + return True + if int(variant.FILTER[0][-1]) <= self._tier: + # Variant failed one or more filters, but we still accept it. + return True + return False + + +class StrelkaParser(VariantParser): + def __init__(self, vcf_filename, tumor_sample=None): + self._vcf_filename = vcf_filename + self._tumor_sample = tumor_sample + + def _does_variant_pass_filters(self, variant): + if len(variant.REF) != 1 or len(variant.ALT[0]) != 1: + return False + else: + super_filter = super(StrelkaParser, self) + super_filter.__init__() + return super_filter._does_variant_pass_filters(variant) + + def _calc_read_counts(self, variant): + alt = variant.ALT[0] + tumor_i = self._get_tumor_index(variant, self._tumor_sample) + total_reads = int(variant.samples[tumor_i]["DP"]) + + if alt is None: + total_reads = 0 + variant_reads = 0 + else: + variant_reads = int( + getattr(variant.samples[tumor_i].data, str(alt) + "U")[0] + ) + + ref_reads = total_reads - variant_reads + return (ref_reads, total_reads) + + +class SomSnipParser(VariantParser): + def __init__(self, vcf_filename, tumor_sample=None): + self._vcf_filename = vcf_filename + self._tumor_sample = tumor_sample + + def _calc_read_counts(self, variant): + tumor_i = self._get_tumor_index(variant, self._tumor_sample) + highqual_reads = variant.samples[tumor_i]["DP4"] + assert len(highqual_reads) == 4 + + ref_reads = int(highqual_reads[0]) + int(highqual_reads[1]) + variant_reads = int(highqual_reads[2]) + int(highqual_reads[3]) + + return (ref_reads, ref_reads + variant_reads) + + +class MutectTcgaParser(VariantParser): + def __init__(self, vcf_filename, tumor_sample=None): + self._vcf_filename = vcf_filename + self._tumor_sample = tumor_sample + + def _calc_read_counts(self, variant): + tumor_i = self._get_tumor_index(variant, self._tumor_sample) + # TD: Tumor allelic depths for the ref and alt alleles in the order listed + ref_reads, variant_reads = variant.samples[tumor_i]["TD"] + total_reads = ref_reads + variant_reads + return (ref_reads, total_reads) + + +class MutectPcawgParser(VariantParser): + def __init__(self, vcf_filename, tumor_sample=None): + self._vcf_filename = vcf_filename + self._tumor_sample = tumor_sample + + def _calc_read_counts(self, variant): + tumor_i = self._get_tumor_index(variant, self._tumor_sample) + ref_reads = int(variant.samples[tumor_i].data.ref_count) + variant_reads = int(variant.samples[tumor_i].data.alt_count) + total_reads = ref_reads + variant_reads + + return (ref_reads, total_reads) + + +class MutectSmchetParser(VariantParser): + def __init__(self, vcf_filename, tumor_sample=None): + self._vcf_filename = vcf_filename + self._tumor_sample = tumor_sample + + def _calc_read_counts(self, variant): + tumor_i = self._get_tumor_index(variant, self._tumor_sample) + ref_reads = int(variant.samples[tumor_i]["AD"][0]) + variant_reads = int(variant.samples[tumor_i]["AD"][1]) + total_reads = ref_reads + variant_reads + + return (ref_reads, total_reads) + + +class MafParser(MutectSmchetParser): + + def _calc_read_counts(self, variant): + total_reads = variant.T_REF + variant.T_ALT + return (variant.T_REF, total_reads) + + def _does_variant_pass_filters(self, variant): + if variant.FILTER and variant.FILTER == "TRUE": + return False + return True + + def _parse_maf(self, maf_filename): + variant_list = [] + with open(maf_filename) as maf_file: + for single_line in csv.DictReader(maf_file, dialect="excel-tab"): + chrom = single_line["Chromosome"] + variant_type = single_line["Variant_Type"] + start = int(single_line["Start_Position"]) + filter = single_line["FILTER"] + ref_reads = int(single_line["t_ref_count"]) + variant_reads = int(single_line["t_alt_count"]) + ref_allele = single_line["Reference_Allele"] + alt_allele = single_line["Tumor_Seq_Allele2"] + pos = start + if variant_type == "DEL": + pos = pos - 1 + variant = MafVariant( + chrom, pos, filter, ref_reads, variant_reads, ref_allele, alt_allele + ) + variant_list.append(variant) + return variant_list + + def _filter(self, maf_filename): + variants = [] + + all_variants = self._parse_maf(maf_filename) + + for variant in all_variants: + if not is_good_chrom(variant.CHROM): + continue + if not self._does_variant_pass_filters(variant): + continue + variants.append(variant) + return variants + + def _parse_vcf(self, maf_filename): + self._parse_maf(maf_filename) + + +class VarDictParser(MutectSmchetParser): + """Support VarDict somatic variant caller. + + https://github.com/AstraZeneca-NGS/VarDictJava + https://github.com/AstraZeneca-NGS/VarDict + + Uses the same read-extraction logic as MuTect (SMC-Het). + """ + + pass + + +class DKFZParser(VariantParser): + def __init__(self, vcf_filename, tumor_sample=None): + self._vcf_filename = vcf_filename + self._tumor_sample = tumor_sample + + def _calc_read_counts(self, variant): + # This doesn't handle multisample correctly, as I don't know how to get the + # DP4 attribute on multiple DKFZ samples currently. + for_ref_reads = int(variant.INFO["DP4"][0]) + back_ref_reads = int(variant.INFO["DP4"][1]) + for_variant_reads = int(variant.INFO["DP4"][2]) + back_variant_reads = int(variant.INFO["DP4"][3]) + ref_reads = for_ref_reads + back_ref_reads + var_reads = for_variant_reads + back_variant_reads + total_reads = ref_reads + var_reads + + return (ref_reads, total_reads) + + +class CnvFormatter(object): + def __init__(self, read_depth, sampidxs, hetsnp_rate): + self._read_depth = read_depth + self._sampidxs = sampidxs + self._hetsnp_rate = hetsnp_rate + + def _find_overlapping_variants(self, chrom, cnv, variants): + overlapping = [] + + start = cnv["start"] + end = cnv["end"] + for variant in variants: + if chrom.upper() == variant["chrom"].upper(): + if start <= variant["pos"] <= end: + overlapping.append(variant["ssm_id"]) + return overlapping + + def _calc_ref_reads(self, cellular_prev, total_reads): + ref_reads = np.zeros(len(self._sampidxs), dtype=np.int64) + for sampidx in self._sampidxs: + vaf = cellular_prev[sampidx] / 2 + ref_reads[sampidx] = int((1 - vaf) * total_reads[sampidx]) + return ref_reads + + def _calc_total_reads(self, locus_start, locus_end): + def _calc(samp_read_depth): + # We estimate 7 heterozygous SNPs per 10 kb, which goes as input to CNA + # algorithms. Thus, we determine how many SNPs are equivalent to a region + # of the given size, then weight accordingly. + assert locus_start < locus_end + # Figure out approximately equivalent number of SSMs to this region. + equiv_ssms = (locus_end - locus_start) * self._hetsnp_rate + return int(np.round(equiv_ssms * samp_read_depth)) + + D = [_calc(self._read_depth[sampidx]) for sampidx in self._sampidxs] + return self._cap_cnv_D(D) + + def _format_overlapping_variants(self, variants, maj_cn, min_cn): + assert len(set(maj_cn)) == len(set(min_cn)) == 1 + variants = [(ssm_id, str(min_cn[0]), str(maj_cn[0])) for ssm_id in variants] + return variants + + def _cap_cnv_D(self, D): + # Average tumour has ~3k SSMs, so say that a CNA region should be + # equivalent to no more than this. + avg_ssms_in_tumour = 3000 + D_max = np.round(avg_ssms_in_tumour * self._read_depth).astype(np.int) + D_min = 1 + + D = np.minimum(D_max, D) + D = np.maximum(D_min, D) + return D + + def _format_cnvs(self, cnvs, variants): + log("Estimated read depth: %s" % self._read_depth) + + for chrom, chrom_cnvs in cnvs.items(): + for cnv in chrom_cnvs: + overlapping_variants = self._find_overlapping_variants( + chrom, cnv, variants + ) + total_reads = self._calc_total_reads(cnv["start"], cnv["end"]) + ref_reads = self._calc_ref_reads(cnv["cell_prev"], total_reads) + yield { + "chrom": chrom, + "start": cnv["start"], + "end": cnv["end"], + "major_cn": cnv["major_cn"], + "minor_cn": cnv["minor_cn"], + "cellular_prevalence": cnv["cell_prev"], + "ref_reads": ref_reads, + "total_reads": total_reads, + "overlapping_variants": self._format_overlapping_variants( + overlapping_variants, cnv["major_cn"], cnv["minor_cn"] + ), + } + + def _merge_variants(self, cnv1, cnv2): + cnv1_variant_names = set([v[0] for v in cnv1["overlapping_variants"]]) + for variant in cnv2["overlapping_variants"]: + variant_name = variant[0] + if variant_name not in cnv1_variant_names: + cnv1["overlapping_variants"].append(variant) + else: + # If variant already in cnv1's list, ignore it. This should only occur + # if two subclonal CNVs have close to 0.5 frequency each. In this case, + # we lose information about major/minor status of the cnv2 relative to + # its SSMs. + log("%s already in %s" % (variant, cnv1["cnv_id"])) + + # CNVs with similar a/d values should not be free to move around the + # phylogeny independently, and so we merge them into a single entity. We may + # do the same with SNVs bearing similar frequencies later on. + def format_and_merge_cnvs(self, cnvs, variants, cellularity): + formatted = list(self._format_cnvs(cnvs, variants)) + formatted.sort(key=lambda f: f["cellular_prevalence"][0], reverse=True) + if len(formatted) == 0: + return [] + + for cnv in formatted: + physical_cnvs = OrderedDict() + for K in ("chrom", "start", "end", "major_cn", "minor_cn"): + physical_cnvs[K] = cnv[K] + + assert ( + len(set(physical_cnvs["major_cn"])) + == len(set(physical_cnvs["major_cn"])) + == 1 + ) + physical_cnvs["major_cn"] = physical_cnvs["major_cn"][0] + physical_cnvs["minor_cn"] = physical_cnvs["minor_cn"][0] + + physical_cnvs["cell_prev"] = "|".join( + [str(C) for C in cnv["cellular_prevalence"]] + ) + cnv["physical_cnvs"] = ",".join( + ["%s=%s" % (K, physical_cnvs[K]) for K in physical_cnvs.keys()] + ) + + merged, formatted = formatted[:1], formatted[1:] + merged[0]["cnv_id"] = "c0" + counter = 1 + + for current in formatted: + last = merged[-1] + assert np.all(current["cellular_prevalence"] <= cellularity) and np.all( + last["cellular_prevalence"] <= cellularity + ) + + # Only merge CNVs if they're clonal. If they're subclonal, leave them + # free to move around the tree. + if np.array_equal( + current["cellular_prevalence"], last["cellular_prevalence"] + ) and np.array_equal(last["cellular_prevalence"], cellularity): + # Merge the CNVs. + log( + "Merging %s_%s and %s_%s" + % (current["chrom"], current["start"], last["chrom"], last["start"]) + ) + last["total_reads"] = self._cap_cnv_D( + current["total_reads"] + last["total_reads"] + ) + last["ref_reads"] = self._calc_ref_reads( + last["cellular_prevalence"], last["total_reads"] + ) + last["physical_cnvs"] += ";" + current["physical_cnvs"] + self._merge_variants(last, current) + else: + # Do not merge the CNVs. + current["cnv_id"] = "c%s" % counter + merged.append(current) + counter += 1 + + return merged + + +class VariantFormatter(object): + def __init__(self): + self._counter = 0 + + def _split_types(self, genotype): + types = [int(e) for e in genotype.split("/")] + if len(types) != 2: + raise Exception("Not diploid: %s" % types) + return types + + def _calc_ref_freq(self, ref_genotype, error_rate): + types = self._split_types(ref_genotype) + num_ref = len([t for t in types if t == 0]) + freq = (num_ref / 2) - error_rate + if freq < 0: + freq = 0.0 + if freq > 1: + raise Exception("Nonsensical frequency: %s" % freq) + return freq + + def format_variants( + self, variants, ref_read_counts, total_read_counts, error_rate, sex + ): + for variant_idx, variant in enumerate(variants): + ssm_id = "s%s" % self._counter + if hasattr(variant, "ID") and variant.ID is not None: + # This field will be defined by PyVCF, but not by our VariantId named + # tuple that we have switched to, so this code will never actually run. + # TODO: fix that. + variant_name = variant.ID + else: + variant_name = "%s_%s" % (variant.CHROM, variant.POS) + + # TODO: switch back to using calc_ref_freq() when we no longer want mu_r + # and mu_v fixed. + # This is mu_r in PhyloWGS. + expected_ref_freq = 1 - error_rate + if variant.CHROM in ("Y", "M") or (variant.CHROM == "X" and sex == "male"): + # Haploid, so should only see non-variants when sequencing error + # occurred. Note that chrY and chrM are always haploid; chrX is haploid + # only in men, so script must know sex of patient to choose correct + # value. Currently, I just assume that all data comes from men. + # + # This is mu_v in PhyloWGS. + expected_var_freq = error_rate + else: + # Diploid, so should see variants in (0.5 - error_rate) proportion of + # reads. + # + # This is mu_v in PhyloWGS. + expected_var_freq = 0.5 - error_rate + + yield { + "ssm_id": ssm_id, + "chrom": variant.CHROM, + "pos": variant.POS, + "variant_name": variant_name, + "ref_reads": list(ref_read_counts[variant_idx, :]), + "total_reads": list(total_read_counts[variant_idx, :]), + "expected_ref_freq": expected_ref_freq, + "expected_var_freq": expected_var_freq, + } + self._counter += 1 + + +def restricted_float(x): + x = float(x) + if x < 0.0 or x > 1.0: + raise argparse.ArgumentTypeError("%r not in range [0.0, 1.0]" % x) + return x + + +def chrom_key(chrom): + if chrom.isdigit(): + return int(chrom) + elif chrom == "X": + return 100 + elif chrom == "Y": + return 101 + else: + raise Exception("Unknown chrom: %s" % chrom) + + +def variant_key(var): + chrom = chrom_key(var.CHROM) + return (chrom, var.POS, None) + + +class Segmenter(object): + def _organize_cnvs(self, cnv_set): + organized = defaultdict(list) + + for sampidx, cnvs in enumerate(cnv_set): + for chrom, chrom_cnvs in cnvs.items(): + for cnv in chrom_cnvs: + organized[chrom].append( + { + "sample": sampidx, + "start": cnv["start"], + "end": cnv["end"], + "major_cn": cnv["major_cn"], + "minor_cn": cnv["minor_cn"], + "cell_prev": cnv["cellular_prevalence"], + } + ) + + for chrom, cnvs in organized.items(): + # Intervals may not be sorted in input file. + cnvs.sort(key=lambda c: c["start"]) + + return organized + + def _create_intervals(self, cnv_set): + # intervals[chrom][(major, minor)] + intervals = defaultdict(list) + min_size_for_inclusion = 1 + + for chrom, cnvs in cnv_set.items(): + for cnv in cnvs: + # We sorted above to place start coordinates after end coordinates. But + # if a CNV was listed with *the same* start and end position (meaning a + # zero-length record, assuming intervals that are left-closed but + # right-open), we will encounter the end for that record before its + # start. As such, the "open_samples.remove()" call below will fail, as + # the given intervals will not have been opened when we encounter its + # end. + # + # Note the above assumes a half-open interpretation of intervals. I + # don't think I implemented this -- if I recall, the code dealing with + # CNVs (such as determining SSM overlap) assumes fully-closed intervals + # (i.e., it doesn't check if cnv.start <= ssm.locus <= (cnv.end + 1)). + # Normally this doesn't matter, given the low resolution of CNV calls + # -- we should never encounter such small intervals. But a pathological + # case in which CNV inputs had the same start & end coordinates for + # some intervals revealed that the code crashes on this input. We + # should provide a more informative error in such cases, which the + # following assertion does. + assert cnv["start"] < cnv["end"], ( + "In CNV %s, start position occurs at or after the end position" + % cnv + ) + + start_pos = [ + ( + c["start"], + "start", + (c["sample"], c["cell_prev"], c["major_cn"], c["minor_cn"]), + ) + for c in cnvs + ] + end_pos = [ + ( + c["end"], + "end", + (c["sample"], c["cell_prev"], c["major_cn"], c["minor_cn"]), + ) + for c in cnvs + ] + + # True > False, so this sorting will place start positions after end + # positions if both have same coordinate. + positions = sorted( + start_pos + end_pos, key=lambda e: (e[0], e[1] == "start") + ) + assert len(positions) >= 2, "Fewer than two positions in %s" % positions + + # prev_pos is updated each time we move to a new coordinate on the + # chromosome. Multiple start or end points may be associated with any + # given coordinate. + prev_pos = None + open_samples = [] + idx = 0 + + while idx < len(positions): + points_at_locus = [positions[idx]] + locus = points_at_locus[0][0] + + # Gather all interval breakpoints at this locus. + while True: + assert positions[idx][0] >= locus + idx += 1 + if idx == len(positions) or positions[idx][0] > locus: + break + points_at_locus.append(positions[idx]) + + if prev_pos is None: + assert len(open_samples) == 0 + + if len(open_samples) > 0: + # If some samples are already open from previous loci (such that + # last_pos will not be None), add this interval. + assert locus > prev_pos + interval = (prev_pos, locus) + if interval[1] - interval[0] > min_size_for_inclusion: + intervals[chrom].append( + (interval[0], interval[1], sorted(list(set(open_samples)))) + ) + else: + # All points should be start points. + assert set([i[1] for i in points_at_locus]) == set(["start"]) + + prev_pos = locus + + # Update open_samples in accordance with whether each breakpoint at + # this locus starts or ends an interval. + for ( + pos, + pt_type, + (sampidx, cell_prev, major_cn, minor_cn), + ) in points_at_locus: + if pt_type == "start": + log( + "Adding ", + (pos, pt_type, sampidx, cell_prev, major_cn, minor_cn), + ) + open_samples.append((sampidx, cell_prev, major_cn, minor_cn)) + elif pt_type == "end": + log( + "Removing ", + (pos, pt_type, sampidx, cell_prev, major_cn, minor_cn), + ) + open_samples.remove((sampidx, cell_prev, major_cn, minor_cn)) + else: + raise Exception("Unknown point type: %s" % pt_type) + + assert len(open_samples) == 0 + + return intervals + + def _merge_adjacent(self, cncalls, allowed_gap=0): + cncalls.sort(key=lambda c: (Util.chrom_key(c["chrom"]), c["start"])) + merged = [] + idx = 0 + while idx < len(cncalls): + adjacent = [cncalls[idx]] + idx += 1 + + while ( + idx < len(cncalls) + and cncalls[idx]["chrom"] == adjacent[-1]["chrom"] + and cncalls[idx]["major_cn"] == adjacent[-1]["major_cn"] + and cncalls[idx]["minor_cn"] == adjacent[-1]["minor_cn"] + and 0 <= cncalls[idx]["start"] - adjacent[-1]["end"] <= allowed_gap + ): + adjacent.append(cncalls[idx]) + idx += 1 + + if len(adjacent) > 1: + log("Merging ", adjacent) + copy = dict(adjacent[0]) + copy["end"] = adjacent[-1]["end"] + merged.append(copy) + else: + merged.append(adjacent[0]) + + return merged + + def segment(self, cn_calls): + # Merge adjacent CNVs here rather than when data loaded, as what can be + # merged will be determined by what tetraploidy correction, if any, is + # applied to the data. + # for sampidx, cnvs in enumerate(cn_calls): + # cn_calls[sampidx] = self._merge_adjacent(cnvs) + organized = self._organize_cnvs(cn_calls) + return self._create_intervals(organized) + + +class MultisampleCnvCombiner(object): + def __init__(self, cn_regions, cellularity, sex): + self.sampidxs = set(range(len(cn_regions))) + segments = Segmenter().segment(cn_regions) + self._cnvs = self._reformat_segments_as_cnvs(segments) + self._cellularity = cellularity + self._sex = sex + + def _reformat_segments_as_cnvs(self, segments): + reformatted = defaultdict(list) + _retrieve_val = lambda idx: np.array(zip(*open_samples)[idx]) + + for chrom, chrom_cnvs in segments.items(): + for start, end, open_samples in chrom_cnvs: + sampidx = _retrieve_val(0) + cell_prev = _retrieve_val(1) + major_cn = _retrieve_val(2) + minor_cn = _retrieve_val(3) + cnv = { + "start": start, + "end": end, + "cell_prev": cell_prev, + "major_cn": major_cn, + "minor_cn": minor_cn, + "sampidx": sampidx, + } + reformatted[chrom].append(cnv) + + return reformatted + + def _ensure_no_overlap(self, cnvs): + for chrom, chrom_cnvs in cnvs.items(): + for idx in range(len(chrom_cnvs) - 1): + current, next = chrom_cnvs[idx], chrom_cnvs[idx + 1] + assert current["start"] < current["end"] <= next["start"] < next["end"] + + def _is_region_normal_cn(self, chrom, major, minor): + return self._is_multisample_region_normal_cn(chrom, [major], [minor]) + + def _is_multisample_region_normal_cn(self, chrom, major, minor): + normal_major = set([1]) + if self._sex == "male" and chrom in (("X", "Y")): + normal_minor = set([0]) + else: + normal_minor = set([1]) + return set(major) == normal_major and set(minor) == normal_minor + + def _get_abnormal_state_for_all_samples(self, chrom, cnv): + """On a per-sample basis, record which samples report the CNA is abnormal + CN, and which report it is normal CN. If multiple different abnormal states + occur in different samples, return None.""" + # All samples must have at least one record for this region, or don't + # include it. + if set(cnv["sampidx"]) != self.sampidxs: + return None + + abnormal_state = None + filtered = [] + + for sampidx, cell_prev, major, minor in zip( + cnv["sampidx"], cnv["cell_prev"], cnv["major_cn"], cnv["minor_cn"] + ): + # Region may be (clonal or subclonal) normal in a sample, so ignore such records. + if self._is_region_normal_cn(chrom, major, minor): + continue + + # Either we haven't observed an abnormal CN state in this region before, + # or the observed abnormal state matches what we've already seen. + if abnormal_state is None or abnormal_state == (major, minor): + abnormal_state = (major, minor) + filtered.append( + { + "sampidx": sampidx, + "cell_prev": cell_prev, + "major_cn": major, + "minor_cn": minor, + } + ) + continue + # The abnormal state (i.e., major & minor alleles) is *different* from + # what we've seen before. The PWGS model doesn't currently account for + # such cases, so ignore the region. + else: + return None + + # None of the observed records were abnormal -- i.e., all samples report + # the region is normal. Reject the region. + if abnormal_state is None: + return None + + retained_sampidxs = [F["sampidx"] for F in filtered] + # Sanity check: when we originally parsed the CNVs, the samples should have + # been added in order, and that ought not to have changed. + assert retained_sampidxs == sorted(retained_sampidxs) + # Sanity check: we should have no duplicate samples. While a given sample + # may report any number of records for a region, above we discarded normal + # regions, and ensured that only one abnormal state exists in all samples. + # Thus, we should have no more than one record per sample for this region. + assert len(retained_sampidxs) == len(set(retained_sampidxs)) + + # Add a record for all samples that reported this region as clonal normal. + cell_prev_when_absent = 0 + for missing_sampidx in self.sampidxs - set(retained_sampidxs): + filtered.append( + { + "sampidx": missing_sampidx, + "cell_prev": cell_prev_when_absent, + "major_cn": abnormal_state[0], + "minor_cn": abnormal_state[1], + } + ) + # Sort by sampidx. + filtered.sort(key=lambda F: F["sampidx"]) + # Ensure all samples have one record. + assert len(filtered) == len(self.sampidxs) + + return filtered + + def load_single_abnormal_state_cnvs(self): + """ + Return all regions that possess at most one abnormal state across samples. + E.g., given three samples, S_1 and S_3 report the region as (2, 1) (with + potentially different cellular prevalences), while S_2 lists it as clonal + (1, 1). In such an instance, the record for S_2 will *not* indicate the + region is normal. Instead, the S_2 record will show a state of (2, 1) with + a cellular prevalence of zero. This is done so that we can calculate + sensible `a` and `d` values for cnv_data.txt. + """ + # In Battenberg, either one region is normal and the other abnormal, + # or both are abnormal. + # In TITAN, only one abnormal region will be listed, without a + # corresponding normal region. + abnormal_cnvs = defaultdict(list) + + for chrom, chrom_cnvs in self._cnvs.items(): + if not is_good_chrom(chrom): + continue + for cnv in chrom_cnvs: + states_for_all_samples = self._get_abnormal_state_for_all_samples( + chrom, cnv + ) + if states_for_all_samples is None: + continue + + combined_states = { + K: np.array([S[K] for S in states_for_all_samples]) + for K in states_for_all_samples[0].keys() + } + cnv.update(combined_states) + abnormal_cnvs[chrom].append(cnv) + abnormal_cnvs[chrom].sort(key=lambda C: C["start"]) + + self._ensure_no_overlap(abnormal_cnvs) + return abnormal_cnvs + + def load_normal_cnvs(self): + """ + Return all regions that are clonal normal across all samples. + """ + normal_cnvs = defaultdict(list) + + for chrom, chrom_cnvs in self._cnvs.items(): + if not is_good_chrom(chrom): + continue + for cnv in chrom_cnvs: + if not self._is_multisample_region_normal_cn( + chrom, cnv["major_cn"], cnv["minor_cn"] + ): + continue + if not set(cnv["sampidx"]) == self.sampidxs: + continue + if not np.array_equal(cnv["cell_prev"], self._cellularity): + # The region must be clonal normal to be retained. This check + # shouldn't be necessary, as we've already ensured all calls have + # major = minor = 1, but we perform it just to be thorough. + continue + normal_cnvs[chrom].append(cnv) + normal_cnvs[chrom].sort(key=lambda C: C["start"]) + + self._ensure_no_overlap(normal_cnvs) + return normal_cnvs + + def load_cnvs(self): + """ + Return both normal and abnormal regions. + """ + combined = defaultdict(list) + + normal_cnvs = self.load_normal_cnvs() + abnormal_cnvs = self.load_single_abnormal_state_cnvs() + for chrom in set(normal_cnvs.keys()) | set(abnormal_cnvs.keys()): + combined[chrom] = normal_cnvs[chrom] + abnormal_cnvs[chrom] + combined[chrom].sort(key=lambda C: C["start"]) + self._ensure_no_overlap(combined) + + return combined + + +class VariantAndCnvGroup(object): + def __init__(self, hetsnp_rate): + self._multisamp_cnv = None + self._cellularity = None + self._hetsnp_rate = hetsnp_rate + + def add_variants(self, variants, ref_read_counts, total_read_counts): + self._variants = variants + # Ensure no duoplicates. + assert len(variants) == len(set(variants)) + # Note that self._variant_idxs will change as we filter out variants, + # reflecting only the remaining valid variants. self._variants, however, + # will not change. + self._variant_idxs = list(range(len(variants))) + self._ref_read_counts = ref_read_counts + self._total_read_counts = total_read_counts + # Estimate read depth before any filtering of variants is performed, in + # case no SSMs remain afterward. + self._estimated_read_depth = self._estimate_read_depth() + + def _find_cellularity(self, cnvs): + max_cellular_prevs = np.zeros(len(cnvs)) + + for sampidx, sample_cnvs in enumerate(cnvs): + for chrom_regions in sample_cnvs.values(): + for cnr in chrom_regions: + if cnr["cellular_prevalence"] > max_cellular_prevs[sampidx]: + max_cellular_prevs[sampidx] = cnr["cellular_prevalence"] + + return max_cellular_prevs + + def add_cnvs(self, cn_regions, sex): + self._cellularity = self._find_cellularity(cn_regions) + self._multisamp_cnv = MultisampleCnvCombiner(cn_regions, self._cellularity, sex) + self._sampidxs = self._multisamp_cnv.sampidxs + + def has_cnvs(self): + return self._multisamp_cnv is not None + + def _filter_variants_outside_regions(self, regions, before_label, after_label): + def _is_pos_in_regions(chrom, pos): + for cnv in regions[chrom]: + if cnv["start"] <= pos <= cnv["end"]: + return True + return False + + filtered = [] + + for vidx in self._variant_idxs: + variant = self._variants[vidx] + if _is_pos_in_regions(variant.CHROM, variant.POS): + filtered.append(vidx) + + self._print_variant_differences( + [self._variants[idx] for idx in self._variant_idxs], + [self._variants[idx] for idx in filtered], + before_label, + after_label, + ) + self._variant_idxs = filtered + + def _print_variant_differences(self, before, after, before_label, after_label): + before = set(before) + after = set(after) + log( + "%s=%s %s=%s delta=%s" + % ( + before_label, + len(before), + after_label, + len(after), + len(before) - len(after), + ) + ) + + assert after.issubset(before) + removed = list(before - after) + removed.sort(key=variant_key) + + def _print_region(var): + var_name = "%s_%s" % (var.CHROM, var.POS) + region_type = None + containing_cnv = None + + for cnv in self._multisamp_cnv.load_normal_cnvs()[var.CHROM]: + if cnv["start"] <= var.POS <= cnv["end"]: + region_type = "normal" + containing_cnv = cnv + break + for cnv in self._multisamp_cnv.load_single_abnormal_state_cnvs()[var.CHROM]: + if cnv["start"] <= var.POS <= cnv["end"]: + assert region_type is None and containing_cnv is None + region_type = "abnormal" + containing_cnv = cnv + break + + if containing_cnv is not None: + log( + "%s\t[in %s-CN region chr%s(%s, %s)]" + % ( + var_name, + region_type, + var.CHROM, + containing_cnv["start"], + containing_cnv["end"], + ) + ) + else: + log("%s\t[outside all regions]" % var_name) + + for var in removed: + _print_region(var) + + def retain_only_variants_in_normal_cn_regions(self): + if not self.has_cnvs(): + raise Exception("CN regions not yet provided") + + normal_cn = self._multisamp_cnv.load_normal_cnvs() + filtered = self._filter_variants_outside_regions( + normal_cn, "all_variants", "only_normal_cn" + ) + + def exclude_variants_in_multiple_abnormal_or_unlisted_regions(self): + # Battenberg: + # Five possible placements for variant in Battenberg according to CN records: + # 1 record: + # That record has normal CN: include + # That record has abnormal CN: include + # 2 records: + # One record is normal CN, one record is abnormal CN: include + # Both records are abnormal CN: exclude (as we don't know what order the CN events occurred in) + # TITAN: + # In output seen to date, TITAN will only list one record per region. If + # the CN state is abnormal and clonal_frac < 1, this implies the + # remainder of the region will be normal CN. Multiple abnormal records + # for the same region are likely possible, but I haven't yet seen any. + # Regardless, when they occur, they should be properly handled by the + # code. + if not self.has_cnvs(): + raise Exception("CN regions not yet provided") + + # If variant isn't listed in *any* region: exclude (as we suspect CNV + # caller didn't know what to do with the region). + self._filter_variants_outside_regions( + self._multisamp_cnv.load_cnvs(), "all_variants", "within_cn_regions" + ) + + def format_variants( + self, sample_size, error_rate, priority_ssms, only_priority, sex + ): + if sample_size is None: + sample_size = len(self._variant_idxs) + random.shuffle(self._variant_idxs) + + subsampled, nonsubsampled = [], [] + variant_idx_map = {self._variants[idx]: idx for idx in self._variant_idxs} + used_variant_idxs = set() # Use a set for O(1) testing of membership. + + for prissm in priority_ssms: + if prissm not in variant_idx_map: + continue + if len(subsampled) >= sample_size: + break + log("%s_%s in priority" % (prissm.CHROM, prissm.POS)) + varidx = variant_idx_map[prissm] + used_variant_idxs.add(varidx) + subsampled.append(varidx) + + for variant_idx in self._variant_idxs: + if variant_idx in used_variant_idxs: + continue + used_variant_idxs.add(variant_idx) + variant = self._variants[variant_idx] + if (not only_priority) and len(subsampled) < sample_size: + subsampled.append(variant_idx) + else: + nonsubsampled.append(variant_idx) + + assert ( + len(used_variant_idxs) + == len(self._variant_idxs) + == len(subsampled) + len(nonsubsampled) + ) + + subsampled.sort(key=lambda idx: variant_key(self._variants[idx])) + subsampled_variants = get_elements_at_indices(self._variants, subsampled) + subsampled_ref_counts = self._ref_read_counts[subsampled, :] + subsampled_total_counts = self._total_read_counts[subsampled, :] + + nonsubsampled.sort(key=lambda idx: variant_key(self._variants[idx])) + nonsubsampled_variants = get_elements_at_indices(self._variants, nonsubsampled) + nonsubsampled_ref_counts = self._ref_read_counts[nonsubsampled, :] + nonsubsampled_total_counts = self._total_read_counts[nonsubsampled, :] + + formatter = VariantFormatter() + subsampled_formatted = list( + formatter.format_variants( + subsampled_variants, + subsampled_ref_counts, + subsampled_total_counts, + error_rate, + sex, + ) + ) + nonsubsampled_formatted = list( + formatter.format_variants( + nonsubsampled_variants, + nonsubsampled_ref_counts, + nonsubsampled_total_counts, + error_rate, + sex, + ) + ) + + return (subsampled_formatted, nonsubsampled_formatted) + + def write_variants(self, variants, outfn): + with open(outfn, "w") as outf: + print("\t".join(("id", "gene", "a", "d", "mu_r", "mu_v")), file=outf) + for variant in variants: + variant["ref_reads"] = ",".join([str(v) for v in variant["ref_reads"]]) + variant["total_reads"] = ",".join( + [str(v) for v in variant["total_reads"]] + ) + vals = ( + "ssm_id", + "variant_name", + "ref_reads", + "total_reads", + "expected_ref_freq", + "expected_var_freq", + ) + vals = [variant[k] for k in vals] + print("\t".join([str(v) for v in vals]), file=outf) + + def _estimate_read_depth(self): + read_sum = 0 + if len(self._variants) == 0: + default_read_depth = 50 + log( + "No variants available, so fixing read depth at %s." + % default_read_depth + ) + return default_read_depth + else: + return np.nanmedian(self._total_read_counts, axis=0) + + def write_cnvs(self, variants, outfn): + with open(outfn, "w") as outf: + print("\t".join(("cnv", "a", "d", "ssms", "physical_cnvs")), file=outf) + formatter = CnvFormatter( + self._estimated_read_depth, self._sampidxs, self._hetsnp_rate + ) + for cnv in formatter.format_and_merge_cnvs( + self._multisamp_cnv.load_single_abnormal_state_cnvs(), + variants, + self._cellularity, + ): + overlapping = [",".join(o) for o in cnv["overlapping_variants"]] + vals = ( + cnv["cnv_id"], + ",".join([str(V) for V in cnv["ref_reads"]]), + ",".join([str(V) for V in cnv["total_reads"]]), + ";".join(overlapping), + cnv["physical_cnvs"], + ) + print("\t".join(vals), file=outf) + + +def log(*msgs): + if log.verbose: + print(*msgs, file=sys.stderr) + + +log.verbose = False + + +class CnvParser(object): + def __init__(self, cn_filename): + self._cn_filename = cn_filename + + def parse(self): + cn_regions = defaultdict(list) + + with open(self._cn_filename) as cnf: + reader = csv.DictReader(cnf, delimiter="\t") + for record in reader: + chrom = record["chromosome"].upper() + del record["chromosome"] + for key in ("start", "end", "major_cn", "minor_cn"): + # Some records from Battenberg have major and minor listed as, e.g., + # "1.0", so cast to float before int. + assert float(record[key]) == int(float(record[key])) + record[key] = int(float(record[key])) + record["cellular_prevalence"] = float(record["cellular_prevalence"]) + cn_regions[chrom].append(record) + + # Ensure CN regions are properly sorted, which we later rely on when + # filtering out regions with multiple abnormal CN states. + for chrom, regions in cn_regions.items(): + cn_regions[chrom] = sorted(regions, key=lambda r: r["start"]) + + return cn_regions + + +def get_elements_at_indices(L, indices): + elem = [] + for idx in indices: + elem.append(L[idx]) + return elem + + +def parse_priority_ssms(priority_ssm_filename): + if priority_ssm_filename is None: + return [] + priority_ssms = [] + already_seen = set() + + with open(priority_ssm_filename) as priof: + for line in priof: + chrom, pos = line.strip().split("_", 1) + variant = VariantId(CHROM=chrom.upper(), POS=int(pos)) + # Prevent duplicates -- otherwise, we'll add the variant to our + # subsampled list of variants twice. This manifested as a problem in the + # PCAWG 6cfce053-bfd6-4ca0-b74b-b2e4549e4f1f sample. + if variant in already_seen: + continue + priority_ssms.append(variant) + already_seen.add(variant) + + return priority_ssms + + +def impute_missing_total_reads(total_reads, missing_variant_confidence): + # Change NaNs to masked values via SciPy. + masked_total_reads = ma.fix_invalid(total_reads) + + # Going forward, suppose you have v variants and s samples in a v*s matrix of + # read counts. Missing values are masked. + + # Calculate geometric mean of variant read depth in each sample. Result: s*1 + sample_means = gmean(masked_total_reads, axis=0) + assert np.sum(sample_means <= 0) == np.sum(np.isnan(sample_means)) == 0 + # Divide every variant's read count by its mean sample read depth to get read + # depth enrichment relative to other variants in sample. Result: v*s + normalized_to_sample = np.dot(masked_total_reads, np.diag(1.0 / sample_means)) + # For each variant, calculate geometric mean of its read depth enrichment + # across samples. Result: v*1 + variant_mean_reads = gmean(normalized_to_sample, axis=1) + assert np.sum(variant_mean_reads <= 0) == np.sum(np.isnan(variant_mean_reads)) == 0 + + # Convert 1D arrays to vectors to permit matrix multiplication. + imputed_counts = np.dot( + variant_mean_reads.reshape((-1, 1)), sample_means.reshape((1, -1)) + ) + nan_coords = np.where(np.isnan(total_reads)) + total_reads[nan_coords] = imputed_counts[nan_coords] + assert np.sum(total_reads <= 0) == np.sum(np.isnan(total_reads)) == 0 + + total_reads[nan_coords] *= missing_variant_confidence + return np.floor(total_reads).astype(np.int) + + +def impute_missing_ref_reads(ref_reads, total_reads): + ref_reads = np.copy(ref_reads) + + assert np.sum(np.isnan(total_reads)) == 0 + nan_coords = np.where(np.isnan(ref_reads)) + ref_reads[nan_coords] = total_reads[nan_coords] + assert np.sum(np.isnan(ref_reads)) == 0 + + return ref_reads.astype(np.int) + + +def is_good_chrom(chrom): + # Ignore the following: + # * Variants unmapped ('chrUn') or mapped to fragmented chromosome ('_random') + # * Weird chromosomes from Mutect (e.g., "chr17_ctg5_hap1"). + # * Mitochondrial ("mt" or "m"), which are weird + # * Sex chromosomes difficult to deal with, as expected frequency depends on + # whether patient is male or female, so ignore them for now. TODO: fix this. + if chrom in [str(i) for i in range(1, 23)] + ["X", "Y"]: + return True + else: + return False + + +def parse_variants( + samples, vcf_files, vcf_types, tumor_sample, missing_variant_confidence +): + parsed_variants = [] + all_variant_ids = [] + num_samples = len(samples) + + for sample in samples: + vcf_fn, vcf_type = vcf_files[sample], vcf_types[sample] + + if vcf_type == "sanger": + variant_parser = SangerParser(vcf_fn, tumor_sample) + elif vcf_type == "mutect_pcawg": + variant_parser = MutectPcawgParser(vcf_fn, tumor_sample) + elif vcf_type == "mutect_smchet": + variant_parser = MutectSmchetParser(vcf_fn, tumor_sample) + elif vcf_type == "mutect_tcga": + variant_parser = MutectTcgaParser(vcf_fn, tumor_sample) + elif vcf_type == "muse": + variant_parser = MuseParser(vcf_fn, muse_tier, tumor_sample) + elif vcf_type == "dkfz": + variant_parser = DKFZParser(vcf_fn, tumor_sample) + elif vcf_type == "strelka": + variant_parser = StrelkaParser(vcf_fn, tumor_sample) + elif vcf_type == "vardict": + variant_parser = VarDictParser(vcf_fn, tumor_sample) + elif vcf_type == "pcawg_consensus": + variant_parser = PcawgConsensusParser(vcf_fn, tumor_sample) + elif vcf_type == "somsnip": + variant_parser = SomSnipParser(vcf_fn, tumor_sample) + elif vcf_type == "maf": + variant_parser = MafParser(vcf_fn, tumor_sample) + else: + raise Exception("Unknowon variant type: %s" % vcf_type) + + parsed_variants.append(variant_parser.list_variants()) + variant_ids = [] + for single_variant in parsed_variants[-1]: + if hasattr(single_variant[0], "ID"): + variant_ids.append( + VariantId( + str(single_variant[0].CHROM), + int(single_variant[0].POS), + single_variant[0].ID, + ) + ) + else: + variant_ids.append( + VariantId( + str(single_variant[0].CHROM), int(single_variant[0].POS), None + ) + ) + all_variant_ids += variant_ids + + all_variant_ids = list(set(all_variant_ids)) # Eliminate duplicates. + all_variant_ids.sort(key=variant_key) + num_variants = len(all_variant_ids) + variant_positions = dict(zip(all_variant_ids, range(num_variants))) + + total_read_counts = np.zeros((num_variants, num_samples)) + total_read_counts.fill(np.nan) + ref_read_counts = np.copy(total_read_counts) + + for sample_idx, parsed in enumerate(parsed_variants): + for variant, ref_reads, total_reads in parsed: + if hasattr(variant, "ID"): + variant_id = VariantId(str(variant.CHROM), int(variant.POS), variant.ID) + else: + variant_id = VariantId(str(variant.CHROM), int(variant.POS), None) + variant_idx = variant_positions[variant_id] + ref_read_counts[variant_idx, sample_idx] = ref_reads + total_read_counts[variant_idx, sample_idx] = total_reads + + total_read_counts = impute_missing_total_reads( + total_read_counts, missing_variant_confidence + ) + ref_read_counts = impute_missing_ref_reads(ref_read_counts, total_read_counts) + return (all_variant_ids, ref_read_counts, total_read_counts) + + +def infer_sex(variant_ids): + num_y_variants = len([V for V in variant_ids if V.CHROM == "Y"]) + if num_y_variants > 0: + return "male" + else: + return "female" + + +def extract_sample_data( + vcf_files_and_samples, vcf_types_and_samples, cnv_files_and_samples +): + vcf_files = {} + vcf_types = {} + cnv_files = {} + + assert len(vcf_files_and_samples) == len( + vcf_types_and_samples + ), "Must specify same number of VCF files and VCF types" + srcs_and_dsts = [ + (vcf_files_and_samples, vcf_files), + (vcf_types_and_samples, vcf_types), + ] + + should_use_cnvs = cnv_files_and_samples is not None + if should_use_cnvs: + assert len(cnv_files_and_samples) == len( + vcf_files_and_samples + ), "Must specify same number of VCF and CNV files" + srcs_and_dsts.append((cnv_files_and_samples, cnv_files)) + + for src, dst in srcs_and_dsts: + for combined in src: + assert "=" in combined, "%s should be in format =" % combined + sample, val = combined.split("=", 1) + dst[sample] = val + + # Sample order will dictate eventual output order. + common_samps = reduce( + lambda s1, s2: s1 & s2, [set(D[1].keys()) for D in srcs_and_dsts] + ) + ordered_samps = [S.split("=", 1)[0] for S in vcf_files_and_samples] + assert len(ordered_samps) == len(common_samps) # Ensure no duplicates. + + assert ( + set(vcf_files.keys()) == common_samps + ), "VCF file samples (%s) differ from common samples (%s)" % ( + vcf_files.keys(), + common_samps, + ) + assert ( + set(vcf_types.keys()) == common_samps + ), "VCF type samples (%s) differ from common samples (%s)" % ( + vcf_types.keys(), + common_samps, + ) + if should_use_cnvs: + assert ( + set(cnv_files.keys()) == common_samps + ), "CNV file samples (%s) differ from CNV file samples (%s)" % ( + cnv_files.keys(), + common_samps, + ) + + return (ordered_samps, vcf_files, vcf_types, cnv_files) + + +def main(): + all_vcf_types = set( + ( + "sanger", + "mutect_pcawg", + "mutect_smchet", + "mutect_tcga", + "muse", + "dkfz", + "strelka", + "vardict", + "pcawg_consensus", + "maf", + ) + ) + + parser = argparse.ArgumentParser( + description="Create ssm_data.txt and cnv_data.txt input files for PhyloWGS from VCF and CNV data.", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument( + "--vcf-type", + dest="vcf_types", + action="append", + required=True, + help="Type of VCF file for each sample, specified as =. Valid VCF types are %s." + % ",".join(all_vcf_types), + ) + parser.add_argument( + "-e", + "--error-rate", + dest="error_rate", + type=restricted_float, + default=0.001, + help="Expected error rate of sequencing platform", + ) + parser.add_argument( + "--missing-variant-confidence", + dest="missing_variant_confidence", + type=restricted_float, + default=1.0, + help="Confidence in range [0, 1] that SSMs missing from a sample are indeed not present in that sample", + ) + parser.add_argument( + "-s", + "--sample-size", + dest="sample_size", + type=int, + help="Subsample SSMs to reduce PhyloWGS runtime", + ) + parser.add_argument( + "-P", + "--priority-ssms", + dest="priority_ssm_filename", + help='File containing newline-separated list of SSMs in "_" format to prioritize for inclusion', + ) + parser.add_argument( + "--only-priority", + dest="only_priority", + action="store_true", + help="Only sample variants provided on priority list", + ) + parser.add_argument( + "--cnvs", + dest="cnv_files", + action="append", + help="Path to CNV file created with parse_cnvs.py for each sample. Specified as =.", + ) + parser.add_argument( + "--regions", + dest="regions", + choices=("normal_cn", "normal_and_abnormal_cn", "all"), + default="normal_and_abnormal_cn", + help="Which regions to use variants from. Refer to the parser README for more details.", + ) + parser.add_argument( + "--output-cnvs", + dest="output_cnvs", + default="cnv_data.txt", + help="Output destination for CNVs", + ) + parser.add_argument( + "--output-variants", + dest="output_variants", + default="ssm_data.txt", + help="Output destination for variants", + ) + parser.add_argument( + "--output-params", + dest="output_params", + default="params.json", + help="Output destination for run parameters", + ) + parser.add_argument( + "--tumor-sample", + dest="tumor_sample", + help="Name of the tumor sample in the input VCF file. Defaults to last sample if not specified.", + ) + parser.add_argument( + "--muse-tier", + dest="muse_tier", + type=int, + default=0, + help="Maximum MuSE tier to include", + ) + parser.add_argument( + "--nonsubsampled-variants", + dest="output_nonsubsampled_variants", + help="If subsampling, write nonsubsampled variants to separate file, in addition to subsampled variants", + ) + parser.add_argument( + "--nonsubsampled-variants-cnvs", + dest="output_nonsubsampled_variants_cnvs", + help="If subsampling, write CNVs for nonsubsampled variants to separate file", + ) + parser.add_argument( + "--sex", + dest="sex", + default="auto", + choices=("auto", "male", "female"), + help="Sex of patient. Used to adjust expected variant frequencies on sex chromosomes. " + + "If auto, patient is set to male if any variants are provided on the Y chromosome, and female otherwise.", + ) + parser.add_argument( + "--het-snp-rate", + dest="hetsnp_rate", + default=7e-4, + type=float, + help="Average number of heterozygous SNPs per base used to call copy " + + "number. This determines how heavily we weight somatic CNAs relative to " + + "SNVs. Defaults to 7 SNPs per 10 kb, as per Battenberg.", + ) + parser.add_argument("--verbose", dest="verbose", action="store_true") + parser.add_argument( + "vcf_files", + nargs="+", + help="Path to VCF file for each sample. Specified as =.", + ) + args = parser.parse_args() + + log.verbose = args.verbose + params = {} + + samples, vcf_files, vcf_types, cnv_files = extract_sample_data( + args.vcf_files, args.vcf_types, args.cnv_files + ) + params["samples"], params["vcf_files"], params["vcf_types"], params["cnv_files"] = ( + samples, + vcf_files, + vcf_types, + cnv_files, + ) + num_samples = len(samples) + variant_ids, ref_read_counts, total_read_counts = parse_variants( + samples, + vcf_files, + vcf_types, + args.tumor_sample, + args.missing_variant_confidence, + ) + + # Fix random seed to ensure same set of SSMs chosen when subsampling on each + # invocation. + random.seed(1) + + if args.sex == "auto": + sex = infer_sex(variant_ids) + else: + sex = args.sex + + grouper = VariantAndCnvGroup(args.hetsnp_rate) + grouper.add_variants(variant_ids, ref_read_counts, total_read_counts) + + if len(cnv_files) > 0: + # Load CNV files in same order as sample order given for VCFs. + cn_regions = [CnvParser(cnv_files[S]).parse() for S in samples] + grouper.add_cnvs(cn_regions, sex) + + if not grouper.has_cnvs(): + assert ( + args.regions == "all" + ), "If you do not provide CNA data, you must specify --regions=all" + + if args.regions == "normal_cn": + grouper.retain_only_variants_in_normal_cn_regions() + elif args.regions == "normal_and_abnormal_cn": + grouper.exclude_variants_in_multiple_abnormal_or_unlisted_regions() + elif args.regions == "all": + pass + else: + raise Exception("Unknown --regions value: %s" % args.regions) + + priority_ssms = parse_priority_ssms(args.priority_ssm_filename) + + subsampled_vars, nonsubsampled_vars = grouper.format_variants( + args.sample_size, args.error_rate, priority_ssms, args.only_priority, sex + ) + if len(subsampled_vars) == 0: + print("No variants to write", file=sys.stderr) + sys.exit(0) + grouper.write_variants(subsampled_vars, args.output_variants) + if args.output_nonsubsampled_variants: + grouper.write_variants(nonsubsampled_vars, args.output_nonsubsampled_variants) + + if grouper.has_cnvs() and args.regions != "normal_cn": + # Write CNVs. + grouper.write_cnvs(subsampled_vars, args.output_cnvs) + if ( + args.output_nonsubsampled_variants + and args.output_nonsubsampled_variants_cnvs + ): + grouper.write_cnvs( + nonsubsampled_vars, args.output_nonsubsampled_variants_cnvs + ) + else: + # Write empty CNV file. + with open(args.output_cnvs, "w"): + pass + + with open(args.output_params, "w") as F: + json.dump(params, F) + + +if __name__ == "__main__": + main() diff --git a/modules/msk/phylowgs/createinput/tests/main.nf.test b/modules/msk/phylowgs/createinput/tests/main.nf.test new file mode 100644 index 0000000..44de107 --- /dev/null +++ b/modules/msk/phylowgs/createinput/tests/main.nf.test @@ -0,0 +1,61 @@ +nextflow_process { + + name "Test Process PHYLOWGS_CREATEINPUT" + script "../main.nf" + process "PHYLOWGS_CREATEINPUT" + + tag "modules" + tag "modules_nfcore" + tag "modules_msk" + tag "phylowgs" + tag "phylowgs/createinput" + + test("PHYLOWGS_CREATEINPUT - txt") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.test_data_mskcc['neoantigen']['unfilteredmaf'], checkIfExists: true), + file(params.test_data_mskcc['neoantigen']['cnvs_txt'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("PHYLOWGS_CREATEINPUT - txt - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file('unfilteredmaf'), + file('cnvs_txt') + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/msk/phylowgs/createinput/tests/main.nf.test.snap b/modules/msk/phylowgs/createinput/tests/main.nf.test.snap new file mode 100644 index 0000000..377f422 --- /dev/null +++ b/modules/msk/phylowgs/createinput/tests/main.nf.test.snap @@ -0,0 +1,68 @@ +{ + "PHYLOWGS_CREATEINPUT - txt - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "cnv_data.txt:md5,d41d8cd98f00b204e9800998ecf8427e", + "ssm_data.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + "versions.yml:md5,c1620c9a82e06c6ccd1e8465cc928648" + ], + "phylowgsinput": [ + [ + { + "id": "test", + "single_end": false + }, + "cnv_data.txt:md5,d41d8cd98f00b204e9800998ecf8427e", + "ssm_data.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,c1620c9a82e06c6ccd1e8465cc928648" + ] + } + ], + "timestamp": "2024-06-11T18:00:07.554362" + }, + "PHYLOWGS_CREATEINPUT - txt": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "cnv_data.txt:md5,9228c0a7bce478e06db5d4304be3dbf7", + "ssm_data.txt:md5,055b7b5359a50eefd86fc85fcc023c6f" + ] + ], + "1": [ + "versions.yml:md5,c1620c9a82e06c6ccd1e8465cc928648" + ], + "phylowgsinput": [ + [ + { + "id": "test", + "single_end": false + }, + "cnv_data.txt:md5,9228c0a7bce478e06db5d4304be3dbf7", + "ssm_data.txt:md5,055b7b5359a50eefd86fc85fcc023c6f" + ] + ], + "versions": [ + "versions.yml:md5,c1620c9a82e06c6ccd1e8465cc928648" + ] + } + ], + "timestamp": "2024-06-11T18:00:00.11222" + } +} \ No newline at end of file diff --git a/modules/msk/phylowgs/createinput/tests/tags.yml b/modules/msk/phylowgs/createinput/tests/tags.yml new file mode 100644 index 0000000..c7a0c24 --- /dev/null +++ b/modules/msk/phylowgs/createinput/tests/tags.yml @@ -0,0 +1,2 @@ +phylowgs/createinput: + - "modules/msk/phylowgs/createinput/**" diff --git a/modules/msk/phylowgs/multievolve/environment.yml b/modules/msk/phylowgs/multievolve/environment.yml new file mode 100644 index 0000000..d6a0ba5 --- /dev/null +++ b/modules/msk/phylowgs/multievolve/environment.yml @@ -0,0 +1,9 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +name: "phylowgs_multievolve" +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - "PHYLOWGS" diff --git a/modules/msk/phylowgs/multievolve/main.nf b/modules/msk/phylowgs/multievolve/main.nf new file mode 100644 index 0000000..d7f6e9c --- /dev/null +++ b/modules/msk/phylowgs/multievolve/main.nf @@ -0,0 +1,50 @@ +process PHYLOWGS_MULTIEVOLVE { + tag "$meta.id" + label 'process_high' + + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'docker://mskcc/phylowgs:v1.5-msk': + 'docker.io/mskcc/phylowgs:v1.5-msk' }" + + input: + tuple val(meta), path(cnv_data), path(ssm_data) + + output: + tuple val(meta), path("chains/trees.zip") , emit: trees + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def threads = task.cpus * 2 + + """ + python2 \\ + /usr/bin/multievolve.py \\ + ${args} \\ + --num-chains ${threads} \\ + --ssms ${ssm_data} \\ + --cnvs ${cnv_data} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + phylowgs: \$PHYLOWGS_TAG + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + mkdir chains + touch chains/trees.zip + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + phylowgs: \$PHYLOWGS_TAG + END_VERSIONS + """ +} diff --git a/modules/msk/phylowgs/multievolve/meta.yml b/modules/msk/phylowgs/multievolve/meta.yml new file mode 100644 index 0000000..0ccb999 --- /dev/null +++ b/modules/msk/phylowgs/multievolve/meta.yml @@ -0,0 +1,48 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "phylowgs_multievolve" +description: Create trees from input from phylowgs_createinput +keywords: + - phylowgs + - CNVs + - FACETs +tools: + - "phylowgs_multievolve": + description: "Program to create trees from input from phylowgs_createinput" + homepage: "https://genomebiology.biomedcentral.com/articles/10.1186/s13059-015-0602-8" + tool_dev_url: "https://github.com/mskcc/phylowgs" + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - cnv_data: + type: file + description: copy number input data from phylowgs_createinput + pattern: "*.{txt}" + - ssm_data: + type: file + description: mutation input data from phylowgs_createinput + pattern: "*.{txt}" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - trees: + type: file + description: Zip file containing the completed trees + pattern: "trees.zip" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@nikhil" +maintainers: + - "@nikhil" diff --git a/modules/msk/phylowgs/multievolve/tests/main.nf.test b/modules/msk/phylowgs/multievolve/tests/main.nf.test new file mode 100644 index 0000000..c7d4c61 --- /dev/null +++ b/modules/msk/phylowgs/multievolve/tests/main.nf.test @@ -0,0 +1,65 @@ +nextflow_process { + + name "Test Process PHYLOWGS_MULTIEVOLVE" + script "../main.nf" + process "PHYLOWGS_MULTIEVOLVE" + config "./nextflow.config" + + tag "modules" + tag "modules_nfcore" + tag "modules_msk" + tag "phylowgs" + tag "phylowgs/multievolve" + + test("phylowgs_multievolve - zip") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map, + file(params.test_data_mskcc['neoantigen']['cnv_data'], checkIfExists: true), + file(params.test_data_mskcc['neoantigen']['ssm_data'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(file(process.out.trees[0][1]).name).match() } + + ) + } + + } + + test("phylowgs_multievolve - zip - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map, + file('cnv_data'), + file('ssm_data') + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + + ) + } + + } + + +} diff --git a/modules/msk/phylowgs/multievolve/tests/main.nf.test.snap b/modules/msk/phylowgs/multievolve/tests/main.nf.test.snap new file mode 100644 index 0000000..b7c9029 --- /dev/null +++ b/modules/msk/phylowgs/multievolve/tests/main.nf.test.snap @@ -0,0 +1,39 @@ +{ + "phylowgs_multievolve - zip": { + "content": [ + "trees.zip" + ], + "timestamp": "2024-06-11T18:01:56.174868" + }, + "phylowgs_multievolve - zip - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "trees.zip:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + "versions.yml:md5,2eaafdf0e85898fdc2e5d40dbbfee2f7" + ], + "trees": [ + [ + { + "id": "test", + "single_end": false + }, + "trees.zip:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,2eaafdf0e85898fdc2e5d40dbbfee2f7" + ] + } + ], + "timestamp": "2024-06-11T18:02:03.678335" + } +} \ No newline at end of file diff --git a/modules/msk/phylowgs/multievolve/tests/nextflow.config b/modules/msk/phylowgs/multievolve/tests/nextflow.config new file mode 100644 index 0000000..43986ff --- /dev/null +++ b/modules/msk/phylowgs/multievolve/tests/nextflow.config @@ -0,0 +1,10 @@ +params { + enable_conda = false +} + +process { + publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" } + withName: 'PHYLOWGS_MULTIEVOLVE' { + ext.args = '--burnin-samples 2 --mcmc-samples 2' + } +} diff --git a/modules/msk/phylowgs/multievolve/tests/tags.yml b/modules/msk/phylowgs/multievolve/tests/tags.yml new file mode 100644 index 0000000..da6189d --- /dev/null +++ b/modules/msk/phylowgs/multievolve/tests/tags.yml @@ -0,0 +1,2 @@ +phylowgs/multievolve: + - "modules/msk/phylowgs/multievolve/**" diff --git a/modules/msk/phylowgs/parsecnvs/environment.yml b/modules/msk/phylowgs/parsecnvs/environment.yml new file mode 100644 index 0000000..f54f5d8 --- /dev/null +++ b/modules/msk/phylowgs/parsecnvs/environment.yml @@ -0,0 +1,9 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +name: "phylowgs_parsecnvs" +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - "PHYLOWGS" diff --git a/modules/msk/phylowgs/parsecnvs/main.nf b/modules/msk/phylowgs/parsecnvs/main.nf new file mode 100644 index 0000000..d7c7618 --- /dev/null +++ b/modules/msk/phylowgs/parsecnvs/main.nf @@ -0,0 +1,45 @@ +process PHYLOWGS_PARSECNVS { + tag "$meta.id" + label 'process_low' + + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'docker://mskcc/phylowgs:v1.5-msk': + 'docker.io/mskcc/phylowgs:v1.5-msk' }" + + input: + tuple val(meta), path(facetsgenelevel) + + output: + tuple val(meta), path("cnvs.txt"), emit: cnv + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + parse_cnvs.py \\ + ${args} \\ + ${facetsgenelevel} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + phylowgs: \$PHYLOWGS_TAG + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch cnvs.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + phylowgs: \$PHYLOWGS_TAG + END_VERSIONS + """ +} diff --git a/modules/msk/phylowgs/parsecnvs/meta.yml b/modules/msk/phylowgs/parsecnvs/meta.yml new file mode 100644 index 0000000..43e1b64 --- /dev/null +++ b/modules/msk/phylowgs/parsecnvs/meta.yml @@ -0,0 +1,45 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "phylowgs_parsecnvs" +description: parse cnvs from FACETS for input to phylowgs +keywords: + - phylowgs + - CNVs + - FACETs +tools: + - "phylowgs_parsecnvs": + description: "parser to convert FACETs output to phylowgs expected input" + homepage: "https://genomebiology.biomedcentral.com/articles/10.1186/s13059-015-0602-8" + tool_dev_url: "https://github.com/mskcc/phylowgs" + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + + - facetsgenelevel: + type: file + description: single sample facets gene level output + pattern: "*.{txt}" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - cnv: + type: file + description: converted cnv file for phylowgs upstream processing + pattern: "*.txt" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@pintoa1-mskcc" +maintainers: + - "@pintoa1-mskcc" diff --git a/modules/msk/phylowgs/parsecnvs/resources/usr/bin/parse_cnvs.py b/modules/msk/phylowgs/parsecnvs/resources/usr/bin/parse_cnvs.py new file mode 100755 index 0000000..e1fdbdc --- /dev/null +++ b/modules/msk/phylowgs/parsecnvs/resources/usr/bin/parse_cnvs.py @@ -0,0 +1,293 @@ +#!/usr/bin/env python2 + +from __future__ import print_function +import argparse +import csv +import sys +from collections import defaultdict + + +def chrom_key(chrom): + chrom = chrom.lower() + if chrom == "x": + chrom = 100 + elif chrom == "y": + chrom = 101 + elif chrom.isdigit(): + chrom = int(chrom) + else: + chrom = 999 + return chrom + + +class CopyNumberWriter(object): + def __init__(self, cn_output_fn): + self._cn_output_fn = cn_output_fn + # cellular_prevalence represents fraction of *all* cells that are affected + # by CNVs, *not* just tumor cells. + self._keys = ( + "chromosome", + "start", + "end", + "copy_number", + "minor_cn", + "major_cn", + "cellular_prevalence", + ) + + def _write_header(self): + self._cn_output.write("\t".join(self._keys) + "\n") + + def _write_cn_record(self, region): + vals = [str(region[k]) for k in self._keys] + self._cn_output.write("\t".join(vals) + "\n") + + def write_cnvs(self, cn_regions): + self._cn_output = open(self._cn_output_fn, "w") + self._write_header() + + chroms = sorted(cn_regions.keys(), key=chrom_key) + for chrom in chroms: + chrom_regions = cn_regions[chrom] + chrom_regions.sort(key=lambda r: r["start"]) + for region in chrom_regions: + # Insert chromosome into record, as including it originally would have + # duplicated the dictionary key corresponding to per-chromosome CNVs. + region["chromosome"] = chrom + region["copy_number"] = region["major_cn"] + region["minor_cn"] + self._write_cn_record(region) + + self._cn_output.close() + + +class CnvParser(object): + def parse(self): + raise Exception("Not implemented") + + +class TitanParser(CnvParser): + def __init__(self, titan_filename, cellularity): + self._titan_filename = titan_filename + self._cellularity = cellularity + + def parse(self): + cn_regions = defaultdict(list) + + with open(self._titan_filename) as titanf: + reader = csv.DictReader(titanf, delimiter="\t") + for record in reader: + chrom = record["Chromosome"].lower() + cnv = {} + cnv["start"] = int(record["Start_Position(bp)"]) + cnv["end"] = int(record["End_Position(bp)"]) + cnv["major_cn"] = int(record["MajorCN"]) + cnv["minor_cn"] = int(record["MinorCN"]) + + clonal_freq = record["Clonal_Frequency"] + if clonal_freq == "NA": + cnv["cellular_prevalence"] = self._cellularity + else: + cnv["cellular_prevalence"] = float(clonal_freq) * self._cellularity + + cn_regions[chrom].append(cnv) + + return cn_regions + + +class FacetsParser(CnvParser): + def __init__(self, fc_filename, cellularity): + self._fc_filename = fc_filename + if cellularity < 1: + self._cellularity = cellularity + else: + self.calc_cellularity() + + def calc_cellularity(self): + max_cellularity = 0 + with open(self._fc_filename) as facetf: + reader = csv.DictReader(facetf, dialect="excel-tab") + for record in reader: + if record["cf.em"] != "NA": + cellularity = float(record["cf.em"]) + if cellularity < 1: + max_cellularity = max(max_cellularity, cellularity) + if max_cellularity > 0: + self._cellularity = max_cellularity + else: + self._cellularity = 1 + + def parse(self): + cn_regions = defaultdict(list) + + with open(self._fc_filename) as facetf: + reader = csv.DictReader(facetf, dialect="excel-tab") + for record in reader: + cnv = {} + if ( + record["tcn.em"] != "NA" + and record["lcn.em"] != "NA" + and record["cf.em"] != "NA" + and str.isdigit(record["tcn.em"]) + and str.isdigit(record["lcn.em"]) + and str.isdigit(record["cf.em"].replace(".", "", 1)) + ): + cnv["cellular_prevalence"] = ( + float(record["cf.em"])) + cnv["minor_cn"] = int(record["lcn.em"]) + cnv["major_cn"] = int(record["tcn.em"]) - cnv["minor_cn"] + chrom = record["chrom"] + cnv["start"] = int(record["loc.start"]) + cnv["end"] = int(float(record["loc.end"])) + cn_regions[chrom].append(cnv) + else: + next + + return cn_regions + + +class BattenbergParser(CnvParser): + def __init__(self, bb_filename, cellularity): + self._bb_filename = bb_filename + self._cellularity = cellularity + # Used by SMC-Het parser, which has fields shifted by 1. + self._field_offset = 0 + + def _compute_cn(self, cnv1, cnv2): + """ + This code isn't used, but is retained for reference. + """ + cn1 = (cnv1["nmaj"] + cnv1["nmin"]) * cnv1["frac"] + if cnv2: + cn2 = (cnv2["nmaj"] + cnv2["nmin"]) * cnv2["frac"] + else: + cn2 = 0 + total_cn = cn1 + cn2 + return total_cn + + def parse(self): + cn_regions = defaultdict(list) + pval_threshold = 0.05 + + with open(self._bb_filename) as bbf: + header = bbf.next() + for line in bbf: + fields = line.strip().split() + chrom = fields[1 + self._field_offset].lower() + start = int(fields[2 + self._field_offset]) + end = int(fields[3 + self._field_offset]) + pval = float(fields[5 + self._field_offset]) + + cnv1 = {} + cnv1["start"] = start + cnv1["end"] = end + cnv1["major_cn"] = int(fields[8 + self._field_offset]) + cnv1["minor_cn"] = int(fields[9 + self._field_offset]) + cnv1["cellular_prevalence"] = ( + float(fields[10 + self._field_offset]) * self._cellularity + ) + + cnv2 = None + # Stefan's comment on p values: The p-values correspond "to whether a + # segment should be clonal or subclonal copynumber. We first fit a + # clonal copynumber profile for the whole sample and then perform a + # simple two-sided t-test twhere the null hypothesis is: A particular + # segment is clonal. And the alternative: It is subclonal." + # + # Thus: if t-test falls below significance threshold, we push cnv1 to + # clonal frequency. + if pval <= pval_threshold: + cnv2 = {} + cnv2["start"] = start + cnv2["end"] = end + cnv2["major_cn"] = int(fields[11 + self._field_offset]) + cnv2["minor_cn"] = int(fields[12 + self._field_offset]) + cnv2["cellular_prevalence"] = ( + float(fields[13 + self._field_offset]) * self._cellularity + ) + else: + cnv1["cellular_prevalence"] = self._cellularity + + if cnv1["start"] >= cnv1["end"] or ( + cnv2 is not None and cnv2["start"] >= cnv2["end"] + ): + continue + + cn_regions[chrom].append(cnv1) + if cnv2 is not None: + cn_regions[chrom].append(cnv2) + return cn_regions + + +class BattenbergSmchetParser(BattenbergParser): + def __init__(self, bb_filename, cellularity): + super(BattenbergSmchetParser, self).__init__(bb_filename, cellularity) + # SMC-Het Battenberg files lack the initial index column. + self._field_offset = -1 + + +def restricted_float(x): + x = float(x) + if x < 0.0 or x > 1.0: + raise argparse.ArgumentTypeError("%r not in range [0.0, 1.0]" % x) + return x + + +def main(): + parser = argparse.ArgumentParser( + description="Create CNV input file for parser from Battenberg or TITAN data", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument( + "-f", + "--cnv-format", + dest="input_type", + required=True, + choices=("battenberg", "battenberg-smchet", "titan", "facets"), + help="Type of CNV input", + ) + parser.add_argument( + "-c", + "--cellularity", + dest="cellularity", + type=float, + required=False, + default=1, + help="Fraction of sample that is cancerous rather than somatic. Used only for estimating CNV confidence -- if no CNVs, need not specify argument.", + ) + parser.add_argument( + "--cnv-output", + dest="cnv_output_filename", + default="cnvs.txt", + help="Output destination for parsed CNVs", + ) + parser.add_argument("cnv_file") + args = parser.parse_args() + + if args.cellularity > 1.0: + print( + "Cellularity for %s is %s. Setting to 1.0." + % (args.cnv_file, args.cellularity), + file=sys.stderr, + ) + cellularity = 1.0 + else: + cellularity = args.cellularity + + if args.input_type == "battenberg": + parser = BattenbergParser(args.cnv_file, cellularity) + elif args.input_type == "battenberg-smchet": + parser = BattenbergSmchetParser(args.cnv_file, cellularity) + elif args.input_type == "titan": + parser = TitanParser(args.cnv_file, cellularity) + elif args.input_type == "facets": + parser = FacetsParser(args.cnv_file, cellularity) + else: + raise Exception("Unknown input type") + + writer = CopyNumberWriter(args.cnv_output_filename) + regions = parser.parse() + writer.write_cnvs(regions) + + +if __name__ == "__main__": + main() diff --git a/modules/msk/phylowgs/parsecnvs/tests/main.nf.test b/modules/msk/phylowgs/parsecnvs/tests/main.nf.test new file mode 100644 index 0000000..210112a --- /dev/null +++ b/modules/msk/phylowgs/parsecnvs/tests/main.nf.test @@ -0,0 +1,63 @@ +nextflow_process { + + name "Test Process PHYLOWGS_PARSECNVS" + script "../main.nf" + process "PHYLOWGS_PARSECNVS" + config "./nextflow.config" + + tag "modules" + tag "modules_nfcore" + tag "modules_msk" + tag "phylowgs" + tag "phylowgs/parsecnvs" + + test("phylowgs_parsecnvs - txt") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file(params.test_data_mskcc['neoantigen']['facets_hisens_cncf_txt'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + + ) + } + + } + + test("phylowgs_parsecnvs - txt - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map + file('facets_hisens_cncf_txt') + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + + ) + } + +} + + +} diff --git a/modules/msk/phylowgs/parsecnvs/tests/main.nf.test.snap b/modules/msk/phylowgs/parsecnvs/tests/main.nf.test.snap new file mode 100644 index 0000000..7d9d8ad --- /dev/null +++ b/modules/msk/phylowgs/parsecnvs/tests/main.nf.test.snap @@ -0,0 +1,64 @@ +{ + "phylowgs_parsecnvs - txt": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "cnvs.txt:md5,2f7b4d8ceff8a9505c4b69e2b1e9ccc7" + ] + ], + "1": [ + "versions.yml:md5,3fad79b7762271d747946e18f665089c" + ], + "cnv": [ + [ + { + "id": "test", + "single_end": false + }, + "cnvs.txt:md5,2f7b4d8ceff8a9505c4b69e2b1e9ccc7" + ] + ], + "versions": [ + "versions.yml:md5,3fad79b7762271d747946e18f665089c" + ] + } + ], + "timestamp": "2024-07-08T12:15:00.257251" + }, + "phylowgs_parsecnvs - txt - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "cnvs.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + "versions.yml:md5,3fad79b7762271d747946e18f665089c" + ], + "cnv": [ + [ + { + "id": "test", + "single_end": false + }, + "cnvs.txt:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,3fad79b7762271d747946e18f665089c" + ] + } + ], + "timestamp": "2024-07-08T12:15:19.111149" + } +} \ No newline at end of file diff --git a/modules/msk/phylowgs/parsecnvs/tests/nextflow.config b/modules/msk/phylowgs/parsecnvs/tests/nextflow.config new file mode 100644 index 0000000..31236ee --- /dev/null +++ b/modules/msk/phylowgs/parsecnvs/tests/nextflow.config @@ -0,0 +1,10 @@ +params { + enable_conda = false +} + +process { + publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" } + withName: 'PHYLOWGS_PARSECNVS' { + ext.args = '-f facets' + } +} diff --git a/modules/msk/phylowgs/parsecnvs/tests/tags.yml b/modules/msk/phylowgs/parsecnvs/tests/tags.yml new file mode 100644 index 0000000..291c657 --- /dev/null +++ b/modules/msk/phylowgs/parsecnvs/tests/tags.yml @@ -0,0 +1,2 @@ +phylowgs/parsecnvs: + - "modules/msk/phylowgs/parsecnvs/**" diff --git a/modules/msk/phylowgs/writeresults/environment.yml b/modules/msk/phylowgs/writeresults/environment.yml new file mode 100644 index 0000000..8aed198 --- /dev/null +++ b/modules/msk/phylowgs/writeresults/environment.yml @@ -0,0 +1,9 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +name: "phylowgs_writeresults" +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - "PHYLOWGS" diff --git a/modules/msk/phylowgs/writeresults/main.nf b/modules/msk/phylowgs/writeresults/main.nf new file mode 100644 index 0000000..33b98a1 --- /dev/null +++ b/modules/msk/phylowgs/writeresults/main.nf @@ -0,0 +1,54 @@ +process PHYLOWGS_WRITERESULTS { + tag "$meta.id" + label 'process_medium' + + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'docker://mskcc/phylowgs:v1.5-msk': + 'docker.io/mskcc/phylowgs:v1.5-msk' }" + + input: + tuple val(meta), path(trees) + + output: + tuple val(meta), path("*.summ.json.gz") , emit: summ + tuple val(meta), path("*.muts.json.gz") , emit: muts + tuple val(meta), path("*.mutass.zip") , emit: mutass + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + + """ + python2 \\ + /usr/bin/write_results.py \\ + ${args} \\ + --include-ssm-names \\ + ${prefix} \\ + ${trees} \\ + ${prefix}.summ.json.gz \\ + ${prefix}.muts.json.gz \\ + ${prefix}.mutass.zip + cat <<-END_VERSIONS > versions.yml + "${task.process}": + phylowgs: \$PHYLOWGS_TAG + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.summ.json.gz + touch ${prefix}.muts.json.gz + touch ${prefix}.mutass.zip + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + phylowgs: \$PHYLOWGS_TAG + END_VERSIONS + """ +} diff --git a/modules/msk/phylowgs/writeresults/meta.yml b/modules/msk/phylowgs/writeresults/meta.yml new file mode 100644 index 0000000..8829447 --- /dev/null +++ b/modules/msk/phylowgs/writeresults/meta.yml @@ -0,0 +1,52 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "phylowgs_writeresults" +description: Write results from trees from phylowgs_multievolve +keywords: + - phylowgs + - CNVs + - FACETs +tools: + - "phylowgs_writeresults": + description: "Write results from trees from phylowgs_multievolve" + homepage: "https://genomebiology.biomedcentral.com/articles/10.1186/s13059-015-0602-8" + tool_dev_url: "https://github.com/mskcc/phylowgs" + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - trees: + type: file + description: zip folder containing tree data from multievolve + pattern: "*.zip" + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - summ: + type: file + description: Output file for JSON-formatted tree summaries + pattern: "*.summ.json.gz" + - muts: + type: file + description: Output file for JSON-formatted list of mutations + pattern: "*.muts.json.gz" + - mutass: + type: file + description: Output file for JSON-formatted list of SSMs and CNVs + pattern: "*.mutass.zip" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@nikhil" +maintainers: + - "@nikhil" diff --git a/modules/msk/phylowgs/writeresults/tests/main.nf.test b/modules/msk/phylowgs/writeresults/tests/main.nf.test new file mode 100644 index 0000000..8d2449f --- /dev/null +++ b/modules/msk/phylowgs/writeresults/tests/main.nf.test @@ -0,0 +1,70 @@ +nextflow_process { + + name "Test Process PHYLOWGS_WRITERESULTS" + script "../main.nf" + process "PHYLOWGS_WRITERESULTS" + + tag "modules" + tag "modules_nfcore" + tag "modules_msk" + tag "phylowgs" + tag "phylowgs/writeresults" + + test("phylowgs_writeresults - zip") { + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map, + file(params.test_data_mskcc['neoantigen']['trees'], checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.versions, + file(process.out.summ[0][1]).name, + file(process.out.muts[0][1]).name, + file(process.out.mutass[0][1]).name).match() + } + + ) + } + + } + + test("phylowgs_writeresults - zip - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test', single_end:false ], // meta map, + file('tree_data') + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.versions, + file(process.out.summ[0][1]).name, + file(process.out.muts[0][1]).name, + file(process.out.mutass[0][1]).name).match() + } + + ) + } + + } + + +} diff --git a/modules/msk/phylowgs/writeresults/tests/main.nf.test.snap b/modules/msk/phylowgs/writeresults/tests/main.nf.test.snap new file mode 100644 index 0000000..61518b0 --- /dev/null +++ b/modules/msk/phylowgs/writeresults/tests/main.nf.test.snap @@ -0,0 +1,24 @@ +{ + "phylowgs_writeresults - zip - stub": { + "content": [ + [ + "versions.yml:md5,2f91a4391e3a459ac03c94123651b6ed" + ], + "test.summ.json.gz", + "test.muts.json.gz", + "test.mutass.zip" + ], + "timestamp": "2024-06-11T18:02:20.644108" + }, + "phylowgs_writeresults - zip": { + "content": [ + [ + "versions.yml:md5,2f91a4391e3a459ac03c94123651b6ed" + ], + "test.summ.json.gz", + "test.muts.json.gz", + "test.mutass.zip" + ], + "timestamp": "2024-06-11T18:02:13.457864" + } +} \ No newline at end of file diff --git a/modules/msk/phylowgs/writeresults/tests/tags.yml b/modules/msk/phylowgs/writeresults/tests/tags.yml new file mode 100644 index 0000000..245c702 --- /dev/null +++ b/modules/msk/phylowgs/writeresults/tests/tags.yml @@ -0,0 +1,2 @@ +phylowgs/writeresults: + - "modules/msk/phylowgs/writeresults/**" diff --git a/modules/nf-core/fastqc/main.nf b/modules/nf-core/fastqc/main.nf deleted file mode 100644 index 9e19a74..0000000 --- a/modules/nf-core/fastqc/main.nf +++ /dev/null @@ -1,55 +0,0 @@ -process FASTQC { - tag "$meta.id" - label 'process_medium' - - conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/fastqc:0.12.1--hdfd78af_0' : - 'biocontainers/fastqc:0.12.1--hdfd78af_0' }" - - input: - tuple val(meta), path(reads) - - output: - tuple val(meta), path("*.html"), emit: html - tuple val(meta), path("*.zip") , emit: zip - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - // Make list of old name and new name pairs to use for renaming in the bash while loop - def old_new_pairs = reads instanceof Path || reads.size() == 1 ? [[ reads, "${prefix}.${reads.extension}" ]] : reads.withIndex().collect { entry, index -> [ entry, "${prefix}_${index + 1}.${entry.extension}" ] } - def rename_to = old_new_pairs*.join(' ').join(' ') - def renamed_files = old_new_pairs.collect{ old_name, new_name -> new_name }.join(' ') - """ - printf "%s %s\\n" $rename_to | while read old_name new_name; do - [ -f "\${new_name}" ] || ln -s \$old_name \$new_name - done - - fastqc \\ - $args \\ - --threads $task.cpus \\ - $renamed_files - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - fastqc: \$( fastqc --version | sed '/FastQC v/!d; s/.*v//' ) - END_VERSIONS - """ - - stub: - def prefix = task.ext.prefix ?: "${meta.id}" - """ - touch ${prefix}.html - touch ${prefix}.zip - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - fastqc: \$( fastqc --version | sed '/FastQC v/!d; s/.*v//' ) - END_VERSIONS - """ -} diff --git a/modules/nf-core/fastqc/meta.yml b/modules/nf-core/fastqc/meta.yml deleted file mode 100644 index ee5507e..0000000 --- a/modules/nf-core/fastqc/meta.yml +++ /dev/null @@ -1,57 +0,0 @@ -name: fastqc -description: Run FastQC on sequenced reads -keywords: - - quality control - - qc - - adapters - - fastq -tools: - - fastqc: - description: | - FastQC gives general quality metrics about your reads. - It provides information about the quality score distribution - across your reads, the per base sequence content (%A/C/G/T). - You get information about adapter contamination and other - overrepresented sequences. - homepage: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/ - documentation: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/ - licence: ["GPL-2.0-only"] -input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - reads: - type: file - description: | - List of input FastQ files of size 1 and 2 for single-end and paired-end data, - respectively. -output: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - html: - type: file - description: FastQC report - pattern: "*_{fastqc.html}" - - zip: - type: file - description: FastQC report archive - pattern: "*_{fastqc.zip}" - - versions: - type: file - description: File containing software versions - pattern: "versions.yml" -authors: - - "@drpatelh" - - "@grst" - - "@ewels" - - "@FelixKrueger" -maintainers: - - "@drpatelh" - - "@grst" - - "@ewels" - - "@FelixKrueger" diff --git a/modules/nf-core/fastqc/tests/main.nf.test b/modules/nf-core/fastqc/tests/main.nf.test deleted file mode 100644 index 70edae4..0000000 --- a/modules/nf-core/fastqc/tests/main.nf.test +++ /dev/null @@ -1,212 +0,0 @@ -nextflow_process { - - name "Test Process FASTQC" - script "../main.nf" - process "FASTQC" - - tag "modules" - tag "modules_nfcore" - tag "fastqc" - - test("sarscov2 single-end [fastq]") { - - when { - process { - """ - input[0] = Channel.of([ - [ id: 'test', single_end:true ], - [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) ] - ]) - """ - } - } - - then { - assertAll ( - { assert process.success }, - - // NOTE The report contains the date inside it, which means that the md5sum is stable per day, but not longer than that. So you can't md5sum it. - // looks like this:
Mon 2 Oct 2023
test.gz
- // https://github.com/nf-core/modules/pull/3903#issuecomment-1743620039 - - { assert process.out.html[0][1] ==~ ".*/test_fastqc.html" }, - { assert process.out.zip[0][1] ==~ ".*/test_fastqc.zip" }, - { assert path(process.out.html[0][1]).text.contains("File typeConventional base calls") }, - - { assert snapshot(process.out.versions).match("fastqc_versions_single") } - ) - } - } - - test("sarscov2 paired-end [fastq]") { - - when { - process { - """ - input[0] = Channel.of([ - [id: 'test', single_end: false], // meta map - [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) ] - ]) - """ - } - } - - then { - assertAll ( - { assert process.success }, - - { assert process.out.html[0][1][0] ==~ ".*/test_1_fastqc.html" }, - { assert process.out.html[0][1][1] ==~ ".*/test_2_fastqc.html" }, - { assert process.out.zip[0][1][0] ==~ ".*/test_1_fastqc.zip" }, - { assert process.out.zip[0][1][1] ==~ ".*/test_2_fastqc.zip" }, - { assert path(process.out.html[0][1][0]).text.contains("File typeConventional base calls") }, - { assert path(process.out.html[0][1][1]).text.contains("File typeConventional base calls") }, - - { assert snapshot(process.out.versions).match("fastqc_versions_paired") } - ) - } - } - - test("sarscov2 interleaved [fastq]") { - - when { - process { - """ - input[0] = Channel.of([ - [id: 'test', single_end: false], // meta map - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_interleaved.fastq.gz', checkIfExists: true) - ]) - """ - } - } - - then { - assertAll ( - { assert process.success }, - - { assert process.out.html[0][1] ==~ ".*/test_fastqc.html" }, - { assert process.out.zip[0][1] ==~ ".*/test_fastqc.zip" }, - { assert path(process.out.html[0][1]).text.contains("File typeConventional base calls") }, - - { assert snapshot(process.out.versions).match("fastqc_versions_interleaved") } - ) - } - } - - test("sarscov2 paired-end [bam]") { - - when { - process { - """ - input[0] = Channel.of([ - [id: 'test', single_end: false], // meta map - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true) - ]) - """ - } - } - - then { - assertAll ( - { assert process.success }, - - { assert process.out.html[0][1] ==~ ".*/test_fastqc.html" }, - { assert process.out.zip[0][1] ==~ ".*/test_fastqc.zip" }, - { assert path(process.out.html[0][1]).text.contains("File typeConventional base calls") }, - - { assert snapshot(process.out.versions).match("fastqc_versions_bam") } - ) - } - } - - test("sarscov2 multiple [fastq]") { - - when { - process { - """ - input[0] = Channel.of([ - [id: 'test', single_end: false], // meta map - [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true), - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test2_1.fastq.gz', checkIfExists: true), - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test2_2.fastq.gz', checkIfExists: true) ] - ]) - """ - } - } - - then { - assertAll ( - { assert process.success }, - - { assert process.out.html[0][1][0] ==~ ".*/test_1_fastqc.html" }, - { assert process.out.html[0][1][1] ==~ ".*/test_2_fastqc.html" }, - { assert process.out.html[0][1][2] ==~ ".*/test_3_fastqc.html" }, - { assert process.out.html[0][1][3] ==~ ".*/test_4_fastqc.html" }, - { assert process.out.zip[0][1][0] ==~ ".*/test_1_fastqc.zip" }, - { assert process.out.zip[0][1][1] ==~ ".*/test_2_fastqc.zip" }, - { assert process.out.zip[0][1][2] ==~ ".*/test_3_fastqc.zip" }, - { assert process.out.zip[0][1][3] ==~ ".*/test_4_fastqc.zip" }, - { assert path(process.out.html[0][1][0]).text.contains("File typeConventional base calls") }, - { assert path(process.out.html[0][1][1]).text.contains("File typeConventional base calls") }, - { assert path(process.out.html[0][1][2]).text.contains("File typeConventional base calls") }, - { assert path(process.out.html[0][1][3]).text.contains("File typeConventional base calls") }, - - { assert snapshot(process.out.versions).match("fastqc_versions_multiple") } - ) - } - } - - test("sarscov2 custom_prefix") { - - when { - process { - """ - input[0] = Channel.of([ - [ id:'mysample', single_end:true ], // meta map - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) - ]) - """ - } - } - - then { - assertAll ( - { assert process.success }, - - { assert process.out.html[0][1] ==~ ".*/mysample_fastqc.html" }, - { assert process.out.zip[0][1] ==~ ".*/mysample_fastqc.zip" }, - { assert path(process.out.html[0][1]).text.contains("File typeConventional base calls") }, - - { assert snapshot(process.out.versions).match("fastqc_versions_custom_prefix") } - ) - } - } - - test("sarscov2 single-end [fastq] - stub") { - - options "-stub" - - when { - process { - """ - input[0] = Channel.of([ - [ id: 'test', single_end:true ], - [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) ] - ]) - """ - } - } - - then { - assertAll ( - { assert process.success }, - { assert snapshot(process.out.html.collect { file(it[1]).getName() } + - process.out.zip.collect { file(it[1]).getName() } + - process.out.versions ).match("fastqc_stub") } - ) - } - } - -} diff --git a/modules/nf-core/fastqc/tests/main.nf.test.snap b/modules/nf-core/fastqc/tests/main.nf.test.snap deleted file mode 100644 index 86f7c31..0000000 --- a/modules/nf-core/fastqc/tests/main.nf.test.snap +++ /dev/null @@ -1,88 +0,0 @@ -{ - "fastqc_versions_interleaved": { - "content": [ - [ - "versions.yml:md5,e1cc25ca8af856014824abd842e93978" - ] - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "23.10.1" - }, - "timestamp": "2024-01-31T17:40:07.293713" - }, - "fastqc_stub": { - "content": [ - [ - "test.html", - "test.zip", - "versions.yml:md5,e1cc25ca8af856014824abd842e93978" - ] - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "23.10.1" - }, - "timestamp": "2024-01-31T17:31:01.425198" - }, - "fastqc_versions_multiple": { - "content": [ - [ - "versions.yml:md5,e1cc25ca8af856014824abd842e93978" - ] - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "23.10.1" - }, - "timestamp": "2024-01-31T17:40:55.797907" - }, - "fastqc_versions_bam": { - "content": [ - [ - "versions.yml:md5,e1cc25ca8af856014824abd842e93978" - ] - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "23.10.1" - }, - "timestamp": "2024-01-31T17:40:26.795862" - }, - "fastqc_versions_single": { - "content": [ - [ - "versions.yml:md5,e1cc25ca8af856014824abd842e93978" - ] - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "23.10.1" - }, - "timestamp": "2024-01-31T17:39:27.043675" - }, - "fastqc_versions_paired": { - "content": [ - [ - "versions.yml:md5,e1cc25ca8af856014824abd842e93978" - ] - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "23.10.1" - }, - "timestamp": "2024-01-31T17:39:47.584191" - }, - "fastqc_versions_custom_prefix": { - "content": [ - [ - "versions.yml:md5,e1cc25ca8af856014824abd842e93978" - ] - ], - "meta": { - "nf-test": "0.8.4", - "nextflow": "23.10.1" - }, - "timestamp": "2024-01-31T17:41:14.576531" - } -} \ No newline at end of file diff --git a/modules/nf-core/fastqc/tests/tags.yml b/modules/nf-core/fastqc/tests/tags.yml deleted file mode 100644 index 7834294..0000000 --- a/modules/nf-core/fastqc/tests/tags.yml +++ /dev/null @@ -1,2 +0,0 @@ -fastqc: - - modules/nf-core/fastqc/** diff --git a/modules/nf-core/multiqc/environment.yml b/modules/nf-core/multiqc/environment.yml index ca39fb6..2121492 100644 --- a/modules/nf-core/multiqc/environment.yml +++ b/modules/nf-core/multiqc/environment.yml @@ -4,4 +4,4 @@ channels: - bioconda - defaults dependencies: - - bioconda::multiqc=1.21 + - bioconda::multiqc=1.23 diff --git a/modules/nf-core/multiqc/main.nf b/modules/nf-core/multiqc/main.nf index 47ac352..459dfea 100644 --- a/modules/nf-core/multiqc/main.nf +++ b/modules/nf-core/multiqc/main.nf @@ -3,14 +3,16 @@ process MULTIQC { conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.21--pyhdfd78af_0' : - 'biocontainers/multiqc:1.21--pyhdfd78af_0' }" + 'https://depot.galaxyproject.org/singularity/multiqc:1.23--pyhdfd78af_0' : + 'biocontainers/multiqc:1.23--pyhdfd78af_0' }" input: path multiqc_files, stageAs: "?/*" path(multiqc_config) path(extra_multiqc_config) path(multiqc_logo) + path(replace_names) + path(sample_names) output: path "*multiqc_report.html", emit: report @@ -26,6 +28,8 @@ process MULTIQC { def config = multiqc_config ? "--config $multiqc_config" : '' def extra_config = extra_multiqc_config ? "--config $extra_multiqc_config" : '' def logo = multiqc_logo ? /--cl-config 'custom_logo: "${multiqc_logo}"'/ : '' + def replace = replace_names ? "--replace-names ${replace_names}" : '' + def samples = sample_names ? "--sample-names ${sample_names}" : '' """ multiqc \\ --force \\ @@ -33,6 +37,8 @@ process MULTIQC { $config \\ $extra_config \\ $logo \\ + $replace \\ + $samples \\ . cat <<-END_VERSIONS > versions.yml diff --git a/modules/nf-core/multiqc/meta.yml b/modules/nf-core/multiqc/meta.yml index 45a9bc3..382c08c 100644 --- a/modules/nf-core/multiqc/meta.yml +++ b/modules/nf-core/multiqc/meta.yml @@ -29,6 +29,19 @@ input: type: file description: Optional logo file for MultiQC pattern: "*.{png}" + - replace_names: + type: file + description: | + Optional two-column sample renaming file. First column a set of + patterns, second column a set of corresponding replacements. Passed via + MultiQC's `--replace-names` option. + pattern: "*.{tsv}" + - sample_names: + type: file + description: | + Optional TSV file with headers, passed to the MultiQC --sample_names + argument. + pattern: "*.{tsv}" output: - report: type: file diff --git a/modules/nf-core/multiqc/tests/main.nf.test b/modules/nf-core/multiqc/tests/main.nf.test index f1c4242..6aa27f4 100644 --- a/modules/nf-core/multiqc/tests/main.nf.test +++ b/modules/nf-core/multiqc/tests/main.nf.test @@ -17,6 +17,8 @@ nextflow_process { input[1] = [] input[2] = [] input[3] = [] + input[4] = [] + input[5] = [] """ } } @@ -41,6 +43,8 @@ nextflow_process { input[1] = Channel.of(file("https://github.com/nf-core/tools/raw/dev/nf_core/pipeline-template/assets/multiqc_config.yml", checkIfExists: true)) input[2] = [] input[3] = [] + input[4] = [] + input[5] = [] """ } } @@ -66,6 +70,8 @@ nextflow_process { input[1] = [] input[2] = [] input[3] = [] + input[4] = [] + input[5] = [] """ } } diff --git a/modules/nf-core/multiqc/tests/main.nf.test.snap b/modules/nf-core/multiqc/tests/main.nf.test.snap index bfebd80..45e95e5 100644 --- a/modules/nf-core/multiqc/tests/main.nf.test.snap +++ b/modules/nf-core/multiqc/tests/main.nf.test.snap @@ -2,14 +2,14 @@ "multiqc_versions_single": { "content": [ [ - "versions.yml:md5,21f35ee29416b9b3073c28733efe4b7d" + "versions.yml:md5,87904cd321df21fac35d18f0fc01bb19" ] ], "meta": { "nf-test": "0.8.4", - "nextflow": "23.10.1" + "nextflow": "24.04.2" }, - "timestamp": "2024-02-29T08:48:55.657331" + "timestamp": "2024-07-10T12:41:34.562023" }, "multiqc_stub": { "content": [ @@ -17,25 +17,25 @@ "multiqc_report.html", "multiqc_data", "multiqc_plots", - "versions.yml:md5,21f35ee29416b9b3073c28733efe4b7d" + "versions.yml:md5,87904cd321df21fac35d18f0fc01bb19" ] ], "meta": { "nf-test": "0.8.4", - "nextflow": "23.10.1" + "nextflow": "24.04.2" }, - "timestamp": "2024-02-29T08:49:49.071937" + "timestamp": "2024-07-10T11:27:11.933869532" }, "multiqc_versions_config": { "content": [ [ - "versions.yml:md5,21f35ee29416b9b3073c28733efe4b7d" + "versions.yml:md5,87904cd321df21fac35d18f0fc01bb19" ] ], "meta": { "nf-test": "0.8.4", - "nextflow": "23.10.1" + "nextflow": "24.04.2" }, - "timestamp": "2024-02-29T08:49:25.457567" + "timestamp": "2024-07-10T11:26:56.709849369" } } \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index bb3ed4c..c7b308c 100644 --- a/nextflow.config +++ b/nextflow.config @@ -168,6 +168,7 @@ profiles { } test { includeConfig 'conf/test.config' } test_full { includeConfig 'conf/test_full.config' } + prod { includeConfig 'conf/prod.config' } } // Set default registry for Apptainer, Docker, Podman and Singularity independent of -profile diff --git a/nf-test.config b/nf-test.config new file mode 100644 index 0000000..0ebde13 --- /dev/null +++ b/nf-test.config @@ -0,0 +1,13 @@ +config { + // location for all nf-tests + testsDir '.' + + // nf-test directory including temporary files for each test + workDir System.getenv('NXF_TEST_DIR') ?: '.nf-test' + + // location of an optional nextflow.config file specific for executing tests + configFile 'tests/config/nf-test.config' + + // run all test with the defined docker profile from the main nextflow.config + profile '' +} diff --git a/pyproject.toml b/pyproject.toml index 5611062..0d62beb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,15 +1,10 @@ -# Config file for Python. Mostly used to configure linting of bin/*.py with Ruff. +# Config file for Python. Mostly used to configure linting of bin/check_samplesheet.py with Black. # Should be kept the same as nf-core/tools to avoid fighting with template synchronisation. -[tool.ruff] +[tool.black] line-length = 120 -target-version = "py38" -cache-dir = "~/.cache/ruff" +target_version = ["py37", "py38", "py39", "py310"] -[tool.ruff.lint] -select = ["I", "E1", "E4", "E7", "E9", "F", "UP", "N"] - -[tool.ruff.lint.isort] -known-first-party = ["nf_core"] - -[tool.ruff.lint.per-file-ignores] -"__init__.py" = ["E402", "F401"] +[tool.isort] +profile = "black" +known_first_party = ["nf_core"] +multi_line_output = 3 diff --git a/subworkflows/local/utils_nfcore_neoantigenpipeline_pipeline/main.nf b/subworkflows/local/utils_nfcore_neoantigenpipeline_pipeline/main.nf index b2eeca8..86d4c91 100644 --- a/subworkflows/local/utils_nfcore_neoantigenpipeline_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_neoantigenpipeline_pipeline/main.nf @@ -83,23 +83,16 @@ workflow PIPELINE_INITIALISATION { Channel .fromSamplesheet("input") .map { - meta, fastq_1, fastq_2 -> - if (!fastq_2) { - return [ meta.id, meta + [ single_end:true ], [ fastq_1 ] ] - } else { - return [ meta.id, meta + [ single_end:false ], [ fastq_1, fastq_2 ] ] - } - } - .groupTuple() - .map { - validateInputSamplesheet(it) - } - .map { - meta, fastqs -> - return [ meta, fastqs.flatten() ] + meta, maf, facets_gene, hla_file -> + [meta, maf, facets_gene, hla_file] + } .set { ch_samplesheet } + + + + emit: samplesheet = ch_samplesheet versions = ch_versions diff --git a/subworkflows/msk/neoantigen_editing/main.nf b/subworkflows/msk/neoantigen_editing/main.nf new file mode 100644 index 0000000..3374728 --- /dev/null +++ b/subworkflows/msk/neoantigen_editing/main.nf @@ -0,0 +1,28 @@ +include { NEOANTIGENEDITING_ALIGNTOIEDB } from '../../../modules/msk/neoantigenediting/aligntoiedb' +include { NEOANTIGENEDITING_COMPUTEFITNESS } from '../../../modules/msk/neoantigenediting/computefitness' + + +workflow NEOANTIGEN_EDITING { + + take: + neoantigenInput_ch + iedbfasta + + main: + + ch_versions = Channel.empty() + + NEOANTIGENEDITING_ALIGNTOIEDB (neoantigenInput_ch, iedbfasta) + ch_versions = ch_versions.mix(NEOANTIGENEDITING_ALIGNTOIEDB.out.versions.first()) + + + ch_computeFitnessIn = neoantigenInput_ch.combine(NEOANTIGENEDITING_ALIGNTOIEDB.out.iedb_alignment, by: [0]) + + NEOANTIGENEDITING_COMPUTEFITNESS ( ch_computeFitnessIn ) + + ch_versions = ch_versions.mix(NEOANTIGENEDITING_COMPUTEFITNESS.out.versions.first()) + + emit: + annotated_output = NEOANTIGENEDITING_COMPUTEFITNESS.out.annotated_output // channel: [ val(meta), [ annotated_json ] ] + versions = ch_versions // channel: [ versions.yml ] +} diff --git a/subworkflows/msk/neoantigen_editing/meta.yml b/subworkflows/msk/neoantigen_editing/meta.yml new file mode 100644 index 0000000..9f1cd64 --- /dev/null +++ b/subworkflows/msk/neoantigen_editing/meta.yml @@ -0,0 +1,40 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json +name: "neoantigen_editing" +description: Compute fitness and quality of the neoantigens +keywords: + - neoantigenediting + - neoantigens + - fitness +components: + - neoantigenediting/computefitness + - neoantigenediting/aligntoiedb +input: + - neoantigenInput_ch: + type: file + description: | + The input channel containing the json formatted for NeoantigenEditing by the neoantigeninput module + Structure: [ val(meta), path(json) ] + pattern: "*.{json}" + - iedbfasta: + type: file + description: | + The input channel containing the IEDB fasta file + Structure: [ val(meta), path(fasta) ] + pattern: "*.{fasta}" +output: + - annotated_output: + type: file + description: | + Channel containing annpotated json output with neoantigen quality + Structure: [ val(meta), [ annotated_json ] ] + pattern: "*.json" + - versions: + type: file + description: | + File containing software versions + Structure: [ path(versions.yml) ] + pattern: "versions.yml" +authors: + - "@johnoooh" +maintainers: + - "@johnoooh" diff --git a/subworkflows/msk/neoantigen_editing/tests/main.nf.test b/subworkflows/msk/neoantigen_editing/tests/main.nf.test new file mode 100644 index 0000000..349be79 --- /dev/null +++ b/subworkflows/msk/neoantigen_editing/tests/main.nf.test @@ -0,0 +1,73 @@ +nextflow_workflow { + + name "Test Workflow NEOANTIGEN_EDITING" + script "../main.nf" + workflow "NEOANTIGEN_EDITING" + + tag "subworkflows" + tag "subworkflows_nfcore" + tag "subworkflows_msk" + tag "subworkflows/neoantigen_editing" + tag "neoantigeninput" + tag "neoantigenediting/aligntoiedb" + tag "neoantigenediting/computefitness" + + + test("neoantigen_editing - json") { + + when { + workflow { + """ + input[0] = Channel.value([ + [ id:'test', single_end:false ], // meta map, + file(params.test_data_mskcc['neoantigen']['patient_data'], checkIfExists: true) + ]) + input[1] = Channel.value(file(params.test_data_mskcc['neoantigen']['iedb']['iedb_fasta'], checkIfExists: true)) + + """ + } + } + + then { + assertAll( + { assert workflow.success}, + { assert snapshot( + workflow.out.versions, + file(workflow.out.annotated_output[0][1]).name + ).match() + } + ) + + } + } + + test("neoantigen_editing - json - stub") { + + options "-stub" + + when { + workflow { + """ + input[0] = Channel.value([ + [ id:'test', single_end:false ], // meta map, + file('patient_data') + ]) + input[1] = Channel.value(file('iedb_fasta')) + + """ + } + } + + then { + assertAll( + { assert workflow.success}, + { assert snapshot( + workflow.out.versions, + file(workflow.out.annotated_output[0][1]).name + ).match() + } + + ) + } + } +} diff --git a/subworkflows/msk/neoantigen_editing/tests/main.nf.test.snap b/subworkflows/msk/neoantigen_editing/tests/main.nf.test.snap new file mode 100644 index 0000000..6ccf1d5 --- /dev/null +++ b/subworkflows/msk/neoantigen_editing/tests/main.nf.test.snap @@ -0,0 +1,22 @@ +{ + "neoantigen_editing - json - stub": { + "content": [ + [ + "versions.yml:md5,03ac4fd1eeae826979dfbed8a7b3186c", + "versions.yml:md5,3ce37ca54350a7f641e05f0de9f87f7f" + ], + "patient_data_annotated.json" + ], + "timestamp": "2024-06-19T14:59:58.874516" + }, + "neoantigen_editing - json": { + "content": [ + [ + "versions.yml:md5,03ac4fd1eeae826979dfbed8a7b3186c", + "versions.yml:md5,3ce37ca54350a7f641e05f0de9f87f7f" + ], + "3_OLTS_primary_tumor_annotated.json" + ], + "timestamp": "2024-06-19T14:59:52.198482" + } +} \ No newline at end of file diff --git a/subworkflows/msk/neoantigen_editing/tests/tags.yml b/subworkflows/msk/neoantigen_editing/tests/tags.yml new file mode 100644 index 0000000..e7dbab4 --- /dev/null +++ b/subworkflows/msk/neoantigen_editing/tests/tags.yml @@ -0,0 +1,2 @@ +subworkflows/neoantigen_editing: + - subworkflows/msk/neoantigen_editing/** diff --git a/subworkflows/msk/netmhcstabandpan/main.nf b/subworkflows/msk/netmhcstabandpan/main.nf new file mode 100644 index 0000000..ea8247c --- /dev/null +++ b/subworkflows/msk/netmhcstabandpan/main.nf @@ -0,0 +1,95 @@ +include { NEOANTIGENUTILS_GENERATEHLASTRING } from '../../../modules/msk/neoantigenutils/generatehlastring/main' +include { NEOANTIGENUTILS_GENERATEMUTFASTA } from '../../../modules/msk/neoantigenutils/generatemutfasta/main' +include { NETMHCPAN } from '../../../modules/msk/netmhcpan/main' +include { NETMHCSTABPAN } from '../../../modules/msk/netmhcstabpan/main' +include { NEOANTIGENUTILS_FORMATNETMHCPAN } from '../../../modules/msk/neoantigenutils/formatnetmhcpan/main' + +workflow NETMHCSTABANDPAN { + + take: + + ch_maf_and_hla // channel: [ val(meta), maf, hla ] + ch_cds_and_cdna // channel: [ cfs, cdna] + + main: + + ch_versions = Channel.empty() + + ch_hla = ch_maf_and_hla + .map{ + new Tuple(it[0],it[2]) + } + + + ch_maf = ch_maf_and_hla + .map{ + new Tuple(it[0],it[1]) + } + + + NEOANTIGENUTILS_GENERATEHLASTRING( ch_hla ) + + + ch_versions = ch_versions.mix(NEOANTIGENUTILS_GENERATEHLASTRING.out.versions) + + NEOANTIGENUTILS_GENERATEMUTFASTA( ch_maf, ch_cds_and_cdna ) + + ch_versions = ch_versions.mix(NEOANTIGENUTILS_GENERATEMUTFASTA.out.versions) + + ch_netmhcinput = createNETMHCInput(NEOANTIGENUTILS_GENERATEMUTFASTA.out.wt_fasta, + NEOANTIGENUTILS_GENERATEMUTFASTA.out.mut_fasta, + NEOANTIGENUTILS_GENERATEHLASTRING.out.hlastring + ) + + + NETMHCPAN( ch_netmhcinput ) + + ch_versions = ch_versions.mix(NETMHCPAN.out.versions) + + NETMHCSTABPAN( ch_netmhcinput ) + + ch_versions = ch_versions.mix(NETMHCSTABPAN.out.versions) + + merged_pan_and_stab = NETMHCPAN.out.netmhcpanoutput.mix(NETMHCSTABPAN.out.netmhcstabpanoutput) + + NEOANTIGENUTILS_FORMATNETMHCPAN( merged_pan_and_stab ) + + ch_versions = ch_versions.mix( NEOANTIGENUTILS_FORMATNETMHCPAN.out.versions ) + + + + emit: + + tsv = NEOANTIGENUTILS_FORMATNETMHCPAN.out.netMHCpanreformatted // channel: [ val(meta), [ tsv ] ] + xls = NETMHCPAN.out.xls // channel: [ val(meta), [ xls ] ] + mut_fasta = NEOANTIGENUTILS_GENERATEMUTFASTA.out.mut_fasta // channel: [ val(meta), [ *.MUT_sequences.fa ] ] + wt_fasta = NEOANTIGENUTILS_GENERATEMUTFASTA.out.wt_fasta // channel: [ val(meta), [ *.WT_sequences.fa ] ] + versions = ch_versions // channel: [ versions.yml ] +} + +def createNETMHCInput(wt_fasta, mut_fasta, hla) { + mut_fasta_channel = mut_fasta + .map{ + new Tuple(it[0].id,it) + } + wt_fasta_channel = wt_fasta + .map{ + new Tuple(it[0].id,it) + } + hla_channel = hla + .map{ + new Tuple(it[0].id,it) + } + merged_mut = mut_fasta_channel + .join(hla_channel) + .map{ + new Tuple(it[1][0], it[1][1],it[2][1],"MUT") + } + merged_wt = wt_fasta_channel + .join(hla_channel) + .map{ + new Tuple(it[1][0], it[1][1],it[2][1],"WT") + } + merged = merged_mut.mix(merged_wt) + return merged +} diff --git a/subworkflows/msk/netmhcstabandpan/meta.yml b/subworkflows/msk/netmhcstabandpan/meta.yml new file mode 100644 index 0000000..5de706c --- /dev/null +++ b/subworkflows/msk/netmhcstabandpan/meta.yml @@ -0,0 +1,62 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json +name: "netmhcstabandpan" +description: Run netmhcpan and netmhcstabpan in parallel. +keywords: + - peptides + - netmhc + - neoantigen + - tsv +components: + - neoantigenutils/generatehlastring + - neoantigenutils/generatemutfasta + - netmhcpan + - netmhcstabpan + - neoantigenutils/formatnetmhcpan +input: + - ch_maf_and_hla: + type: file + description: | + The input channel containing the maf and files + Structure: [ val(meta), path(maf), path(hla) ] + pattern: "*.{maf/txt}" + - ch_cds_and_cdna: + type: file + description: | + The resource channel containing the cds and cdna files + Structure: [ path(cds) , path(cdna) ] + pattern: "*.{fa.gz}" +output: + - tsv: + type: file + description: | + Channel containing TSV files + Structure: [ val(meta), path(tsv) ] + pattern: "*.tsv" + - xls: + type: file + description: | + Channel containing XLS files + Structure: [ val(meta), path(xls) ] + pattern: "*.xls" + - mut_fasta: + type: file + description: | + Channel containing the MUT fasta files + Structure: [ val(meta), path(mut_fasta) ] + pattern: "*.fa" + - wt_fasta: + type: file + description: | + Channel containing the WT fasta files + Structure: [ val(meta), path(wt_fasta) ] + pattern: "*.fa" + - versions: + type: file + description: | + File containing software versions + Structure: [ path(versions.yml) ] + pattern: "versions.yml" +authors: + - "@nikhil" +maintainers: + - "@nikhil" diff --git a/subworkflows/msk/netmhcstabandpan/tests/main.nf.test b/subworkflows/msk/netmhcstabandpan/tests/main.nf.test new file mode 100644 index 0000000..74c750e --- /dev/null +++ b/subworkflows/msk/netmhcstabandpan/tests/main.nf.test @@ -0,0 +1,83 @@ +nextflow_workflow { + + name "Test Subworkflow NETMHCSTABANDPAN" + script "../main.nf" + workflow "NETMHCSTABANDPAN" + + tag "subworkflows" + tag "subworkflows_nfcore" + tag "subworkflows_msk" + tag "subworkflows/netmhcstabandpan" + tag "netmhcstabandpan" + tag "neoantigenutils/generatehlastring" + tag "neoantigenutils/generatemutfasta" + tag "netmhcpan" + tag "netmhcstabpan" + tag "neoantigenutils/formatnetmhcpan" + + test("netmhcstabandpan - tsv,xls,fa") { + + when { + workflow { + """ + input[0] = Channel.value([ + [ id:'test', single_end:false ], // meta map + file(params.test_data_mskcc['neoantigen']['temp_test_short_maf'], checkIfExists: true), + file(params.test_data_mskcc['neoantigen']['winners_hla_txt'], checkIfExists: true), + ]) + input[1] = Channel.value([ + file(params.test_data_mskcc['neoantigen']['cds'], checkIfExists: true), + file(params.test_data_mskcc['neoantigen']['cdna'], checkIfExists: true) + ]) + + """ + } + } + + then { + assertAll( + { assert workflow.success}, + { assert snapshot(workflow.out.tsv[0][1], + file(workflow.out.xls[0][1]).name, + workflow.out.mut_fasta[0][1], + workflow.out.wt_fasta[0][1] + ).match() + } + ) + } + } + + test("netmhcstabandpan - tsv,xls,fa - stub") { + + options "-stub" + + when { + workflow { + """ + input[0] = Channel.value([ + [ id:'test', single_end:false ], // meta map + file('temp_test_maf'), + file('winners_hla_txt'), + ]) + input[1] = Channel.value([ + file('cds'), + file('cdna') + ]) + """ + } + } + + then { + assertAll( + { assert workflow.success}, + { assert snapshot(workflow.out.tsv[0][1], + file(workflow.out.xls[0][1]).name, + workflow.out.mut_fasta[0][1], + workflow.out.wt_fasta[0][1] + ).match() + } + + ) + } + } +} diff --git a/subworkflows/msk/netmhcstabandpan/tests/main.nf.test.snap b/subworkflows/msk/netmhcstabandpan/tests/main.nf.test.snap new file mode 100644 index 0000000..dbde65e --- /dev/null +++ b/subworkflows/msk/netmhcstabandpan/tests/main.nf.test.snap @@ -0,0 +1,20 @@ +{ + "netmhcstabandpan - tsv,xls,fa": { + "content": [ + "test_netmHCpanoutput.WT.tsv:md5,a1d7db1b6f116e96457f2fa60660558e", + "test.WT.xls", + "test.MUT_sequences.fa:md5,7fdb7d3f0fe5a6f439ed294b612c2d70", + "test.WT_sequences.fa:md5,7595ed6cf0c98500b00c9ad027125b38" + ], + "timestamp": "2024-07-30T13:48:55.729458" + }, + "netmhcstabandpan - tsv,xls,fa - stub": { + "content": [ + "test.WT.PAN.tsv:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.MUT.xls", + "test.MUT_sequences.fa:md5,d41d8cd98f00b204e9800998ecf8427e", + "test.WT_sequences.fa:md5,d41d8cd98f00b204e9800998ecf8427e" + ], + "timestamp": "2024-07-30T13:49:11.413783" + } +} \ No newline at end of file diff --git a/subworkflows/msk/netmhcstabandpan/tests/tags.yml b/subworkflows/msk/netmhcstabandpan/tests/tags.yml new file mode 100644 index 0000000..257cc06 --- /dev/null +++ b/subworkflows/msk/netmhcstabandpan/tests/tags.yml @@ -0,0 +1,2 @@ +subworkflows/netmhcstabandpan: + - subworkflows/msk/netmhcstabandpan/** diff --git a/subworkflows/msk/phylowgs/main.nf b/subworkflows/msk/phylowgs/main.nf new file mode 100644 index 0000000..f15f272 --- /dev/null +++ b/subworkflows/msk/phylowgs/main.nf @@ -0,0 +1,69 @@ +include { PHYLOWGS_CREATEINPUT } from '../../../modules/msk/phylowgs/createinput/main' +include { PHYLOWGS_PARSECNVS } from '../../../modules/msk/phylowgs/parsecnvs/main' +include { PHYLOWGS_MULTIEVOLVE } from '../../../modules/msk/phylowgs/multievolve/main' +include { PHYLOWGS_WRITERESULTS } from '../../../modules/msk/phylowgs/writeresults/main' + +workflow PHYLOWGS { + + take: + ch_input_maf_and_genelevel + + main: + + ch_versions = Channel.empty() + + ch_genelevel = ch_input_maf_and_genelevel + .map{ + new Tuple(it[0],it[2]) + } + + ch_maf = ch_input_maf_and_genelevel + .map{ + new Tuple(it[0],it[1]) + } + + PHYLOWGS_PARSECNVS(ch_genelevel) + + ch_versions = ch_versions.mix(PHYLOWGS_PARSECNVS.out.versions) + + ch_maf_and_cnv = join_maf_with_cnv(ch_maf,PHYLOWGS_PARSECNVS.out.cnv) + + PHYLOWGS_CREATEINPUT(ch_maf_and_cnv) + + ch_versions = ch_versions.mix(PHYLOWGS_CREATEINPUT.out.versions) + + PHYLOWGS_MULTIEVOLVE(PHYLOWGS_CREATEINPUT.out.phylowgsinput) + + ch_versions = ch_versions.mix(PHYLOWGS_MULTIEVOLVE.out.versions) + + PHYLOWGS_WRITERESULTS(PHYLOWGS_MULTIEVOLVE.out.trees) + + ch_versions = ch_versions.mix(PHYLOWGS_WRITERESULTS.out.versions) + + + emit: + + summ = PHYLOWGS_WRITERESULTS.out.summ // channel: [ val(meta), [ summ ] ] + muts = PHYLOWGS_WRITERESULTS.out.muts // channel: [ val(meta), [ muts ] ] + mutass = PHYLOWGS_WRITERESULTS.out.mutass // channel: [ val(meta), [ mutass ] ] + versions = ch_versions // channel: [ versions.yml ] +} + +def join_maf_with_cnv(maf,cnv) { + maf_channel = maf + .map{ + new Tuple(it[0].id,it) + } + cnv_channel = cnv + .map{ + new Tuple(it[0].id,it) + } + mergedWithKey = maf_channel + .join(cnv_channel) + merged = mergedWithKey + .map{ + new Tuple(it[1][0],it[1][1],it[2][1]) + } + return merged + +} diff --git a/subworkflows/msk/phylowgs/meta.yml b/subworkflows/msk/phylowgs/meta.yml new file mode 100644 index 0000000..f58558c --- /dev/null +++ b/subworkflows/msk/phylowgs/meta.yml @@ -0,0 +1,43 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json +name: "phylowgs" +description: Application for inferring subclonal composition and evolution from whole-genome and exome sequencing data +keywords: + - CNVs + - FACETs + - mutations + - clones +components: + - phylowgs/createinput + - phylowgs/parsecnvs + - phylowgs/multievolve + - phylowgs/writeresults +input: + - ch_input_maf_and_genelevel: + type: file + description: | + The input channel containing the maf and FACETS genelevel files + Structure: [ val(meta), path(maf), path(genelevel) ] + pattern: "*.{maf/txt}" +output: + - summ: + type: file + description: Output file for JSON-formatted tree summaries + pattern: "*.summ.json.gz" + - muts: + type: file + description: Output file for JSON-formatted list of mutations + pattern: "*.muts.json.gz" + - mutass: + type: file + description: Output file for JSON-formatted list of SSMs and CNVs + pattern: "*.mutass.zip" + - versions: + type: file + description: | + File containing software versions + Structure: [ path(versions.yml) ] + pattern: "versions.yml" +authors: + - "@nikhil" +maintainers: + - "@nikhil" diff --git a/subworkflows/msk/phylowgs/tests/main.nf.test b/subworkflows/msk/phylowgs/tests/main.nf.test new file mode 100644 index 0000000..7fa9052 --- /dev/null +++ b/subworkflows/msk/phylowgs/tests/main.nf.test @@ -0,0 +1,74 @@ +nextflow_workflow { + + name "Test Subworkflow PHYLOWGS" + script "../main.nf" + workflow "PHYLOWGS" + + tag "subworkflows" + tag "subworkflows_nfcore" + tag "subworkflows_msk" + tag "subworkflows/phylowgs" + tag "phylowgs" + tag "phylowgs/parsecnvs" + tag "phylowgs/createinput" + tag "phylowgs/multievolve" + tag "phylowgs/writeresults" + + config "./nextflow.config" + + + test("phylowgs - gz") { + + when { + workflow { + """ + input[0] = Channel.value([ + [ id:'test', single_end:false ], // meta map + file(params.test_data_mskcc['neoantigen']['unfilteredmaf'], checkIfExists: true), + file(params.test_data_mskcc['neoantigen']['facets_hisens_cncf_txt'], checkIfExists: true) + ]) + """ + } + } + + then { + assertAll( + { assert workflow.success}, + { assert snapshot(file(workflow.out.summ[0][1]).name, + file(workflow.out.muts[0][1]).name, + file(workflow.out.mutass[0][1]).name).match() + } + ) + } + } + + test("phylowgs - gz - stub") { + + options "-stub" + + when { + workflow { + """ + input[0] = Channel.value([ + [ id:'test', single_end:false ], // meta map + file('unfilteredmaf'), + file('facets_gene_level_txt') + ]) + """ + } + } + + then { + assertAll( + { assert workflow.success}, + { assert snapshot(file(workflow.out.summ[0][1]).name, + file(workflow.out.muts[0][1]).name, + file(workflow.out.mutass[0][1]).name).match() + } + + ) + } + } + + +} diff --git a/subworkflows/msk/phylowgs/tests/main.nf.test.snap b/subworkflows/msk/phylowgs/tests/main.nf.test.snap new file mode 100644 index 0000000..ff38651 --- /dev/null +++ b/subworkflows/msk/phylowgs/tests/main.nf.test.snap @@ -0,0 +1,18 @@ +{ + "phylowgs - gz": { + "content": [ + "test.summ.json.gz", + "test.muts.json.gz", + "test.mutass.zip" + ], + "timestamp": "2024-03-28T19:55:03.638169" + }, + "phylowgs - gz - stub": { + "content": [ + "test.summ.json.gz", + "test.muts.json.gz", + "test.mutass.zip" + ], + "timestamp": "2024-03-28T19:55:13.404789" + } +} \ No newline at end of file diff --git a/subworkflows/msk/phylowgs/tests/nextflow.config b/subworkflows/msk/phylowgs/tests/nextflow.config new file mode 100644 index 0000000..db18f7f --- /dev/null +++ b/subworkflows/msk/phylowgs/tests/nextflow.config @@ -0,0 +1,13 @@ +params { + enable_conda = false +} + +process { + publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" } + withName: 'PHYLOWGS_PARSECNVS' { + ext.args = '-c 0 -f facets' + } + withName: 'PHYLOWGS_MULTIEVOLVE' { + ext.args = '--num-chains 4 --burnin-samples 2 --mcmc-samples 2' + } +} diff --git a/subworkflows/msk/phylowgs/tests/tags.yml b/subworkflows/msk/phylowgs/tests/tags.yml new file mode 100644 index 0000000..3a3de52 --- /dev/null +++ b/subworkflows/msk/phylowgs/tests/tags.yml @@ -0,0 +1,2 @@ +subworkflows/phylowgs: + - subworkflows/msk/phylowgs/** diff --git a/subworkflows/nf-core/utils_nfcore_pipeline/main.nf b/subworkflows/nf-core/utils_nfcore_pipeline/main.nf index a8b55d6..14558c3 100644 --- a/subworkflows/nf-core/utils_nfcore_pipeline/main.nf +++ b/subworkflows/nf-core/utils_nfcore_pipeline/main.nf @@ -65,9 +65,15 @@ def checkProfileProvided(nextflow_cli_args) { // Citation string for pipeline // def workflowCitation() { + def temp_doi_ref = "" + String[] manifest_doi = workflow.manifest.doi.tokenize(",") + // Using a loop to handle multiple DOIs + // Removing `https://doi.org/` to handle pipelines using DOIs vs DOI resolvers + // Removing ` ` since the manifest.doi is a string and not a proper list + for (String doi_ref: manifest_doi) temp_doi_ref += " https://doi.org/${doi_ref.replace('https://doi.org/', '').replace(' ', '')}\n" return "If you use ${workflow.manifest.name} for your analysis please cite:\n\n" + "* The pipeline\n" + - " ${workflow.manifest.doi}\n\n" + + temp_doi_ref + "\n" + "* The nf-core framework\n" + " https://doi.org/10.1038/s41587-020-0439-x\n\n" + "* Software dependencies\n" + diff --git a/workflows/neoantigenpipeline.nf b/workflows/neoantigenpipeline.nf index 0c60350..7ee2533 100644 --- a/workflows/neoantigenpipeline.nf +++ b/workflows/neoantigenpipeline.nf @@ -4,12 +4,19 @@ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -include { FASTQC } from '../modules/nf-core/fastqc/main' -include { MULTIQC } from '../modules/nf-core/multiqc/main' include { paramsSummaryMap } from 'plugin/nf-validation' include { paramsSummaryMultiqc } from '../subworkflows/nf-core/utils_nfcore_pipeline' include { softwareVersionsToYAML } from '../subworkflows/nf-core/utils_nfcore_pipeline' include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore_neoantigenpipeline_pipeline' +include { PHYLOWGS_CREATEINPUT } from '../modules/msk/phylowgs/createinput/main' +include { PHYLOWGS_MULTIEVOLVE } from '../modules/msk/phylowgs/multievolve/main' +include { PHYLOWGS_PARSECNVS } from '../modules/msk/phylowgs/parsecnvs/main' +include { PHYLOWGS_WRITERESULTS } from '../modules/msk/phylowgs/writeresults/main' +include { PHYLOWGS } from '../subworkflows/msk/phylowgs' +include { NETMHCSTABANDPAN } from '../subworkflows/msk/netmhcstabandpan/main' +include { NETMHCPAN } from '../modules/msk/netmhcpan/main' +include { NEOANTIGENUTILS_NEOANTIGENINPUT } from '../modules/msk/neoantigenutils/neoantigeninput' +include { NEOANTIGEN_EDITING } from '../subworkflows/msk/neoantigen_editing' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -20,21 +27,70 @@ include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore_neoa workflow NEOANTIGENPIPELINE { take: - ch_samplesheet // channel: samplesheet read in from --input + ch_samplesheet // channel: samplesheet read in from --input It should have maf, polysolver file, facets gene level file + main: ch_versions = Channel.empty() - ch_multiqc_files = Channel.empty() - // - // MODULE: Run FastQC - // - FASTQC ( - ch_samplesheet - ) - ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}) - ch_versions = ch_versions.mix(FASTQC.out.versions.first()) + ch_cds_and_cdna = Channel.value([file(params.cds), file(params.cdna)]) + + ch_samplesheet.map { + meta, maf, facets_hisens_cncf, hla_file -> + [meta, maf, hla_file] + + } + .set { netMHCpan_input_ch } + + + ch_samplesheet.map { + meta, maf, facets_hisens_cncf, hla_file -> + [meta, maf, facets_hisens_cncf] + + } + .set { phylowgs_input_ch } + + // phylowgs workflow + PHYLOWGS(phylowgs_input_ch) + + ch_versions = ch_versions.mix(PHYLOWGS.out.versions) + + NETMHCSTABANDPAN(netMHCpan_input_ch,ch_cds_and_cdna) + + ch_versions = ch_versions.mix(NETMHCSTABANDPAN.out.versions) + + netMHCpanMut = NETMHCSTABANDPAN.out.tsv + .filter{ it[0].typeMut == true && it[0].fromStab == false } + netMHCpanWT = NETMHCSTABANDPAN.out.tsv + .filter{ it[0].typeMut == false && it[0].fromStab == false } + stabNetMHCpanMut = NETMHCSTABANDPAN.out.tsv + .filter{ it[0].typeMut == true && it[0].fromStab == true } + stabnetMHCpanWT = NETMHCSTABANDPAN.out.tsv + .filter{ it[0].typeMut == false && it[0].fromStab == true } + + merged = merge_for_input_generation(netMHCpan_input_ch, PHYLOWGS.out.summ, PHYLOWGS.out.muts, PHYLOWGS.out.mutass, netMHCpanMut, netMHCpanWT) + + merged_netMHC_input = merged + .map{ + new Tuple(it[0], it[1], it[2]) + } + merged_phylo_output = merged + .map{ + new Tuple(it[0], it[3], it[4], it[5]) + } + merged_netmhc_tsv = merged + .map{ + new Tuple(it[0], it[6], it[7]) + } + + NEOANTIGENUTILS_NEOANTIGENINPUT(merged_netMHC_input,merged_phylo_output,merged_netmhc_tsv) + + ch_versions = ch_versions.mix(NEOANTIGENUTILS_NEOANTIGENINPUT.out.versions) + + NEOANTIGEN_EDITING(NEOANTIGENUTILS_NEOANTIGENINPUT.out.json, file(params.iedbfasta)) + + ch_versions = ch_versions.mix(NEOANTIGEN_EDITING.out.versions) // // Collate and save software versions @@ -43,30 +99,48 @@ workflow NEOANTIGENPIPELINE { .collectFile(storeDir: "${params.outdir}/pipeline_info", name: 'nf_core_pipeline_software_mqc_versions.yml', sort: true, newLine: true) .set { ch_collated_versions } - // - // MODULE: MultiQC - // - ch_multiqc_config = Channel.fromPath("$projectDir/assets/multiqc_config.yml", checkIfExists: true) - ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath(params.multiqc_config, checkIfExists: true) : Channel.empty() - ch_multiqc_logo = params.multiqc_logo ? Channel.fromPath(params.multiqc_logo, checkIfExists: true) : Channel.empty() - summary_params = paramsSummaryMap(workflow, parameters_schema: "nextflow_schema.json") - ch_workflow_summary = Channel.value(paramsSummaryMultiqc(summary_params)) - ch_multiqc_custom_methods_description = params.multiqc_methods_description ? file(params.multiqc_methods_description, checkIfExists: true) : file("$projectDir/assets/methods_description_template.yml", checkIfExists: true) - ch_methods_description = Channel.value(methodsDescriptionText(ch_multiqc_custom_methods_description)) - ch_multiqc_files = ch_multiqc_files.mix(ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) - ch_multiqc_files = ch_multiqc_files.mix(ch_collated_versions) - ch_multiqc_files = ch_multiqc_files.mix(ch_methods_description.collectFile(name: 'methods_description_mqc.yaml', sort: false)) - - MULTIQC ( - ch_multiqc_files.collect(), - ch_multiqc_config.toList(), - ch_multiqc_custom_config.toList(), - ch_multiqc_logo.toList() - ) + emit: - multiqc_report = MULTIQC.out.report.toList() // channel: /path/to/multiqc_report.html - versions = ch_versions // channel: [ path(versions.yml) ] + versions = ch_versions // channel: [ path(versions.yml) ] + neo_out = NEOANTIGEN_EDITING.out.annotated_output +} + +def merge_for_input_generation(netMHCpan_input_ch, summ_ch, muts_ch, mutass_ch, netmhcpan_mut_tsv_ch, netmhcpan_wt_tsv_ch ) { + netMHCpan_input = netMHCpan_input_ch + .map{ + new Tuple(it[0].id,it) + } + summ = summ_ch + .map{ + new Tuple(it[0].id,it) + } + muts = muts_ch + .map{ + new Tuple(it[0].id,it) + } + mutass = mutass_ch + .map{ + new Tuple(it[0].id,it) + } + netmhcpan_mut_tsv = netmhcpan_mut_tsv_ch + .map{ + new Tuple(it[0].id,it) + } + netmhcpan_wt_tsv = netmhcpan_wt_tsv_ch + .map{ + new Tuple(it[0].id,it) + } + merged = netMHCpan_input + .join(summ) + .join(muts) + .join(mutass) + .join(netmhcpan_mut_tsv) + .join(netmhcpan_wt_tsv) + .map{ + new Tuple(it[1][0], it[1][1], it[1][2], it[2][1], it[3][1], it[4][1], it[5][1], it[6][1]) + } + return merged } /* @@ -74,3 +148,4 @@ workflow NEOANTIGENPIPELINE { THE END ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ +