From 04c41ff5079cea2caa11b3c6d8bddb8c9132b4f2 Mon Sep 17 00:00:00 2001 From: David Mas-Ponte Date: Tue, 17 Mar 2020 20:00:38 +0100 Subject: [PATCH 01/12] adding msisensor tool (see nf-core#95) - now right branch --- environment.yml | 1 + main.nf | 61 ++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 61 insertions(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index dc427c9861..8b0615ce11 100644 --- a/environment.yml +++ b/environment.yml @@ -21,6 +21,7 @@ dependencies: - bioconda::genesplicer=1.0 - bioconda::htslib=1.9 - bioconda::manta=1.6.0 + - bioconda::msisensor=0.5 - bioconda::multiqc=1.8 - bioconda::qualimap=2.2.2d - bioconda::samtools=1.9 diff --git a/main.nf b/main.nf index 788301b250..93830bd740 100644 --- a/main.nf +++ b/main.nf @@ -618,6 +618,7 @@ process Get_software_versions { trim_galore -v &> v_trim_galore.txt 2>&1 || true vcftools --version &> v_vcftools.txt 2>&1 || true vep --help &> v_vep.txt 2>&1 || true + msisensor &> v_msisensor.txt 2>&1 || true scrape_software_versions.py &> software_versions_mqc.yaml """ @@ -2070,7 +2071,7 @@ pairBam = bamNormal.cross(bamTumor).map { pairBam = pairBam.dump(tag:'BAM Somatic Pair') // Manta, Strelka, Mutect2 -(pairBamManta, pairBamStrelka, pairBamStrelkaBP, pairBamCalculateContamination, pairBamFilterMutect2, pairBamTNscope, pairBam) = pairBam.into(7) +(pairBamManta, pairBamStrelka, pairBamStrelkaBP, pairBamCalculateContamination, pairBamFilterMutect2, pairBamTNscope, pairBamMsisensor, pairBam) = pairBam.into(8) intervalPairBam = pairBam.spread(bedIntervals) @@ -2605,6 +2606,64 @@ process StrelkaBP { vcfStrelkaBP = vcfStrelkaBP.dump(tag:'Strelka BP') +// STEP MSISENSOR.1 - SCAN + +// Scan reference genome for microsattelites +process msisensorScan { + label 'cpus_1' + label 'memory_max' + // memory '20 GB' + + tag {fasta} + + input: + file(fasta) from ch_fasta + file(fastaFai) from ch_fastaFai + + output: + file "microsatellites.list" into msi_scan_ch + + when: 'msisensor' in tools + + script: + """ + msisensor scan -d ${fasta} -o microsatellites.list + """ +} + +// STEP MSISENSOR.2 - SCORE + +// Score the normal vs somatic pair of bams + +process msisensor { + label 'cpus_4' + label 'memory_max' + // memory '10 GB' + + tag {idSampleTumor + "_vs_" + idSampleNormal} + + publishDir "${params.outdir}/MSI/${idSampleTumor}_vs_${idSampleNormal}/msisensor", mode: params.publishDirMode + + input: + set idPatient, idSampleNormal, file(bamNormal), file(baiNormal), idSampleTumor, file(bamTumor), file(baiTumor) from pairBamMsisensor + file msiSites from msi_scan_ch + + output: + set sampleId, file("${idSampleTumor}_vs_${idSampleNormal}_msisensor"), file("${idSampleTumor}_vs_${idSampleNormal}_msisensor_dis"), file("${idSampleTumor}_vs_${idSampleNormal}_msisensor_germline"), file("${idSampleTumor}_vs_${idSampleNormal}_msisensor_somatic") into msisensor_out_ch + + when: + when: 'msisensor' in tools + + script: + """ + msisensor msi -d ${msiSites} \ + -b 4 \ + -n ${bamNormal} \ + -t ${bamTumor} \ + -o ${idSampleTumor}_vs_${idSampleNormal}_msisensor + """ +} + // STEP ASCAT.1 - ALLELECOUNTER // Run commands and code from Malin Larsson From 3c84bce678bb44fce63b9d040d90873ff9a63b71 Mon Sep 17 00:00:00 2001 From: David Mas-Ponte Date: Wed, 18 Mar 2020 19:57:08 +0100 Subject: [PATCH 02/12] debugging, now it runs --- main.nf | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/main.nf b/main.nf index 93830bd740..7ced59e784 100644 --- a/main.nf +++ b/main.nf @@ -2618,7 +2618,7 @@ process msisensorScan { input: file(fasta) from ch_fasta - file(fastaFai) from ch_fastaFai + file(fastaFai) from ch_fai output: file "microsatellites.list" into msi_scan_ch @@ -2649,7 +2649,7 @@ process msisensor { file msiSites from msi_scan_ch output: - set sampleId, file("${idSampleTumor}_vs_${idSampleNormal}_msisensor"), file("${idSampleTumor}_vs_${idSampleNormal}_msisensor_dis"), file("${idSampleTumor}_vs_${idSampleNormal}_msisensor_germline"), file("${idSampleTumor}_vs_${idSampleNormal}_msisensor_somatic") into msisensor_out_ch + set val("Msisensor"), idPatient, file("${idSampleTumor}_vs_${idSampleNormal}_msisensor"), file("${idSampleTumor}_vs_${idSampleNormal}_msisensor_dis"), file("${idSampleTumor}_vs_${idSampleNormal}_msisensor_germline"), file("${idSampleTumor}_vs_${idSampleNormal}_msisensor_somatic") into msisensor_out_ch when: when: 'msisensor' in tools @@ -3654,7 +3654,8 @@ def defineToolList() { 'strelka', 'tiddit', 'tnscope', - 'vep' + 'vep', + 'msisensor' ] } From b484cfe2c8cad0090f3dd814438f53546d0f6e67 Mon Sep 17 00:00:00 2001 From: David Mas-Ponte Date: Thu, 19 Mar 2020 13:54:44 +0100 Subject: [PATCH 03/12] Update main.nf Co-Authored-By: Maxime Garcia --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 7ced59e784..e8bf9120fa 100644 --- a/main.nf +++ b/main.nf @@ -2070,7 +2070,7 @@ pairBam = bamNormal.cross(bamTumor).map { pairBam = pairBam.dump(tag:'BAM Somatic Pair') -// Manta, Strelka, Mutect2 +// Manta, Strelka, Mutect2, MSIsensor (pairBamManta, pairBamStrelka, pairBamStrelkaBP, pairBamCalculateContamination, pairBamFilterMutect2, pairBamTNscope, pairBamMsisensor, pairBam) = pairBam.into(8) intervalPairBam = pairBam.spread(bedIntervals) From 9afaa94e345c6ed0c824919b9ebb7e61f050fe09 Mon Sep 17 00:00:00 2001 From: David Mas-Ponte Date: Thu, 19 Mar 2020 13:56:19 +0100 Subject: [PATCH 04/12] update tests as suggested in PR --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 26f6c8cf7f..71d634521f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -131,7 +131,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - tool: [Haplotypecaller, Freebayes, Manta, mpileup, Strelka, TIDDIT] + tool: [Haplotypecaller, Freebayes, Manta, mpileup, Strelka, TIDDIT, msisensor] steps: - uses: actions/checkout@v2 - name: Install Nextflow From 41154f551aacaf2ddec2797b70839b0aad087dd4 Mon Sep 17 00:00:00 2001 From: David Mas-Ponte Date: Thu, 19 Mar 2020 14:06:57 +0100 Subject: [PATCH 05/12] adding container docs and changelog --- CHANGELOG.md | 1 + docs/containers.md | 1 + 2 files changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index d678bf7abb..8c9e400562 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) a - [#117](https://github.com/nf-core/sarek/pull/117) - Add `Trim Galore` possibilities to Sarek - [#141](https://github.com/nf-core/sarek/pull/141) - Add containers for `WBcel235` - [#150](https://github.com/nf-core/sarek/pull/150), [#151](https://github.com/nf-core/sarek/pull/151), [#154](https://github.com/nf-core/sarek/pull/154) - Add AWS mega test GitHub Actions +- [#163](https://github.com/nf-core/sarek/pull/163) - Add [msisensor](https://github.com/ding-lab/msisensor) in tools and container ### `Changed` diff --git a/docs/containers.md b/docs/containers.md index 01ba9476f3..0b7d5f4761 100644 --- a/docs/containers.md +++ b/docs/containers.md @@ -25,6 +25,7 @@ For annotation, the main container can be used, but the cache has to be download - Contain **[GeneSplicer](https://ccb.jhu.edu/software/genesplicer/)** 1.0 - Contain **[HTSlib](https://github.com/samtools/htslib)** 1.9 - Contain **[Manta](https://github.com/Illumina/manta)** 1.6.0 +- Contain **[msisensor](https://github.com/ding-lab/msisensor)** 0.5 - Contain **[MultiQC](https://github.com/ewels/MultiQC/)** 1.8 - Contain **[Qualimap](http://qualimap.bioinfo.cipf.es)** 2.2.2d - Contain **[samtools](https://github.com/samtools/samtools)** 1.9 From e0af15e9215160e7331968c8aade74c12c0b5e3b Mon Sep 17 00:00:00 2001 From: David Mas-Ponte Date: Thu, 19 Mar 2020 16:38:23 +0100 Subject: [PATCH 06/12] update scrape software versions --- bin/scrape_software_versions.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bin/scrape_software_versions.py b/bin/scrape_software_versions.py index ba53181640..44580887f0 100755 --- a/bin/scrape_software_versions.py +++ b/bin/scrape_software_versions.py @@ -13,6 +13,7 @@ 'GATK': ['v_gatk.txt', r"Version:(\S+)"], 'htslib': ['v_samtools.txt', r"htslib (\S+)"], 'Manta': ['v_manta.txt', r"([0-9.]+)"], + 'msisensor': ["v_msisensor.txt", r"Version: v(\S+)"], 'MultiQC': ['v_multiqc.txt', r"multiqc, version (\S+)"], 'Nextflow': ['v_nextflow.txt', r"(\S+)"], 'nf-core/sarek': ['v_pipeline.txt', r"(\S+)"], @@ -38,6 +39,7 @@ results['GATK'] = 'N/A' results['htslib'] = 'N/A' results['Manta'] = 'N/A' +results['msisensor'] = 'N/A' results['MultiQC'] = 'N/A' results['Qualimap'] = 'N/A' results['R'] = 'N/A' From 46d55b99fae4b5a49efc9cc3ded6dcb644afd262 Mon Sep 17 00:00:00 2001 From: David Mas-Ponte Date: Thu, 19 Mar 2020 17:28:43 +0100 Subject: [PATCH 07/12] adding contributor --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 4fa84f9aea..5b561495d5 100644 --- a/README.md +++ b/README.md @@ -105,6 +105,7 @@ Helpful contributors: * [gulfshores](https://github.com/gulfshores) * [pallolason](https://github.com/pallolason) * [silviamorins](https://github.com/silviamorins) +* [David Mas-Ponte](https://github.com/davidmasp) ## Contributions & Support From c1af38b97335644aaec2f2184b6463ac2d87f149 Mon Sep 17 00:00:00 2001 From: David Mas-Ponte Date: Thu, 19 Mar 2020 23:39:23 +0100 Subject: [PATCH 08/12] adding output info in docs for msisensor --- docs/output.md | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/docs/output.md b/docs/output.md index 493957cf9f..fcb8ec997c 100644 --- a/docs/output.md +++ b/docs/output.md @@ -34,6 +34,8 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d - [ConvertAlleleCounts](#convertallelecounts) - [ASCAT](#ascat) - [Control-FREEC](#control-freec) + - [MSI status](#msi-status) + - [msisensor](#msisensor) - [Variant annotation](#variant-annotation) - [snpEff](#snpeff) - [VEP](#vep) @@ -424,6 +426,37 @@ For a Tumor/Normal pair only: - `[TUMORSAMPLE].pileup.gz_BAF.txt` and `[NORMALSAMPLE].pileup.gz_BAF.txt` - file with beta allele frequencies for each possibly heterozygous SNP position + +### MSI status + +[Microsatellite instability](https://en.wikipedia.org/wiki/Microsatellite_instability) +is a genetic condition associated to deficienceies in the +mismatch repair (MMR) system which causes a tendency to accumulate a high +number of mutations (SNVs and indels). + +#### msisensor + +[msisensor](https://github.com/ding-lab/msisensor) is a tool to detect the MSI +status of a tumor scaning the length of the microsatellite regions. An altered +distribution of microsatellite length is associated to a missed replication +slippage which would be corrected under normal MMR conditions. It requires +a normal sample for each tumour to differentiate the somatic and germline +cases. + +For further reading see the [msisensor paper](https://www.ncbi.nlm.nih.gov/pubmed/24371154). + +For a Tumor/Normal pair only: +**Output directory: `results/MSI/[TUMORSAMPLE]_vs_[NORMALSAMPLE]/msisensor`** + +- `[TUMORSAMPLE]_vs_[NORMALSAMPLE]_msisensor + - MSI score output, contains information about the number of somatic sites. +- `[TUMORSAMPLE]_vs_[NORMALSAMPLE]_msisensor_dis + - The normal and tumor length distribution for each microsatellite position. +- `[TUMORSAMPLE]_vs_[NORMALSAMPLE]_msisensor_germline + - somatic sites detected +- `[TUMORSAMPLE]_vs_[NORMALSAMPLE]_msisensor_somatic + - germ line sites detected + ## Variant annotation This directory contains results from the final annotation steps: two software are used for annotation, [snpEff](http://snpeff.sourceforge.net/) and [VEP](https://www.ensembl.org/info/docs/tools/vep/index.html). @@ -472,6 +505,13 @@ For all samples: - `VariantCaller_Sample_VEP.ann.vcf.gz` and `VariantCaller_Sample_VEP.ann.vcf.gz.tbi` - VCF with Tabix index + +## MSI status + + +All the results regarding the MSI status of the samples are + + ## QC and reporting ### QC From d8a5cf250cf16df7437a1086c06b7d0412b716f0 Mon Sep 17 00:00:00 2001 From: David Mas-Ponte Date: Thu, 19 Mar 2020 23:46:14 +0100 Subject: [PATCH 09/12] ups sorry, now it should lint well --- docs/output.md | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/docs/output.md b/docs/output.md index fcb8ec997c..97fa16e1b9 100644 --- a/docs/output.md +++ b/docs/output.md @@ -426,11 +426,10 @@ For a Tumor/Normal pair only: - `[TUMORSAMPLE].pileup.gz_BAF.txt` and `[NORMALSAMPLE].pileup.gz_BAF.txt` - file with beta allele frequencies for each possibly heterozygous SNP position - ### MSI status [Microsatellite instability](https://en.wikipedia.org/wiki/Microsatellite_instability) -is a genetic condition associated to deficienceies in the +is a genetic condition associated to deficienceies in the mismatch repair (MMR) system which causes a tendency to accumulate a high number of mutations (SNVs and indels). @@ -439,7 +438,7 @@ number of mutations (SNVs and indels). [msisensor](https://github.com/ding-lab/msisensor) is a tool to detect the MSI status of a tumor scaning the length of the microsatellite regions. An altered distribution of microsatellite length is associated to a missed replication -slippage which would be corrected under normal MMR conditions. It requires +slippage which would be corrected under normal MMR conditions. It requires a normal sample for each tumour to differentiate the somatic and germline cases. @@ -505,13 +504,6 @@ For all samples: - `VariantCaller_Sample_VEP.ann.vcf.gz` and `VariantCaller_Sample_VEP.ann.vcf.gz.tbi` - VCF with Tabix index - -## MSI status - - -All the results regarding the MSI status of the samples are - - ## QC and reporting ### QC From 79a4d77673c8924001ce1d802bed5f05dc87ebbe Mon Sep 17 00:00:00 2001 From: David Mas-Ponte Date: Thu, 19 Mar 2020 23:46:14 +0100 Subject: [PATCH 10/12] update some markdown typos --- docs/output.md | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/docs/output.md b/docs/output.md index fcb8ec997c..eabc4fd7e8 100644 --- a/docs/output.md +++ b/docs/output.md @@ -426,11 +426,10 @@ For a Tumor/Normal pair only: - `[TUMORSAMPLE].pileup.gz_BAF.txt` and `[NORMALSAMPLE].pileup.gz_BAF.txt` - file with beta allele frequencies for each possibly heterozygous SNP position - ### MSI status [Microsatellite instability](https://en.wikipedia.org/wiki/Microsatellite_instability) -is a genetic condition associated to deficienceies in the +is a genetic condition associated to deficienceies in the mismatch repair (MMR) system which causes a tendency to accumulate a high number of mutations (SNVs and indels). @@ -439,7 +438,7 @@ number of mutations (SNVs and indels). [msisensor](https://github.com/ding-lab/msisensor) is a tool to detect the MSI status of a tumor scaning the length of the microsatellite regions. An altered distribution of microsatellite length is associated to a missed replication -slippage which would be corrected under normal MMR conditions. It requires +slippage which would be corrected under normal MMR conditions. It requires a normal sample for each tumour to differentiate the somatic and germline cases. @@ -448,13 +447,13 @@ For further reading see the [msisensor paper](https://www.ncbi.nlm.nih.gov/pubme For a Tumor/Normal pair only: **Output directory: `results/MSI/[TUMORSAMPLE]_vs_[NORMALSAMPLE]/msisensor`** -- `[TUMORSAMPLE]_vs_[NORMALSAMPLE]_msisensor +- `[TUMORSAMPLE]_vs_[NORMALSAMPLE]`_msisensor - MSI score output, contains information about the number of somatic sites. -- `[TUMORSAMPLE]_vs_[NORMALSAMPLE]_msisensor_dis +- `[TUMORSAMPLE]_vs_[NORMALSAMPLE]`_msisensor_dis - The normal and tumor length distribution for each microsatellite position. -- `[TUMORSAMPLE]_vs_[NORMALSAMPLE]_msisensor_germline +- `[TUMORSAMPLE]_vs_[NORMALSAMPLE]`_msisensor_germline - somatic sites detected -- `[TUMORSAMPLE]_vs_[NORMALSAMPLE]_msisensor_somatic +- `[TUMORSAMPLE]_vs_[NORMALSAMPLE]`_msisensor_somatic - germ line sites detected ## Variant annotation @@ -505,13 +504,6 @@ For all samples: - `VariantCaller_Sample_VEP.ann.vcf.gz` and `VariantCaller_Sample_VEP.ann.vcf.gz.tbi` - VCF with Tabix index - -## MSI status - - -All the results regarding the MSI status of the samples are - - ## QC and reporting ### QC From bd5c309f4226de311e9f7a7a95148783cb02e2b2 Mon Sep 17 00:00:00 2001 From: Maxime Garcia Date: Fri, 20 Mar 2020 13:04:03 +0100 Subject: [PATCH 11/12] Apply suggestions from code review --- docs/output.md | 12 ++++++------ main.nf | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/output.md b/docs/output.md index eabc4fd7e8..e999693a68 100644 --- a/docs/output.md +++ b/docs/output.md @@ -35,7 +35,7 @@ The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes d - [ASCAT](#ascat) - [Control-FREEC](#control-freec) - [MSI status](#msi-status) - - [msisensor](#msisensor) + - [MSIsensor](#msisensor) - [Variant annotation](#variant-annotation) - [snpEff](#snpeff) - [VEP](#vep) @@ -433,19 +433,19 @@ is a genetic condition associated to deficienceies in the mismatch repair (MMR) system which causes a tendency to accumulate a high number of mutations (SNVs and indels). -#### msisensor +#### MSIsensor -[msisensor](https://github.com/ding-lab/msisensor) is a tool to detect the MSI +[MSIsensor](https://github.com/ding-lab/msisensor) is a tool to detect the MSI status of a tumor scaning the length of the microsatellite regions. An altered distribution of microsatellite length is associated to a missed replication -slippage which would be corrected under normal MMR conditions. It requires +slippage which would be corrected under normal mismatch repair (MMR) conditions. It requires a normal sample for each tumour to differentiate the somatic and germline cases. -For further reading see the [msisensor paper](https://www.ncbi.nlm.nih.gov/pubmed/24371154). +For further reading see the [MSIsensor paper](https://www.ncbi.nlm.nih.gov/pubmed/24371154). For a Tumor/Normal pair only: -**Output directory: `results/MSI/[TUMORSAMPLE]_vs_[NORMALSAMPLE]/msisensor`** +**Output directory: `results/[TUMORSAMPLE]_vs_[NORMALSAMPLE]/MSIsensor`** - `[TUMORSAMPLE]_vs_[NORMALSAMPLE]`_msisensor - MSI score output, contains information about the number of somatic sites. diff --git a/main.nf b/main.nf index e8bf9120fa..ac51ceba00 100644 --- a/main.nf +++ b/main.nf @@ -2642,7 +2642,7 @@ process msisensor { tag {idSampleTumor + "_vs_" + idSampleNormal} - publishDir "${params.outdir}/MSI/${idSampleTumor}_vs_${idSampleNormal}/msisensor", mode: params.publishDirMode + publishDir "${params.outdir}/${idSampleTumor}_vs_${idSampleNormal}/MSIsensor", mode: params.publishDirMode input: set idPatient, idSampleNormal, file(bamNormal), file(baiNormal), idSampleTumor, file(bamTumor), file(baiTumor) from pairBamMsisensor From e6c693f44daf1b2fdb51054a37d77b32d7440169 Mon Sep 17 00:00:00 2001 From: Maxime Garcia Date: Fri, 20 Mar 2020 13:21:56 +0100 Subject: [PATCH 12/12] Apply suggestions from code review --- docs/output.md | 2 +- main.nf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/output.md b/docs/output.md index e999693a68..bc6e6beee4 100644 --- a/docs/output.md +++ b/docs/output.md @@ -445,7 +445,7 @@ cases. For further reading see the [MSIsensor paper](https://www.ncbi.nlm.nih.gov/pubmed/24371154). For a Tumor/Normal pair only: -**Output directory: `results/[TUMORSAMPLE]_vs_[NORMALSAMPLE]/MSIsensor`** +**Output directory: `results/VariantCalling/[TUMORSAMPLE]_vs_[NORMALSAMPLE]/MSIsensor`** - `[TUMORSAMPLE]_vs_[NORMALSAMPLE]`_msisensor - MSI score output, contains information about the number of somatic sites. diff --git a/main.nf b/main.nf index ac51ceba00..3e542da0b1 100644 --- a/main.nf +++ b/main.nf @@ -2642,7 +2642,7 @@ process msisensor { tag {idSampleTumor + "_vs_" + idSampleNormal} - publishDir "${params.outdir}/${idSampleTumor}_vs_${idSampleNormal}/MSIsensor", mode: params.publishDirMode + publishDir "${params.outdir}/VariantCalling/${idSampleTumor}_vs_${idSampleNormal}/MSIsensor", mode: params.publishDirMode input: set idPatient, idSampleNormal, file(bamNormal), file(baiNormal), idSampleTumor, file(bamTumor), file(baiTumor) from pairBamMsisensor