nf-core · jonasscheid · May 29, 2024 · May 14, 2024 · May 14, 2024 · May 14, 2024
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -4,6 +4,7 @@ on:
   push:
     branches:
       - dev
+      - master
   pull_request:
   release:
     types: [published]
@@ -74,3 +75,37 @@ jobs:
       - name: Run pipeline with profile ${{ matrix.tests }}
         run: |
           nextflow run ${GITHUB_WORKSPACE} -profile ${{ matrix.tests }},docker --max_memory '6.GB' --max_cpus 2 --spectrum_batch_size 5000 --outdir ./results
+
+  # Define a second workflow only against main with additional tests: test_timstof and test_full
+  profile_main:
+    name: Run profile tests and additional full tests
+    if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' && github.repository == 'nf-core/mhcquant' }}
+    runs-on: ubuntu-latest
+    env:
+      NXF_VER: ${{ matrix.nxf_ver }}
+      NXF_ANSI_LOG: false
+    strategy:
+      matrix:
+        include:
+          # Test pipeline minimum Nextflow version
+          - NXF_VER: "23.04.0"
+            NXF_EDGE: ""
+          # Test latest edge release of Nextflow
+          - NXF_VER: ""
+            NXF_EDGE: "1"
+        tests: ["test_timstof", "test_full"]
+    steps:
+      - name: Check out pipeline code
+        uses: actions/checkout@v2
+      - name: Install Nextflow
+        env:
+          NXF_VER: ${{ matrix.NXF_VER }}
+          # Uncomment only if the edge release is more recent than the latest stable release
+          # See https://github.com/nextflow-io/nextflow/issues/2467
+          # NXF_EDGE: ${{ matrix.NXF_EDGE }}
+        run: |
+          wget -qO- get.nextflow.io | bash
+          sudo mv nextflow /usr/local/bin/
+      - name: Run pipeline with profile ${{ matrix.tests }}
+        run: |
+          nextflow run ${GITHUB_WORKSPACE} -profile ${{ matrix.tests }},docker --max_memory '6.GB' --max_cpus 2 --spectrum_batch_size 5000 --outdir ./results
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,6 +8,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ### `Added`
 
 - Added MS²Rescore module with the underlying python CLI [#293](https://github.com/nf-core/mhcquant/pull/293)
+- Added support for handling various archive formats: `d|d.tar.gz|d.tar|d.zip|mzML.gz|raw|RAW|mzML` [#323](https://github.com/nf-core/mhcquant/pull/323)
+- Added test for timsTOF data [#323](https://github.com/nf-core/mhcquant/pull/323)
 
 ### `Fixed`
 

diff --git a/README.md b/README.md
@@ -23,7 +23,7 @@
 
 It was specifically designed to analyse immunopeptidomics data, which deals with the analysis of affinity purified, unspecifically cleaved peptides that have recently been discussed intensively in [the context of cancer vaccines](https://www.nature.com/articles/ncomms13404).
 
-The workflow is based on the OpenMS C++ framework for computational mass spectrometry. RAW files (mzML) serve as inputs and a database search (Comet) is performed based on a given input protein database. FDR rescoring is applied using Percolator based on a competitive target-decoy approach (reversed decoys). For label free quantification all input files undergo identification based retention time alignment (MapAlignerIdentification), and targeted feature extraction matching ids between runs (FeatureFinderIdentification). In addition, a variant calling file (vcf) can be specified to translate variants into proteins that will be included in the database search and binding predictions on specified alleles (alleles.tsv) using MHCFlurry (Class 1) or MHCNugget (Class 2) can be directly run on the output peptide lists. Moreover, if a vcf file was specified, neoepitopes will automatically be determined and binding predictions can also directly be predicted for them.
+The workflow is based on the OpenMS C++ framework for computational mass spectrometry. RAW files (mzML) serve as inputs and a database search (Comet) is performed based on a given input protein database. FDR rescoring is applied using Percolator based on a competitive target-decoy approach (reversed decoys). For label free quantification all input files undergo identification based retention time alignment (MapAlignerIdentification), and targeted feature extraction matching ids between runs (FeatureFinderIdentification).
 
 The pipeline is built using [Nextflow](https://www.nextflow.io), a workflow tool to run tasks across multiple compute infrastructures in a very portable manner. It uses Docker/Singularity containers making installation trivial and results highly reproducible. The [Nextflow DSL2](https://www.nextflow.io/docs/latest/dsl2.html) implementation of this pipeline uses one container per process which makes it much easier to maintain and update software dependencies. Where possible, these processes have been submitted to and installed from [nf-core/modules](https://github.com/nf-core/modules) in order to make them available to all nf-core pipelines, and to everyone within the Nextflow community!
 

diff --git a/conf/modules.config b/conf/modules.config
@@ -29,8 +29,32 @@ process {
 process {
 
     withName: 'THERMORAWFILEPARSER' {
-        ext.args = "-f 2"
+        ext.args   = "-f 2"
         ext.prefix = {"${raw.baseName}"}
+        publishDir = [
+            path: {"${params.outdir}"},
+            mode: params.publish_dir_mode,
+            enabled: false
+        ]
+    }
+
+    withName: 'UNTAR' {
+        publishDir  = [
+            path: {"${params.outdir}"},
+            mode: params.publish_dir_mode,
+            enabled: false
+        ]
+    }
+
+    withName: 'UNZIP' {
+        publishDir  = [
+            path: {"${params.outdir}"},
+            mode: params.publish_dir_mode,
+            enabled: false
+        ]
+    }
+
+    withName: 'GUNZIP' {
         publishDir  = [
             path: {"${params.outdir}"},
             mode: params.publish_dir_mode,

diff --git a/conf/test_ionannotator.config b/conf/test_ionannotator.config
@@ -20,8 +20,8 @@ params {
     max_time   = '6.h'
 
     // Input data
-    fasta = 'https://raw.githubusercontent.com/nf-core/test-datasets/mhcquant/testdata/UP000005640_9606.fasta'
-    input = 'https://raw.githubusercontent.com/nf-core/test-datasets/mhcquant/testdata/HepG2_sample_sheet.tsv'
+    input = params.pipelines_testdata_base_path + 'mhcquant/testdata/HepG2_sample_sheet.tsv'
+    fasta = params.pipelines_testdata_base_path + 'mhcquant/testdata/UP000005640_9606.fasta'
 
     // Don't do quantification since this step needs a larger test dataset (-> test quantification using test_full)
     skip_quantification = true

diff --git a/conf/test_mokapot.config b/conf/test_mokapot.config
@@ -20,8 +20,8 @@ params {
     max_time   = '6.h'
 
     // Input data
-    fasta = 'https://raw.githubusercontent.com/nf-core/test-datasets/mhcquant/testdata/UP000005640_9606.fasta'
-    input = 'https://raw.githubusercontent.com/nf-core/test-datasets/mhcquant/testdata/HepG2_sample_sheet.tsv'
+    input = params.pipelines_testdata_base_path + 'mhcquant/testdata/HepG2_sample_sheet.tsv'
+    fasta = params.pipelines_testdata_base_path + 'mhcquant/testdata/UP000005640_9606.fasta'
 
     // Don't do quantification since this step needs a larger test dataset (-> test quantification using test_full)
     skip_quantification = true

diff --git a/conf/test_percolator.config b/conf/test_percolator.config
@@ -20,8 +20,8 @@ params {
     max_time   = '6.h'
 
     // Input data
-    fasta = 'https://raw.githubusercontent.com/nf-core/test-datasets/mhcquant/testdata/UP000005640_9606.fasta'
-    input = 'https://raw.githubusercontent.com/nf-core/test-datasets/mhcquant/testdata/HepG2_sample_sheet.tsv'
+    input = params.pipelines_testdata_base_path + 'mhcquant/testdata/HepG2_sample_sheet.tsv'
+    fasta = params.pipelines_testdata_base_path + 'mhcquant/testdata/UP000005640_9606.fasta'
 
     // Don't do quantification since this step needs a larger test dataset (-> test quantification using test_full)
     skip_quantification = true

diff --git a/conf/test_timstof.config b/conf/test_timstof.config
@@ -0,0 +1,39 @@
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Nextflow config file for running minimal tests on timsTOF data with  MS²Rescore and Percolator
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    Defines input files and everything required to run a fast and simple pipeline test.
+
+    Use as follows:
+        nextflow run nf-core/mhcquant -profile test_timstof,<docker/singularity> --outdir <OUTDIR>
+
+----------------------------------------------------------------------------------------
+*/
+
+params {
+    config_profile_name        = 'Test timsTOF profile'
+    config_profile_description = 'Minimal test dataset to check pipeline function with timsTOF data'
+
+    // Limit resources so that this can run on GitHub Actions
+    max_cpus   = 2
+    max_memory = '6.GB'
+    max_time   = '6.h'
+
+    // Input data
+    input = params.pipelines_testdata_base_path + 'mhcquant/testdata/sample_sheet_timstof.tsv'
+    fasta = params.pipelines_testdata_base_path + 'mhcquant/testdata/UP000005640_9606.fasta'
+
+    // Don't do quantification since this step needs a larger test dataset (-> test quantification using test_full)
+    skip_quantification = true
+
+    // Search settings
+    activation_method        = 'CID'
+    prec_charge              = '1:4'
+    precursor_mass_tolerance = 20
+    fragment_mass_tolerance  = 0.01
+    spectrum_batch_size      = 1000
+
+    // MS²Rescore settings
+    feature_generators       = 'ms2pip'
+    ms2pip_model             = 'timsTOF'
+}
diff --git a/modules.json b/modules.json
@@ -5,6 +5,11 @@
         "https://github.com/nf-core/modules.git": {
             "modules": {
                 "nf-core": {
+                    "gunzip": {
+                        "branch": "master",
+                        "git_sha": "3a5fef109d113b4997c9822198664ca5f2716208",
+                        "installed_by": ["modules"]
+                    },
                     "multiqc": {
                         "branch": "master",
                         "git_sha": "b7ebe95761cd389603f9cc0e0dc384c0f663815a",

diff --git a/modules/local/untar/environment.yml b/modules/local/untar/environment.yml
@@ -0,0 +1,11 @@
+name: untar
+
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+
+dependencies:
+  - conda-forge::grep=3.11
+  - conda-forge::sed=4.7
+  - conda-forge::tar=1.34
diff --git a/modules/local/untar/main.nf b/modules/local/untar/main.nf
@@ -0,0 +1,54 @@
+process UNTAR {
+    tag "$archive"
+    label 'process_single'
+
+    conda "${moduleDir}/environment.yml"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/ubuntu:20.04' :
+        'nf-core/ubuntu:20.04' }"
+
+    input:
+    tuple val(meta), path(archive)
+
+    output:
+    tuple val(meta), path("*.d"), emit: untar
+    path "versions.yml"         , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args  = task.ext.args ?: ''
+    def args2 = task.ext.args2 ?: ''
+    def prefix = task.ext.prefix ?: archive.baseName.replaceAll(/\.tar(\.gz)?$/, '')
+
+    """
+    mkdir $prefix
+    depth=\$(tar -tf "${archive}" | grep '\\.d/\$' | head -n 1 | tr -cd '/' | wc -c)
+
+    tar \\
+        -C $prefix \\
+        -xavf \\
+        $args \\
+        $archive \\
+        --strip-components=\$depth \\
+        $args2
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        untar: \$(echo \$(tar --version 2>&1) | sed 's/^.*(GNU tar) //; s/ Copyright.*\$//')
+    END_VERSIONS
+    """
+
+    stub:
+    prefix    = task.ext.prefix ?: ( meta.id ? "${meta.id}" : archive.toString().replaceFirst(/\.[^\.]+(.gz)?$/, ""))
+    """
+    mkdir $prefix
+    touch ${prefix}/file.txt
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        untar: \$(echo \$(tar --version 2>&1) | sed 's/^.*(GNU tar) //; s/ Copyright.*\$//')
+    END_VERSIONS
+    """
+}
diff --git a/modules/local/untar/meta.yml b/modules/local/untar/meta.yml
@@ -0,0 +1,46 @@
+name: untar
+description: Extract files.
+keywords:
+  - untar
+  - uncompress
+  - extract
+tools:
+  - untar:
+      description: |
+        Extract tar.gz files.
+      documentation: https://www.gnu.org/software/tar/manual/
+      licence: ["GPL-3.0-or-later"]
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - archive:
+      type: file
+      description: File to be untar
+      pattern: "*.{tar}.{gz}"
+output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - untar:
+      type: directory
+      description: Directory containing contents of archive
+      pattern: "*/"
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+authors:
+  - "@joseespinosa"
+  - "@drpatelh"
+  - "@matthdsm"
+  - "@jfy133"
+maintainers:
+  - "@joseespinosa"
+  - "@drpatelh"
+  - "@matthdsm"
+  - "@jfy133"
diff --git a/modules/local/unzip/environment.yml b/modules/local/unzip/environment.yml
@@ -0,0 +1,7 @@
+name: unzip
+channels:
+  - conda-forge
+  - bioconda
+  - defaults
+dependencies:
+  - conda-forge::p7zip=16.02
diff --git a/modules/local/unzip/main.nf b/modules/local/unzip/main.nf
@@ -0,0 +1,37 @@
+process UNZIP {
+    tag "$archive"
+    label 'process_single'
+
+    conda "${moduleDir}/environment.yml"
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'https://depot.galaxyproject.org/singularity/p7zip:16.02' :
+        'biocontainers/p7zip:16.02' }"
+
+    input:
+    tuple val(meta), path(archive)
+
+    output:
+    tuple val(meta), path("*.d"), emit: unzipped_archive
+    path "versions.yml"         , emit: versions
+
+    when:
+    task.ext.when == null || task.ext.when
+
+    script:
+    def args = task.ext.args ?: ''
+    if ( archive instanceof List && archive.name.size > 1 ) { error "[UNZIP] error: 7za only accepts a single archive as input. Please check module input." }
+
+    prefix = task.ext.prefix ?: ( meta.id ? "${meta.id}" : archive.baseName)
+    """
+    7za \\
+        x \\
+        -o"." \\
+        $args \\
+        $archive
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        7za: \$(echo \$(7za --help) | sed 's/.*p7zip Version //; s/(.*//')
+    END_VERSIONS
+    """
+}
diff --git a/modules/local/unzip/meta.yml b/modules/local/unzip/meta.yml
@@ -0,0 +1,42 @@
+name: unzip
+description: Unzip ZIP archive files
+keywords:
+  - unzip
+  - decompression
+  - zip
+  - archiving
+tools:
+  - unzip:
+      description: p7zip is a quick port of 7z.exe and 7za.exe (command line version of 7zip, see www.7-zip.org) for Unix.
+      homepage: https://sourceforge.net/projects/p7zip/
+      documentation: https://sourceforge.net/projects/p7zip/
+      tool_dev_url: https://sourceforge.net/projects/p7zip"
+      licence: ["LGPL-2.1-or-later"]
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - archive:
+      type: file
+      description: ZIP file
+      pattern: "*.zip"
+output:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. [ id:'test', single_end:false ]
+  - unzipped_archive:
+      type: directory
+      description: Directory contents of the unzipped archive
+      pattern: "${archive.baseName}/"
+  - versions:
+      type: file
+      description: File containing software versions
+      pattern: "versions.yml"
+authors:
+  - "@jfy133"
+maintainers:
+  - "@jfy133"
diff --git a/modules/nf-core/gunzip/environment.yml b/modules/nf-core/gunzip/environment.yml