diff --git a/bin/fasta2peptides.py b/bin/fasta2peptides.py new file mode 100755 index 0000000..8d05bd6 --- /dev/null +++ b/bin/fasta2peptides.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python3 + +import argparse +import logging +from collections import defaultdict +from time import time +from Bio import SeqIO + +# Configure logging +logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(message)s", + level=logging.INFO +) + +def parse_fasta(fasta_file): + """Parses a FASTA file and returns a dictionary {header: peptide_col_name}""" + fasta_map = {} + with open(fasta_file) as f: + for record in SeqIO.parse(f, "fasta"): + fasta_map[record.id] = str(record.seq) + return fasta_map + +def generate_peptides(fasta_map, peptide_length): + """Generates peptides of a specific length from the input sequences.""" + start_time = time() + peptides_set = set() + + for header, seq in fasta_map.items(): + for i in range(len(seq) - peptide_length + 1): + peptides_set.add((seq[i:i + peptide_length], header)) + + logging.info(f"Generated {len(peptides_set):,.0f} peptides of length {peptide_length} in {time() - start_time:.2f} seconds") + return peptides_set + +def group_peptides(peptides_set, peptide_col_name): + """Collapses identical peptides from different proteins and aggregates headers.""" + start_time = time() + peptides_dict = defaultdict(lambda: {"protein_ids": set(), "counts": 0}) + + for sequence, protein_id in peptides_set: + peptides_dict[sequence]["protein_ids"].add(protein_id) + peptides_dict[sequence]["counts"] += 1 + + peptides = [ + {peptide_col_name: seq, "protein_ids": ";".join(data["protein_ids"]), "counts": data["counts"]} + for seq, data in peptides_dict.items() + ] + + logging.info(f"Grouped peptides in {time() - start_time:.2f} seconds") + return peptides + +def write_output(peptides, output_file, peptide_col_name): + """Writes the peptides data to a TSV file without using pandas.""" + start_time = time() + + with open(output_file, "w") as f: + # Write header + f.write(f"{peptide_col_name}\tprotein_ids\tcounts\n") + # Write data + for peptide in peptides: + f.write(f"{peptide[peptide_col_name]}\t{peptide['protein_ids']}\t{peptide['counts']}\n") + + logging.info(f"Wrote {len(peptides):,.0f} peptides to {output_file} in {time() - start_time:.2f} seconds") + +def main(): + parser = argparse.ArgumentParser(description="Generate peptides of different lengths from a FASTA file and save as separate TSV files.") + parser.add_argument("-i", "--input", required=True, help="Input FASTA file") + parser.add_argument("-o", "--output_prefix", required=True, help="Output file prefix (each length will have its own file)") + parser.add_argument("-minl","--min_length", type=int, required=True, help="Minimum peptide length") + parser.add_argument("-maxl","--max_length", type=int, required=True, help="Maximum peptide length") + parser.add_argument("-pepcol","--peptide_col_name", type=str, required=True, help="Peptide column name") + + args = parser.parse_args() + + fasta_map = parse_fasta(args.input) + + for k in range(args.min_length, args.max_length + 1): + peptides_set = generate_peptides(fasta_map, k) + peptides = group_peptides(peptides_set, args.peptide_col_name) + output_filename = f"{args.output_prefix}_length_{k}.tsv" + write_output(peptides, output_filename, args.peptide_col_name) + +if __name__ == "__main__": + main() diff --git a/bin/gen_peptides.py b/bin/gen_peptides.py deleted file mode 100755 index d962ed3..0000000 --- a/bin/gen_peptides.py +++ /dev/null @@ -1,68 +0,0 @@ -#!/usr/bin/env python -# Written by Sabrina Krakau, Christopher Mohr and released under the MIT license (2022). - -import argparse - -import pandas as pd -from Bio.SeqIO.FastaIO import SimpleFastaParser -from epytope.Core import Protein, generate_peptides_from_proteins - -parser = argparse.ArgumentParser("Generating peptides from protein sequences.") -parser.add_argument( - "-i", "--input", metavar="FILE", type=argparse.FileType("r"), help="FASTA filename containing proteins." -) -parser.add_argument( - "-o", "--output", metavar="FILE", type=argparse.FileType("w"), help="Output file containing peptides." -) -parser.add_argument( - "-min", "--min_length", metavar="N", type=int, help="Minimal length of peptides that will be generated." -) -parser.add_argument( - "-max", "--max_length", metavar="N", type=int, help="Maximum length of peptides that will be generated." -) -args = parser.parse_args() - - -def read_protein_fasta(file): - # split at first whitespace and use short ID - - collect = set() - # iterate over all FASTA entries: - for _id, seq in SimpleFastaParser(file): - # generate element: - _id = _id.split(" ")[0] - - try: - collect.add(Protein(seq.strip().upper(), transcript_id=_id)) - except TypeError: - collect.add(Protein(seq.strip().upper())) - return list(collect) - - -proteins = read_protein_fasta(args.input) - -c = 0 -for k in range(args.min_length, args.max_length + 1): - peptides = generate_peptides_from_proteins(proteins, k) - # get proteins and corresponding counts - pd_peptides = pd.DataFrame( - [ - ( - str(pep), - ",".join([prot.transcript_id.split(" ")[0] for prot in pep.get_all_proteins()]), - ",".join([str(len(pep.proteinPos[prot.transcript_id])) for prot in pep.get_all_proteins()]), - ) - for pep in peptides - ], - columns=["sequence", "protein_ids", "counts"], - ) - # assign id - pd_peptides = pd_peptides.assign(id=[str(c + id) for id in pd_peptides.index]) - c += len(pd_peptides["sequence"]) - - if k == args.min_length: - pd_peptides[["sequence", "id", "protein_ids", "counts"]].to_csv(args.output, sep="\t", index=False) - else: - pd_peptides[["sequence", "id", "protein_ids", "counts"]].to_csv( - args.output, sep="\t", index=False, mode="a", header=False - ) diff --git a/bin/split_peptides.py b/bin/split_peptides.py index 4e31134..abf53cb 100755 --- a/bin/split_peptides.py +++ b/bin/split_peptides.py @@ -1,34 +1,49 @@ -#!/usr/bin/env python -# Written by Sabrina Krakau, Christopher Mohr and released under the MIT license (2022). - +#!/usr/bin/env python3 +# Written by Jonas Scheid the MIT license (2025). import argparse import math +from pathlib import Path + +def split_peptides(input_file, output_base, min_size, max_chunks): + """Splits the peptide input file into smaller chunks in a single pass.""" + input_path = Path(input_file) + output_base = Path(output_base) + + with input_path.open("r") as infile: + lines = infile.readlines() # Read all lines into memory + + header, data_lines = lines[0], lines[1:] # Separate header + total_size = len(data_lines) # Number of peptides (excluding header) + + if total_size == 0: + raise ValueError("Input file contains no peptides.") + + # Determine number of chunks & chunk size + num_chunks = min(math.ceil(total_size / min_size), max_chunks) + chunk_size = max(min_size, math.ceil(total_size / num_chunks)) -parser = argparse.ArgumentParser("Split peptides input file.") -parser.add_argument("-i", "--input", metavar="FILE", type=str, help="Input file containing peptides.") -parser.add_argument("-o", "--output_base", type=str, help="Base filename for output files.") -parser.add_argument( - "-s", "--min_size", metavar="N", type=int, help="Minimum number of peptides that should be written into one file." -) -parser.add_argument( - "-c", "--max_chunks", metavar="N", type=int, help="Maximum number of chunks that should be created." -) -args = parser.parse_args() - -with open(args.input) as infile: - tot_size = sum([1 for _ in infile]) - 1 - -n = int(min(math.ceil(float(tot_size) / args.min_size), args.max_chunks)) -h = int(max(args.min_size, math.ceil(float(tot_size) / n))) - -with open(args.input) as infile: - header = next(infile) - for chunk in range(n): - with open(args.output_base + ".chunk_" + str(chunk) + ".tsv", "w") as outfile: + for chunk_idx in range(num_chunks): + chunk_file = output_base.with_name(f"{output_base.stem}_chunk_{chunk_idx}.tsv") + start = chunk_idx * chunk_size + end = start + chunk_size + + with chunk_file.open("w") as outfile: outfile.write(header) - for _ in range(h): - try: - outfile.write(next(infile)) - except StopIteration: - break + outfile.writelines(data_lines[start:end]) + + if end >= total_size: + break # Stop if we've written all data + +def main(): + parser = argparse.ArgumentParser(description="Split a peptide file into smaller chunks.") + parser.add_argument("-i", "--input", required=True, help="Input file containing peptides.") + parser.add_argument("-o", "--output_base", required=True, help="Base filename for output files.") + parser.add_argument("--min_size", type=int, required=True, help="Minimum peptides per file.") + parser.add_argument("--max_chunks", type=int, required=True, help="Maximum number of chunks.") + + args = parser.parse_args() + split_peptides(args.input, args.output_base, args.min_size, args.max_chunks) + +if __name__ == "__main__": + main() diff --git a/conf/modules.config b/conf/modules.config index fb24018..334eebe 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -20,129 +20,20 @@ process { saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] - withName: 'MULTIQC' { - ext.args = { params.multiqc_title ? "--title \"$params.multiqc_title\"" : '' } - publishDir = [ - path: { "${params.outdir}/multiqc" }, - mode: params.publish_dir_mode, - saveAs: { filename -> filename.equals('versions.yml') ? null : filename } - ] - } - - withName: EPYTOPE_CHECK_REQUESTED_MODELS { - publishDir = [ - path: { "${params.outdir}/reports" }, - mode: params.publish_dir_mode - ] - ext.args = "--tools ${params.tools} " - } - - withName: EPYTOPE_CHECK_REQUESTED_MODELS_PEP { - publishDir = [ - path: { "${params.outdir}/reports" }, - mode: params.publish_dir_mode - ] - ext.args = "--tools ${params.tools} --peptides " - } - - withName: PREPARE_PREDICTION_INPUT { - ext.prefix = {"${meta.sample}"} - publishDir = [ - enabled: false - ] - } - - withName: SYFPEITHI { - publishDir = [ - path: { "${params.outdir}/syfpeithi" }, - mode: params.publish_dir_mode - ] - } - - withName: MHCFLURRY { - publishDir = [ - path: { "${params.outdir}/mhcflurry" }, - mode: params.publish_dir_mode, - pattern: '*.csv' - ] - } - - withName: MHCNUGGETS { - ext.prefix = {"${meta.sample}"} - publishDir = [ - path: { "${params.outdir}/mhcnuggets" }, - mode: params.publish_dir_mode, - pattern: '*predicted_mhcnuggets.csv' - ] - } - - withName: NETMHCPAN { - ext.args = "-BA" - publishDir = [ - path: { "${params.outdir}/netmhcpan" }, - mode: params.publish_dir_mode, - pattern: '*.xls' - ] - } - - withName: NETMHCIIPAN { - ext.prefix = {"${meta.sample}"} - ext.args = "-BA" - publishDir = [ - path: { "${params.outdir}/netmhciipan" }, - mode: params.publish_dir_mode, - pattern: '*.xls' - ] - } - - withName: MERGE_PREDICTIONS { - ext.prefix = {"${meta.sample}"} - publishDir = [ - path: { "${params.outdir}/merged_predictions" }, - mode: params.publish_dir_mode - ] - } - - withName: EPYTOPE_GENERATE_PEPTIDES { - publishDir = [ - path: { "${params.outdir}/generated_peptides/${meta.sample}" }, - mode: params.publish_dir_mode - ] - ext.args = '' - } - - withName: SPLIT_PEPTIDES_PEPTIDES { - ext.args = "--min_size ${params.peptides_split_minchunksize} --max_chunks ${params.peptides_split_maxchunks} " - } - - withName: SPLIT_PEPTIDES_PROTEIN { - ext.args = "--min_size ${params.peptides_split_minchunksize} --max_chunks ${params.peptides_split_maxchunks} " + withName: GUNZIP_VCF { + publishDir = [ enabled: false ] } - withName: EPYTOPE_PEPTIDE_PREDICTION_PROTEIN { - // Argument list needs to end with --peptides - ext.args = [ - genome_reference != 'grch37' & genome_reference != 'grch38' ? "--genome_reference '${genome_reference}'" : '', - genome_reference == 'grch37' ? "--genome_reference 'https://grch37.ensembl.org/'" : '', - genome_reference == 'grch38' ? "--genome_reference 'https://www.ensembl.org'" : '', - '--peptides' - ].join(' ').trim() + withName: VARIANT_SPLIT { publishDir = [ - path: { "${params.outdir}/split_predictions/${meta.sample}" }, + path: { "${params.outdir}/split_input/${meta.sample}" }, mode: params.publish_dir_mode ] } - withName: EPYTOPE_PEPTIDE_PREDICTION_PEP { - // Argument list needs to end with --peptides - ext.args = [ - genome_reference != 'grch37' & genome_reference != 'grch38' ? "--genome_reference '${genome_reference}'" : '', - genome_reference == 'grch37' ? "--genome_reference 'https://grch37.ensembl.org/'" : '', - genome_reference == 'grch38' ? "--genome_reference 'https://www.ensembl.org'" : '', - '--peptides' - ].join(' ').trim() + withName: SNPSIFT_SPLIT { publishDir = [ - path: { "${params.outdir}/split_predictions/${meta.sample}" }, + path: { "${params.outdir}/split_input/${meta.sample}" }, mode: params.publish_dir_mode ] } @@ -161,81 +52,78 @@ process { ] } - withName: MERGE_JSON_SINGLE { - ext.args = " --single_input " - } - - withName: MERGE_JSON_MULTI { - ext.args = " --input \$PWD " + withName: FASTA2PEPTIDES { + publishDir = [ enabled: false ] } - withName: CAT_TSV { - publishDir = [ - path: { "${params.outdir}/predictions/${meta.sample}" }, - mode: params.publish_dir_mode - ] + withName: SPLIT_PEPTIDES { + publishDir = [ enabled: false ] } - withName: CAT_FASTA { - publishDir = [ - path: { "${params.outdir}/predictions/${meta.sample}" }, - mode: params.publish_dir_mode - ] + withName: PREPARE_PREDICTION_INPUT { + ext.prefix = {"${meta.file_id}"} + publishDir = [ enabled: false ] } - withName: CSVTK_CONCAT { + withName: MHCFLURRY { + ext.prefix = {"${meta.file_id}"} publishDir = [ - path: { "${params.outdir}/predictions/${meta.sample}" }, - mode: params.publish_dir_mode + path: { "${params.outdir}/mhcflurry" }, + mode: params.publish_dir_mode, + pattern: '*.csv' ] } - withName: GET_PREDICTION_VERSIONS { + withName: MHCNUGGETS { + ext.prefix = {"${meta.file_id}"} publishDir = [ - path: { "${params.outdir}/reports" }, - mode: params.publish_dir_mode - ] - } - - withName: GUNZIP_VCF { - publishDir = [ - enabled: false + path: { "${params.outdir}/mhcnuggets" }, + mode: params.publish_dir_mode, + pattern: '*predicted_mhcnuggets.csv' ] } - withName: MERGE_JSON { + withName: NETMHCPAN { + ext.prefix = {"${meta.file_id}"} + ext.args = "-BA" publishDir = [ - path: { "${params.outdir}/predictions/${meta.sample}" }, - mode: params.publish_dir_mode + path: { "${params.outdir}/netmhcpan" }, + mode: params.publish_dir_mode, + pattern: '*.xls' ] } - withName: EPYTOPE_SHOW_SUPPORTED_MODELS { + withName: NETMHCIIPAN { + ext.prefix = {"${meta.file_id}"} + ext.args = "-BA" publishDir = [ - path: { "${params.outdir}/supported_models" }, - mode: params.publish_dir_mode + path: { "${params.outdir}/netmhciipan" }, + mode: params.publish_dir_mode, + pattern: '*.xls' ] } - withName: VARIANT_SPLIT { - publishDir = [ - path: { "${params.outdir}/split_input/${meta.sample}" }, - mode: params.publish_dir_mode - ] + withName: MERGE_PREDICTIONS { + ext.prefix = {"${meta.file_id}"} + ext.args = params.wide_format_output ? "--wide_format_output" : "" + publishDir = [ enabled: false ] } - withName: SNPSIFT_SPLIT { + withName: CSVTK_CONCAT { + ext.prefix = {"${meta.sample}"} publishDir = [ - path: { "${params.outdir}/split_input/${meta.sample}" }, - mode: params.publish_dir_mode + path: { "${params.outdir}/predictions/${meta.sample}" }, + mode: params.publish_dir_mode, + pattern: '*.tsv' ] } - withName: SPLIT_PEPTIDES { + withName: MULTIQC { + ext.args = { params.multiqc_title ? "--title \"$params.multiqc_title\"" : '' } publishDir = [ - path: { "${params.outdir}/split_input/${meta.sample}" }, - mode: params.publish_dir_mode + path: { "${params.outdir}/multiqc" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } ] } - } diff --git a/modules.json b/modules.json index 91ef878..7f1d421 100644 --- a/modules.json +++ b/modules.json @@ -5,6 +5,11 @@ "https://github.com/nf-core/modules.git": { "modules": { "nf-core": { + "csvtk/concat": { + "branch": "master", + "git_sha": "aa5c23023134cf2d8b75a95d53557890e40261b9", + "installed_by": ["modules"] + }, "gunzip": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", diff --git a/modules/local/csvtk_concat.nf b/modules/local/csvtk_concat.nf deleted file mode 100644 index f3d986c..0000000 --- a/modules/local/csvtk_concat.nf +++ /dev/null @@ -1,39 +0,0 @@ -process CSVTK_CONCAT { - label 'process_low' - - conda "bioconda::csvtk=0.23.0" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/csvtk:0.23.0--h9ee0642_0' : - 'biocontainers/csvtk:0.23.0--h9ee0642_0' }" - - input: - tuple val(meta), path(predicted) - - output: - tuple val(meta), path("*prediction_result.tsv"), emit: predicted - path "versions.yml", emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - """ - csvtk concat -t $predicted > ${meta.sample}_prediction_result_unsorted.tmp - csvtk sort -k chr:n,length:n ${meta.sample}_prediction_result_unsorted.tmp -t --out-file ${meta.sample}_prediction_result.tsv - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - csvtk: \$(echo \$( csvtk version | sed -e "s/csvtk v//g" )) - END_VERSIONS - """ - - stub: - """ - touch ${meta.sample}_prediction_result.tsv - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - csvtk: \$(echo \$( csvtk version | sed -e "s/csvtk v//g" )) - END_VERSIONS - """ -} diff --git a/modules/local/epytope_generate_peptides.nf b/modules/local/epytope_generate_peptides.nf deleted file mode 100644 index c0d2bf3..0000000 --- a/modules/local/epytope_generate_peptides.nf +++ /dev/null @@ -1,50 +0,0 @@ -process EPYTOPE_GENERATE_PEPTIDES { - label 'process_low' - tag "${meta.sample}" - - conda "bioconda::epytope=3.3.1" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/epytope:3.3.1--pyh7cba7a3_0' : - 'biocontainers/epytope:3.3.1--pyh7cba7a3_0' }" - - input: - tuple val(meta), path(raw) - - output: - tuple val(meta), path("*.tsv"), emit: splitted - path "versions.yml", emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def prefix = task.ext.suffix ? "${meta.sample}_${task.ext.suffix}" : "${meta.sample}_peptides" - def min_length = (meta.mhc_class == "I") ? params.min_peptide_length_classI : params.min_peptide_length_classII - def max_length = (meta.mhc_class == "I") ? params.max_peptide_length_classI : params.max_peptide_length_classII - - """ - gen_peptides.py --input ${raw} \\ - --max_length ${max_length} \\ - --min_length ${min_length} \\ - --output '${prefix}.tsv' \\ - $task.ext.args - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - epytope: \$(python -c "import pkg_resources; print(pkg_resources.get_distribution('epytope').version)") - python: \$(python --version 2>&1 | sed 's/Python //g') - END_VERSIONS - """ - - stub: - def prefix = task.ext.suffix ? "${meta.sample}_${task.ext.suffix}" : "${meta.sample}_peptides" - """ - touch ${prefix}.tsv - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - epytope: \$(python -c "import pkg_resources; print(pkg_resources.get_distribution('epytope').version)") - python: \$(python --version 2>&1 | sed 's/Python //g') - END_VERSIONS - """ -} diff --git a/modules/local/fasta2peptides/environment.yml b/modules/local/fasta2peptides/environment.yml new file mode 100644 index 0000000..6bd963e --- /dev/null +++ b/modules/local/fasta2peptides/environment.yml @@ -0,0 +1,5 @@ +channels: +- conda-forge +- bioconda +dependencies: +- bioconda::biopython=1.85 diff --git a/modules/local/fasta2peptides/main.nf b/modules/local/fasta2peptides/main.nf new file mode 100644 index 0000000..2240990 --- /dev/null +++ b/modules/local/fasta2peptides/main.nf @@ -0,0 +1,58 @@ +process FASTA2PEPTIDES { + label 'process_low' + tag "${meta.sample}" + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/83/8372f6241b480332d91bc00a88ec8c72c8f7fcc9994177a5dd67a07007cd6e32/data' : + 'community.wave.seqera.io/library/biopython:1.85--6f761292fa9881b4' }" + + input: + tuple val(meta), path(fasta) + + output: + tuple val(meta), path("*.tsv"), emit: tsv + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = task.ext.prefix ?: "${meta.sample}" + def min_length = meta.mhc_class == "I" ? params.min_peptide_length_classI : params.min_peptide_length_classII + def max_length = meta.mhc_class == "I" ? params.max_peptide_length_classI : params.max_peptide_length_classII + def pep_col = params.peptide_col_name ? "${params.peptide_col_name}" : "sequence" + + """ + fasta2peptides.py \\ + -i $fasta \\ + -o ${prefix} \\ + -minl ${min_length} \\ + -maxl ${max_length} \\ + -pepcol ${pep_col} \\ + + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python3 --version 2>&1 | cut -d' ' -f2) + biopython: \$(python3 -c "import Bio; print(Bio.__version__)") + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.sample}" + def min_length = meta.mhc_class == "I" ? params.min_peptide_length_classI : params.min_peptide_length_classII + def max_length = meta.mhc_class == "I" ? params.max_peptide_length_classI : params.max_peptide_length_classII + + """ + touch ${prefix}_length_${min_length}.tsv + touch ${prefix}_length_${max_length}.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python3 --version 2>&1 | cut -d' ' -f2) + biopython: \$(python3 -c "import Bio; print(Bio.__version__)") + END_VERSIONS + """ + +} diff --git a/modules/local/merge_predictions/main.nf b/modules/local/merge_predictions/main.nf index 9a39f4e..5c81a69 100644 --- a/modules/local/merge_predictions/main.nf +++ b/modules/local/merge_predictions/main.nf @@ -8,12 +8,11 @@ process MERGE_PREDICTIONS { 'biocontainers/mhcgnomes:1.8.6--pyh7cba7a3_0' }" input: - tuple val(meta), path(prediction_files) + tuple val(meta), path(prediction_files), path(source_file) output: tuple val(meta), path("*.tsv"), emit: merged - path "versions.yml", emit: versions - + path "versions.yml" , emit: versions script: //TODO handle the thresholds (parse the --tools_thresholds and --use_affinity_thresholds) @@ -21,7 +20,7 @@ process MERGE_PREDICTIONS { stub: def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: meta.sample + def prefix = task.ext.prefix ?: "${meta.id}" """ touch merged_prediction.tsv touch versions.yml diff --git a/modules/local/merge_predictions/templates/merge_predictions.py b/modules/local/merge_predictions/templates/merge_predictions.py index 01e55fe..987b1f0 100644 --- a/modules/local/merge_predictions/templates/merge_predictions.py +++ b/modules/local/merge_predictions/templates/merge_predictions.py @@ -32,8 +32,10 @@ class Arguments: def __init__(self) -> None: self.input = "$prediction_files".split(" ") + self.source_file = "$source_file" self.prefix = "$task.ext.prefix" if "$task.ext.prefix" != "null" else "$meta.id" self.alleles = sorted("$meta.alleles".split(';')) + self.peptide_col_name = "sequence" self.parse_ext_args("$task.ext.args") def parse_ext_args(self, args_string: str) -> None: @@ -47,6 +49,8 @@ def parse_ext_args(self, args_string: str) -> None: # Parse the extended arguments args_list = shlex.split(args_string) # Split the string into a list of arguments parser = argparse.ArgumentParser() + # Define custom script argument + parser.add_argument("--wide_format_output", action="store_true") # input parameters args = parser.parse_args(args_list) @@ -101,11 +105,44 @@ def BAtoic50(BA: float) -> float: """Convert binding affinity (BA) to IC50.""" return 10 ** ((1 - BA) * math.log10(50000)) + @staticmethod + def longTowide(df: pd.DataFrame) -> pd.DataFrame: + """ + Transforms a long-format DataFrame into a wide-format DataFrame, + where 'predictor-allele-BA', 'predictor-allele-rank', and 'predictor-allele-binder' + become separate columns. + + Parameters: + df (pd.DataFrame): The original long-format DataFrame. + + Returns: + pd.DataFrame: Transformed wide-format DataFrame. + """ + # Identify non-predictor columns to keep as index + meta_columns = [col for col in df.columns if col not in ['predictor', 'allele', 'BA', 'rank', 'binder']] + + # Pivot to wide format + df_wide = df.pivot_table( + index=meta_columns, + columns=['predictor', 'allele'], + values=['BA', 'rank', 'binder'], + aggfunc='first' # Assuming first occurrence is enough if duplicates exist + ) + + # Flatten the MultiIndex columns + df_wide.columns = [f"{pred}_{allele}_{val}" for val, pred, allele in df_wide.columns] + + # Reset index + df_wide.reset_index(inplace=True) + + return df_wide + + class PredictionResult: - def __init__(self, file_path, alleles, threshold=2): + def __init__(self, file_path, alleles, peptide_col_name): self.file_path = file_path self.alleles = alleles - self.threshold = threshold + self.peptide_col_name = peptide_col_name self.predictor = None self.prediction_df = self._format_prediction_result() @@ -150,10 +187,10 @@ def _format_mhcflurry_prediction(self) -> pd.DataFrame: # Convert IC50 to BA df['BA'] = df['mhcflurry_affinity'].apply(Utils.ic50toBA) # Harmonize df to desired output structure - df.rename(columns={"peptide": "sequence", "mhcflurry_presentation_percentile": "rank"}, inplace=True) - df = df[['sequence', 'allele', 'rank', 'BA']] - df["binder"] = df["rank"] <= PredictorBindingThreshold.MHCFLURRY.value - df["predictor"] = self.predictor + df.rename(columns={'peptide': self.peptide_col_name, 'mhcflurry_presentation_percentile': 'rank'}, inplace=True) + df = df[[self.peptide_col_name, 'allele', 'rank', 'BA']] + df['binder'] = df['rank'] <= PredictorBindingThreshold.MHCFLURRY.value + df['predictor'] = self.predictor return df @@ -163,11 +200,11 @@ def _format_mhcnuggets_prediction(self) -> pd.DataFrame: # Convert IC50 to BA df['BA'] = df['ic50'].apply(Utils.ic50toBA) # Harmonize df to desired output structure - df.rename(columns={"peptide": "sequence", "human_proteome_rank": "rank"}, inplace=True) - df = df[['sequence', 'allele', 'rank', 'BA']] + df.rename(columns={'peptide': self.peptide_col_name, 'human_proteome_rank': 'rank'}, inplace=True) + df = df[[self.peptide_col_name, 'allele', 'rank', 'BA']] # Use IC50 < 500 as threshold since mhcnuggets provides a different ranking compared to other predictors - df["binder"] = df["BA"] >= PredictorBindingThreshold.MHCNUGGETS.value - df["predictor"] = self.predictor + df['binder'] = df['BA'] >= PredictorBindingThreshold.MHCNUGGETS.value + df['predictor'] = self.predictor return df @@ -178,24 +215,24 @@ def _format_netmhcpan_prediction(self) -> pd.DataFrame: df = pd.read_csv(self.file_path, sep='\t', skiprows=1) # Extract Peptide, percentile rank, binding affinity df = df[df.columns[df.columns.str.contains('Peptide|EL_Rank|BA-score')]] - df = df.rename(columns={'Peptide':'sequence','EL_Rank':'EL_Rank.0','BA-score':'BA-score.0'}) + df = df.rename(columns={'Peptide':self.peptide_col_name,'EL_Rank':'EL_Rank.0','BA-score':'BA-score.0'}) # to longformat based on .0|1|2.. df_long = pd.melt( df, - id_vars=["sequence"], - value_vars=[col for col in df.columns if col != "sequence"], - var_name="metric", - value_name="value", + id_vars=[self.peptide_col_name], + value_vars=[col for col in df.columns if col != self.peptide_col_name], + var_name='metric', + value_name='value', ) # Extract the allele information (e.g., .0, .1, etc.) - df_long["allele"] = df_long["metric"].str.split('.').str[1] - df_long["metric"] = df_long["metric"].apply(lambda x: x.split('.')[0].replace('EL_Rank','rank').replace('BA-score','BA')) + df_long['allele'] = df_long['metric'].str.split('.').str[1] + df_long['metric'] = df_long['metric'].apply(lambda x: x.split('.')[0].replace('EL_Rank','rank').replace('BA-score','BA')) # Pivot table to organize columns properly - df_pivot = df_long.pivot_table(index=["sequence", "allele"], columns="metric", values="value").reset_index() - df_pivot['allele'] = [alleles_dict[int(index.strip("."))] for index in df_pivot['allele']] - df_pivot['binder'] = df_pivot['rank'] <= self.threshold + df_pivot = df_long.pivot_table(index=[self.peptide_col_name, 'allele'], columns='metric', values='value').reset_index() + df_pivot['allele'] = [alleles_dict[int(index.strip('.'))] for index in df_pivot['allele']] + df_pivot['binder'] = df_pivot['rank'] <= PredictorBindingThreshold.NETMHCPAN.value df_pivot['predictor'] = self.predictor df_pivot.index.name = '' @@ -212,22 +249,22 @@ def _format_netmhciipan_prediction(self) -> pd.DataFrame: df = pd.read_csv(self.file_path, sep='\t', skiprows=1) # Extract Peptide, percentile rank, binding affinity df = df[df.columns[df.columns.str.contains('Peptide|Rank(?!_BA)|Score_BA')]] - df = df.rename(columns={'Peptide':'sequence','Rank':'Rank.0','Score_BA':'Score_BA.0'}) + df = df.rename(columns={'Peptide':self.peptide_col_name,'Rank':'Rank.0','Score_BA':'Score_BA.0'}) # to longformat based on .0|1|2.. df_long = pd.melt( df, - id_vars=["sequence"], - value_vars=[col for col in df.columns if col != "sequence"], - var_name="metric", - value_name="value", + id_vars=[self.peptide_col_name], + value_vars=[col for col in df.columns if col != self.peptide_col_name], + var_name='metric', + value_name='value', ) # Extract the allele information (e.g., .0, .1, etc.) - df_long["allele"] = df_long["metric"].str.split('.').str[1] - df_long["metric"] = df_long["metric"].apply(lambda x: x.split('.')[0].replace('Rank','rank').replace('Score_BA','BA')) + df_long['allele'] = df_long['metric'].str.split('.').str[1] + df_long['metric'] = df_long['metric'].apply(lambda x: x.split('.')[0].replace('Rank','rank').replace('Score_BA','BA')) # Pivot table to organize columns properly - df_pivot = df_long.pivot_table(index=["sequence", "allele"], columns="metric", values="value").reset_index() - df_pivot['allele'] = [alleles_dict[int(index.strip("."))] for index in df_pivot['allele']] + df_pivot = df_long.pivot_table(index=[self.peptide_col_name, 'allele'], columns='metric', values='value').reset_index() + df_pivot['allele'] = [alleles_dict[int(index.strip('.'))] for index in df_pivot['allele']] df_pivot['binder'] = df_pivot['rank'] <= PredictorBindingThreshold.NETMHCIIPAN.value df_pivot['predictor'] = self.predictor df_pivot.index.name = '' @@ -240,14 +277,25 @@ def main(): # Iterate over each file predicted by multiple predictors, harmonize and merge output output_df = [] for file in args.input: - result = PredictionResult(file, args.alleles) + result = PredictionResult(file, args.alleles, args.peptide_col_name) logging.info(f"Writing {len(result.prediction_df)} {result.predictor} predictions to file..") output_df.append(result.prediction_df) output_df = pd.concat(output_df) - # Normalize allele names and write to file + # Normalize allele names output_df['allele'] = output_df['allele'].apply(lambda x : mhcgnomes.parse(x).to_string()) + + # Read in source file to annotate source metadata + source_df = pd.read_csv(args.source_file, sep='\t') + # Merge the prediction results with the source file + output_df = pd.merge(output_df, source_df, on=args.peptide_col_name, how='left') + + # Transform output to wide format if specified + if args.wide_format_output: + output_df = Utils.longTowide(output_df) + + # Write output file output_df.to_csv(f'{args.prefix}_predictions.tsv', sep='\t', index=False) # Parse versions diff --git a/modules/local/merge_predictions/templates/versions.yml b/modules/local/merge_predictions/templates/versions.yml deleted file mode 100644 index a16678a..0000000 --- a/modules/local/merge_predictions/templates/versions.yml +++ /dev/null @@ -1 +0,0 @@ -${task.process}:\n argparse: 1.1\n pandas: 1.5.3\n \ No newline at end of file diff --git a/modules/local/mhcflurry/main.nf b/modules/local/mhcflurry/main.nf index 41e9030..3281b8b 100644 --- a/modules/local/mhcflurry/main.nf +++ b/modules/local/mhcflurry/main.nf @@ -11,7 +11,7 @@ process MHCFLURRY { containerOptions = (workflow.containerEngine == 'docker') ? '-u $(id -u) -e "HOME=${HOME}" -v /etc/passwd:/etc/passwd:ro -v /etc/shadow:/etc/shadow:ro -v /etc/group:/etc/group:ro -v $HOME:$HOME' : '' input: - tuple val(meta), path(peptide_file) + tuple val(meta), path(csv) output: tuple val(meta), path("*.csv"), emit: predicted @@ -22,12 +22,12 @@ process MHCFLURRY { error("MHCflurry prediction of ${meta.sample} is not possible with MHC class II!") } def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.sample}" + def prefix = task.ext.prefix ?: "${meta.id}" """ mhcflurry-downloads fetch models_class1_presentation mhcflurry-predict \\ - $peptide_file \\ + $csv \\ --out ${prefix}_predicted_mhcflurry.csv \\ $args @@ -38,8 +38,7 @@ process MHCFLURRY { """ stub: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: meta.sample + def prefix = task.ext.prefix ?: "${meta.id}" """ touch ${prefix}_predicted_mhcflurry.csv diff --git a/modules/local/mhcnuggets/main.nf b/modules/local/mhcnuggets/main.nf index f941758..ef9d926 100644 --- a/modules/local/mhcnuggets/main.nf +++ b/modules/local/mhcnuggets/main.nf @@ -19,8 +19,8 @@ process MHCNUGGETS { template "mhcnuggets.py" stub: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.sample}" + def prefix = task.ext.prefix ?: "${meta.id}" + """ touch ${prefix}_predicted_mhcnuggets.tsv diff --git a/modules/local/mhcnuggets/templates/mhcnuggets.py b/modules/local/mhcnuggets/templates/mhcnuggets.py index 1c6b1ce..eb261a8 100644 --- a/modules/local/mhcnuggets/templates/mhcnuggets.py +++ b/modules/local/mhcnuggets/templates/mhcnuggets.py @@ -3,6 +3,7 @@ import argparse import shlex import logging +from pathlib import Path import pandas as pd from mhcnuggets.src.predict import predict diff --git a/modules/local/netmhciipan/main.nf b/modules/local/netmhciipan/main.nf index f244e43..ae8c567 100644 --- a/modules/local/netmhciipan/main.nf +++ b/modules/local/netmhciipan/main.nf @@ -4,11 +4,11 @@ process NETMHCIIPAN { conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'oras://community.wave.seqera.io/library/bash_gawk_perl_tcsh:59b949a8a9f0816b' : + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/de/de9c5fbcc5583f3c096617ef2c8f84c5e69b479cc5a5944f10d0e1d226779662/data' : 'community.wave.seqera.io/library/bash_gawk_perl_tcsh:a941b4e9bd4b8805' }" input: - tuple val(meta), path(peptide_file), path(software) + tuple val(meta), path(tsv), path(software) output: tuple val(meta), path("*.xls"), emit: predicted @@ -18,8 +18,8 @@ process NETMHCIIPAN { if (meta.mhc_class != "II") { error "NETMHCIIPAN only supports MHC class II. Use NETMHCIIPAN for MHC class II." } - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" // Adjust for netMHCIIpan allele format (e.g. DRB1_0101, HLA-DPA10103-DPB10101) def alleles = meta.alleles_supported.tokenize(';') .collect { @@ -30,7 +30,7 @@ process NETMHCIIPAN { """ netmhciipan/netMHCIIpan \ - -f $peptide_file \ + -f $tsv \ -inptype 1 \ -a $alleles \ -xls \ @@ -45,7 +45,7 @@ process NETMHCIIPAN { stub: def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: meta.sample + def prefix = task.ext.prefix ?: "${meta.id}" """ touch ${prefix}_predicted_netmhciipan.xls diff --git a/modules/local/netmhcpan/main.nf b/modules/local/netmhcpan/main.nf index 64516b1..220772d 100644 --- a/modules/local/netmhcpan/main.nf +++ b/modules/local/netmhcpan/main.nf @@ -4,11 +4,11 @@ process NETMHCPAN { conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'oras://community.wave.seqera.io/library/bash_gawk_perl_tcsh:59b949a8a9f0816b' : + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/de/de9c5fbcc5583f3c096617ef2c8f84c5e69b479cc5a5944f10d0e1d226779662/data' : 'community.wave.seqera.io/library/bash_gawk_perl_tcsh:a941b4e9bd4b8805' }" input: - tuple val(meta), path(peptide_file), path(software) + tuple val(meta), path(tsv), path(software) output: tuple val(meta), path("*.xls"), emit: predicted @@ -18,13 +18,13 @@ process NETMHCPAN { if (meta.mhc_class != "I") { error "NETMHCPAN only supports MHC class I. Use NETMHCIIPAN for MHC class II." } - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: meta.sample - def alleles = meta.alleles_supported.tokenize(';').collect { it.replace('*', '') }.join(',') + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def alleles = meta.alleles_supported.tokenize(';').collect { it.replace('*', '') }.join(',') """ netmhcpan/netMHCpan \ - -p $peptide_file \ + -p $tsv \ -a $alleles \ -xls \ -xlsfile ${prefix}_predicted_netmhcpan.xls \ @@ -37,8 +37,8 @@ process NETMHCPAN { """ stub: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: meta.sample + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" """ touch ${prefix}_predicted_netmhcpan.xls diff --git a/modules/local/prepare_prediction_input/main.nf b/modules/local/prepare_prediction_input/main.nf index 49531d2..0da7b0c 100644 --- a/modules/local/prepare_prediction_input/main.nf +++ b/modules/local/prepare_prediction_input/main.nf @@ -20,7 +20,7 @@ process PREPARE_PREDICTION_INPUT { stub: def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: meta.sample + def prefix = task.ext.prefix ?: "${meta.id}" """ touch ${prefix}_syfpeithi.csv touch ${prefix}_mhcflurry.csv diff --git a/modules/local/prepare_prediction_input/templates/prepare_prediction_input.py b/modules/local/prepare_prediction_input/templates/prepare_prediction_input.py index 423fce0..5a806cc 100644 --- a/modules/local/prepare_prediction_input/templates/prepare_prediction_input.py +++ b/modules/local/prepare_prediction_input/templates/prepare_prediction_input.py @@ -2,9 +2,10 @@ import argparse import shlex -from enum import Enum -import logging import json +import logging +from enum import Enum +from pathlib import Path import pandas as pd import mhcgnomes @@ -144,24 +145,17 @@ def keep_supported_alleles(allele_ls: list, tool: str, supported_alleles_tool: d return tool_allele_input + def has_valid_aas(peptide: str) -> bool: + """ + Check if a peptide contains only valid amino acids. + """ + valid_aas = "ACDEFGHIKLMNPQRSTVWY" + return all(aa in valid_aas for aa in peptide) + def main(): args = Arguments() - ### DEBUGGING INPUT - #args.input = '/mnt/volume/data/epitopeprediction/peptides.tsv' - #args.supported_alleles_json = '/mnt/volume/dev/mhcgnomes/supported_alleles.json' - #args.mhc_class = 'I' - #args.alleles='/mnt/volume/dev/epitopeprediction-2/modules/local/check_supported_alleles/templates/test_alleles.txt' - #args.tools = ['mhcflurry', 'mhcnuggets', 'mhcnuggetsii', 'netmhcpan','netmhciipan'] - #args.prefix = "test" - #args.min_peptide_length_classI = 8 - #args.max_peptide_length_classI = 12 - #args.min_peptide_length_classII = 12 - #args.max_peptide_length_classII = 25 - #args.peptide_col_name = "sequence" - ### - # Parse alleles to uniform format alleles_normalized = Utils.parse_alleles(args.alleles) supported_alleles_dict = json.load(open(args.supported_alleles_json)) @@ -176,13 +170,13 @@ def main(): # Parse input file to desired format of tools df_input = pd.read_csv(args.input, sep="\t") logging.info(f"Read file with {len(df_input)} peptides.") - + # Filter peptides with invalid amino acids + df_input = df_input[df_input[args.peptide_col_name].apply(Utils.has_valid_aas)] # Filter peptides based on user-defined length if args.mhc_class == "I": df = df_input[df_input[args.peptide_col_name].str.len().between(args.min_peptide_length_classI, args.max_peptide_length_classI)] else: df = df_input[df_input[args.peptide_col_name].str.len().between(args.min_peptide_length_classII, args.max_peptide_length_classII)] - if len(df) == 0: raise ValueError("No peptides left after applying length filters! Aborting..") diff --git a/modules/local/split_peptides/environment.yml b/modules/local/split_peptides/environment.yml new file mode 100644 index 0000000..f399d67 --- /dev/null +++ b/modules/local/split_peptides/environment.yml @@ -0,0 +1,5 @@ +channels: +- conda-forge +- bioconda +dependencies: +- conda-forge::python=3.11.0 diff --git a/modules/local/split_peptides.nf b/modules/local/split_peptides/main.nf similarity index 54% rename from modules/local/split_peptides.nf rename to modules/local/split_peptides/main.nf index 998b18a..e48c4f2 100644 --- a/modules/local/split_peptides.nf +++ b/modules/local/split_peptides/main.nf @@ -1,28 +1,31 @@ process SPLIT_PEPTIDES { label 'process_low' + tag "${meta.sample}" - conda "conda-forge::python=3.8.3" + conda "${moduleDir}/environment.yml" container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/python:3.8.3' : - 'biocontainers/python:3.8.3' }" + 'https://depot.galaxyproject.org/singularity/python:3.11' : + 'biocontainers/python:3.11' }" input: - tuple val(meta), path(peptide) + tuple val(meta), path(tsv) output: tuple val(meta), path("*.tsv"), emit: splitted - path "versions.yml", emit: versions + path "versions.yml" , emit: versions when: task.ext.when == null || task.ext.when script: - def prefix = task.ext.suffix ? "${peptide.baseName}_${task.ext.suffix}" : "${peptide.baseName}" + def prefix = task.ext.suffix ?: "${tsv.baseName}" """ - split_peptides.py --input ${peptide} \\ - --output_base "${prefix}" \\ - $task.ext.args + split_peptides.py \\ + --input $tsv \\ + --output_base "${prefix}" \\ + --min_size ${params.peptides_split_minchunksize} \\ + --max_chunks ${params.peptides_split_maxchunks} \\ cat <<-END_VERSIONS > versions.yml "${task.process}": @@ -31,7 +34,8 @@ process SPLIT_PEPTIDES { """ stub: - def prefix = task.ext.suffix ? "${peptide.baseName}_${task.ext.suffix}" : "${peptide.baseName}" + def prefix = task.ext.suffix ?: "${tsv.getExtension()}" + """ touch ${prefix}_1.tsv touch ${prefix}_2.tsv diff --git a/modules/nf-core/csvtk/concat/environment.yml b/modules/nf-core/csvtk/concat/environment.yml new file mode 100644 index 0000000..12087be --- /dev/null +++ b/modules/nf-core/csvtk/concat/environment.yml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda + +dependencies: + - bioconda::csvtk=0.31.0 diff --git a/modules/nf-core/csvtk/concat/main.nf b/modules/nf-core/csvtk/concat/main.nf new file mode 100644 index 0000000..9f17a9b --- /dev/null +++ b/modules/nf-core/csvtk/concat/main.nf @@ -0,0 +1,55 @@ +process CSVTK_CONCAT { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/csvtk:0.31.0--h9ee0642_0' : + 'biocontainers/csvtk:0.31.0--h9ee0642_0' }" + + input: + tuple val(meta), path(csv, name: 'inputs/csv*/*') + val in_format + val out_format + + output: + tuple val(meta), path("${prefix}.${out_extension}"), emit: csv + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + def delimiter = in_format == "tsv" ? "\t" : (in_format == "csv" ? "," : in_format) + def out_delimiter = out_format == "tsv" ? "\t" : (out_format == "csv" ? "," : out_format) + out_extension = out_format == "tsv" ? 'tsv' : 'csv' + """ + csvtk \\ + concat \\ + $args \\ + --num-cpus $task.cpus \\ + --delimiter "${delimiter}" \\ + --out-delimiter "${out_delimiter}" \\ + --out-file ${prefix}.${out_extension} \\ + $csv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + csvtk: \$(echo \$( csvtk version | sed -e "s/csvtk v//g" )) + END_VERSIONS + """ + + stub: + prefix = task.ext.prefix ?: "${meta.id}" + out_extension = out_format == "tsv" ? 'tsv' : 'csv' + """ + touch ${prefix}.${out_extension} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + csvtk: \$(echo \$( csvtk version | sed -e "s/csvtk v//g" )) + END_VERSIONS + """ +} diff --git a/modules/nf-core/csvtk/concat/meta.yml b/modules/nf-core/csvtk/concat/meta.yml new file mode 100644 index 0000000..27ffc1c --- /dev/null +++ b/modules/nf-core/csvtk/concat/meta.yml @@ -0,0 +1,52 @@ +name: csvtk_concat +description: Concatenate two or more CSV (or TSV) tables into a single table +keywords: + - concatenate + - tsv + - csv +tools: + - csvtk: + description: A cross-platform, efficient, practical CSV/TSV toolkit + homepage: http://bioinf.shenwei.me/csvtk + documentation: http://bioinf.shenwei.me/csvtk + tool_dev_url: https://github.com/shenwei356/csvtk + licence: ["MIT"] + identifier: "" +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - csv: + type: file + description: CSV/TSV formatted files + pattern: "*.{csv,tsv}" + - - in_format: + type: string + description: Input format (csv, tab, or a delimiting character) + pattern: "*" + - - out_format: + type: string + description: Output format (csv, tab, or a delimiting character) + pattern: "*" +output: + - csv: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - ${prefix}.${out_extension}: + type: file + description: Concatenated CSV/TSV file + pattern: "*.{csv,tsv}" + - versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "version.yml" +authors: + - "@rpetit3" +maintainers: + - "@rpetit3" diff --git a/modules/nf-core/csvtk/concat/tests/main.nf.test b/modules/nf-core/csvtk/concat/tests/main.nf.test new file mode 100644 index 0000000..b6c1a58 --- /dev/null +++ b/modules/nf-core/csvtk/concat/tests/main.nf.test @@ -0,0 +1,72 @@ +// nf-core modules test csvtk/concat +nextflow_process { + + name "Test Process CSVTK_CONCAT" + script "../main.nf" + process "CSVTK_CONCAT" + + tag "modules" + tag "modules_nfcore" + tag "csvtk" + tag "csvtk/concat" + + test("tsv - concat - csv") { + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + [ + file("https://github.com/nf-core/test-datasets/raw/bacass/bacass_hybrid.csv", checkIfExists: true), + file("https://github.com/nf-core/test-datasets/raw/bacass/bacass_long.csv", checkIfExists: true), + file("https://github.com/nf-core/test-datasets/raw/bacass/bacass_short.csv", checkIfExists: true), + file("https://github.com/nf-core/test-datasets/raw/bacass/bacass_short.csv", checkIfExists: true) + ] + ] + input[1] = "tsv" + input[2] = "csv" + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("tsv - concat - csv - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ + [ id:'test' ], // meta map + [ + file("https://github.com/nf-core/test-datasets/raw/bacass/bacass_hybrid.csv", checkIfExists: true), + file("https://github.com/nf-core/test-datasets/raw/bacass/bacass_long.csv", checkIfExists: true), + file("https://github.com/nf-core/test-datasets/raw/bacass/bacass_short.csv", checkIfExists: true) + ] + ] + input[1] = "tsv" + input[2] = "csv" + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} \ No newline at end of file diff --git a/modules/nf-core/csvtk/concat/tests/main.nf.test.snap b/modules/nf-core/csvtk/concat/tests/main.nf.test.snap new file mode 100644 index 0000000..254d34a --- /dev/null +++ b/modules/nf-core/csvtk/concat/tests/main.nf.test.snap @@ -0,0 +1,68 @@ +{ + "tsv - concat - csv - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.csv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + "versions.yml:md5,c203a84cc5b289951b70302549dcf08d" + ], + "csv": [ + [ + { + "id": "test" + }, + "test.csv:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,c203a84cc5b289951b70302549dcf08d" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.3" + }, + "timestamp": "2025-01-08T04:46:46.133640633" + }, + "tsv - concat - csv": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.csv:md5,bb0ed52999b6b24297bcefb3c29f0a5c" + ] + ], + "1": [ + "versions.yml:md5,c203a84cc5b289951b70302549dcf08d" + ], + "csv": [ + [ + { + "id": "test" + }, + "test.csv:md5,bb0ed52999b6b24297bcefb3c29f0a5c" + ] + ], + "versions": [ + "versions.yml:md5,c203a84cc5b289951b70302549dcf08d" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.3" + }, + "timestamp": "2025-01-08T04:46:31.419386462" + } +} \ No newline at end of file diff --git a/modules/nf-core/csvtk/concat/tests/tags.yml b/modules/nf-core/csvtk/concat/tests/tags.yml new file mode 100644 index 0000000..0d10e7c --- /dev/null +++ b/modules/nf-core/csvtk/concat/tests/tags.yml @@ -0,0 +1,2 @@ +csvtk/concat: + - "modules/nf-core/csvtk/concat/**" diff --git a/nextflow.config b/nextflow.config index 95864e5..d915baf 100644 --- a/nextflow.config +++ b/nextflow.config @@ -10,11 +10,13 @@ params { // Input options input = null + peptide_col_name = 'sequence' peptides_split_maxchunks = 100 - peptides_split_minchunksize = 5000 + peptides_split_minchunksize = 500 split_by_variants = false split_by_variants_size = 0 split_by_variants_distance = 110000 + wide_format_output = false // References genome_reference = 'grch37' @@ -26,7 +28,6 @@ params { // Options: Filtering filter_self = false - show_supported_models = false // External tools external_tools_meta = "${projectDir}/assets/external_tools_meta.json" diff --git a/nextflow_schema.json b/nextflow_schema.json index 5b9cba1..7765f00 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -66,6 +66,11 @@ "description": "Options for the peptide prediction step.", "fa_icon": "far fa-chart-bar", "properties": { + "peptide_col_name": { + "type": "string", + "default": "sequence", + "description": "Specifies the column name in the input file that contains the peptide sequences." + }, "filter_self": { "type": "boolean", "description": "Filter against human proteome.", @@ -119,10 +124,11 @@ "description": "Specifies that sequences of proteins, affected by provided variants, will be written to a FASTA file.", "help_text": "Specifies that sequences of proteins that are affected by the provided genomic variants are written to a `FASTA` file. The resulting `FASTA` file will contain the wild-type and mutated protein sequences." }, - "show_supported_models": { + "wide_format_output": { "type": "boolean", - "description": "Writes out supported prediction models.", - "help_text": "Writes out supported models. Does not run any predictions." + "default": false, + "description": "Specifies that the output file will be in wide format.", + "help_text": "Transforms output file such that for each predictor and allele a column `predictor_allele_predictionMetric` is created." } } }, diff --git a/subworkflows/local/mhc_binding_prediction/main.nf b/subworkflows/local/mhc_binding_prediction/main.nf index 246efe1..7213397 100755 --- a/subworkflows/local/mhc_binding_prediction/main.nf +++ b/subworkflows/local/mhc_binding_prediction/main.nf @@ -1,13 +1,13 @@ -include { PREPARE_PREDICTION_INPUT } from '../../../modules/local/prepare_prediction_input' -include { SYFPEITHI } from '../../../modules/local/syfpeithi' -include { MHCFLURRY } from '../../../modules/local/mhcflurry' +include { PREPARE_PREDICTION_INPUT } from '../../../modules/local/prepare_prediction_input' +include { SYFPEITHI } from '../../../modules/local/syfpeithi' +include { MHCFLURRY } from '../../../modules/local/mhcflurry' include { MHCNUGGETS; - MHCNUGGETS as MHCNUGGETSII } from '../../../modules/local/mhcnuggets' -include { NETMHCPAN } from '../../../modules/local/netmhcpan' -include { NETMHCIIPAN } from '../../../modules/local/netmhciipan' + MHCNUGGETS as MHCNUGGETSII } from '../../../modules/local/mhcnuggets' +include { NETMHCPAN } from '../../../modules/local/netmhcpan' +include { NETMHCIIPAN } from '../../../modules/local/netmhciipan' include { UNPACK_NETMHC_SOFTWARE as NETMHCPAN_IMPORT; - UNPACK_NETMHC_SOFTWARE as NETMHCIIPAN_IMPORT} from '../../../modules/local/unpack_netmhc_software' -include { MERGE_PREDICTIONS } from '../../../modules/local/merge_predictions' + UNPACK_NETMHC_SOFTWARE as NETMHCIIPAN_IMPORT } from '../../../modules/local/unpack_netmhc_software' +include { MERGE_PREDICTIONS } from '../../../modules/local/merge_predictions' workflow MHC_BINDING_PREDICTION { take: @@ -22,8 +22,13 @@ workflow MHC_BINDING_PREDICTION { validate_tools_param(tools) + // Add filename to meta to prevent overwriting identically named files + ch_peptides + .map { meta, file -> [meta + [file_id: meta.sample + '_' + file.baseName], file] } + .set { ch_peptides_to_predict } + //prepare the input file - PREPARE_PREDICTION_INPUT( ch_peptides, supported_alleles_json) + PREPARE_PREDICTION_INPUT( ch_peptides_to_predict, supported_alleles_json) .prepared .transpose() .branch { @@ -70,9 +75,13 @@ workflow MHC_BINDING_PREDICTION { ch_binding_predictors_out = ch_binding_predictors_out.mix(NETMHCIIPAN.out.predicted) } - MERGE_PREDICTIONS( ch_binding_predictors_out - .map { meta, file -> [meta.subMap('sample','alleles','mhc_class','input_type'), file] } - .groupTuple()) + ch_binding_predictors_out + .map { meta, file -> [meta.subMap('sample','alleles','mhc_class','input_type','file_id'), file] } + .groupTuple() + .join(ch_peptides_to_predict) + .set { ch_binding_predictors_out_meta} + + MERGE_PREDICTIONS( ch_binding_predictors_out_meta ) ch_versions = ch_versions.mix(MERGE_PREDICTIONS.out.versions) emit: @@ -80,8 +89,6 @@ workflow MHC_BINDING_PREDICTION { versions = ch_versions } - - // // Auxiliar Functions // diff --git a/workflows/epitopeprediction.nf b/workflows/epitopeprediction.nf index 9f17651..09f3ec4 100644 --- a/workflows/epitopeprediction.nf +++ b/workflows/epitopeprediction.nf @@ -6,19 +6,14 @@ // // MODULE: Local to the pipeline // -include { VARIANT_SPLIT } from '../modules/local/variant_split' -include { SNPSIFT_SPLIT } from '../modules/local/snpsift_split' -include { EPYTOPE_GENERATE_PEPTIDES } from '../modules/local/epytope_generate_peptides' -include { SPLIT_PEPTIDES as SPLIT_PEPTIDES_PEPTIDES } from '../modules/local/split_peptides' -include { SPLIT_PEPTIDES as SPLIT_PEPTIDES_PROTEIN } from '../modules/local/split_peptides' -include { EPYTOPE_PEPTIDE_PREDICTION as EPYTOPE_PEPTIDE_PREDICTION_PROTEIN } from '../modules/local/epytope_peptide_prediction' -include { EPYTOPE_PEPTIDE_PREDICTION as EPYTOPE_PEPTIDE_PREDICTION_PEP } from '../modules/local/epytope_peptide_prediction' -include { EPYTOPE_PEPTIDE_PREDICTION as EPYTOPE_PEPTIDE_PREDICTION_VAR } from '../modules/local/epytope_peptide_prediction' -include { CAT_FILES as CAT_TSV } from '../modules/local/cat_files' -include { CAT_FILES as CAT_FASTA } from '../modules/local/cat_files' -include { CSVTK_CONCAT } from '../modules/local/csvtk_concat' -include { MERGE_JSON as MERGE_JSON_SINGLE } from '../modules/local/merge_json' -include { MERGE_JSON as MERGE_JSON_MULTI } from '../modules/local/merge_json' +include { VARIANT_SPLIT } from '../modules/local/variant_split' +include { SNPSIFT_SPLIT } from '../modules/local/snpsift_split' +include { FASTA2PEPTIDES } from '../modules/local/fasta2peptides' +include { SPLIT_PEPTIDES } from '../modules/local/split_peptides' +include { EPYTOPE_PEPTIDE_PREDICTION as EPYTOPE_PEPTIDE_PREDICTION_VAR } from '../modules/local/epytope_peptide_prediction' +include { CAT_FILES as CAT_TSV } from '../modules/local/cat_files' +include { CAT_FILES as CAT_FASTA } from '../modules/local/cat_files' +include { CSVTK_CONCAT } from '../modules/nf-core/csvtk/concat' // // SUBWORKFLOW: Consisting of a mix of local and nf-core/modules @@ -132,18 +127,8 @@ workflow EPITOPEPREDICTION { // RUN EPITOPE PREDICTION //======================================================================================== //*/ - // TODO: Implement Fasta parsing - //// not sure if this is the best solution to also have a extra process for protein, but I think we need it for cases when we have both in one sheet? (CM) - //// run epitope prediction for proteins - //EPYTOPE_PEPTIDE_PREDICTION_PROTEIN( - // SPLIT_PEPTIDES_PROTEIN - // .out - // .splitted - // .combine( ch_prediction_tool_versions ) - // .transpose(), - // EXTERNAL_TOOLS_IMPORT.out.nonfree_tools.collect().ifEmpty([]) - //) - //ch_versions = ch_versions.mix( EPYTOPE_PEPTIDE_PREDICTION_PROTEIN.out.versions ) + FASTA2PEPTIDES( ch_samples_uncompressed.protein ) + ch_versions = ch_versions.mix( FASTA2PEPTIDES.out.versions ) //ch_prediction_input = SPLIT_PEPTIDES_PEPTIDES.out.splitted.transpose() @@ -157,11 +142,27 @@ workflow EPITOPEPREDICTION { // EXTERNAL_TOOLS_IMPORT.out.nonfree_tools.collect().ifEmpty([]) //) //ch_versions = ch_versions.mix( EPYTOPE_PEPTIDE_PREDICTION_VAR.out.versions ) + // TODO: Speed up prediction by splitting large files + ch_to_predict = ch_samples_uncompressed.peptide + .mix(FASTA2PEPTIDES.out.tsv.transpose()) + + // Split tsv if size exceeds params.peptides_split_minchunksize + SPLIT_PEPTIDES(ch_to_predict) + ch_versions = ch_versions.mix(SPLIT_PEPTIDES.out.versions) - MHC_BINDING_PREDICTION( ch_samples_uncompressed.peptide, params.tools, supported_alleles_json, netmhc_software_meta) + // + // SUBWORKFLOW: MHC Binding Prediction + // + MHC_BINDING_PREDICTION( SPLIT_PEPTIDES.out.splitted.transpose(), + params.tools, + supported_alleles_json, + netmhc_software_meta) ch_versions = ch_versions.mix(MHC_BINDING_PREDICTION.out.versions) - ch_predicted_peptides = MHC_BINDING_PREDICTION.out.predicted + // TODO: Fix meta.id / meta.sample + CSVTK_CONCAT(MHC_BINDING_PREDICTION.out.predicted + .map { meta, file -> [meta.subMap('sample','alleles','mhc_class'), file] } + .groupTuple(), "tsv", "tsv") // // Collate and save software versions