diff --git a/LICENSE b/LICENSE index fa6e5d35..26cbbd71 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ MIT License -Copyright (c) Leon Bichmann, Marissa Dubbelaar, Jonas Scheid, Steffen Lemke +Copyright (c) Jonas Scheid, Steffen Lemke, Leon Bichmann, Marissa Dubbelaar Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/bin/markdown_to_html.py b/bin/markdown_to_html.py deleted file mode 100755 index 168daee6..00000000 --- a/bin/markdown_to_html.py +++ /dev/null @@ -1,108 +0,0 @@ -#!/usr/bin/env python - -from __future__ import print_function -import argparse -import markdown -import os -import sys -import io - - -def convert_markdown(in_fn): - input_md = io.open(in_fn, mode="r", encoding="utf-8").read() - html = markdown.markdown( - "[TOC]\n" + input_md, - extensions=[ - "pymdownx.extra", - "pymdownx.b64", - "pymdownx.highlight", - "pymdownx.emoji", - "pymdownx.tilde", - "toc", - ], - extension_configs={ - "pymdownx.b64": {"base_path": os.path.dirname(in_fn)}, - "pymdownx.highlight": {"noclasses": True}, - "toc": {"title": "Table of Contents"}, - }, - ) - return html - - -def wrap_html(contents): - header = """ - - - - - -
- """ - footer = """ -
- - - """ - return header + contents + footer - - -def parse_args(args=None): - parser = argparse.ArgumentParser() - parser.add_argument( - "mdfile", - type=argparse.FileType("r"), - nargs="?", - help="File to convert. Defaults to stdin.", - ) - parser.add_argument( - "-o", - "--out", - type=argparse.FileType("w"), - default=sys.stdout, - help="Output file name. Defaults to stdout.", - ) - return parser.parse_args(args) - - -def main(args=None): - args = parse_args(args) - converted_md = convert_markdown(args.mdfile.name) - html = wrap_html(converted_md) - args.out.write(html) - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/bin/mhcflurry_neoepitope_binding_prediction.py b/bin/mhcflurry_neoepitope_binding_prediction.py deleted file mode 100755 index 31dd403a..00000000 --- a/bin/mhcflurry_neoepitope_binding_prediction.py +++ /dev/null @@ -1,59 +0,0 @@ -#!/usr/bin/env python - -# Written by Lukas Heumos and released under MIT license. - -import pandas as pd -import numpy as np -import logging -import sys - -from mhcflurry import Class1AffinityPredictor - -# logging setup -console = logging.StreamHandler(sys.stdout) -formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") -console.setFormatter(formatter) -LOG = logging.getLogger("Neoepitope Binding Prediction") -LOG.addHandler(console) -LOG.setLevel(logging.INFO) - -# List of alleles supported by mhcflurry -supported_alleles = ( - "A*01:01,A*02:01,A*02:02,A*02:03,A*02:05,A*02:06,A*02:07,A*02:11,A*02:12,A*02:16,A*02:17,A*02:19," - "A*02:50,A*03:01,A*11:01,A*23:01,A*24:02,A*24:03,A*25:01,A*26:01,A*26:02,A*26:03,A*29:02,A*30:01," - "A*30:02,A*31:01,A*32:01,A*33:01,A*66:01,A*68:01,A*68:02,A*68:23,A*69:01,A*80:01,B*07:01,B*07:02," - "B*08:01,B*08:02,B*08:03,B*14:02,B*15:01,B*15:02,B*15:03,B*15:09,B*15:17,B*18:01,B*27:02,B*27:03," - "B*27:04,B*27:05,B*27:06,B*35:01,B*35:03,B*37:01,B*38:01,B*39:01,B*39:06,B*40:01,B*40:02,B*42:01," - "B*44:02,B*44:03,B*45:01,B*46:01,B*48:01,B*51:01,B*53:01,B*54:01,B*57:01,B*58:01,B*83:01,C*03:03," - "C*04:01,C*05:01,C*06:02,C*07:02,C*08:02,C*12:03,C*14:02,C*15:02".split(",") -) - -# read provided allotypes -alleles = sys.argv[-3].split(";") - -# extract and verify alleles -unsupported_alleles = [a for a in alleles if a not in supported_alleles] -alleles = [a for a in alleles if a in supported_alleles] - -if unsupported_alleles: - for allele in unsupported_alleles: - LOG.warning("Allele: " + allele + " is not supported by MHCFlurry!") -if not alleles: - LOG.warning("Submitted alleles are not supported or formatting of input.tsv is not correct!") - -flatten = lambda l: [item for sublist in l for item in sublist] -# read identified peptides -neoepitopes = [line.rstrip("\n").strip().split(",") for line in open(sys.argv[-2])][1:] -neoepitopes = flatten(neoepitopes) -seqs_to_geneID = dict(zip(neoepitopes[::2], neoepitopes[1::2])) - -if len(seqs_to_geneID) > 0: - # call mhcflurry - for allele in alleles: - predictor = Class1AffinityPredictor.load() - df_pred = predictor.predict_to_dataframe(allele=allele, peptides=seqs_to_geneID.keys()) - df_pred.insert(1, "geneID", pd.Series(np.array(seqs_to_geneID.values()))) - df_pred.to_csv(allele + "_" + sys.argv[-1]) -else: - op = open(sys.argv[-1], "w") - op.close() diff --git a/bin/mhcflurry_predict_mztab.py b/bin/mhcflurry_predict_mztab.py deleted file mode 100755 index a2b165b8..00000000 --- a/bin/mhcflurry_predict_mztab.py +++ /dev/null @@ -1,81 +0,0 @@ -#!/usr/bin/env python - -# Written by Lukas Heumos and released under MIT license. - -import argparse -import logging -import pandas as pd -import numpy as np -import sys - -from collections import defaultdict -from mhcflurry import Class1AffinityPredictor - -# logging setup -console = logging.StreamHandler(sys.stdout) -formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") -console.setFormatter(formatter) -LOG = logging.getLogger("MHCFlurry Predict mztab") -LOG.addHandler(console) -LOG.setLevel(logging.INFO) - - -def parse_mztab(identified_peptides_file): - """ - parses an mztab file and returns all identified proteins - :param identified_peptides_file: path to the mztab file - :return: identified proteins - """ - mztab = open(identified_peptides_file) - mztab_read = mztab.readlines() - mztab.close() - - seq_geneIDs = defaultdict(str) - for line in mztab_read: - if line.startswith("PEP"): - content = line.split("\t") - seq = content[1] - geneID = content[2] - if not "U" in seq and not "X" in seq and not "Z" in seq and not "J" in seq and not "B" in seq: - seq_geneIDs[seq] = geneID - - return seq_geneIDs - - -# List of alleles supported by mhcflurry -supported_alleles = ( - "A*01:01,A*02:01,A*02:02,A*02:03,A*02:05,A*02:06,A*02:07,A*02:11,A*02:12,A*02:16,A*02:17,A*02:19," - "A*02:50,A*03:01,A*11:01,A*23:01,A*24:02,A*24:03,A*25:01,A*26:01,A*26:02,A*26:03,A*29:02,A*30:01," - "A*30:02,A*31:01,A*32:01,A*33:01,A*66:01,A*68:01,A*68:02,A*68:23,A*69:01,A*80:01,B*07:01,B*07:02," - "B*08:01,B*08:02,B*08:03,B*14:02,B*15:01,B*15:02,B*15:03,B*15:09,B*15:17,B*18:01,B*27:02,B*27:03," - "B*27:04,B*27:05,B*27:06,B*35:01,B*35:03,B*37:01,B*38:01,B*39:01,B*39:06,B*40:01,B*40:02,B*42:01," - "B*44:02,B*44:03,B*45:01,B*46:01,B*48:01,B*51:01,B*53:01,B*54:01,B*57:01,B*58:01,B*83:01,C*03:03," - "C*04:01,C*05:01,C*06:02,C*07:02,C*08:02,C*12:03,C*14:02,C*15:02".split(",") -) - -# read provided allotypes -alleles = sys.argv[-3].split(";") - -# extract and verify alleles -unsupported_alleles = [a for a in alleles if a not in supported_alleles] -alleles = [a for a in alleles if a in supported_alleles] - -if unsupported_alleles: - for allele in unsupported_alleles: - LOG.warning("Allele: " + allele + " is not supported by MHCFlurry!") -if not alleles: - LOG.warning("Submitted alleles are not supported or formatting of input.tsv is not correct!") - -# read identified peptides -seqs_to_geneID = parse_mztab(sys.argv[-2]) - -if len(seqs_to_geneID) > 0: - # call mhcflurry - for allele in alleles: - predictor = Class1AffinityPredictor.load() - df_pred = predictor.predict_to_dataframe(allele=allele, peptides=seqs_to_geneID.keys()) - df_pred.insert(1, "geneID", pd.Series(np.array(seqs_to_geneID.values()))) - df_pred.to_csv(allele + "_" + sys.argv[-1]) -else: - op = open(sys.argv[-1], "w") - op.close() diff --git a/bin/mhcflurry_predict_mztab_for_filtering.py b/bin/mhcflurry_predict_mztab_for_filtering.py deleted file mode 100755 index f5bd9e51..00000000 --- a/bin/mhcflurry_predict_mztab_for_filtering.py +++ /dev/null @@ -1,83 +0,0 @@ -#!/usr/bin/env python - -# Written by Lukas Heumos and released under MIT license. - -from mhcflurry import Class1AffinityPredictor -import logging -import sys - -# logging setup -console = logging.StreamHandler(sys.stdout) -formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") -console.setFormatter(formatter) -LOG = logging.getLogger("MHCFlurry Predict mztab for filtering") -LOG.addHandler(console) -LOG.setLevel(logging.INFO) - -# List of alleles supported by mhclurry -supported_alleles = ( - "A*01:01,A*02:01,A*02:02,A*02:03,A*02:05,A*02:06,A*02:07,A*02:11,A*02:12,A*02:16,A*02:17,A*02:19," - "A*02:50,A*03:01,A*11:01,A*23:01,A*24:02,A*24:03,A*25:01,A*26:01,A*26:02,A*26:03,A*29:02,A*30:01," - "A*30:02,A*31:01,A*32:01,A*33:01,A*66:01,A*68:01,A*68:02,A*68:23,A*69:01,A*80:01,B*07:01,B*07:02," - "B*08:01,B*08:02,B*08:03,B*14:02,B*15:01,B*15:02,B*15:03,B*15:09,B*15:17,B*18:01,B*27:02,B*27:03," - "B*27:04,B*27:05,B*27:06,B*35:01,B*35:03,B*37:01,B*38:01,B*39:01,B*39:06,B*40:01,B*40:02,B*42:01," - "B*44:02,B*44:03,B*45:01,B*46:01,B*48:01,B*51:01,B*53:01,B*54:01,B*57:01,B*58:01,B*83:01,C*03:03," - "C*04:01,C*05:01,C*06:02,C*07:02,C*08:02,C*12:03,C*14:02,C*15:02".split(",") -) - -# read provided allotypes -print(sys.argv[-4]) -alleles = sys.argv[-4].split(";") -print(alleles) - -# extract and verify alleles -unsupported_alleles = [a for a in alleles if a not in supported_alleles] -alleles = [a for a in alleles if a in supported_alleles] - -if unsupported_alleles: - for allele in unsupported_alleles: - LOG.warning("Allele: " + allele + " is not supported by MHCFlurry!") -if not alleles: - LOG.warning("Submitted alleles are not supported or formatting of input.tsv is not correct!") - - -# read identified peptides with q-value < threshold -mztab = open(sys.argv[-3]) -mztab_read = mztab.readlines() -mztab.close() -seqs = [l.split()[1] for l in mztab_read if l.startswith("PSM")] -seqs_new_smaller_qval = list(set(seqs)) - -# read all remaining peptides with q-value > threshold -mztab = open(sys.argv[-2]) -mztab_read = mztab.readlines() -mztab.close() -seqs = [l.split()[1] for l in mztab_read if l.startswith("PSM") if l.split()[1] not in seqs_new_smaller_qval] -seqs_new_greater_qval = list(set(seqs)) -seqs_new_greater_qval = [ - s - for s in seqs_new_greater_qval - if 7 < len(s) < 13 and not "U" in s and not "X" in s and not "Z" in s and not "J" in s and not "B" in s -] - -# call mhcflurry -# subprocess.call("mhcflurry-predict --peptides {p} --alleles {a} --out {o}".format(p=" ".join(seqs_new), a=" ".join(alleles), o=sys.argv[-1])) -seqs_filtered = [] -for allele in alleles: - print(allele) - predictor = Class1AffinityPredictor.load() - df_pred = predictor.predict_to_dataframe(allele=allele, peptides=seqs_new_greater_qval) - seqs_filtered += df_pred[df_pred["prediction"] <= float(sys.argv[-5])]["peptide"].values.tolist() - -# merge sequence lists and append decoys -seqs_new_all = list(set(seqs_new_smaller_qval + seqs_filtered)) -seqs_new_all = seqs_new_all + [s[::-1] for s in seqs_new_all] - -# write idXML for filtering -op = open(sys.argv[-1], "w") -op.write('' + "\n") -for pep in seqs_new_all: - op.write("\t\t\t" + '' + "\n") - op.write("\t\t\t" + "" + "\n") -op.write("" + "\n") -op.close() diff --git a/bin/mhcnuggets_predict_peptides.py b/bin/mhcnuggets_predict_peptides.py deleted file mode 100755 index 724e89b3..00000000 --- a/bin/mhcnuggets_predict_peptides.py +++ /dev/null @@ -1,192 +0,0 @@ -#!/usr/bin/env python - -# Written by Lukas Heumos and released under MIT license. - -from mhcnuggets.src.predict import predict -import argparse -import logging -import sys - -# logging setup -console = logging.StreamHandler(sys.stdout) -formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") -console.setFormatter(formatter) -LOG = logging.getLogger("MHCNuggets Predict Peptides") -LOG.addHandler(console) -LOG.setLevel(logging.INFO) - -supported_alleles_class_2 = [ - "HLA-DPA1*01:03-DPB1*02:01", - "HLA-DPA1*01:03-DPB1*03:01", - "HLA-DPA1*01:03-DPB1*04:01", - "HLA-DPA1*01:03-DPB1*04:02", - "HLA-DPA1*02:01-DPB1*01:01", - "HLA-DPA1*02:01-DPB1*05:01", - "HLA-DPA1*02:02-DPB1*05:01", - "HLA-DPA1*03:01-DPB1*04:02", - "HLA-DPB1*01:01", - "HLA-DPB1*02:01", - "HLA-DPB1*03:01", - "HLA-DPB1*04:01", - "HLA-DPB1*04:02", - "HLA-DPB1*05:01", - "HLA-DPB1*09:01", - "HLA-DPB1*11:01", - "HLA-DPB1*14:01", - "HLA-DPB1*20:01", - "HLA-DQA1*01:01", - "HLA-DQA1*01:01-DQB1*05:01", - "HLA-DQA1*01:01-DQB1*05:03", - "HLA-DQA1*01:02", - "HLA-DQA1*01:02-DQB1*05:01", - "HLA-DQA1*01:02-DQB1*05:02", - "HLA-DQA1*01:02-DQB1*06:02", - "HLA-DQA1*01:02-DQB1*06:04", - "HLA-DQA1*01:03-DQB1*03:02", - "HLA-DQA1*01:03-DQB1*06:01", - "HLA-DQA1*01:03-DQB1*06:03", - "HLA-DQA1*01:04-DQB1*05:03", - "HLA-DQA1*02:01-DQB1*02:01", - "HLA-DQA1*02:01-DQB1*02:02", - "HLA-DQA1*02:01-DQB1*03:01", - "HLA-DQA1*02:01-DQB1*03:03", - "HLA-DQA1*02:01-DQB1*04:02", - "HLA-DQA1*03:01", - "HLA-DQA1*03:01-DQB1*02:01", - "HLA-DQA1*03:01-DQB1*03:01", - "HLA-DQA1*03:01-DQB1*03:02", - "HLA-DQA1*03:01-DQB1*04:01", - "HLA-DQA1*03:02-DQB1*03:01", - "HLA-DQA1*03:02-DQB1*03:03", - "HLA-DQA1*03:02-DQB1*04:01", - "HLA-DQA1*03:03-DQB1*04:02", - "HLA-DQA1*04:01-DQB1*04:02", - "HLA-DQA1*05:01", - "HLA-DQA1*05:01-DQB1*02:01", - "HLA-DQA1*05:01-DQB1*03:01", - "HLA-DQA1*05:01-DQB1*03:02", - "HLA-DQA1*05:01-DQB1*03:03", - "HLA-DQA1*05:01-DQB1*04:02", - "HLA-DQA1*05:05-DQB1*03:01", - "HLA-DQA1*06:01-DQB1*04:02", - "HLA-DQB1*02:01", - "HLA-DQB1*02:02", - "HLA-DQB1*03:01", - "HLA-DQB1*03:02", - "HLA-DQB1*03:19", - "HLA-DQB1*04:02", - "HLA-DQB1*05:01", - "HLA-DQB1*05:02", - "HLA-DQB1*05:03", - "HLA-DQB1*06:02", - "HLA-DQB1*06:03", - "HLA-DQB1*06:04", - "HLA-DRA0*10:1-DRB1*01:01", - "HLA-DRA0*10:1-DRB1*03:01", - "HLA-DRA0*10:1-DRB1*04:01", - "HLA-DRA0*10:1-DRB1*04:04", - "HLA-DRA0*10:1-DRB1*07:01", - "HLA-DRA0*10:1-DRB1*08:01", - "HLA-DRA0*10:1-DRB1*09:01", - "HLA-DRA0*10:1-DRB1*11:01", - "HLA-DRA0*10:1-DRB1*13:01", - "HLA-DRA0*10:1-DRB1*14:54", - "HLA-DRA0*10:1-DRB1*15:01", - "HLA-DRA0*10:1-DRB3*01:01", - "HLA-DRA0*10:1-DRB3*02:02", - "HLA-DRA0*10:1-DRB3*03:01", - "HLA-DRA0*10:1-DRB4*01:03", - "HLA-DRA0*10:1-DRB5*01:01", - "HLA-DRB1*01:01", - "HLA-DRB1*01:02", - "HLA-DRB1*01:03", - "HLA-DRB1*03:01", - "HLA-DRB1*03:02", - "HLA-DRB1*03:03", - "HLA-DRB1*03:04", - "HLA-DRB1*03:05", - "HLA-DRB1*04:01", - "HLA-DRB1*04:02", - "HLA-DRB1*04:03", - "HLA-DRB1*04:04", - "HLA-DRB1*04:05", - "HLA-DRB1*04:06", - "HLA-DRB1*04:07", - "HLA-DRB1*04:11", - "HLA-DRB1*07:01", - "HLA-DRB1*08:01", - "HLA-DRB1*08:02", - "HLA-DRB1*08:03", - "HLA-DRB1*08:04", - "HLA-DRB1*09:01", - "HLA-DRB1*10:01", - "HLA-DRB1*11:01", - "HLA-DRB1*11:02", - "HLA-DRB1*11:03", - "HLA-DRB1*11:04", - "HLA-DRB1*12:01", - "HLA-DRB1*12:02", - "HLA-DRB1*13:01", - "HLA-DRB1*13:02", - "HLA-DRB1*13:03", - "HLA-DRB1*13:04", - "HLA-DRB1*13:05", - "HLA-DRB1*14:01", - "HLA-DRB1*14:02", - "HLA-DRB1*15:01", - "HLA-DRB1*15:02", - "HLA-DRB1*15:03", - "HLA-DRB1*16:01", - "HLA-DRB1*16:02", - "HLA-DRB3*01:01", - "HLA-DRB3*02:02", - "HLA-DRB3*03:01", - "HLA-DRB4*01:01", - "HLA-DRB4*01:03", - "HLA-DRB5*01:01", - "HLA-DRB5*01:02", -] - -flatten = lambda l: [item for sublist in l for item in sublist] - - -def convert_alleles_mhcnuggets_format(alleles): - return [allele.replace("*", "") for allele in alleles] - - -def parse_alleles(allele_input): - alleles = allele_input.split(";") - supp_alleles = convert_alleles_mhcnuggets_format(list(set(alleles).intersection(supported_alleles_class_2))) - - return supp_alleles - - -def main(): - model = argparse.ArgumentParser(description="MHCNuggets binding prediction") - - model.add_argument("-p", "--peptides", type=str, help="mhcnuggets input") - - model.add_argument("-a", "--alleles", type=str, help="class 2 alleles") - - model.add_argument("-o", "--output", type=str, help="mhcnuggets output") - - args = model.parse_args() - - if open(args.peptides).readlines() != []: - supp_alleles = parse_alleles(args.alleles) - - for allele in supp_alleles: - predict( - class_="II", - peptides_path=args.peptides, - mhc=allele, - output=allele + args.output, - ) - - else: - op = open("predicted_neoepitopes_class_2", "w") - op.close() - - -if __name__ == "__main__": - main() diff --git a/bin/postprocess_neoepitopes_mhcnuggets.py b/bin/postprocess_neoepitopes_mhcnuggets.py deleted file mode 100755 index 5e4e0acc..00000000 --- a/bin/postprocess_neoepitopes_mhcnuggets.py +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/env python - -# Written by Lukas Heumos and released under MIT license. - -import argparse -import sys -import logging -from collections import defaultdict - -# logging setup -console = logging.StreamHandler(sys.stdout) -formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") -console.setFormatter(formatter) -LOG = logging.getLogger("Postprocess Neoepitopes MHCNuggets") -LOG.addHandler(console) -LOG.setLevel(logging.INFO) - - -def main(): - model = argparse.ArgumentParser(description="Postprocess Neoepitopes predicted by MHCNuggets") - - model.add_argument("-i", "--input", type=str, nargs="*", help="predicted class 2 neoepitopes") - - model.add_argument("-n", "--neoepitopes", type=str, help="neoepitopes file") - - args = model.parse_args() - - neoepitope_to_geneID = defaultdict() - with open(args.neoepitopes, "r") as f: - # skip header - f.readline() - for line in f: - split = line.split(",") - neoepitope_to_geneID[split[0]] = split[1] - - for prediction in args.input: - neoepitope_to_geneID_ic50 = defaultdict() - - with open(prediction, "r") as f: - # skip header - f.readline() - for line in f: - split = line.split(",") - neoepitope = split[0].strip() - geneID = neoepitope_to_geneID[neoepitope].strip() - ic50 = split[1].strip() - neoepitope_to_geneID_ic50[neoepitope] = (geneID, ic50) - - with open(prediction + ".csv", "w") as f: - f.write("peptide,geneID,ic50\n") - for neoepitope, pair in neoepitope_to_geneID_ic50.items(): - f.write(neoepitope + "," + pair[0] + "," + pair[1] + "\n") - - -if __name__ == "__main__": - main() diff --git a/bin/postprocess_peptides_mhcnuggets.py b/bin/postprocess_peptides_mhcnuggets.py deleted file mode 100755 index 68154f9c..00000000 --- a/bin/postprocess_peptides_mhcnuggets.py +++ /dev/null @@ -1,56 +0,0 @@ -#!/usr/bin/env python - -# Written by Lukas Heumos and released under MIT license. - -import argparse -import sys -import logging -from collections import defaultdict - -# logging setup -console = logging.StreamHandler(sys.stdout) -formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") -console.setFormatter(formatter) -LOG = logging.getLogger("Postprocess Peptides MHCNuggets") -LOG.addHandler(console) -LOG.setLevel(logging.INFO) - - -def main(): - model = argparse.ArgumentParser(description="Postprocess Neoepitopes predicted by MHCNuggets") - - model.add_argument("-i", "--input", type=str, nargs="*", help="predicted class 2 peptides") - - model.add_argument("-p", "--peptides_seq_ID", type=str, help="peptides to seq_ID csv file") - - model.add_argument("-o", "--output", type=str, help="output file name") - - args = model.parse_args() - - peptide_to_geneID = defaultdict() - with open(args.peptides_seq_ID, "r") as f: - for line in f: - split = line.split(",") - peptide_to_geneID[split[0]] = split[1] - - for prediction in args.input: - peptide_to_geneID_ic50 = defaultdict() - - with open(prediction, "r") as f: - # skip header - f.readline() - for line in f: - split = line.split(",") - peptide = split[0].strip() - geneID = peptide_to_geneID[peptide].strip() - ic50 = split[1].strip() - peptide_to_geneID_ic50[peptide] = (geneID, ic50) - - with open(args.output, "w") as f: - f.write("peptide,geneID,ic50\n") - for peptide, pair in peptide_to_geneID_ic50.items(): - f.write(peptide + "," + pair[0] + "," + pair[1] + "\n") - - -if __name__ == "__main__": - main() diff --git a/bin/preprocess_neoepitopes_mhcnuggets.py b/bin/preprocess_neoepitopes_mhcnuggets.py deleted file mode 100755 index d454d1c2..00000000 --- a/bin/preprocess_neoepitopes_mhcnuggets.py +++ /dev/null @@ -1,57 +0,0 @@ -#!/usr/bin/env python - -# Written by Lukas Heumos and released under MIT license. - -import argparse -import logging -import sys - -# logging setup -console = logging.StreamHandler(sys.stdout) -formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") -console.setFormatter(formatter) -LOG = logging.getLogger("Preprocess Neoepitopes MHCNuggets") -LOG.addHandler(console) -LOG.setLevel(logging.INFO) - - -def parse_neoepitopes(filepath): - with open(filepath, "r") as f: - # parse and remove the header - lines = f.read().split("\n")[1:] - # filter any leftover empty lines - lines = filter(lambda x: len(x) > 0, lines) - lines = [elem.split(",")[0] for elem in lines] - - return lines - - -def write_neoepitopes(neoepitopes, filepath): - with open(filepath, "w") as f: - for neoepitope in neoepitopes: - if neoepitope == neoepitopes[-1]: - f.write(neoepitope) - else: - f.write(neoepitope + "\n") - - -def main(): - model = argparse.ArgumentParser(description="Neoepitope preprocessing for mhcnuggets") - - model.add_argument("-n", "--neoepitopes", type=str, help="neoepitopes input file") - - model.add_argument( - "-o", - "--output", - type=str, - help="preprocess neoepitope file for subsequent mhcnuggets prediction", - ) - - args = model.parse_args() - - neoepitopes = parse_neoepitopes(args.neoepitopes) - write_neoepitopes(neoepitopes, args.output) - - -if __name__ == "__main__": - main() diff --git a/bin/preprocess_peptides_mhcnuggets.py b/bin/preprocess_peptides_mhcnuggets.py deleted file mode 100755 index f67d1b34..00000000 --- a/bin/preprocess_peptides_mhcnuggets.py +++ /dev/null @@ -1,64 +0,0 @@ -#!/usr/bin/env python - -# Written by Lukas Heumos and released under MIT license. - -import argparse -import logging -import sys -from collections import defaultdict - -# logging setup -console = logging.StreamHandler(sys.stdout) -formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") -console.setFormatter(formatter) -LOG = logging.getLogger("Preprocess Peptides MHCNuggets") -LOG.addHandler(console) -LOG.setLevel(logging.INFO) - - -def parse_mztab(identified_peptides_file): - """ - parses an mztab file and returns all identified proteins - :param identified_peptides_file: path to the mztab file - :return: identified proteins - """ - mztab = open(identified_peptides_file) - mztab_read = mztab.readlines() - mztab.close() - - seq_geneIDs = defaultdict(str) - for line in mztab_read: - if line.startswith("PEP"): - content = line.split("\t") - seq = content[1] - geneID = content[2] - if not "U" in seq and not "X" in seq and not "Z" in seq and not "J" in seq and not "B" in seq: - seq_geneIDs[seq] = geneID - - return seq_geneIDs - - -def main(): - model = argparse.ArgumentParser(description="MHCNuggets binding prediction") - - model.add_argument("-m", "--mztab", type=str, help="mztab file") - - model.add_argument("-o", "--output", type=str, help="preprocessed ") - - args = model.parse_args() - - seq_to_geneIDs = parse_mztab(args.mztab) - - # write just peptides new line formatted - with open(args.output, "w") as f: - for protein in seq_to_geneIDs.keys(): - f.write(protein + "\n") - - # write seq to geneID in a suitable format (csv?) - with open("peptide_to_geneID", "w") as f: - for protein, geneID in seq_to_geneIDs.items(): - f.write(protein + "," + geneID + "\n") - - -if __name__ == "__main__": - main() diff --git a/bin/resolve_neoepitopes.py b/bin/resolve_neoepitopes.py deleted file mode 100755 index 378798c1..00000000 --- a/bin/resolve_neoepitopes.py +++ /dev/null @@ -1,177 +0,0 @@ -#!/usr/bin/env python - -# Written by Lukas Heumos and released under MIT license. - -""" -Commandline tool for extracting unique neoepitopes from mztab files and the -and the FRED2 vcf_neoepitope_predictor.py script. -usage: found_neoepitopes.py [-h] - -m MZTAB - -n NEOEPITOPES - [-f FILEFORMAT {raw, csv, json}] - -o OUTPUT -Neoepitope prediction for TargetInsepctor. -optional arguments: - -h, --help show this help message and exit - -m, --mztab MZTAB - Path to the mzab file - -n, --neoepitopes NEOEPITOPES - Path to the neoepitopes input file - -f, --file_format {raw, csv, json} - File format to report result in - -o, --output OUTPUT - Path to the output file -""" -import argparse -import csv -import json -import logging -import re -import sys -from itertools import tee -from collections import defaultdict - -# logging setup -console = logging.StreamHandler(sys.stdout) -formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") -console.setFormatter(formatter) -LOG = logging.getLogger("Resolve Neoepitopes") -LOG.addHandler(console) -LOG.setLevel(logging.INFO) - - -def parse_mztab(identified_peptides_file): - """ - parses an mztab file and returns all identified proteins - :param identified_peptides_file: path to the mztab file - :return: identified proteins - """ - mztab = open(identified_peptides_file) - mztab_read = mztab.readlines() - mztab.close() - - seq_geneIDs = defaultdict(str) - for line in mztab_read: - if line.startswith("PEP"): - content = line.split("\t") - seq = content[1] - geneID = content[2] - if not "U" in seq and not "X" in seq and not "Z" in seq and not "J" in seq and not "B" in seq: - seq_geneIDs[seq] = geneID - - return seq_geneIDs - - -def parse_vcf_neoepitopes(neoepitope_file, alleles): - """ - parses the output of the VCF neoepitope predictor script - :param neoepitope_file: output file of VCF neoepitope script - :param alleles: all alleles which were used for the VCF neoepitope script - :return: dictionary of alleles to peptide sequences - """ - HLA_allele_to_peptides = defaultdict(list) - - if not alleles: - neoepitopes = [] - with open(neoepitope_file) as tsvfile: - reader = csv.DictReader(tsvfile, dialect="excel-tab") - for row in reader: - neoepitopes.append(row["Sequence"]) - - return neoepitopes - - with open(neoepitope_file) as tsvfile: - reader = csv.DictReader(tsvfile, dialect="excel-tab") - reader, reader_iter_cp = tee(reader) - - # determine whether any alleles were used for the vcf_neoepitope_predictor script - # which are not present in the alleles specified for the MHCFlurry prediction - content = reader_iter_cp.next() - for allele in alleles: - try: - content[allele] - except KeyError: - LOG.warning( - "Allele " - + str(allele) - + " was specified, but not found in the possible neoepitopes predicted from the vcf file." - ) - alleles.remove(allele) - - # allele with highest affinity score wins - for row in reader: - max_val = 0 - max_allele = "" - for allele in alleles: - if row[allele] > max_val: - max_allele = allele - max_val = row[allele] - HLA_allele_to_peptides[max_allele].append(row["Sequence"]) - - return HLA_allele_to_peptides - - -def write_found_neoepitopes(filepath, found_neoepitopes, file_format="csv"): - """ - writes all unique neoepitopes to a specified file - :param filepath: path to write the file to - :param found_neoepitopes: dictionary of alleles to peptide sequences - :param file_format: json or csv or raw - """ - # if only list (no bindings) - if file_format == "pep": - with open(filepath + "." + file_format, "w") as f: - f.write("Peptide sequence" + "\t" + "geneID") - f.write("\n".join(str(seq) + "\t" + str(geneID) for seq, geneID in found_neoepitopes.items())) - elif file_format == "json": - json.dump(found_neoepitopes, open(filepath + "." + file_format, "w")) - elif file_format == "csv": - with open(filepath + "." + file_format, "w") as csv_file: - writer = csv.writer(csv_file) - header = ["Peptide sequence", "geneID"] - writer.writerow(header) - for seq, geneID in found_neoepitopes.items(): - writer.writerow([seq, geneID]) - elif file_format == "raw": - f = open(filepath + "." + file_format, "w") - f.write(str(found_neoepitopes)) - f.close() - else: - LOG.error("Could not write found neoepitopes. Please specify one of the file formats json, csv or raw.") - - -def main(): - model = argparse.ArgumentParser( - description="Neoepitope resolvement from mztab and possible vcf determined neoepitopes." - ) - - model.add_argument("-n", "--neoepitopes", type=str, help="All possible predicted neoepitopes") - - model.add_argument("-m", "--mztab", type=str, help="Path to mztab file") - - model.add_argument( - "-f", - "--file_format", - type=str, - default="csv", - help="File format for output file", - ) - - model.add_argument("-o", "--output", type=str, required=True, help="Output file path") - - args = model.parse_args() - - # parse all identified peptides and possible neoepitopes - predicted_vcf_neoepitopes = parse_vcf_neoepitopes(args.neoepitopes, []) - identified_peptides_to_geneIDs = parse_mztab(args.mztab) - - # build the intersection of all found epitopes and possible neoepitopes - found_neoepitopes = list(set(predicted_vcf_neoepitopes) & set(identified_peptides_to_geneIDs.keys())) - LOG.info(str(len(found_neoepitopes)) + ' Neoepitopes were found. Examine "found_neoepitopes.csv" for details.') - found_neoepitopes_to_geneIDs = {k: v for k, v in identified_peptides_to_geneIDs.items() if k in found_neoepitopes} - - write_found_neoepitopes(args.output, found_neoepitopes_to_geneIDs, args.file_format) - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/bin/variants2fasta.py b/bin/variants2fasta.py deleted file mode 100755 index 308795a3..00000000 --- a/bin/variants2fasta.py +++ /dev/null @@ -1,276 +0,0 @@ -#!/usr/bin/env python - -# Written by Leon Bichmann and released under MIT license. - -import time -import sys -import argparse - -from Fred2.Core import Protein, Peptide, Allele, MutationSyntax, Variant -from Fred2.Core.Variant import VariationType -from Fred2.IO import read_lines, MartsAdapter, read_annovar_exonic -from Fred2.EpitopePrediction import EpitopePredictorFactory -from Fred2.Core import ( - generate_peptides_from_proteins, - generate_peptides_from_variants, - generate_transcripts_from_variants, - generate_proteins_from_transcripts, -) -from Fred2.IO.ADBAdapter import EIdentifierTypes, EAdapterFields -from vcf_reader import read_vcf - - -MARTDBURL = { - "GRCH37": "http://grch37.ensembl.org/biomart/martservice?query=", - "GRCH38": "http://www.ensembl.org/biomart/martservice?query=", -} # is corrently set to GRCh38 - - -def read_variant_effect_predictor(file, gene_filter=None): - """ - Reads a VCF (v4.1) file generatede by variant effect predictor and generates variant objects - :param str file: Path to vcf file - :param list gene_filter: List of proteins (in HGNC) of inerrest. Variants are filter according to this list - :return: list(Variant) - a list of Fred2.Core.Variant objects - """ - vars = [] - - def get_type(ref, alt): - """ - returns the variant type - """ - if len(ref) == 1 and len(alt) == 1: - return VariationType.SNP - if len(ref) > 0 and len(alt) == 0: - if len(ref) % 3 == 0: - return VariationType.DEL - else: - return VariationType.FSDEL - if len(ref) == 0 and len(alt) > 0: - if len(alt) % 3 == 0: - return VariationType.INS - else: - return VariationType.FSINS - return VariationType.UNKNOWN - - coding_types = set( - [ - "3_prime_UTR_variant", - "5_prime_UTR_variant", - "start_lost", - "stop_gained", - "frameshift_variant", - "start_lost", - "inframe_insertion", - "inframe_deletion", - "missense_variant", - "protein_altering_variant", - "splice_region_variant", - "incomplete_terminal_codon_variant", - "stop_retained_variant", - "synonymous_variant", - "coding_sequence_variant", - ] - ) - - with open(file, "r") as f: - for i, l in enumerate(f): - # skip comments - if l.startswith("#") or l.strip() == "": - continue - - chrom, gene_pos, var_id, ref, alt, _, filter_flag, info = l.strip().split("\t")[:8] - coding = {} - isSynonymous = False - - for co in info.split(","): - # Allele|Consequence|IMPACT|SYMBOL|Gene|Feature_type|Feature|BIOTYPE|EXON|INTRON|HGVSc|HGVSp|cDNA_position|CDS_position|Protein_position|Amino_acids|Codons|Existing_variation|DISTANCE|STRAND|FLAGS|SYMBOL_SOURCE|HGNC_ID|TSL|APPRIS|SIFT|PolyPhen|AF|AFR_AF|AMR_AF|EAS_AF|EUR_AF|SAS_AF|AA_AF|EA_AF|gnomAD_AF|gnomAD_AFR_AF|gnomAD_AMR_AF|gnomAD_ASJ_AF|gnomAD_EAS_AF|gnomAD_FIN_AF|gnomAD_NFE_AF|gnomAD_OTH_AF|gnomAD_SAS_AF|CLIN_SIG|SOMATIC|PHENO|PUBMED|MOTIF_NAME|MOTIF_POS|HIGH_INF_POS|MOTIF_SCORE_CHANGE"> - ( - _, - var_type, - _, - gene, - _, - transcript_type, - transcript_id, - _, - _, - _, - _, - _, - transcript_pos, - _, - prot_pos, - aa_mutation, - ) = co.strip().split("|")[:16] - HGNC_ID = co.strip().split("|")[22] - - # pass every other feature type except Transcript (RegulatoryFeature, MotifFeature.) - # pass genes that are uninterresting for us - if transcript_type != "Transcript" or (HGNC_ID not in gene_filter and gene_filter): - continue - - # pass all intronic and other mutations that do not directly influence the protein sequence - if any(t in coding_types for t in var_type.split("&")): - # generate mutation syntax - - # positioning in Fred2 is 0-based!!! - if transcript_pos != "": - coding[transcript_id] = MutationSyntax( - transcript_id, - int(transcript_pos.split("/")[0]) - 1, - -1 if prot_pos == "" else int(prot_pos) - 1, - co, - "", - geneID=HGNC_ID, - ) - - # is variant synonymous? - isSynonymous = any(t == "synonymous_variant" for t in var_type.split("&")) - if coding: - vars.append( - Variant( - var_id, - get_type(ref, alt), - chrom, - int(gene_pos), - ref.upper(), - alt.upper(), - coding, - False, - isSynonymous, - ) - ) - return vars - - -def main(): - model = argparse.ArgumentParser(description="Neoepitope protein fasta generation from variant vcf") - - model.add_argument("-v", "--vcf", type=str, default=None, help="Path to the vcf input file") - - model.add_argument( - "-t", - "--type", - type=str, - choices=["VEP", "ANNOVAR", "SNPEFF"], - default="VEP", - help="Type of annotation tool used (Variant Effect Predictor, ANNOVAR exonic gene annotation, SnpEff)", - ) - - model.add_argument("-f", "--fasta_ref", type=str, default=None, help="Path to the fasta input file") - - model.add_argument( - "-p", - "--proteins", - type=str, - default=None, - help="Path to the protein ID input file (in HGNC-ID)", - ) - - model.add_argument( - "-r", - "--reference", - type=str, - default="GRCh38", - help="The reference genome used for varinat annotation and calling.", - ) - - model.add_argument( - "-fINDEL", - "--filterINDEL", - action="store_true", - help="Filter insertions and deletions (including frameshifts)", - ) - - model.add_argument("-fFS", "--filterFSINDEL", action="store_true", help="Filter frameshift INDELs") - - model.add_argument("-fSNP", "--filterSNP", action="store_true", help="Filter SNPs") - - model.add_argument("-o", "--output", type=str, required=True, help="Path to the output file") - - args = model.parse_args() - - martDB = MartsAdapter(biomart=MARTDBURL[args.reference.upper()]) - - if args.vcf is None: - sys.stderr.write("At least a vcf file or a protein id file has to be provided.\n") - return -1 - - # if vcf file is given: generate variants and filter them if HGNC IDs ar given - if args.vcf is not None: - protein_ids = [] - if args.proteins is not None: - with open(args.proteins, "r") as f: - for l in f: - l = l.strip() - if l != "": - protein_ids.append(l) - - if args.type == "VEP": - variants = read_variant_effect_predictor(args.vcf, gene_filter=protein_ids) - - elif args.type == "SNPEFF": - variants = read_vcf(args.vcf)[0] - - else: - variants = read_annovar_exonic(args.vcf, gene_filter=protein_ids) - - if args.filterSNP: - variants = filter(lambda x: x.type != VariationType.SNP, variants) - - if args.filterINDEL: - variants = filter( - lambda x: x.type - not in [ - VariationType.INS, - VariationType.DEL, - VariationType.FSDEL, - VariationType.FSINS, - ], - variants, - ) - - if args.filterFSINDEL: - variants = filter( - lambda x: x.type not in [VariationType.FSDEL, VariationType.FSINS], - variants, - ) - - if not variants: - sys.stderr.write("No variants left after filtering. Please refine your filtering criteria.\n") - return -1 - - # generate transcripts - transcripts = generate_transcripts_from_variants(variants, martDB, EIdentifierTypes.ENSEMBL) - - # generate proteins - proteins = generate_proteins_from_transcripts(transcripts) - - # write fasta file - with open(args.output, "w") as f: - for p in proteins: - f.write(">" + str(p.transcript_id) + "|" + str(p.vars) + "_var_" + "\n") - f.write(str(p) + "\n") - - # concatenate fasta file with fasta reference - with open(args.output) as op: - opr1 = op.read() - - with open(args.fasta_ref) as op: - opr2 = op.read() - - concat = opr1 + opr2 - - with open(args.output, "w") as op: - op.write(concat) - - else: - sys.stderr.write("At least a vcf file or a protein id file has to be provided.\n") - return -1 - - return 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/bin/vcf_neoepitope_predictor.py b/bin/vcf_neoepitope_predictor.py deleted file mode 100755 index 95c08733..00000000 --- a/bin/vcf_neoepitope_predictor.py +++ /dev/null @@ -1,472 +0,0 @@ -#!/usr/bin/env python - -# Written by Leon Bichmann and released under MIT license. - -""" -Commandline tool for (neo)epitope prediction -usage: neoepitopeprediction.py [-h] - [-m {netmhc,smmpmbec,syfpeithi,netmhcpan,netctlpan,smm,tepitopepan,arb,pickpocket,epidemix,netmhcii,netmhciipan,comblibsidney,unitope,hammer,svmhc,bimas}] - [-v VCF] [-t {VEP,ANNOVAR,SNPEFF}] [-p PROTEINS] - [-minl, -maxl {8,9,10,11,12,13,14,15,16,17}] - -a ALLELES - [-r REFERENCE] [-fINDEL] [-fFS] [-fSNP] - -o OUTPUT -Neoepitope prediction for TargetInsepctor. -optional arguments: - -h, --help show this help message and exit - -m, --method {netmhc,smmpmbec,syfpeithi,netmhcpan,netctlpan,smm,tepitopepan,arb,pickpocket,epidemix,netmhcii,netmhciipan,comblibsidney,unitope,hammer,svmhc,bimas}, - The name of the prediction method - -v VCF, --vcf VCF Path to the vcf input file - -t, --type {VEP,ANNOVAR, SNPEFF} - Type of annotation tool used (Variant Effect - Predictor, ANNOVAR exonic gene annotation, SnpEff) - -p, --proteins PROTEINS - Path to the protein ID input file (in HGNC-ID) - -minl, --peptide_min_length {8,9,10,11,12,13,14,15,16,17} - The minimal length of peptides - -maxl, --peptide_max_length {8,9,10,11,12,13,14,15,16,17} - The maximal length of peptides - -a, --alleles ALLELES - Alleles string separated by semicolon - -r, --reference REFERENCE - The reference genome used for varinat annotation and - calling. - -fINDEL, --filterINDEL - Filter insertions and deletions (including - frameshifts) - -fFS, --filterFSINDEL - Filter frameshift INDELs - -fSNP, --filterSNP Filter SNPs - -bind, --predict_bindings - Whether to predict bindings or not - -o OUTPUT, --output OUTPUT - Path to the output file -Neoepitope prediction node Consumes a VCF file containing the identified somatic genomic variants, besides a text -file containing HLA alleles, and generates all possible neo-epitopes based on the annotated variants contained in the -VCF file by extracting the annotated transcript sequences from Ensemble [18] and integrating the variants. -Optionally, it consumes a text file, containing gene IDs of the reference system used for annotation, which are used -as filter during the neoepitope generation. The user can specify whether frameshift mutations, deletions, -and insertions should be considered in addition to single nucleotide variations (default). NeoEpitopePrediction -currently supports ANNOVAR [19] and Variant Effect Predictor [20] annotations for GRCh37 and GRCh38 only. -""" -import sys -import argparse -import logging - -from Fred2.Core import Protein, Allele, MutationSyntax, Variant -from Fred2.Core.Variant import VariationType -from Fred2.IO import read_lines, MartsAdapter, read_annovar_exonic -from Fred2.EpitopePrediction import EpitopePredictorFactory -from Fred2.Core import ( - generate_transcripts_from_variants, - generate_proteins_from_transcripts, - generate_peptides_from_proteins, - generate_peptides_from_variants, -) -from Fred2.IO.ADBAdapter import EIdentifierTypes, EAdapterFields -from vcf_reader import read_vcf - -# logging setup -console = logging.StreamHandler(sys.stdout) -formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s") -console.setFormatter(formatter) -LOG = logging.getLogger("VCF Neoepitope Predictor") -LOG.addHandler(console) -LOG.setLevel(logging.INFO) - -MARTDBURL = { - "GRCH37": "http://grch37.ensembl.org/biomart/martservice?query=", - "GRCH38": "http://www.ensembl.org/biomart/martservice?query=", -} # is correctly set to GRCh38 - - -def read_variant_effect_predictor(file, gene_filter=None): - """ - Reads a VCF (v4.1) file generated by variant effect predictor and generates variant objects - :param str file: Path to vcf file - :param list gene_filter: List of proteins (in HGNC) of inerrest. Variants are filter according to this list - :return: list(Variant) - a list of Fred2.Core.Variant objects - """ - vars = [] - - def get_type(ref, alt): - """ - returns the variant type - """ - if len(ref) == 1 and len(alt) == 1: - return VariationType.SNP - if len(ref) > 0 and len(alt) == 0: - if len(ref) % 3 == 0: - return VariationType.DEL - else: - return VariationType.FSDEL - if len(ref) == 0 and len(alt) > 0: - if len(alt) % 3 == 0: - return VariationType.INS - else: - return VariationType.FSINS - return VariationType.UNKNOWN - - coding_types = { - "3_prime_UTR_variant", - "5_prime_UTR_variant", - "start_lost", - "stop_gained", - "frameshift_variant", - "start_lost", - "inframe_insertion", - "inframe_deletion", - "missense_variant", - "protein_altering_variant", - "splice_region_variant", - "incomplete_terminal_codon_variant", - "stop_retained_variant", - "synonymous_variant", - "coding_sequence_variant", - } - - with open(file, "r") as f: - for i, l in enumerate(f): - # skip comments - if l.startswith("#") or l.strip() == "": - continue - - chrom, gene_pos, var_id, ref, alt, _, filter_flag, info = l.strip().split("\t")[:8] - coding = {} - is_synonymous = False - - for co in info.split(","): - # skip additional info fields without annotation - try: - # Allele|Consequence|IMPACT|SYMBOL|Gene|Feature_type|Feature|BIOTYPE|EXON|INTRON|HGVSc|HGVSp|cDNA_position|CDS_position|Protein_position|Amino_acids|Codons|Existing_variation|DISTANCE|STRAND|FLAGS|SYMBOL_SOURCE|HGNC_ID|TSL|APPRIS|SIFT|PolyPhen|AF|AFR_AF|AMR_AF|EAS_AF|EUR_AF|SAS_AF|AA_AF|EA_AF|gnomAD_AF|gnomAD_AFR_AF|gnomAD_AMR_AF|gnomAD_ASJ_AF|gnomAD_EAS_AF|gnomAD_FIN_AF|gnomAD_NFE_AF|gnomAD_OTH_AF|gnomAD_SAS_AF|CLIN_SIG|SOMATIC|PHENO|PUBMED|MOTIF_NAME|MOTIF_POS|HIGH_INF_POS|MOTIF_SCORE_CHANGE"> - ( - _, - var_type, - _, - gene, - _, - transcript_type, - transcript_id, - _, - _, - _, - _, - _, - _, - transcript_pos, - prot_pos, - aa_mutation, - ) = co.strip().split("|")[:16] - except ValueError: - LOG.warning("INFO field in different format in line: {}, skipping...".format(str(i))) - continue - - # pass every other feature type except Transcript (RegulatoryFeature, MotifFeature.) - # pass genes that are uninteresting for us - if transcript_type != "Transcript" or (gene not in gene_filter and gene_filter): - continue - - # pass all intronic and other mutations that do not directly influence the protein sequence - if any(t in coding_types for t in var_type.split("&")): - # generate mutation syntax - - # positioning in Fred2 is 0-based!!! - if transcript_pos != "" and "?" not in transcript_pos: - coding[transcript_id] = MutationSyntax( - transcript_id, - int(transcript_pos.split("-")[0]) - 1, - -1 if prot_pos == "" else int(prot_pos.split("-")[0]) - 1, - co, - "", - geneID=gene, - ) - # is variant synonymous? - is_synonymous = any(t == "synonymous_variant" for t in var_type.split("&")) - - if coding: - vars.append( - Variant( - var_id, - get_type(ref, alt), - chrom, - int(gene_pos), - ref.upper(), - alt.upper(), - coding, - False, - is_synonymous, - ) - ) - return vars - - -def main(): - model = argparse.ArgumentParser(description="Neoepitope prediction for TargetInspector.") - - model.add_argument( - "-m", - "--method", - type=str, - choices=EpitopePredictorFactory.available_methods().keys(), - default="bimas", - help="The name of the prediction method", - ) - - model.add_argument("-v", "--vcf", type=str, default=None, help="Path to the vcf input file") - - model.add_argument( - "-t", - "--type", - type=str, - choices=["VEP", "ANNOVAR", "SNPEFF"], - default="VEP", - help="Type of annotation tool used (Variant Effect Predictor, ANNOVAR exonic gene annotation, SnpEff)", - ) - - model.add_argument( - "-p", - "--proteins", - type=str, - default=None, - help="Path to the protein ID input file (in HGNC-ID)", - ) - - model.add_argument( - "-minl", - "--peptide_min_length", - type=int, - default=8, - help="Minimum peptide length for epitope prediction", - ) - - model.add_argument( - "-maxl", - "--peptide_max_length", - type=int, - default=12, - help="Maximum peptide length for epitope prediction", - ) - - model.add_argument( - "-a", - "--alleles", - type=str, - required=True, - help="Path to the allele file (one per line in new nomenclature)", - ) - - model.add_argument( - "-r", - "--reference", - type=str, - default="GRCh38", - help="The reference genome used for variant annotation and calling.", - ) - - model.add_argument( - "-fINDEL", - "--filterINDEL", - action="store_true", - help="Filter insertions and deletions (including frameshifts)", - ) - - model.add_argument("-fFS", "--filterFSINDEL", action="store_true", help="Filter frameshift INDELs") - - model.add_argument("-fSNP", "--filterSNP", action="store_true", help="Filter SNPs") - - model.add_argument("-etk", "--etk", action="store_true", help=argparse.SUPPRESS) - - model.add_argument("-bind", "--predict_bindings", action="store_true", help="Predict bindings") - - model.add_argument("-o", "--output", type=str, required=True, help="Path to the output file") - - args = model.parse_args() - - martDB = MartsAdapter(biomart=MARTDBURL[args.reference.upper()]) - transcript_to_genes = {} - - if args.vcf is None and args.proteins is None: - sys.stderr.write("At least a vcf file or a protein id file has to be provided.\n") - return -1 - - # if vcf file is given: generate variants and filter them if HGNC IDs ar given - if args.vcf is not None: - protein_ids = [] - if args.proteins is not None: - with open(args.proteins, "r") as f: - for l in f: - l = l.strip() - if l != "": - protein_ids.append(l) - if args.type == "VEP": - variants = read_variant_effect_predictor(args.vcf, gene_filter=protein_ids) - elif args.type == "SNPEFF": - variants = read_vcf(args.vcf)[0] - else: - variants = read_annovar_exonic(args.vcf, gene_filter=protein_ids) - - variants = filter(lambda x: x.type != VariationType.UNKNOWN, variants) - - if args.filterSNP: - variants = filter(lambda x: x.type != VariationType.SNP, variants) - - if args.filterINDEL: - variants = filter( - lambda x: x.type - not in [ - VariationType.INS, - VariationType.DEL, - VariationType.FSDEL, - VariationType.FSINS, - ], - variants, - ) - - if args.filterFSINDEL: - variants = filter( - lambda x: x.type not in [VariationType.FSDEL, VariationType.FSINS], - variants, - ) - - if not variants: - sys.stderr.write("No variants left after filtering. Please refine your filtering criteria.\n") - return -1 - - epitopes = [] - minlength = args.peptide_min_length - maxlength = args.peptide_max_length - prots = [ - p - for p in generate_proteins_from_transcripts( - generate_transcripts_from_variants(variants, martDB, EIdentifierTypes.ENSEMBL) - ) - ] - for peplen in range(minlength, maxlength + 1): - peptide_gen = generate_peptides_from_proteins(prots, peplen) - - peptides_var = [x for x in peptide_gen] - - # remove peptides which are not 'variant relevant' - peptides = [x for x in peptides_var if any(x.get_variants_by_protein(y) for y in x.proteins.keys())] - epitopes.extend(peptides) - - for v in variants: - for trans_id, coding in v.coding.iteritems(): - if coding.geneID is not None: - transcript_to_genes[trans_id] = coding.geneID - else: - transcript_to_genes[trans_id] = "None" - - # else: generate protein sequences from given HGNC IDs and then epitopes - else: - proteins = [] - with open(args.proteins, "r") as f: - for l in f: - ensembl_ids = martDB.get_ensembl_ids_from_id(l.strip(), type=EIdentifierTypes.HGNC)[0] - protein_seq = martDB.get_product_sequence(ensembl_ids[EAdapterFields.PROTID]) - if protein_seq is not None: - transcript_to_genes[ensembl_ids[EAdapterFields.TRANSID]] = l.strip() - proteins.append( - Protein( - protein_seq, - gene_id=l.strip(), - transcript_id=ensembl_ids[EAdapterFields.TRANSID], - ) - ) - epitopes = [] - for length in range(args.peptide_min_length, args.peptide_max_length): - epitopes.extend(generate_peptides_from_proteins(proteins, length)) - - # read in allele list - alleles = args.alleles - - # predict bindings for all found neoepitopes - if args.predict_bindings: - result = EpitopePredictorFactory(args.method).predict(epitopes, alleles=alleles.split(";")) - - with open(args.output, "w") as f: - alleles = result.columns - var_column = " Variants" if args.vcf is not None else "" - f.write("Sequence\tMethod\t" + "\t".join(a.name for a in alleles) + "\tAntigen ID\t" + var_column + "\n") - for index, row in result.iterrows(): - p = index[0] - method = index[1] - proteins = ",".join( - set([transcript_to_genes[prot.transcript_id.split(":FRED2")[0]] for prot in p.get_all_proteins()]) - ) - vars_str = "" - - if args.vcf is not None: - vars_str = "\t" + "|".join( - set( - prot_id.split(":FRED2")[0] - + ":" - + ",".join(repr(v) for v in set(p.get_variants_by_protein(prot_id))) - for prot_id in p.proteins.iterkeys() - if p.get_variants_by_protein(prot_id) - ) - ) - - f.write( - str(p) - + "\t" - + method - + "\t" - + "\t".join("%.3f" % row[a] for a in alleles) - + "\t" - + proteins - + vars_str - + "\n" - ) - - if args.etk: - with open(args.output.rsplit(".", 1)[0] + "_etk.tsv", "w") as g: - alleles = result.columns - g.write("Alleles:\t" + "\t".join(a.name for a in alleles) + "\n") - for index, row in result.iterrows(): - p = index[0] - proteins = " ".join( - set( - [ - transcript_to_genes[prot.transcript_id.split(":FRED2")[0]] - for prot in p.get_all_proteins() - ] - ) - ) - g.write(str(p) + "\t" + "\t".join("%.3f" % row[a] for a in alleles) + "\t" + proteins + "\n") - # don't predict bindings! - # different output format! - else: - with open(args.output, "w") as f: - var_column = " Variants" if args.vcf is not None else "" - f.write("Sequence\tAntigen ID\t" + var_column + "\n") - - for epitope in epitopes: - p = epitope - proteins = ",".join( - set([transcript_to_genes[prot.transcript_id.split(":FRED2")[0]] for prot in p.get_all_proteins()]) - ) - vars_str = "" - - if args.vcf is not None: - vars_str = "\t" + "|".join( - set( - prot_id.split(":FRED2")[0] - + ":" - + ",".join(repr(v) for v in set(p.get_variants_by_protein(prot_id))) - for prot_id in p.proteins.iterkeys() - if p.get_variants_by_protein(prot_id) - ) - ) - - f.write(str(p) + "\t" + proteins + vars_str + "\n") - - with open(args.output.replace(".csv", ".txt"), "w") as f: - for epitope in epitopes: - f.write(str(epitope) + "\n") - - return 0 - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/bin/vcf_reader.py b/bin/vcf_reader.py deleted file mode 100755 index 9ad879fa..00000000 --- a/bin/vcf_reader.py +++ /dev/null @@ -1,281 +0,0 @@ -#!/usr/bin/env python - -# Written by Christopher Mohr / Mathias Walzer and released under MIT license. - -import os -import sys -import logging -import csv -import re -import vcf -import argparse -import urllib2 -import itertools -import pandas as pd -import numpy as np -import Fred2.Core.Generator as generator -import math - -from collections import defaultdict -from Fred2.IO.MartsAdapter import MartsAdapter -from Fred2.Core.Variant import Variant, VariationType, MutationSyntax -from Fred2.EpitopePrediction import EpitopePredictorFactory -from Fred2.IO.ADBAdapter import EIdentifierTypes -from Fred2.IO.UniProtAdapter import UniProtDB -from Fred2.Core.Allele import Allele -from Fred2.Core.Peptide import Peptide -from Fred2.IO import FileReader -from Bio import SeqUtils - -from datetime import datetime -from string import Template - -__author__ = "mohr, walzer" -VERSION = "1.1" - -ID_SYSTEM_USED = EIdentifierTypes.ENSEMBL -transcriptProteinMap = {} -transcriptSwissProtMap = {} - - -REPORT_TEMPLATE = """ -################################################################### - EPAA - EPITOPE PREDICTION REPORT -################################################################### -Persons in Charge: Christopher Mohr, Mathias Walzer -Date: $date -Pipeline Version: 1.1 -Workflow Version: 1.1 -Sample ID: $sample -Alleles -------------- -$alleles -Used Prediction Methods -------------- -$methods -Used Reference -------------- -$reference -Binding Assessment Criteria -------------- -Syfpeithi predictions: prediction score > half max score of corresponding allele -netMHC/netMHCpan predictions: affinity (as IC50 value in nM) <= 500 -Additional Steps -------------- -NO filtering for peptide input. -Filtering of self-peptides (Reviewed (Swiss-Prot) UP000005640 uniprot-all.fasta.gz - 29/02/16, ENSEMBL release 84 Homo_sapiens.GRCh38.pep.all.fa.gz - 27/04/2016) -When personalized protein sequences are provided, peptides will be filtered against those as well. -Stats -------------- -Number of Variants: $variants -Number of Peptides: $peptides -Number of Peptides after Filtering: $filter -Number of Predictions: $predictions -Number of Predicted Binders: $binders -Number of Predicted Non-Binders: $nonbinders -Number of Binding Peptides: $uniquebinders -Number of Non-Binding Peptides: $uniquenonbinders -Contacts -------------- -mohr@informatik.uni-tuebingen.de -walzer@informatik.uni-tuebingen.de -University of Tuebingen, Applied Bioinformatics, -Center for Bioinformatics, Quantitative Biology Center, -and Dept. of Computer Science, -Sand 14, 72076 Tuebingen, Germany -""" - - -def get_fred2_annotation(vt, p, r, alt): - if vt == VariationType.SNP: - return p, r, alt - elif vt == VariationType.DEL or vt == VariationType.FSDEL: - # more than one observed ? - if alt != "-": - alternative = "-" - reference = r[len(alt) :] - position = p + len(alt) - else: - return p, r, alt - elif vt == VariationType.INS or vt == VariationType.FSINS: - if r != "-": - position = p - reference = "-" - if alt != "-": - alt_new = alt[len(r) :] - alternative = alt_new - else: - alternative = str(alt) - else: - return p, r, alt - - return position, reference, alternative - - -def read_vcf(filename, pass_only=True): - """ - reads vcf files - returns a list of FRED2 variants - :param filename: /path/to/file - :return: list of FRED2 variants - """ - global ID_SYSTEM_USED - - vl = list() - with open(filename, "rb") as tsvfile: - vcf_reader = vcf.Reader(tsvfile) - vl = [r for r in vcf_reader] - - dict_vars = {} - list_vars = [] - transcript_ids = [] - genotye_dict = {"het": False, "hom": True, "ref": True} - - for num, record in enumerate(vl): - c = record.CHROM.strip("chr") - p = record.POS - 1 - variation_dbid = record.ID - r = str(record.REF) - v_list = record.ALT - f = record.FILTER - if pass_only and f: - continue - - """ - Enum for variation types: - type.SNP, type.DEL, type.INS, type.FSDEL, type.FSINS, type.UNKNOWN - """ - vt = VariationType.UNKNOWN - if record.is_snp: - vt = VariationType.SNP - elif record.is_indel: - if len(v_list) % 3 == 0: # no frameshift - if record.is_deletion: - vt = VariationType.DEL - else: - vt = VariationType.INS - else: # frameshift - if record.is_deletion: - vt = VariationType.FSDEL - else: - vt = VariationType.FSINS - gene = "" - - for alt in v_list: - isHomozygous = False - if "HOM" in record.INFO: - isHomozygous = record.INFO["HOM"] == 1 - elif "SGT" in record.INFO: - zygosity = record.INFO["SGT"].split("->")[1] - if zygosity in genotye_dict: - isHomozygous = genotye_dict[zygosity] - else: - if zygosity[0] == zygosity[1]: - isHomozygous = True - else: - isHomozygous = False - else: - for sample in record.samples: - if "GT" in sample.data: - isHomozygous = sample.data["GT"] == "1/1" - - if record.INFO["ANN"]: - isSynonymous = False - coding = dict() - types = [] - for annraw in record.INFO["ANN"]: # for each ANN only add a new coding! see GSvar - annots = annraw.split("|") - ( - obs, - a_mut_type, - impact, - a_gene, - a_gene_id, - feature_type, - transcript_id, - exon, - tot_exon, - trans_coding, - prot_coding, - cdna, - cds, - aa, - distance, - warnings, - ) = annots - types.append(a_mut_type) - - tpos = 0 - ppos = 0 - positions = "" - - # get cds/protein positions and convert mutation syntax to FRED2 format - if trans_coding != "": - positions = re.findall(r"\d+", trans_coding) - ppos = int(positions[0]) - 1 - - if prot_coding != "": - positions = re.findall(r"\d+", prot_coding) - tpos = int(positions[0]) - 1 - - isSynonymous = a_mut_type == "synonymous_variant" - - gene = a_gene_id - # there are no isoforms in biomart - transcript_id = transcript_id.split(".")[0] - - if "NM" in transcript_id: - ID_SYSTEM_USED = EIdentifierTypes.REFSEQ - - # take online coding variants into account, FRED2 cannot deal with stopgain variants right now - if not prot_coding or "stop_gained" in a_mut_type: - continue - - coding[transcript_id] = MutationSyntax(transcript_id, ppos, tpos, trans_coding, prot_coding) - transcript_ids.append(transcript_id) - - if coding: - pos, reference, alternative = get_fred2_annotation(vt, p, r, str(alt)) - var = Variant( - "line" + str(num), - vt, - c, - pos, - reference, - alternative, - coding, - isHomozygous, - isSynonymous, - ) - var.gene = gene - var.log_metadata("vardbid", variation_dbid) - dict_vars[var] = var - list_vars.append(var) - - transToVar = {} - - # fix because of memory/timing issues due to combinatoric explosion - for v in list_vars: - for trans_id in v.coding.iterkeys(): - transToVar.setdefault(trans_id, []).append(v) - - for tId, vs in transToVar.iteritems(): - if len(vs) > 10: - for v in vs: - vs_new = Variant( - v.id, - v.type, - v.chrom, - v.genomePos, - v.ref, - v.obs, - v.coding, - True, - v.isSynonymous, - ) - vs_new.gene = v.gene - for m in ["vardbid"]: - vs_new.log_metadata(m, v.get_metadata(m)) - dict_vars[v] = vs_new - - return dict_vars.values(), transcript_ids diff --git a/conf/modules.config b/conf/modules.config index 44b08b64..59a325fe 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -90,31 +90,6 @@ process { ] } - withName: 'OPENMS_IDFILTER_PSMS' { - ext.prefix = {"${meta.id}_pred_filtered"} - ext.args = "-whitelist:ignore_modifications" - ext.args2 = "-whitelist:peptides" - publishDir = [ - path: {"${params.outdir}/intermediate_results/refined_fdr"}, - mode: params.publish_dir_mode, - pattern: '*.idXML' - ] - } - - withName: 'OPENMS_IDFILTER_REFINED' { - ext.args = [ - "-remove_decoys", - "-precursor:length '${params.peptide_min_length}:${params.peptide_max_length}'", - "-delete_unreferenced_peptide_hits", - (params.fdr_threshold == '0.01') ? "-score:pep 0.05" : "-score:pep " + params.fdr_threshold - ].join(' ').trim() - publishDir = [ - path: {"${params.outdir}/intermediate_results/refined_fdr"}, - mode: params.publish_dir_mode, - pattern: '*.idXML' - ] - } - withName: 'OPENMS_IDRIPPER' { publishDir = [ enabled: false @@ -181,21 +156,6 @@ process { ] } - withName: 'GENERATE_PROTEINS_FROM_VCF' { - ext.args = [ - "-t ${params.variant_annotation_style}", - "-r ${params.variant_reference}", - params.variant_indel_filter ? "-fINDEL" : "", - params.variant_frameshift_filter ? "-fFS" : "", - params.variant_snp_filter ? "-fSNP" : "" - ].join(' ').trim() - publishDir = [ - path: {"${params.outdir}"}, - mode: params.publish_dir_mode, - pattern: '*.fasta' - ] - } - withName: 'OPENMS_FILEFILTER' { publishDir = [ enabled: false @@ -224,6 +184,7 @@ process { withName: 'OPENMS_COMETADAPTER' { ext.args = [ "-precursor_mass_tolerance ${params.precursor_mass_tolerance}", + "-precursor_error_units ${params.precursor_error_units}", "-fragment_mass_tolerance ${params.fragment_mass_tolerance}", "-fragment_bin_offset ${params.fragment_bin_offset}", "-instrument ${params.instrument}", @@ -274,7 +235,6 @@ process { "-testFDR 0.05", "-enzyme no_enzyme", "-subset_max_train ${params.subset_max_train}", - "-doc ${params.description_correct_features} ", "-post_processing_tdc", (params.fdr_level != 'psm_level_fdrs') ? "-" + params.fdr_level : "" ].join(' ').trim() @@ -341,177 +301,6 @@ process { } } - - -// Refine on predicted subset -process { - - if (params.refine_fdr_on_predicted_subset && params.predict_class_1) { - withName: 'OPENMS_MZTABEXPORTERPERC' { - ext.prefix = {"${meta.sample}_${meta.condition}_all_ids_merged_psm_perc_filtered"} - publishDir = [ - path: {"${params.outdir}/intermediate_results/refined_fdr"}, - mode: params.publish_dir_mode, - pattern: '*.mzTab' - ] - } - - withName: 'OPENMS_MZTABEXPORTERPSM' { - ext.prefix = {"${meta.sample}_${meta.condition}_all_ids_merged"} - publishDir = [ - path: {"${params.outdir}/intermediate_results/refined_fdr"}, - mode: params.publish_dir_mode, - pattern: '*.mzTab' - ] - } - - withName: 'MHCFLURRY_PREDICTPSMS' { - publishDir = [ - path: {"${params.outdir}/intermediate_results/MHCFLURRY_PREDICTPSMS"}, - mode: params.publish_dir_mode, - pattern: '*.idXML' - ] - } - - withName: 'REFINE_FDR:OPENMS_PERCOLATORADAPTER' { - ext.prefix = {"${meta.id}_perc_subset"} - ext.args = [ - "-seed 4711", - "-trainFDR 0.05", - "-testFDR 0.05", - "-enzyme no_enzyme", - "-subset_max_train ${params.subset_max_train}", - "-doc ${params.description_correct_features} ", - (params.fdr_level != 'psm_level_fdrs') ? "-" + params.fdr_level : "" - ].join(' ').trim() - publishDir = [ - path: {"${params.outdir}/intermediate_results/refined_fdr"}, - mode: params.publish_dir_mode, - pattern: '*.idXML' - ] - } - } -} - -// Class I prediction -process { - - if (params.predict_class_1 & !params.skip_quantification) { - withName: 'MHCFLURRY_PREDICTPEPTIDESCLASS1' { - publishDir = [ - path: {"${params.outdir}/class_1_bindings"}, - mode: params.publish_dir_mode, - pattern: '*.fasta' - ] - } - } - - if (params.predict_class_1 & !params.skip_quantification & params.include_proteins_from_vcf) { - withName: 'PREDICT_POSSIBLE_CLASS1_NEOEPITOPES' { - ext.prefix = {"${meta}_vcf_neoepitopes"} - publishDir = [ - path: {"${params.outdir}/class_1_bindings"}, - mode: params.publish_dir_mode, - pattern: '*.csv' - ] - } - - withName: 'RESOLVE_FOUND_CLASS1_NEOEPITOPES' { - ext.prefix = {"${meta.sample}_found_neoepitopes"} - publishDir = [ - path: {"${params.outdir}/class_1_bindings"}, - mode: params.publish_dir_mode, - pattern: '*.csv' - ] - } - - withName: 'MHCFLURRY_PREDICTNEOEPITOPESCLASS1' { - publishDir = [ - path: {"${params.outdir}/class_1_bindings"}, - mode: params.publish_dir_mode, - pattern: '*.csv' - ] - } - } -} - -// Class II prediction -process { - - if (params.predict_class_2 & !params.skip_quantification) { - - withName: 'MHCNUGGETS_PEPTIDESCLASS2PRE' { - publishDir = [ - path: {"${params.outdir}/class_2_bindings"}, - mode: params.publish_dir_mode, - pattern: '*_peptides' - ] - } - - withName: 'MHCNUGGETS_PREDICTPEPTIDESCLASS2' { - publishDir = [ - path: {"${params.outdir}/class_2_bindings"}, - mode: params.publish_dir_mode, - pattern: '*_class_2' - ] - } - - withName: 'MHCNUGGETS_PEPTIDESCLASS2POST' { - publishDir = [ - path: {"${params.outdir}/class_2_bindings"}, - mode: params.publish_dir_mode, - pattern: '*.csv' - ] - } - } - - if (params.predict_class_2 & !params.skip_quantification & params.include_proteins_from_vcf) { - - withName: 'PREDICT_POSSIBLE_CLASS2_NEOEPITOPES' { - publishDir = [ - path: {"${params.outdir}/class_2_bindings"}, - mode: params.publish_dir_mode, - pattern: '*.csv' - ] - } - - withName: 'RESOLVE_FOUND_CLASS2_NEOEPITOPES' { - publishDir = [ - path: {"${params.outdir}/class_2_bindings"}, - mode: params.publish_dir_mode, - pattern: '*.csv' - ] - } - - withName: 'MHCNUGGETS_NEOEPITOPESCLASS2PRE' { - ext.prefix = {"${meta}_mhcnuggets_preprocessed"} - publishDir = [ - path: {"${params.outdir}/class_2_bindings"}, - mode: params.publish_dir_mode, - pattern: '*${ext.prefix}' - ] - } - - withName: 'MHCNUGGETS_PREDICTNEOEPITOPESCLASS2' { - ext.prefix = {"${meta}_predicted_neoepitopes_class_2"} - publishDir = [ - path: {"${params.outdir}/class_2_bindings"}, - mode: params.publish_dir_mode, - pattern: '*${ext.prefix}' - ] - } - - withName: 'MHCNUGGETS_NEOEPITOPESCLASS2POST' { - publishDir = [ - path: {"${params.outdir}/class_2_bindings"}, - mode: params.publish_dir_mode, - pattern: '*.csv' - ] - } - } -} - - process { if (params.annotate_ions) { diff --git a/docs/output.md b/docs/output.md index d2f460de..3018292a 100644 --- a/docs/output.md +++ b/docs/output.md @@ -88,104 +88,8 @@ This folder contains the intermediate results from various steps of the MHCquant - `{Sample}_{Condition}_matching_ions.tsv`: Contains ion annotations and additional metadata of peptides reported after peptide identification. -- `refined_fdr` (Only if `--refine_fdr_on_predicted_subset` is specified) - - - `*merged_psm_perc_filtered.mzTab` : This file export filtered percolator results (by q-value) as mzTab. - - - `*_all_ids_merged.mzTab` : Exportas all of the psm results as mztab. - - - `*perc_subset.idXML` : This file is the outcome of a second OpenMS `PercolatorAdapter` run. - - - `*pred_filtered.idXML` : Contains filtered PSMs prediction results by shrinked search space (outcome mhcflurry). - - - `{ID}_-_{filename}_filtered` : An outcome file of `OPENMS_IDFILTER_REFINED`. - - - -## VCF - -### Reference fasta - -
-Output files - -- `*_vcf.fasta`: If `--include_proteins_from_vcf` is specified, then this fasta is created for the respective sample - -
-The fasta database including mutated proteins used for the database search - -### Neoepitopes - -These CSV files list all of the theoretically possible neoepitope sequences from the variants specified in the vcf and neoepitopes that are found during the mass spectrometry search, independant of binding predictions, respectively - -#### Found neoepitopes - -
-Output files - -- `class_1_bindings/` - - - `*found_neoepitopes_class1.csv`: Generated when `--include_proteins_from_vcf` and `--predict_class_1` are specified - -- `class_2_bindings/` - - - `*found_neoepitopes_class2.csv`: Generated when `--include_proteins_from_vcf` and `--predict_class_2` are specified -
-This CSV lists all neoepitopes that are found during the mass spectrometry search, independant of binding predictions. -The format is as follows: - -```bash -peptide sequence geneID -``` - -#### vcf_neoepitopes - -
-Output files - -- `class_1_bindings/` - -- `*vcf_neoepitopes_class1.csv`: Generated when `--include_proteins_from_vcf` and `--predict_class_1` are specified - -- `class_2_bindings/` - -- `*vcf_neoepitopes_class2.csv`: Generated when `--include_proteins_from_vcf` and `--predict_class_2` are specified - -
- -This CSV file contains all theoretically possible neoepitope sequences from the variants that were specified in the vcf. -The format is shown below - -```bash -Sequence Antigen ID Variants -``` - -## Class prediction - -### Class (1|2) bindings - -
-Output files - -- `class_1_bindings/` - -- `*predicted_peptides_class_1.csv`: If `--predict_class_1` is specified, then this CSV is generated - -- `class_2_bindings/` - -- `*predicted_peptides_class_2.csv`: If `--predict_class_2` is specified, then this CSV is generated - -
- -This folder contains the binding predictions of all detected class 1 or 2 peptides and all theoretically possible neoepitope sequences -The prediction outputs are comma-separated table (CSV) for each allele, listing each peptide sequence and its corresponding predicted affinity scores: - -```bash -peptide allele prediction prediction_low prediction_high prediction_percentile -``` - ### MultiQC
diff --git a/docs/supported_class_1_alleles.md b/docs/supported_class_1_alleles.md deleted file mode 100644 index 467522d0..00000000 --- a/docs/supported_class_1_alleles.md +++ /dev/null @@ -1,82 +0,0 @@ -# Class 1 Alleles supported by MHCFlurry (13.04.2019) - -A*01:01 -A*02:01 -A*02:02 -A*02:03 -A*02:05 -A*02:06 -A*02:07 -A*02:11 -A*02:12 -A*02:16 -A*02:17 -A*02:19 -A*02:50 -A*03:01 -A*11:01 -A*23:01 -A*24:02 -A*24:03 -A*25:01 -A*26:01 -A*26:02 -A*26:03 -A*29:02 -A*30:01 -A*30:02 -A*31:01 -A*32:01 -A*33:01 -A*66:01 -A*68:01 -A*68:02 -A*68:23 -A*69:01 -A*80:01 -B*07:01 -B*07:02 -B*08:01 -B*08:02 -B*08:03 -B*14:02 -B*15:01 -B*15:02 -B*15:03 -B*15:09 -B*15:17 -B*18:01 -B*27:02 -B*27:03 -B*27:04 -B*27:05 -B*27:06 -B*35:01 -B*35:03 -B*37:01 -B*38:01 -B*39:01 -B*39:06 -B*40:01 -B*40:02 -B*42:01 -B*44:02 -B*44:03 -B*45:01 -B*46:01 -B*48:01 -B*51:01 -B*53:01 -B*54:01 -B*57:01 -B*58:01 -B*83:01 -C*03:03 -C*04:01 -C*05:01 -C*06:02 -C*07:02 -C*08:02 -C*12:03 -C*14:02 -C*15:02 diff --git a/docs/supported_class_2_alleles.md b/docs/supported_class_2_alleles.md deleted file mode 100644 index 4972360c..00000000 --- a/docs/supported_class_2_alleles.md +++ /dev/null @@ -1,131 +0,0 @@ -# Class 2 Alleles supported by MHCNuggets (05.07.2019) - -HLA-DPB1*01:01 -HLA-DPB1*02:01 -HLA-DPB1*03:01 -HLA-DPB1*04:01 -HLA-DPB1*04:02 -HLA-DPB1*05:01 -HLA-DPB1*09:01 -HLA-DPB1*11:01 -HLA-DPB1*14:01 -HLA-DPB1*20:01 -HLA-DQA1*01:01 -HLA-DQA1*01:02 -HLA-DQA1*03:01 -HLA-DQA1*05:01 -HLA-DQB1*02:01 -HLA-DQB1*02:02 -HLA-DQB1*03:01 -HLA-DQB1*03:02 -HLA-DQB1*03:19 -HLA-DQB1*04:02 -HLA-DQB1*05:01 -HLA-DQB1*05:02 -HLA-DQB1*05:03 -HLA-DQB1*06:02 -HLA-DQB1*06:03 -HLA-DQB1*06:04 -HLA-DRB1*01:01 -HLA-DRB1*01:02 -HLA-DRB1*01:03 -HLA-DRB1*03:01 -HLA-DRB1*03:02 -HLA-DRB1*03:03 -HLA-DRB1*03:04 -HLA-DRB1*03:05 -HLA-DRB1*04:01 -HLA-DRB1*04:02 -HLA-DRB1*04:03 -HLA-DRB1*04:04 -HLA-DRB1*04:05 -HLA-DRB1*04:06 -HLA-DRB1*04:07 -HLA-DRB1*04:11 -HLA-DRB1*07:01 -HLA-DRB1*08:01 -HLA-DRB1*08:02 -HLA-DRB1*08:03 -HLA-DRB1*08:04 -HLA-DRB1*09:01 -HLA-DRB1*10:01 -HLA-DRB1*11:01 -HLA-DRB1*11:02 -HLA-DRB1*11:03 -HLA-DRB1*11:04 -HLA-DRB1*12:01 -HLA-DRB1*12:02 -HLA-DRB1*13:01 -HLA-DRB1*13:02 -HLA-DRB1*13:03 -HLA-DRB1*13:04 -HLA-DRB1*13:05 -HLA-DRB1*14:01 -HLA-DRB1*14:02 -HLA-DRB1*15:01 -HLA-DRB1*15:02 -HLA-DRB1*15:03 -HLA-DRB1*16:01 -HLA-DRB1*16:02 -HLA-DRB3*01:01 -HLA-DRB3*02:02 -HLA-DRB3*03:01 -HLA-DRB4*01:01 -HLA-DRB4*01:03 -HLA-DRB5*01:01 -HLA-DRB5*01:02 -HLA-DPA1*01:03-DPB1*02:01 -HLA-DPA1*01:03-DPB1*03:01 -HLA-DPA1*01:03-DPB1*04:01 -HLA-DPA1*01:03-DPB1*04:02 -HLA-DPA1*02:01-DPB1*01:01 -HLA-DPA1*02:01-DPB1*05:01 -HLA-DPA1*02:02-DPB1*05:01 -HLA-DPA1*03:01-DPB1*04:02 -HLA-DQA1*01:01-DQB1*05:01 -HLA-DQA1*01:01-DQB1*05:03 -HLA-DQA1*01:02-DQB1*05:01 -HLA-DQA1*01:02-DQB1*05:02 -HLA-DQA1*01:02-DQB1*06:02 -HLA-DQA1*01:02-DQB1*06:04 -HLA-DQA1*01:03-DQB1*03:02 -HLA-DQA1*01:03-DQB1*06:01 -HLA-DQA1*01:03-DQB1*06:03 -HLA-DQA1*01:04-DQB1*05:03 -HLA-DQA1*02:01-DQB1*02:01 -HLA-DQA1*02:01-DQB1*02:02 -HLA-DQA1*02:01-DQB1*03:01 -HLA-DQA1*02:01-DQB1*03:03 -HLA-DQA1*02:01-DQB1*04:02 -HLA-DQA1*03:01-DQB1*02:01 -HLA-DQA1*03:01-DQB1*03:01 -HLA-DQA1*03:01-DQB1*03:02 -HLA-DQA1*03:01-DQB1*04:01 -HLA-DQA1*03:02-DQB1*03:01 -HLA-DQA1*03:02-DQB1*03:03 -HLA-DQA1*03:02-DQB1*04:01 -HLA-DQA1*03:03-DQB1*04:02 -HLA-DQA1*04:01-DQB1*04:02 -HLA-DQA1*05:01-DQB1*02:01 -HLA-DQA1*05:01-DQB1*03:01 -HLA-DQA1*05:01-DQB1*03:02 -HLA-DQA1*05:01-DQB1*03:03 -HLA-DQA1*05:01-DQB1*04:02 -HLA-DQA1*05:05-DQB1*03:01 -HLA-DQA1*06:01-DQB1*04:02 -HLA-DRA0*10:1-DRB1*01:01 -HLA-DRA0*10:1-DRB1*03:01 -HLA-DRA0*10:1-DRB1*04:01 -HLA-DRA0*10:1-DRB1*04:04 -HLA-DRA0*10:1-DRB1*07:01 -HLA-DRA0*10:1-DRB1*08:01 -HLA-DRA0*10:1-DRB1*09:01 -HLA-DRA0*10:1-DRB1*11:01 -HLA-DRA0*10:1-DRB1*13:01 -HLA-DRA0*10:1-DRB1*14:54 -HLA-DRA0*10:1-DRB1*15:01 -HLA-DRA0*10:1-DRB3*01:01 -HLA-DRA0*10:1-DRB3*02:02 -HLA-DRA0*10:1-DRB3*03:01 -HLA-DRA0*10:1-DRB4*01:03 -HLA-DRA0*10:1-DRB5*01:01 diff --git a/modules/local/generate_proteins_from_vcf.nf b/modules/local/generate_proteins_from_vcf.nf deleted file mode 100644 index d91e5384..00000000 --- a/modules/local/generate_proteins_from_vcf.nf +++ /dev/null @@ -1,37 +0,0 @@ -process GENERATE_PROTEINS_FROM_VCF { - tag "$meta" - label 'process_medium' - - conda "bioconda::fred2=2.0.7 bioconda::mhcflurry=1.4.3 bioconda::mhcnuggets=2.3.2" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/mulled-v2-c3f301504f7fa2e7bf81c3783de19a9990ea3001:12b1b9f040fd92a80629d58f8a558dde4820eb15-0' : - 'biocontainers/mulled-v2-c3f301504f7fa2e7bf81c3783de19a9990ea3001:12b1b9f040fd92a80629d58f8a558dde4820eb15-0' }" - - input: - tuple val(meta), path(fasta), path(vcf) - - output: - tuple val(meta), path("*.fasta"), emit: vcf_fasta - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def prefix = task.ext.prefix ?: "${fasta.baseName}_added_vcf" - def args = task.ext.args ?: '' - - """ - variants2fasta.py -v $vcf \\ - -f $fasta \\ - -o ${meta.sample}_${prefix}.fasta \\ - $args - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - fred2: \$(echo \$(python -c "import pkg_resources; print('fred2' + pkg_resources.get_distribution('Fred2').version)" | sed 's/^fred2//; s/ .*\$//')) - mhcnuggets: \$(echo \$(python -c "import pkg_resources; print('mhcnuggets' + pkg_resources.get_distribution('mhcnuggets').version)" | sed 's/^mhcnuggets//; s/ .*\$//' )) - mhcflurry: \$(echo \$(mhcflurry-predict --version 2>&1 | sed 's/^mhcflurry //; s/ .*\$//') ) - END_VERSIONS - """ -} diff --git a/modules/local/mhcflurry_predictneoepitopesclass1.nf b/modules/local/mhcflurry_predictneoepitopesclass1.nf deleted file mode 100644 index 9c84ea9f..00000000 --- a/modules/local/mhcflurry_predictneoepitopesclass1.nf +++ /dev/null @@ -1,32 +0,0 @@ -process MHCFLURRY_PREDICTNEOEPITOPESCLASS1 { - tag "$meta.id" - label 'process_low' - - conda "bioconda::fred2=2.0.7 bioconda::mhcflurry=1.4.3 bioconda::mhcnuggets=2.3.2" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/mulled-v2-c3f301504f7fa2e7bf81c3783de19a9990ea3001:12b1b9f040fd92a80629d58f8a558dde4820eb15-0' : - 'biocontainers/mulled-v2-c3f301504f7fa2e7bf81c3783de19a9990ea3001:12b1b9f040fd92a80629d58f8a558dde4820eb15-0' }" - - input: - tuple val(meta), val(allotypes), path(neoepitopes) - - output: - tuple val(meta), path("*.csv"), emit: csv - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def prefix = task.ext.suffix ?: "${neoepitopes}_${meta.id}_predicted_neoepitopes_class_1" - - """ - mhcflurry-downloads --quiet fetch models_class1 - mhcflurry_neoepitope_binding_prediction.py '$allotypes' ${prefix}.csv - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - mhcflurry: \$(echo \$(mhcflurry-predict --version 2>&1 | sed 's/^mhcflurry //; s/ .*\$//') ) - END_VERSIONS - """ -} diff --git a/modules/local/mhcflurry_predictpeptidesclass1.nf b/modules/local/mhcflurry_predictpeptidesclass1.nf deleted file mode 100644 index d828542a..00000000 --- a/modules/local/mhcflurry_predictpeptidesclass1.nf +++ /dev/null @@ -1,34 +0,0 @@ -process MHCFLURRY_PREDICTPEPTIDESCLASS1 { - tag "$meta.id" - label 'process_low' - - conda "bioconda::fred2=2.0.7 bioconda::mhcflurry=1.4.3 bioconda::mhcnuggets=2.3.2" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/mulled-v2-c3f301504f7fa2e7bf81c3783de19a9990ea3001:12b1b9f040fd92a80629d58f8a558dde4820eb15-0' : - 'biocontainers/mulled-v2-c3f301504f7fa2e7bf81c3783de19a9990ea3001:12b1b9f040fd92a80629d58f8a558dde4820eb15-0' }" - - input: - tuple val(meta), path(mztab), val(alleles) - - output: - tuple val(meta), path("*.csv"), emit: csv - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def prefix = task.ext.suffix ?: "${meta.id}_predicted_peptides_class_1" - - """ - mhcflurry-downloads --quiet fetch models_class1 - mhcflurry_predict_mztab.py '$alleles' $mztab ${prefix}.csv - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - mhcnuggets: \$(echo \$(python -c "import pkg_resources; print('mhcnuggets' + pkg_resources.get_distribution('mhcnuggets').version)" | sed 's/^mhcnuggets//; s/ .*\$//' )) - mhcflurry: \$(echo \$(mhcflurry-predict --version 2>&1 | sed 's/^mhcflurry //; s/ .*\$//') ) - fred2: \$(echo \$(python -c "import pkg_resources; print('fred2' + pkg_resources.get_distribution('Fred2').version)" | sed 's/^fred2//; s/ .*\$//')) - END_VERSIONS - """ -} diff --git a/modules/local/mhcflurry_predictpsms.nf b/modules/local/mhcflurry_predictpsms.nf deleted file mode 100644 index f5e92fc1..00000000 --- a/modules/local/mhcflurry_predictpsms.nf +++ /dev/null @@ -1,35 +0,0 @@ -process MHCFLURRY_PREDICTPSMS { - tag "$meta.id" - label 'process_medium' - - conda "bioconda::fred2=2.0.7 bioconda::mhcflurry=1.4.3 bioconda::mhcnuggets=2.3.2" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/mulled-v2-c3f301504f7fa2e7bf81c3783de19a9990ea3001:12b1b9f040fd92a80629d58f8a558dde4820eb15-0' : - 'biocontainers/mulled-v2-c3f301504f7fa2e7bf81c3783de19a9990ea3001:12b1b9f040fd92a80629d58f8a558dde4820eb15-0' }" - - input: - tuple val(meta), path(perc_mztab), path(psm_mztab), val(allotypes) - - output: - tuple val(meta), path("*.idXML"), emit: idxml - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def prefix = task.ext.suffix ?: "${meta.id}_peptide_filter" - - """ - mhcflurry-downloads --quiet fetch models_class1 - mhcflurry_predict_mztab_for_filtering.py ${params.subset_affinity_threshold} '$allotypes' $perc_mztab $psm_mztab ${prefix}.idXML - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - mhcnuggets: \$(echo \$(python -c "import pkg_resources; print('mhcnuggets' + pkg_resources.get_distribution('mhcnuggets').version)" | sed 's/^mhcnuggets//; s/ .*\$//' )) - mhcflurry: \$(echo \$(mhcflurry-predict --version 2>&1 | sed 's/^mhcflurry //; s/ .*\$//') ) - fred2: \$(echo \$(python -c "import pkg_resources; print('fred2' + pkg_resources.get_distribution('Fred2').version)" | sed 's/^fred2//; s/ .*\$//')) - END_VERSIONS - """ - -} diff --git a/modules/local/mhcnuggets_neoepitopesclass2post.nf b/modules/local/mhcnuggets_neoepitopesclass2post.nf deleted file mode 100644 index 721b6cfe..00000000 --- a/modules/local/mhcnuggets_neoepitopesclass2post.nf +++ /dev/null @@ -1,30 +0,0 @@ -process MHCNUGGETS_NEOEPITOPESCLASS2POST { - tag "$meta.id" - label 'process_low' - - conda "bioconda::mhcnuggets=2.3.2" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/mhcnuggets:2.3.2--py_0' : - 'biocontainers/mhcnuggets:2.3.2--py_0' }" - - input: - tuple val(meta), path(neoepitopes), path(predicted) - - output: - tuple val(meta), path("*.csv"), emit: csv - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - - """ - postprocess_neoepitopes_mhcnuggets.py --input $predicted --neoepitopes $neoepitopes - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - mhcnuggets: \$(echo \$(python -c "import pkg_resources; print('mhcnuggets' + pkg_resources.get_distribution('mhcnuggets').version)" | sed 's/^mhcnuggets//; s/ .*\$//' )) - END_VERSIONS - """ -} diff --git a/modules/local/mhcnuggets_neoepitopesclass2pre.nf b/modules/local/mhcnuggets_neoepitopesclass2pre.nf deleted file mode 100644 index fd68fec4..00000000 --- a/modules/local/mhcnuggets_neoepitopesclass2pre.nf +++ /dev/null @@ -1,33 +0,0 @@ -process MHCNUGGETS_NEOEPITOPESCLASS2PRE { - tag "$meta.id" - label 'process_low' - - conda "bioconda::mhcnuggets=2.3.2" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/mhcnuggets:2.3.2--py_0' : - 'biocontainers/mhcnuggets:2.3.2--py_0' }" - - input: - tuple val(meta), path(neoepitopes) - - output: - tuple val(meta), path("*.csv") , emit: preprocessed - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def prefix = task.ext.prefix ?: "${meta.id}_mhcnuggets_preprocessed" - - """ - preprocess_neoepitopes_mhcnuggets.py \\ - --neoepitopes $neoepitopes \\ - --output ${prefix}.csv - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - mhcnuggets: \$(echo \$(python -c "import pkg_resources; print('mhcnuggets' + pkg_resources.get_distribution('mhcnuggets').version)" | sed 's/^mhcnuggets//; s/ .*\$//' )) - END_VERSIONS - """ -} diff --git a/modules/local/mhcnuggets_peptidesclass2post.nf b/modules/local/mhcnuggets_peptidesclass2post.nf deleted file mode 100644 index fd114f36..00000000 --- a/modules/local/mhcnuggets_peptidesclass2post.nf +++ /dev/null @@ -1,33 +0,0 @@ -process MHCNUGGETS_PEPTIDESCLASS2POST { - tag "$meta.id" - label 'process_low' - - conda "bioconda::mhcnuggets=2.3.2" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/mhcnuggets:2.3.2--py_0' : - 'biocontainers/mhcnuggets:2.3.2--py_0' }" - - input: - tuple val(meta), path(peptides), path(peptide_to_geneID) - - output: - tuple val(meta), path('*.csv'), emit: csv - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def prefix = task.ext.prefix ?: "${meta.id}_postprocessed" - - """ - postprocess_peptides_mhcnuggets.py --input $peptides \\ - --peptides_seq_ID $peptide_to_geneID \\ - --output ${prefix}.csv - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - mhcnuggets: \$(echo \$(python -c "import pkg_resources; print('mhcnuggets' + pkg_resources.get_distribution('mhcnuggets').version)" | sed 's/^mhcnuggets//; s/ .*\$//' )) - END_VERSIONS - """ -} diff --git a/modules/local/mhcnuggets_peptidesclass2pre.nf b/modules/local/mhcnuggets_peptidesclass2pre.nf deleted file mode 100644 index a3b140aa..00000000 --- a/modules/local/mhcnuggets_peptidesclass2pre.nf +++ /dev/null @@ -1,33 +0,0 @@ -process MHCNUGGETS_PEPTIDESCLASS2PRE { - tag "$meta.id" - label 'process_low' - - conda "bioconda::mhcnuggets=2.3.2" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/mhcnuggets:2.3.2--py_0' : - 'biocontainers/mhcnuggets:2.3.2--py_0' }" - - input: - tuple val(meta), path(mztab) - - output: - tuple val(meta), path("*_peptides") , emit: preprocessed - tuple val(meta), path('peptide_to_geneID'), emit: geneID - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def prefix = task.ext.prefix ?: "${meta.id}_preprocessed_mhcnuggets_peptides" - - """ - preprocess_peptides_mhcnuggets.py --mztab $mztab \\ - --output ${prefix} - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - mhcnuggets: \$(echo \$(python -c "import pkg_resources; print('mhcnuggets' + pkg_resources.get_distribution('mhcnuggets').version)" | sed 's/^mhcnuggets//; s/ .*\$//' )) - END_VERSIONS - """ -} diff --git a/modules/local/mhcnuggets_predictneoepitopesclass2.nf b/modules/local/mhcnuggets_predictneoepitopesclass2.nf deleted file mode 100644 index d2c35eef..00000000 --- a/modules/local/mhcnuggets_predictneoepitopesclass2.nf +++ /dev/null @@ -1,33 +0,0 @@ -process MHCNUGGETS_PREDICTNEOEPITOPESCLASS2 { - tag "$meta.id" - label 'process_low' - - conda "bioconda::fred2=2.0.7 bioconda::mhcflurry=1.4.3 bioconda::mhcnuggets=2.3.2" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/mulled-v2-c3f301504f7fa2e7bf81c3783de19a9990ea3001:12b1b9f040fd92a80629d58f8a558dde4820eb15-0' : - 'biocontainers/mulled-v2-c3f301504f7fa2e7bf81c3783de19a9990ea3001:12b1b9f040fd92a80629d58f8a558dde4820eb15-0' }" - - input: - tuple val(meta), path(neoepitopes), val(alleles) - - output: - tuple val(meta), path("*.csv"), emit: csv - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def prefix = task.ext.prefix ?: "${meta.id}_predicted_neoepitopes_class_2" - - """ - mhcnuggets_predict_peptides.py --peptides $neoepitopes \\ - --alleles '$alleles' \\ - --output ${prefix}.csv - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - mhcnuggets: \$(echo \$(python -c "import pkg_resources; print('mhcnuggets' + pkg_resources.get_distribution('mhcnuggets').version)" | sed 's/^mhcnuggets//; s/ .*\$//' )) - END_VERSIONS - """ -} diff --git a/modules/local/mhcnuggets_predictpeptidesclass2.nf b/modules/local/mhcnuggets_predictpeptidesclass2.nf deleted file mode 100644 index d275e7c2..00000000 --- a/modules/local/mhcnuggets_predictpeptidesclass2.nf +++ /dev/null @@ -1,35 +0,0 @@ -process MHCNUGGETS_PREDICTPEPTIDESCLASS2 { - tag "$meta.id" - label 'process_low' - - conda "bioconda::fred2=2.0.7 bioconda::mhcflurry=1.4.3 bioconda::mhcnuggets=2.3.2" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/mulled-v2-c3f301504f7fa2e7bf81c3783de19a9990ea3001:12b1b9f040fd92a80629d58f8a558dde4820eb15-0' : - 'biocontainers/mulled-v2-c3f301504f7fa2e7bf81c3783de19a9990ea3001:12b1b9f040fd92a80629d58f8a558dde4820eb15-0' }" - - input: - tuple val(meta), path(peptides), val(alleles) - - output: - tuple val(meta), path("*_class_2"), emit: csv - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def prefix = task.ext.prefix ?: "${meta.id}_predicted_peptides_class_2" - - """ - mhcnuggets_predict_peptides.py --peptides $peptides \\ - --alleles '$alleles' \\ - --output ${prefix} - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - mhcflurry: \$(echo \$(mhcflurry-predict --version 2>&1 | sed 's/^mhcflurry //; s/ .*\$//') ) - mhcnuggets: \$(echo \$(python -c "import pkg_resources; print('mhcnuggets' + pkg_resources.get_distribution('mhcnuggets').version)" | sed 's/^mhcnuggets//; s/ .*\$//' )) - fred2: \$(echo \$(python -c "import pkg_resources; print('fred2' + pkg_resources.get_distribution('Fred2').version)" | sed 's/^fred2//; s/ .*\$//')) - END_VERSIONS - """ -} diff --git a/modules/local/openms_percolatoradapter.nf b/modules/local/openms_percolatoradapter.nf index a155dccc..c19718df 100644 --- a/modules/local/openms_percolatoradapter.nf +++ b/modules/local/openms_percolatoradapter.nf @@ -20,13 +20,11 @@ process OPENMS_PERCOLATORADAPTER { script: def prefix = task.ext.prefix ?: "${meta.id}_pout" def args = task.ext.args ?: '' - def klammer = (params.description_correct_features > 0 && params.klammer) ? "-klammer" : "" """ OMP_NUM_THREADS=$task.cpus \\ PercolatorAdapter -in $merged_with_features \\ -out ${prefix}.idXML \\ - $klammer \\ $args cat <<-END_VERSIONS > versions.yml diff --git a/modules/local/predict_possible_neoepitopes.nf b/modules/local/predict_possible_neoepitopes.nf deleted file mode 100644 index c469b469..00000000 --- a/modules/local/predict_possible_neoepitopes.nf +++ /dev/null @@ -1,41 +0,0 @@ -process PREDICT_POSSIBLE_NEOEPITOPES { - tag "$meta" - label 'process_low' - - conda "bioconda::fred2=2.0.7 bioconda::mhcflurry=1.4.3 bioconda::mhcnuggets=2.3.2" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/mulled-v2-c3f301504f7fa2e7bf81c3783de19a9990ea3001:12b1b9f040fd92a80629d58f8a558dde4820eb15-0' : - 'biocontainers/mulled-v2-c3f301504f7fa2e7bf81c3783de19a9990ea3001:12b1b9f040fd92a80629d58f8a558dde4820eb15-0' }" - - input: - tuple val(meta), val(alleles), path(vcf) - - output: - tuple val(meta), path("*.csv"), emit: csv - tuple val(meta), path("*.txt"), emit: txt - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def prefix = task.ext.prefix ?: "${meta}_vcf_neoepitopes" - - """ - vcf_neoepitope_predictor.py \\ - -t ${params.variant_annotation_style} \\ - -r ${params.variant_reference} \\ - -a '$alleles' \\ - -minl ${params.peptide_min_length} \\ - -maxl ${params.peptide_max_length} \\ - -v $vcf \\ - -o ${prefix}.csv - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - mhcflurry: \$(echo \$(mhcflurry-predict --version 2>&1 | sed 's/^mhcflurry //; s/ .*\$//') ) - mhcnuggets: \$(echo \$(python -c "import pkg_resources; print('mhcnuggets' + pkg_resources.get_distribution('mhcnuggets').version)" | sed 's/^mhcnuggets//; s/ .*\$//' )) - fred2: \$(echo \$(python -c "import pkg_resources; print('fred2' + pkg_resources.get_distribution('Fred2').version)" | sed 's/^fred2//; s/ .*\$//')) - END_VERSIONS - """ -} diff --git a/modules/local/resolve_found_neoepitopes.nf b/modules/local/resolve_found_neoepitopes.nf deleted file mode 100644 index 4dce9107..00000000 --- a/modules/local/resolve_found_neoepitopes.nf +++ /dev/null @@ -1,36 +0,0 @@ -process RESOLVE_FOUND_NEOEPITOPES { - tag "$meta" - label 'process_low' - - conda "bioconda::fred2=2.0.7 bioconda::mhcflurry=1.4.3 bioconda::mhcnuggets=2.3.2" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/mulled-v2-c3f301504f7fa2e7bf81c3783de19a9990ea3001:12b1b9f040fd92a80629d58f8a558dde4820eb15-0' : - 'biocontainers/mulled-v2-c3f301504f7fa2e7bf81c3783de19a9990ea3001:12b1b9f040fd92a80629d58f8a558dde4820eb15-0' }" - - input: - tuple val(meta), path(mztab), path(neoepitopes) - - output: - tuple val(meta), path("*.csv"), emit: csv - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def prefix = task.ext.prefix ?: "${meta}_found_neoepitopes" - - """ - resolve_neoepitopes.py -n $neoepitopes \\ - -m $mztab \\ - -f csv \\ - -o ${prefix} - - cat <<-END_VERSIONS > versions.yml - ${task.process}: - mhcflurry: \$(echo \$(mhcflurry-predict --version 2>&1 | sed 's/^mhcflurry //; s/ .*\$//') ) - mhcnuggets: \$(echo \$(python -c "import pkg_resources; print('mhcnuggets' + pkg_resources.get_distribution('mhcnuggets').version)" | sed 's/^mhcnuggets//; s/ .*\$//')) - fred2: \$(echo \$(python -c "import pkg_resources; print('fred2' + pkg_resources.get_distribution('Fred2').version)" | sed 's/^fred2//; s/ .*\$//')) - END_VERSIONS - """ -} diff --git a/modules/local/thermorawfileparser.nf b/modules/local/thermorawfileparser.nf deleted file mode 100644 index 05ae988a..00000000 --- a/modules/local/thermorawfileparser.nf +++ /dev/null @@ -1,36 +0,0 @@ -process THERMORAWFILEPARSER { - tag "$meta.id" - label 'process_low' - - conda "bioconda::thermorawfileparser=1.4.3" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/thermorawfileparser:1.4.3--ha8f3691_0' : - 'biocontainers/thermorawfileparser:1.4.3--ha8f3691_0' }" - - input: - tuple val(meta), path(rawfile) - - output: - tuple val(meta), path("*.mzML"), emit: mzml - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def prefix = task.ext.prefix ?: "${rawfile.baseName}" - - // The ThermoRawFileParser expects a input file which is transcribed to an indexed mzML (defined by '-f 2') - """ - ThermoRawFileParser.sh \\ - -i $rawfile \\ - -f 2 \\ - -o . - - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - thermorawfileparser: \$(ThermoRawFileParser.sh --version) - END_VERSIONS - """ -} diff --git a/nextflow.config b/nextflow.config index 48a28039..12736cd5 100644 --- a/nextflow.config +++ b/nextflow.config @@ -11,63 +11,40 @@ params { // Input options input = null outdir = null - fasta = "data/*.fasta" + fasta = null // Workflow options - allele_sheet = null - include_proteins_from_vcf = false - predict_class_1 = false - predict_class_2 = false - refine_fdr_on_predicted_subset = false skip_decoy_generation = false - subset_affinity_threshold = 500 - variant_annotation_style = "SNPEFF" - variant_frameshift_filter = false - variant_indel_filter = false - variant_reference = "GRCH38" - variant_snp_filter = false - activation_method = 'ALL' - description_correct_features = 0 - digest_mass_range = "800:2500" + run_centroidisation = false + filter_mzml = false + skip_quantification = true + annotate_ions = false + + // Comet search parameters + default_params_file_comet = ' ' + instrument = 'high_res' enzyme = 'unspecific cleavage' - fdr_threshold = 0.01 - fdr_level = 'peptide_level_fdrs' - fixed_mods = ' ' + activation_method = 'ALL' + digest_mass_range = '800:2500' + prec_charge = '2:3' + precursor_mass_tolerance = 5 + precursor_error_units = 'ppm' fragment_bin_offset = 0.0 fragment_mass_tolerance = 0.01 - instrument = 'high_res' - default_params_file_comet = ' ' - klammer = false - max_rt_alignment_shift = 300 number_mods = 3 + fixed_mods = ' ' + variable_mods = 'Oxidation (M)' num_hits = 1 - peptide_min_length = 8 - peptide_max_length = 12 - pick_ms_levels = 2 - prec_charge = '2:3' - precursor_mass_tolerance = 5 - quantification_fdr = null - quantification_min_prob = 0 - quantification_mz_window = 5 - quantification_rt_window = 0 - quantification_peak_width = 60 - quantification_min_peak_width = 0.2 - quantification_mapping_tolerance= 0 - refine_fdr_on_predicted_subset = false - remove_precursor_peak = false - run_centroidisation = false - skip_quantification = false - spectrum_batch_size = 0 - subset_max_train = 0 use_x_ions = false use_z_ions = false use_a_ions = false use_c_ions = false use_NL_ions = false - variable_mods = 'Oxidation (M)' - vcf_sheet = null - annotate_ions = false - filter_mzml = false + remove_precursor_peak = false + spectrum_batch_size = 0 + + // Preprocessing settings + pick_ms_levels = 2 // MS2Rescore settings rescoring_engine = 'percolator' @@ -75,6 +52,25 @@ params { ms2pip_model = 'Immuno-HCD' deeplc_calibration_set_size = 0.15 + // Percolator settings + fdr_threshold = 0.01 + fdr_level = 'peptide_level_fdrs' + subset_max_train = 0 + + // IDfilter settings + peptide_min_length = 8 + peptide_max_length = 12 + + // Quantification and alignment settings + max_rt_alignment_shift = 300 + quantification_fdr = null + quantification_min_prob = 0 + quantification_mz_window = 5 + quantification_rt_window = 0 + quantification_peak_width = 60 + quantification_min_peak_width = 0.2 + quantification_mapping_tolerance = 0 + // MultiQC options multiqc_config = null multiqc_title = null @@ -289,7 +285,7 @@ dag { manifest { name = 'nf-core/mhcquant' - author = """Leon Bichmann, Marissa Dubbelaar, Jonas Scheid, Steffen Lemke""" + author = """Jonas Scheid, Steffen Lemke, Leon Bichmann, Marissa Dubbelaar""" homePage = 'https://github.com/nf-core/mhcquant' description = """Identify and quantify peptides from mass spectrometry raw data""" mainScript = 'main.nf' diff --git a/nextflow_schema.json b/nextflow_schema.json index c92162df..c81e02b3 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -20,7 +20,8 @@ "exists": true, "schema": "assets/schema_input.json", "mimetype": "text/csv", - "pattern": "^\\S+\\.tsv$" + "pattern": "^\\S+\\.tsv$", + "fa_icon": "fas fa-file" }, "outdir": { "type": "string", @@ -46,89 +47,135 @@ "title": "Database Options", "type": "object", "fa_icon": "fas fa-database", - "description": "", - "default": "", + "description": "Decoy Database generation settings", + "required": ["fasta"], "properties": { "fasta": { "type": "string", - "fa_icon": "fas fa-file-code", + "fa_icon": "fas fa-file", "pattern": ".fasta$", "description": "Input FASTA protein database", - "help_text": "If you have no genome reference available, the pipeline can build one using a FASTA file. This requires additional time and resources, so it's better to use a pre-build index if possible." - }, - "include_proteins_from_vcf": { - "type": "boolean", - "fa_icon": "fas fa-file-code", - "description": "Set depending on whether variants should be translated to proteins and included into your fasta for database search." + "help_text": "Path to the protein database file" }, "skip_decoy_generation": { "type": "boolean", "fa_icon": "fas fa-fast-forward", - "description": "Add this parameter when you want to skip the generation of the decoy database, the consequence is that it prevents the generation of variants and FDR refinement", - "help_text": "If you want to use your own decoys, you can specify a dataset that includes decoy sequences. However, each database entry should keep the prefix 'DECOY_'.\nOne should consider though that this option will then prevent appending variants to the database and if not using reversed decoys the subset refinement FDR option will not work." + "description": "Add this parameter when you want to skip the generation of the decoy database.", + "help_text": "If you want to use your own decoys, you can specify a dataset that includes decoy sequences. However, each database entry should keep the prefix 'DECOY_'." } } }, "preprocessing": { - "title": "Preprocessing", + "title": "Spectrum preprocessing", "type": "object", - "fa_icon": "fas fa-microchip", - "description": "", + "fa_icon": "fas fa-arrows-to-circle", + "description": "Define pre-search settings", "default": "", "properties": { + "run_centroidisation": { + "type": "boolean", + "fa_icon": "fas fa-border-center-v", + "default": false, + "description": "Include the flag when the specified ms level is not centroided (default=false). " + }, "pick_ms_levels": { "type": "integer", "fa_icon": "fas fa-layer-group", "default": 2, "description": "Specify the MS levels for which the peak picking is applied (unless you use `--run_centroidisation`)." }, - "run_centroidisation": { + "filter_mzml": { "type": "boolean", - "fa_icon": "fas fa-border-center-v", "default": false, - "description": "Include the flag when the specified ms level is not centroided (default=false). " + "fa_icon": "fas fa-filter", + "description": "Clean up spectrum files and remove artificial charge 0 peptides." } } }, - "mass_spectrometry_data_processing": { - "title": "Mass Spectrometry Data Processing", + "search_settings": { + "title": "Database Search Settings", "type": "object", - "fa_icon": "fas fa-waveform-lines", + "fa_icon": "fas fa-magnifying-glass", "description": "", "default": "", "properties": { - "peptide_min_length": { - "type": "integer", - "fa_icon": "fas fa-dash", - "default": 8, - "description": "Specify the minimum length of peptides to be considered after processing" + "instrument": { + "type": "string", + "default": "high_res", + "enum": ["high_res", "low_res"], + "hidden": true, + "fa_icon": "fas fa-wrench", + "description": "Comets theoretical_fragment_ions parameter: theoretical fragment ion peak representation, high_res: sum of intensities plus flanking bins, ion trap (low_res) ms/ms: sum of intensities of central M bin only" }, - "peptide_max_length": { - "type": "integer", - "fa_icon": "fas fa-plus", - "default": 12, - "description": "Specify the maximum length of peptides to be considered after processing" + "enzyme": { + "type": "string", + "fa_icon": "fas fa-scissors", + "default": "unspecific cleavage", + "enum": [ + "unspecific cleavage", + "no cleavage", + "Arg-C/P", + "Asp-N", + "Lys-C", + "Lys-N", + "Chymotrypsin", + "CNBr", + "Trypsin", + "Arg-C", + "PepsinA", + "Trypsin/P", + "glutamyl endopeptidase" + ], + "description": "Specify which enzymatic restriction should be applied", + "hidden": true, + "help_text": "For HLA peptides rarely other enzymes are used, however most enzymes such as for example 'Trypsin' are available." }, - "fragment_mass_tolerance": { - "type": "number", - "default": 0.01, - "fa_icon": "fas fa-indent", - "description": "Specify the fragment mass tolerance to be used for the comet database search.", - "help_text": "For High-Resolution instruments a fragment mass tolerance value of 0.02 is recommended. (See the Comet parameter documentation: eg. 0.02)" + "activation_method": { + "type": "string", + "fa_icon": "fas fa-star-shooting", + "default": "ALL", + "enum": ["ALL", "CID", "ECD", "ETD", "PQD", "HCD", "IRMPD"], + "description": "Specify which fragmentation method was used in the MS acquisition", + "help_text": "If not specified, `ALL` tries to infer the fragmentation method based on the spectrum file" + }, + "digest_mass_range": { + "type": "string", + "fa_icon": "fas fa-line-height", + "description": "Specify the mass range in Dalton that peptides should fulfill to be considered for peptide spectrum matching." + }, + "prec_charge": { + "type": "string", + "fa_icon": "fas fa-bolt", + "description": "Specify the precursor charge range that peptides should fulfill to be considered for peptide spectrum matching." }, "precursor_mass_tolerance": { "type": "integer", - "fa_icon": "fas fa-indent", + "fa_icon": "fas fa-think-peaks", "default": 5, - "description": "Specify the precursor mass tolerance to be used for the comet database search.", - "help_text": " For High-Resolution instruments a precursor mass tolerance value of 5ppm is recommended. (eg. 5)" + "description": "Specify the precursor mass tolerance to be used for the Comet database search.", + "help_text": "For high-resolution instruments a precursor mass tolerance value of 5ppm is recommended. (eg. 5)" + }, + "precursor_error_units": { + "type": "string", + "fa_icon": "fas fa-think-peaks", + "default": "ppm", + "enum": ["ppm", "Da", "amu"], + "description": "Specify the unit of the precursor mass tolerance to be used for the Comet database search." }, "fragment_bin_offset": { "type": "number", - "fa_icon": "fas fa-indent", + "fa_icon": "fas fa-pipe", "default": 0.0, + "hidden": true, "description": "Specify the fragment bin offset to be used for the comet database search.", - "help_text": "For High-Resolution instruments a fragment bin offset of 0 is recommended. (See the Comet parameter documentation: eg. 0)" + "help_text": "For high-resolution instruments a fragment bin offset of 0 is recommended. (See the Comet parameter documentation: https://uwpr.github.io/Comet/parameters/parameters_202401/).\n This parameter needs to be combined with `fragment_bin_tol` parameter" + }, + "fragment_mass_tolerance": { + "type": "number", + "default": 0.01, + "fa_icon": "fas fa-pipe", + "description": "Specify the fragment mass tolerance to be used for the comet database search.", + "help_text": "For high-resolution instruments a fragment mass tolerance value of 0.02 is recommended\n(See the Comet parameter documentation: https://uwpr.github.io/Comet/parameters/parameters_202401/).\nThe OpenCometAdapter mulitplies this parameter with 2 to align with other search engines." }, "number_mods": { "type": "integer", @@ -136,199 +183,136 @@ "default": 3, "description": "Specify the maximum number of modifications that should be contained in a peptide sequence match." }, - "num_hits": { - "type": "integer", - "fa_icon": "fas fa-hashtag", - "default": 1, - "description": "Specify the number of hits that should be reported for each spectrum." - }, - "digest_mass_range": { - "type": "string", - "fa_icon": "fas fa-chard-line", - "default": "800:2500", - "description": "Specify the mass range that peptides should fulfill to be considered for peptide spectrum matching." - }, - "prec_charge": { - "type": "string", - "fa_icon": "fas fa-input-numeric", - "default": "2:3", - "description": "Specify the precursor charge range that peptides should fulfill to be considered for peptide spectrum matching." - }, - "activation_method": { - "type": "string", - "fa_icon": "fas fa-file-code", - "default": "ALL", - "description": "Specify which fragmentation method was used in the MS acquisition", - "enum": ["ALL", "CID", "ECD", "ETD", "PQD", "HCD", "IRMPD"] - }, - "enzyme": { - "type": "string", - "fa_icon": "fas fa-dna", - "default": "unspecific cleavage", - "description": "Specify which enzymatic restriction should be applied", - "hidden": true, - "help_text": " for HLA peptides rarely other enzymes are used, however most enzymes such as for example 'Trypsin' are available (see OpenMS enzymes)" - }, - "max_rt_alignment_shift": { - "type": "integer", - "fa_icon": "fas fa-slider", - "default": 300, - "description": "Set a maximum retention time shift for the linear rt alignment" - }, "fixed_mods": { "type": "string", "fa_icon": "fas fa-cubes-stacked", "description": "Specify which fixed modifications should be applied to the database search", - "help_text": "e.g. 'Carbamidomethyl (C)' (see OpenMS modifications; for a list of options, see parameter description on https://abibuilder.informatik.uni-tuebingen.de/archive/openms/Documentation/nightly/html/TOPP_CometAdapter.html)" + "help_text": "e.g. 'Carbamidomethyl (C)' (see OpenMS modifications; for a list of options, see parameter description on https://abibuilder.informatik.uni-tuebingen.de/archive/openms/Documentation/nightly/html/TOPP_CometAdapter.html).\nMultiple modifications can be specified by separating commas:`Oxidation (M),Carbamidomethyl (C)`" }, "variable_mods": { "type": "string", "fa_icon": "fas fa-cubes-stacked", "default": "Oxidation (M)", "description": "Specify which variable modifications should be applied to the database search", - "help_text": "e.g. 'Oxidation (M)' (see OpenMS modifications; for a list of options, see parameter description on https://abibuilder.informatik.uni-tuebingen.de/archive/openms/Documentation/nightly/html/TOPP_CometAdapter.html)" + "help_text": "e.g. 'Oxidation (M)' (see OpenMS modifications; for a list of options, see parameter description on https://abibuilder.informatik.uni-tuebingen.de/archive/openms/Documentation/nightly/html/TOPP_CometAdapter.html).\nMultiple modifications can be specified by separating commas:`Oxidation (M),Carbamidomethyl (C)`" + }, + "num_hits": { + "type": "integer", + "fa_icon": "fas fa-hashtag", + "default": 1, + "description": "Specify the number of hits that should be reported for each spectrum." }, "use_x_ions": { "type": "boolean", - "fa_icon": "fas fa-file-code", + "fa_icon": "fas fa-wave-sine", "description": "Include x ions into the peptide spectrum matching" }, "use_z_ions": { "type": "boolean", - "fa_icon": "fas fa-file-code", + "fa_icon": "fas fa-wave-sine", "description": "Include z ions into the peptide spectrum matching" }, "use_a_ions": { "type": "boolean", - "fa_icon": "fas fa-file-code", + "fa_icon": "fas fa-wave-sine", "description": "Include a ions into the peptide spectrum matching" }, "use_c_ions": { "type": "boolean", - "fa_icon": "fas fa-file-code", + "fa_icon": "fas fa-wave-sine", "description": "Include c ions into the peptide spectrum matching" }, "use_NL_ions": { "type": "boolean", - "fa_icon": "fas fa-file-code", + "fa_icon": "fas fa-wave-sine", "description": "Include NL ions into the peptide spectrum matching" }, "remove_precursor_peak": { "type": "boolean", - "fa_icon": "fas fa-file-code", + "fa_icon": "fas fa-pipe", "description": "Include if you want to remove all peaks around precursor m/z", "default": false }, "spectrum_batch_size": { "type": "integer", - "fa_icon": "fas fa-wave-sine", + "fa_icon": "fas fa-truck-fast", "default": 0, + "hidden": true, "description": "Size of Spectrum batch for Comet processing (Decrease/Increase depending on Memory Availability)" }, - "vcf_sheet": { - "type": "string", - "fa_icon": "fas fa-file-code", - "pattern": "^\\S+\\.tsv$", - "description": "Specify a .tsv file containing the information about genomic variants (vcf files < v.4.2) for each sample.", - "help_text": "| Sample | VCF_FileName |\n| -------------| :---------------------:|\n| MM15_Melanom | data/MM15_variants.vcf |\n| MM17_Melanom | data/MM17_variants.vcf |" - }, - "annotate_ions": { - "type": "boolean", - "default": "false", - "fa_icon": "fas fa-tags", - "description": "Set this option to create documents that are created to perform the ion annotation" - }, - "instrument": { - "type": "string", - "default": "high_res", - "fa_icon": "fas fa-wrench", - "description": "Comets theoretical_fragment_ions parameter: theoretical fragment ion peak representation, high-res: sum of intensities plus flanking bins, ion trap (low-res) ms/ms: sum of intensities of central M bin only" - }, "default_params_file_comet": { "type": "string", "fa_icon": "fas fa-file-code", - "description": "Default Comet params file. All parameters of this take precedence." - }, - "filter_mzml": { - "type": "boolean", - "fa_icon": "fas fa-file-code", - "description": "Clean up mzml files and remove artificial charge 0 peptides." + "hidden": true, + "description": "Specify custom Comet params file. All parameters of this take precedence." } } }, - "rescoring": { - "title": "Rescoring", + "rescoring_settings": { + "title": "Rescoring settings", "type": "object", "fa_icon": "fas fa-star-half-stroke", "description": "", "default": "", "properties": { - "fdr_level": { - "type": "string", - "fa_icon": "fas fa-rectangle-code", - "default": "peptide_level_fdrs", - "description": "Specify the level at which the false discovery rate should be computed.", - "enum": ["peptide_level_fdrs", "psm_level_fdrs", "protein_level_fdrs"] - }, - "fdr_threshold": { - "type": "number", - "fa_icon": "fas fa-less-than", - "default": 0.01, - "description": "Specify the false discovery rate threshold at which peptide hits should be selected." - }, "rescoring_engine": { "type": "string", - "fa_icon": "fas fa-file-code", + "fa_icon": "fas fa-wrench", "default": "percolator", "description": "Specify the rescoring engine that should be used for rescoring. Either percolator or mokapot", "enum": ["percolator", "mokapot"] }, "feature_generators": { "type": "string", - "fa_icon": "fas fa-file-code", + "fa_icon": "fas fa-head-side-gear", "default": "deeplc,ms2pip", "description": "Specify the feature generator that should be used for rescoring. One or multiple of basic,ms2pip,deeplc,ionmob" }, "ms2pip_model": { "type": "string", - "fa_icon": "fas fa-file-code", + "fa_icon": "fas fa-head-side-gear", "default": "Immuno-HCD", + "enum": [ + "Immuno-HCD", + "timsTOF", + "CID", + "CIDch2", + "CID-TMT", + "TMT", + "HCD", + "HCDch2", + "TTOF5600", + "iTRAQ", + "iTRAQphospho" + ], "description": "Specify the ms2pip model that should be used for rescoring. Checkout the ms2pip documentation for available models." }, "deeplc_calibration_set_size": { "type": "number", - "fa_icon": "fas fa-file-code", + "fa_icon": "fas fa-head-side-gear", + "hidden": true, "default": 0.15, "description": "Specify the number or percentage of PSMs that should be used for calibration of the deeplc model." }, - "refine_fdr_on_predicted_subset": { - "type": "boolean", - "fa_icon": "fas fa-arrows-repeat", - "help_text": "SubsetFDR makes use of binding predictions applying the tool mhcflurry to subset all PSMs not passing the q-value threshold. If specified the FDR will be refined using Percolator on the subset of predicted binders among all PSMs resulting in an increased identification rate. (Please be aware that this option is only available for MHC class I data of alleles that are supported by mhcflurry)", - "description": "Set if MHCquant should be run in SubsetFDR mode" - }, - "subset_affinity_threshold": { - "type": "integer", - "fa_icon": "fas fa-pen", - "default": 500, - "description": "Affinity threshold (nM) used to define binders for PSM subset selection in the FDR refinement procedure" - }, - "description_correct_features": { - "type": "integer", - "fa_icon": "fas fa-pen", - "description": "Specify percolator descriptor feature set", - "help_text": "See percolator description (https://github.com/percolator/percolator/wiki/Retention-time-and-calibration)" + "fdr_level": { + "type": "string", + "fa_icon": "fas fa-rectangle-code", + "default": "peptide_level_fdrs", + "description": "Specify the level at which the false discovery rate should be computed.", + "enum": ["peptide_level_fdrs", "psm_level_fdrs", "protein_level_fdrs"] }, - "klammer": { - "type": "boolean", - "fa_icon": "fas fa-microchip", - "description": "Use klammer retention time features for Percolator rescoring", - "help_text": "https://pubs.acs.org/doi/10.1021/ac070262k" + "fdr_threshold": { + "type": "number", + "fa_icon": "fas fa-less-than", + "default": 0.01, + "description": "Specify the false discovery rate threshold at which peptide hits should be selected." }, "subset_max_train": { "type": "integer", + "hidden": true, + "default": 0, "fa_icon": "fas fa-train-track", - "description": "Maximum subset for percolator training iterations" + "description": "Maximum subset for Percolator training iterations" } } }, @@ -341,104 +325,86 @@ "properties": { "skip_quantification": { "type": "boolean", + "default": true, "fa_icon": "fas fa-fast-forward", "description": "Skip quantification and only yield peptide identifications" }, + "max_rt_alignment_shift": { + "type": "integer", + "fa_icon": "fas fa-align-center", + "default": 300, + "description": "Set a maximum retention time shift for the linear RT alignment" + }, "quantification_fdr": { "type": "boolean", + "hidden": true, "fa_icon": "fas fa-less-than", "description": "Compute FDR for the targeted approach", "help_text": "(Weisser H. and Choudhary J.S. J Proteome Res. 2017 Aug 4)" }, "quantification_min_prob": { "type": "number", + "hidden": true, + "default": 0, "description": "Specify a cut off probability value for quantification events as a filter" }, "quantification_mz_window": { "type": "number", + "hidden": true, + "default": 5, "description": "Specify a m/z window for matching between runs" }, "quantification_rt_window": { "type": "number", + "hidden": true, + "default": 0, "description": "Specify a rt window for matching between runs" }, "quantification_mapping_tolerance": { "type": "number", + "hidden": true, + "default": 0, "description": "Specify a rt mapping tolerance for mapping features between runs" }, "quantification_peak_width": { "type": "number", + "hidden": true, + "default": 60, "description": "Specify a peak width for feature extraction" }, "quantification_min_peak_width": { "type": "number", + "hidden": true, + "default": 0.2, "description": "Specify a minimum peak width for quantification" } } }, - "mhc_affinity_prediction": { - "title": "MHC affinity prediction", - "type": "object", - "fa_icon": "fas fa-magnifying-glass", - "description": "", - "default": "", - "properties": { - "allele_sheet": { - "type": "string", - "fa_icon": "fas fa-file-code", - "pattern": "^\\S+\\.tsv$", - "description": "Specify a .tsv file containing the MHC alleles of your probes as well as their metadata such as SampleID.", - "help_text": "| Sample | HLA_Alleles_Class_1 | HLA_Alleles_Class_2 |\n| -------------| :----------------------------------------------:| ------------------------------------------:|\n| MM15_Melanom | `A*03:01;A*68:01;B*27:05;B*35:03;C*02:02;C*04:01` | `HLA-DRB1*01:01;HLA-DQB1*03:19;HLA-DQA1*05:01` |\n| MM17_Melanom | `A*02:01;B*07:01;B*26:01;C*11:01;C*01:01` | `HLA-DRB1*01:02;HLA-DRB3*02:02;HLA-DRB4*01:03` |\n" - }, - "predict_class_1": { - "type": "boolean", - "fa_icon": "fas fa-circle-1", - "description": "Set flag depending on whether MHC class 1 binding predictions using the tool mhcflurry should be run." - }, - "predict_class_2": { - "type": "boolean", - "fa_icon": "fas fa-circle-2", - "description": "Set flag depending on whether MHC class 2 binding predictions using the tool mhcnuggets should be run." - } - } - }, - "variant_options": { - "title": "Variant Options", + "post_processing": { + "title": "Post Processing", "type": "object", - "fa_icon": "fas fa-dna", + "fa_icon": "fas fa-waveform-lines", "description": "", "default": "", "properties": { - "variant_reference": { - "type": "string", - "fa_icon": "fas fa-file", - "description": "Specify genomic reference used for variant annotation", - "enum": ["GRCH37", "GRCH38"], - "default": "GRCH38" - }, - "variant_annotation_style": { - "type": "string", - "fa_icon": "fas fa-file", - "description": "Specify style of tool used for variant annotation - currently supported", - "enum": ["SNPEFF", "VEP", "ANNOVAR"], - "default": "SNPEFF" - }, - "variant_indel_filter": { - "type": "boolean", + "peptide_min_length": { + "type": "integer", "fa_icon": "fas fa-filter", - "description": "Set this option to not consider insertions and deletions for variant translation", - "default": false + "default": 8, + "description": "Specify the minimum length of peptides to be considered after processing" }, - "variant_frameshift_filter": { - "type": "boolean", + "peptide_max_length": { + "type": "integer", "fa_icon": "fas fa-filter", - "description": "Set this option to not consider frameshifts for variant translation", - "default": false + "default": 12, + "description": "Specify the maximum length of peptides to be considered after processing" }, - "variant_snp_filter": { + "annotate_ions": { "type": "boolean", - "fa_icon": "fas fa-filter", - "description": "Set this option to not consider snps for variant translation" + "default": false, + "fa_icon": "fas fa-tags", + "description": "Create tsv files containing information about the MS2 ion annotations after processing.", + "help_text": "The resulting tsv files should aid in spectrum validation downstream analyses" } } }, @@ -655,19 +621,16 @@ "$ref": "#/definitions/preprocessing" }, { - "$ref": "#/definitions/mass_spectrometry_data_processing" + "$ref": "#/definitions/search_settings" }, { - "$ref": "#/definitions/rescoring" + "$ref": "#/definitions/rescoring_settings" }, { "$ref": "#/definitions/quantification_options" }, { - "$ref": "#/definitions/mhc_affinity_prediction" - }, - { - "$ref": "#/definitions/variant_options" + "$ref": "#/definitions/post_processing" }, { "$ref": "#/definitions/institutional_config_options" diff --git a/subworkflows/local/include_proteins.nf b/subworkflows/local/include_proteins.nf deleted file mode 100644 index b1c54396..00000000 --- a/subworkflows/local/include_proteins.nf +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Perform the necessary steps to make the data uniform for further processing - */ - -include { GENERATE_PROTEINS_FROM_VCF } from '../../modules/local/generate_proteins_from_vcf' - -workflow INCLUDE_PROTEINS { - take: - input_fasta - - main: - ch_versions = Channel.empty() - - // Variant - vcf_sheet = file(params.vcf_sheet, checkIfExists: true) - - Channel.from( vcf_sheet ) - .splitCsv(header: ['Sample', 'VCF_FileName'], sep:'\t', skip: 1) - .map { col -> tuple("${col.Sample}", file("${col.VCF_FileName}"),) } - .set { ch_vcf_from_sheet } - - // Combine the vcf information with the meta information - ch_vcf = input_fasta - .map{ it -> [it[0].sample, it[0], it[1]] } - .combine( ch_vcf_from_sheet, by: 0 ) - .map(it -> [it[1], it[2], it[3]]) - // If specified translate variants to proteins and include in reference fasta - GENERATE_PROTEINS_FROM_VCF( ch_vcf ) - ch_versions = ch_versions.mix(GENERATE_PROTEINS_FROM_VCF.out.versions) - - emit: - // Define the information that is returned by this workflow - versions = ch_versions - ch_vcf_from_sheet = ch_vcf_from_sheet - ch_fasta_file = GENERATE_PROTEINS_FROM_VCF.out.vcf_fasta -} diff --git a/subworkflows/local/predict_class1.nf b/subworkflows/local/predict_class1.nf deleted file mode 100644 index 16247fef..00000000 --- a/subworkflows/local/predict_class1.nf +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Perform the class 1 prediction when the parameter --predict_class_1 is provided and --skip_quantification is not - */ - -include { MHCFLURRY_PREDICTPEPTIDESCLASS1 } from '../../modules/local/mhcflurry_predictpeptidesclass1' -include { PREDICT_POSSIBLE_NEOEPITOPES as PREDICT_POSSIBLE_CLASS1_NEOEPITOPES } from '../../modules/local/predict_possible_neoepitopes' -include { RESOLVE_FOUND_NEOEPITOPES as RESOLVE_FOUND_CLASS1_NEOEPITOPES } from '../../modules/local/resolve_found_neoepitopes' -include { MHCFLURRY_PREDICTNEOEPITOPESCLASS1 } from '../../modules/local/mhcflurry_predictneoepitopesclass1' - -workflow PREDICT_CLASS1 { - take: - mztab - peptides_class_1_alleles - ch_vcf_from_sheet - - main: - ch_versions = Channel.empty() - ch_predicted_possible_neoepitopes = Channel.empty() - alleles = peptides_class_1_alleles.map{ meta, alleles -> [[id:meta], alleles] } - - // If specified predict peptides using MHCFlurry - MHCFLURRY_PREDICTPEPTIDESCLASS1(mztab.join(alleles)) - ch_versions = ch_versions.mix(MHCFLURRY_PREDICTPEPTIDESCLASS1.out.versions) - - if ( params.include_proteins_from_vcf ) { - // Predict all possible neoepitopes from vcf - PREDICT_POSSIBLE_CLASS1_NEOEPITOPES(alleles.combine(ch_vcf_from_sheet, by:0)) - ch_versions = ch_versions.mix(PREDICT_POSSIBLE_CLASS1_NEOEPITOPES.out.versions) - ch_predicted_possible_neoepitopes = PREDICT_POSSIBLE_CLASS1_NEOEPITOPES.out.csv - // Resolve found neoepitopes - RESOLVE_FOUND_CLASS1_NEOEPITOPES( - mztab - .map{ it -> [it[0].sample, it[0], it[1]] } - .combine( ch_predicted_possible_neoepitopes, by:0) - .map( it -> [it[1], it[2], it[3]]) - ) - ch_versions = ch_versions.mix(RESOLVE_FOUND_CLASS1_NEOEPITOPES.out.versions) - // Predict class 1 neoepitopes MHCFlurry - MHCFLURRY_PREDICTNEOEPITOPESCLASS1(alleles.join(RESOLVE_FOUND_CLASS1_NEOEPITOPES.out.csv, by:0)) - ch_versions = ch_versions.mix(MHCFLURRY_PREDICTNEOEPITOPESCLASS1.out.versions) - } - - emit: - // Define the information that is returned by this workflow - versions = ch_versions - ch_predicted_possible_neoepitopes = ch_predicted_possible_neoepitopes -} diff --git a/subworkflows/local/predict_class2.nf b/subworkflows/local/predict_class2.nf deleted file mode 100644 index e4af7c2c..00000000 --- a/subworkflows/local/predict_class2.nf +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Perform the class 2 prediction when the parameter --predict_class_2 is provided and --skip_quantification is not - */ - -include { MHCNUGGETS_PEPTIDESCLASS2PRE } from '../../modules/local/mhcnuggets_peptidesclass2pre' -include { MHCNUGGETS_PREDICTPEPTIDESCLASS2 } from '../../modules/local/mhcnuggets_predictpeptidesclass2' -include { MHCNUGGETS_PEPTIDESCLASS2POST } from '../../modules/local/mhcnuggets_peptidesclass2post' -include { PREDICT_POSSIBLE_NEOEPITOPES as PREDICT_POSSIBLE_CLASS2_NEOEPITOPES } from '../../modules/local/predict_possible_neoepitopes' -include { RESOLVE_FOUND_NEOEPITOPES as RESOLVE_FOUND_CLASS2_NEOEPITOPES } from '../../modules/local/resolve_found_neoepitopes' -include { MHCNUGGETS_NEOEPITOPESCLASS2PRE } from '../../modules/local/mhcnuggets_neoepitopesclass2pre' -include { MHCNUGGETS_PREDICTNEOEPITOPESCLASS2 } from '../../modules/local/mhcnuggets_predictneoepitopesclass2' -include { MHCNUGGETS_NEOEPITOPESCLASS2POST } from '../../modules/local/mhcnuggets_neoepitopesclass2post' - -workflow PREDICT_CLASS2 { - take: - mztab - peptides_class_2_alleles - ch_vcf_from_sheet - - main: - ch_versions = Channel.empty() - ch_predicted_possible_neoepitopes = Channel.empty() - alleles = peptides_class_2_alleles.map{meta, alleles -> [[id:meta], alleles]} - - // Preprocess found peptides for MHCNuggets prediction class 2 - MHCNUGGETS_PEPTIDESCLASS2PRE(mztab) - ch_versions = ch_versions.mix(MHCNUGGETS_PEPTIDESCLASS2PRE.out.versions) - - // Predict found peptides using MHCNuggets class 2 - MHCNUGGETS_PREDICTPEPTIDESCLASS2( - MHCNUGGETS_PEPTIDESCLASS2PRE.out.preprocessed - .join(alleles) - ) - ch_versions = ch_versions.mix(MHCNUGGETS_PREDICTPEPTIDESCLASS2.out.versions) - // Postprocess predicted MHCNuggets peptides class 2 - MHCNUGGETS_PEPTIDESCLASS2POST( MHCNUGGETS_PREDICTPEPTIDESCLASS2.out.csv.join(MHCNUGGETS_PEPTIDESCLASS2PRE.out.geneID, by:0) ) - ch_versions = ch_versions.mix(MHCNUGGETS_PEPTIDESCLASS2POST.out.versions) - if ( params.include_proteins_from_vcf ) { - // Predict all possible class 2 neoepitopes from vcf - PREDICT_POSSIBLE_CLASS2_NEOEPITOPES(alleles.combine(ch_vcf_from_sheet, by:0)) - ch_versions = ch_versions.mix(PREDICT_POSSIBLE_CLASS2_NEOEPITOPES.out.versions) - ch_predicted_possible_neoepitopes = PREDICT_POSSIBLE_CLASS2_NEOEPITOPES.out.csv - // Resolve found class 2 neoepitopes - RESOLVE_FOUND_CLASS2_NEOEPITOPES( - mztab - .map{ it -> [it[0].sample, it[1]] } - .combine( ch_predicted_possible_neoepitopes, by:0) - ) - ch_versions = ch_versions.mix(RESOLVE_FOUND_CLASS2_NEOEPITOPES.out.versions) - // Preprocess resolved neoepitopes in a format that MHCNuggets understands - MHCNUGGETS_NEOEPITOPESCLASS2PRE(RESOLVE_FOUND_CLASS2_NEOEPITOPES.out.csv) - ch_versions = ch_versions.mix(MHCNUGGETS_NEOEPITOPESCLASS2PRE.out.versions) - // Predict class 2 MHCNuggets - MHCNUGGETS_PREDICTNEOEPITOPESCLASS2(MHCNUGGETS_NEOEPITOPESCLASS2PRE.out.preprocessed.join(alleles, by:0)) - ch_versions = ch_versions.mix(MHCNUGGETS_PREDICTNEOEPITOPESCLASS2.out.versions) - // Class 2 MHCNuggets Postprocessing - MHCNUGGETS_NEOEPITOPESCLASS2POST(RESOLVE_FOUND_CLASS2_NEOEPITOPES.out.csv.join(MHCNUGGETS_PREDICTNEOEPITOPESCLASS2.out.csv, by:0)) - ch_versions = ch_versions.mix(MHCNUGGETS_NEOEPITOPESCLASS2POST.out.versions) - } - - emit: - // Define the information that is returned by this workflow - versions = ch_versions - ch_predicted_possible_neoepitopes = ch_predicted_possible_neoepitopes -} diff --git a/subworkflows/local/refine_fdr.nf b/subworkflows/local/refine_fdr.nf deleted file mode 100644 index fdb1e9aa..00000000 --- a/subworkflows/local/refine_fdr.nf +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Perform an additional step where the process are collected - * that are called when the parameter "refine_fdr" is provided - */ - -include { OPENMS_MZTABEXPORTER as OPENMS_MZTABEXPORTERPERC } from '../../modules/local/openms_mztabexporter' -include { OPENMS_MZTABEXPORTER as OPENMS_MZTABEXPORTERPSM } from '../../modules/local/openms_mztabexporter' -include { MHCFLURRY_PREDICTPSMS } from '../../modules/local/mhcflurry_predictpsms' -include { OPENMS_PERCOLATORADAPTER } from '../../modules/local/openms_percolatoradapter' -include { OPENMS_IDFILTER as OPENMS_IDFILTER_PSMS } from '../../modules/nf-core/openms/idfilter/main' -include { OPENMS_IDFILTER as OPENMS_IDFILTER_REFINED } from '../../modules/nf-core/openms/idfilter/main' - -workflow REFINE_FDR { - // Define the input parameters - take: - filtered_perc_output - psm_features - classI_alleles - - main: - ch_versions = Channel.empty() - // Export filtered percolator results as mztab - OPENMS_MZTABEXPORTERPERC( filtered_perc_output ) - ch_versions = ch_versions.mix(OPENMS_MZTABEXPORTERPERC.out.versions) - // Export psm results as mztab - OPENMS_MZTABEXPORTERPSM( psm_features ) - ch_versions = ch_versions.mix(OPENMS_MZTABEXPORTERPSM.out.versions) - // Predict psm results using mhcflurry to shrink search space - MHCFLURRY_PREDICTPSMS( - OPENMS_MZTABEXPORTERPERC.out.mztab - .join( OPENMS_MZTABEXPORTERPSM.out.mztab, by:[0] ) - .map{ it -> [it[0].sample, it[0], it[1], it[2]] } - .combine( classI_alleles, by:0) - .map(it -> [it[1], it[2], it[3], it[4]]) - ) - ch_versions = ch_versions.mix(MHCFLURRY_PREDICTPSMS.out.versions) - - // Filter psm results by shrinked search space - // TODO: Check if filtering works properly when reevaluating this subworkflow - OPENMS_IDFILTER_PSMS(psm_features.combine( MHCFLURRY_PREDICTPSMS.out.idxml, by: [0] )) - ch_versions = ch_versions.mix(OPENMS_IDFILTER_PSMS.out.versions) - // Recompute percolator fdr on shrinked search space - OPENMS_PERCOLATORADAPTER( OPENMS_IDFILTER_PSMS.out.filtered ) - ch_versions = ch_versions.mix(OPENMS_PERCOLATORADAPTER.out.versions) - // Filter results by refined fdr - OPENMS_IDFILTER_REFINED(OPENMS_PERCOLATORADAPTER.out.idxml.flatMap { it -> [tuple(it[0], it[1], [])]}) - ch_versions = ch_versions.mix(OPENMS_IDFILTER_REFINED.out.versions) - emit: - // Define the information that is returned by this workflow - filter_refined_q_value = OPENMS_IDFILTER_REFINED.out.filtered - versions = ch_versions -} diff --git a/subworkflows/local/utils_nfcore_mhcquant_pipeline/main.nf b/subworkflows/local/utils_nfcore_mhcquant_pipeline/main.nf index 7bfd2d71..ba086c31 100644 --- a/subworkflows/local/utils_nfcore_mhcquant_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_mhcquant_pipeline/main.nf @@ -161,32 +161,6 @@ def validateInputSamplesheet(input) { return [ metas[0], fastqs ] } -// MHC affinity prediction -if (params.predict_class_1 || params.predict_class_2) { - Channel.from(file(params.allele_sheet, checkIfExists: true)) - .splitCsv(header: true, sep:'\t') - .multiMap { col -> - classI: ["${col.Sample}", "${col.HLA_Alleles_Class_1}"] - classII: ["${col.Sample}", "${col.HLA_Alleles_Class_2}"] } - .set { ch_alleles_from_sheet } - - // Allele class 1 - if (params.predict_class_1) { - ch_alleles_from_sheet.classI - .ifEmpty { exit 1, "params.allele_sheet was empty - no allele input file supplied" } - .flatMap { it -> [tuple(it[0].toString(), it[1])] } - .set { peptides_class_1_alleles } - } - - // Allele class 2 - if (params.predict_class_2) { - ch_alleles_from_sheet.classII - .ifEmpty { exit 1, "params.allele_sheet was empty - no allele input file supplied" } - .flatMap { it -> [tuple(it[0].toString(), it[1])] } - .set { peptides_class_2_alleles } - } -} - // // Generate methods description for MultiQC // diff --git a/workflows/mhcquant.nf b/workflows/mhcquant.nf index c14e7b69..fe16d1f4 100644 --- a/workflows/mhcquant.nf +++ b/workflows/mhcquant.nf @@ -21,11 +21,7 @@ include { OPENMS_MZTABEXPORTER } from '../modules/local/openms_mztabexport // // SUBWORKFLOW: Loaded from subworkflows/local/ // -include { INCLUDE_PROTEINS } from '../subworkflows/local/include_proteins' -include { REFINE_FDR } from '../subworkflows/local/refine_fdr' include { QUANT } from '../subworkflows/local/quant' -include { PREDICT_CLASS1 } from '../subworkflows/local/predict_class1' -include { PREDICT_CLASS2 } from '../subworkflows/local/predict_class2' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -76,20 +72,7 @@ workflow MHCQUANT { other : true } .set { branched_ms_files } - // - // SUBWORKFLOW: Include protein information - // - // TODO: Temporary disabled because of outdated vcf parsing - //if (params.include_proteins_from_vcf) { - // // Include the proteins from the vcf file to the fasta file - // INCLUDE_PROTEINS(ch_fasta) - // ch_versions = ch_versions.mix(INCLUDE_PROTEINS.out.versions) - // ch_fasta_file = INCLUDE_PROTEINS.out.ch_fasta_file - // ch_vcf_from_sheet = INCLUDE_PROTEINS.out.ch_vcf_from_sheet - //} else { - // ch_fasta_file = ch_fasta - // ch_vcf_from_sheet = Channel.empty() - //} + // Decoy Database creation if (!params.skip_decoy_generation) { // Generate reversed decoy database OPENMS_DECOYDATABASE(ch_fasta) @@ -132,12 +115,6 @@ workflow MHCQUANT { } // Run comet database search - // TODO: Fix accordingly with vcf parsing - //if (params.include_proteins_from_vcf) { - // OPENMS_COMETADAPTER(ch_clean_mzml_file.join(ch_decoy_db, remainder:true)) - //} else { - // OPENMS_COMETADAPTER(ch_clean_mzml_file.combine(ch_fasta.map{ meta, fasta -> [fasta] })) - //} OPENMS_COMETADAPTER(ch_clean_mzml_file.combine(ch_decoy_db)) ch_versions = ch_versions.mix(OPENMS_COMETADAPTER.out.versions) @@ -191,24 +168,7 @@ workflow MHCQUANT { // Filter by percolator q-value OPENMS_IDFILTER_Q_VALUE(ch_rescored_runs.map {group_meta, idxml -> [group_meta, idxml, []]}) ch_versions = ch_versions.mix(OPENMS_IDFILTER_Q_VALUE.out.versions) - - // - // SUBWORKFLOW: Refine the FDR values on the predicted subset - // - if (params.refine_fdr_on_predicted_subset && params.predict_class_1) { - // Run the following subworkflow - REFINE_FDR ( - OPENMS_IDFILTER_Q_VALUE.out.filtered, - OPENMS_PSMFEATUREEXTRACTOR.out.idxml, - peptides_class_1_alleles - ) - ch_versions = ch_versions.mix(REFINE_FDR.out.versions) - // Define the outcome of the paramer to a fixed variable - ch_filter_q_value = REFINE_FDR.out.filter_refined_q_value - } else { - // Make sure that the columns that consists of the ID's, sample names and the idXML file names are returned - ch_filter_q_value = OPENMS_IDFILTER_Q_VALUE.out.filtered - } + ch_filter_q_value = OPENMS_IDFILTER_Q_VALUE.out.filtered // // SUBWORKFLOW: QUANT @@ -221,6 +181,7 @@ workflow MHCQUANT { ch_output = ch_filter_q_value } + // Annotate Ions for follow-up spectrum validation if (params.annotate_ions) { // Join the ch_filtered_idxml and the ch_mzml_file ch_clean_mzml_file.map { meta, mzml -> [ groupKey([id: "${meta.sample}_${meta.condition}"], meta.group_count), mzml] } @@ -246,37 +207,6 @@ workflow MHCQUANT { OPENMS_MZTABEXPORTER(ch_output) ch_versions = ch_versions.mix(OPENMS_MZTABEXPORTER.out.versions) - // - // SUBWORKFLOW: Predict class I (neoepitopes) - // - // TODO: Temporary disabled because of outdated vcf parsing - //if (params.predict_class_1 & !params.skip_quantification) { - // PREDICT_CLASS1 ( - // OPENMS_MZTABEXPORTER.out.mztab, - // peptides_class_1_alleles, - // ch_vcf_from_sheet - // ) - // ch_versions = ch_versions.mix(PREDICT_CLASS1.out.versions) - // ch_predicted_possible_neoepitopes = PREDICT_CLASS1.out.ch_predicted_possible_neoepitopes - //} else { - // ch_predicted_possible_neoepitopes = Channel.empty() - //} - // - //// - //// SUBWORKFLOW: Predict class II (neoepitopes) - //// - //if (params.predict_class_2 & !params.skip_quantification) { - // PREDICT_CLASS2 ( - // OPENMS_MZTABEXPORTER.out.mztab, - // peptides_class_2_alleles, - // ch_vcf_from_sheet - // ) - // ch_versions = ch_versions.mix(PREDICT_CLASS2.out.versions) - // ch_predicted_possible_neoepitopes_II = PREDICT_CLASS2.out.ch_predicted_possible_neoepitopes - //} else { - // ch_predicted_possible_neoepitopes_II = Channel.empty() - //} - // // Collate and save software versions //