Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add initial draft of netmhciipan module #252

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ process {
}

withName: NETMHCIIPAN{
ext.args = "-BA"
publishDir = [
path: { "${params.outdir}/netmhciipan" },
mode: params.publish_dir_mode
Expand Down
5 changes: 0 additions & 5 deletions modules/local/external_tools_import.nf
Original file line number Diff line number Diff line change
Expand Up @@ -56,11 +56,6 @@ process EXTERNAL_TOOLS_IMPORT {
-e "s_/scratch_/tmp_" \
-e "s_setenv[[:space:]]NMHOME.*_setenv NMHOME \\`realpath -s \\\$0 | sed -r 's/[^/]+\$//'\\`_ " "${toolname}/${toolbinaryname}"

# MODIFY perl location in perl script for netmhcIIpan
if [ "$toolname" == "netmhciipan" ]; then
sed -i.bak \
-e 's_bin/perl.*\$_local/bin/perl_' "${toolname}/NetMHCIIpan-${toolversion}.pl"
fi
#
# VALIDATE THE CHECKSUM OF THE DOWNLOADED MODEL DATA
#
Expand Down
44 changes: 37 additions & 7 deletions modules/local/merge_predictions/templates/merge_predictions.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
import mhcgnomes
import pandas as pd

# Create logger object with date and time
import logging

logging.basicConfig(
Expand All @@ -18,6 +17,10 @@
datefmt="%Y-%m-%d %H:%M:%S",
)

class PredictorBindingThreshold(Enum):
NETMHCPAN = 2
NETMHCIIPAN = 5

class Arguments:
"""
Parses the argments, including the ones coming from $task.ext.args.
Expand Down Expand Up @@ -83,10 +86,9 @@ def format_yaml_like(data: dict, indent: int = 0) -> str:
return yaml_str

class PredictionResult:
def __init__(self, file_path, alleles, threshold=2):
def __init__(self, file_path, alleles):
self.file_path = file_path
self.alleles = alleles
self.threshold = threshold
self.predictor = None
self.prediction_df = self._format_prediction_result()

Expand Down Expand Up @@ -120,7 +122,7 @@ def _format_mhcnuggets_prediction(self):
pass

def _format_netmhcpan_prediction(self) -> pd.DataFrame:
# Map with allele index to allele name
# Map with allele index to allele name.NetMHCpan sorts alleles alphabetically
alleles_dict = {i: allele for i, allele in enumerate(self.alleles)}
# Read the file into a DataFrame with no headers initially
df = pd.read_csv(self.file_path, sep='\t', skiprows=1)
Expand All @@ -142,20 +144,48 @@ def _format_netmhcpan_prediction(self) -> pd.DataFrame:
# Pivot table to organize columns properly
df_pivot = df_long.pivot_table(index=["sequence", "allele"], columns="metric", values="value").reset_index()
df_pivot['allele'] = [alleles_dict[int(index.strip("."))] for index in df_pivot['allele']]
df_pivot['binder'] = df_pivot['EL_Rank'] <= self.threshold
df_pivot['binder'] = df_pivot['EL_Rank'] <= PredictorBindingThreshold.NETMHCPAN.value
df_pivot['predictor'] = 'netmhcpan'
df_pivot.index.name = ''

return df_pivot

def _format_netmhciipan_prediction(self, threshold=None):
pass
def _format_netmhciipan_prediction(self) -> pd.DataFrame:
# Map with allele index to allele name. NetMHCIIpan sorts alleles alphabetically
alleles_dict = {i: allele for i, allele in enumerate(self.alleles)}
# Read the file into a DataFrame with no headers initially
df = pd.read_csv(self.file_path, sep='\t', skiprows=1)
df = df[df.columns[df.columns.str.contains('Peptide|Rank|Rank_BA')]]
# TODO: Naming needs to be harmonized down the line once all predictors are implemented
df = df.rename(columns={'Peptide':'sequence','Rank':'Rank.0','Rank_BA':'Rank_BA.0'})
# to longformat based on .0|1|2..
df_long = pd.melt(
df,
id_vars=["sequence"],
value_vars=[col for col in df.columns if col != "sequence"],
var_name="metric",
value_name="value",
)
# Extract the allele information (e.g., .0, .1, etc.)
df_long["allele"] = df_long["metric"].str.split('.').str[1]
df_long["metric"] = df_long["metric"].str.split('.').str[0]

# Pivot table to organize columns properly
df_pivot = df_long.pivot_table(index=["sequence", "allele"], columns="metric", values="value").reset_index()
df_pivot['allele'] = [alleles_dict[int(index.strip("."))] for index in df_pivot['allele']]
df_pivot['binder'] = df_pivot['Rank'] <= PredictorBindingThreshold.NETMHCIIPAN.value
df_pivot['predictor'] = 'netmhciipan'
df_pivot.index.name = ''

return df_pivot

def main():
args = Arguments()

for file in args.input:
result = PredictionResult(file, args.alleles)

logging.info(f"Writing {len(result.prediction_df)} {result.predictor} predictions to file..")
result.prediction_df.to_csv(f"{args.prefix}_{result.predictor}.tsv", sep="\t", index=False)

# Parse versions
Expand Down
26 changes: 22 additions & 4 deletions modules/local/netmhciipan.nf
Original file line number Diff line number Diff line change
Expand Up @@ -8,28 +8,46 @@ process NETMHCIIPAN {
tuple val(meta), path(peptide_file), path(software)

output:
tuple val(meta), path("*.tsv"), emit: predicted
tuple val(meta), path("*.xls"), emit: predicted
path "versions.yml", emit: versions

script:
if (meta.mhc_class != "II") {
error "NETMHCIIPAN only supports MHC class II. Use NETMHCPAN for MHC class I."
error "NETMHCIIPAN only supports MHC class II. Use NETMHCIIPAN for MHC class II."
}
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: meta.sample
// Adjust for netMHCIIpan allele format
def alleles = meta.alleles.tokenize(';').collect {
it.contains('DRB') ?
it.replace('*', '_').replace(':', '') :
('HLA-' + it.replace('*', '').replace(':', ''))
}.join(',')

"""
netmhciipan/netMHCIIpan \
-f $peptide_file \
-inptype 1 \
-a $alleles \
-xls \
-xlsfile ${prefix}_predicted_netmhciipan.xls \
$args

cat <<-END_VERSIONS > versions.yml
"${task.process}":
\$(cat netmhciipan/data/version | sed -s 's/ version/:/g')
END_VERSIONS
"""

stub:
def args = task.ext.args ?: ''
def prefix = task.ext.prefix ?: meta.sample
"""
touch ${prefix}_predicted_netmhciipan.tsv
touch ${prefix}_predicted_netmhciipan.xls

cat <<-END_VERSIONS > versions.yml
"${task.process}":
netmhciipan \$(cat data/version | sed -s 's/ version/:/g')
\$(cat netmhciipan/data/version | sed -s 's/ version/:/g')
END_VERSIONS
"""
}
2 changes: 1 addition & 1 deletion modules/local/netmhcpan.nf
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ process NETMHCPAN {

cat <<-END_VERSIONS > versions.yml
"${task.process}":
netmhcpan \$(cat data/version | sed -s 's/ version/:/g')
\$(cat netmhcpan/data/version | sed -s 's/ version/:/g')
END_VERSIONS
"""
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,27 @@
import argparse
import shlex
from enum import Enum
import logging

import pandas as pd


logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
class MinLength(Enum):
NETMHCPAN = 8
NETMHCIIPAN = 8

class MaxLength(Enum):
NETMHCPAN = 14
NETMHCIIPAN = 25

# TODO: Implement
class MaxNumberOfAlleles(Enum):
NETMHCPAN = 50
NETMHCPAN = 50
NETMHCIIPAN = 50

class Arguments:
"""
Expand Down Expand Up @@ -90,16 +98,31 @@ def main():
args = Arguments()

df = pd.read_csv(args.input, sep="\t")
len_df = len(df)
logging.info(f"Reading in file with {len(df)} peptides..")

# Filter peptides based on user-defined length
if args.mhc_class == "I":
df = df[df["sequence"].str.len().between(args.min_peptide_length_classI, args.max_peptide_length_classI)]
else:
df = df[df["sequence"].str.len().between(args.min_peptide_length_classII, args.max_peptide_length_classII)]

# Filter peptides based on tool length boundaries and adjust input format
if "netmhcpan" in args.tools and args.mhc_class == "I":
logging.info("Input for NetMHCpan detected. Parsing input..")
df = df[df["sequence"].str.len().between(MinLength.NETMHCPAN.value, MaxLength.NETMHCPAN.value)]
# We only need a file containing the sequences underneath each other
df[['sequence']].to_csv(f'{args.prefix}_netmhcpan_input.tsv', sep="\t", header=False, index=False)

if "netmhciipan" in args.tools and args.mhc_class == "II":
logging.info("Input for NetMHCIIpan detected. Parsing input..")
df = df[df["sequence"].str.len().between(MinLength.NETMHCIIPAN.value, MaxLength.NETMHCIIPAN.value)]
df[['sequence']].to_csv(f'{args.prefix}_netmhciipan_input.tsv', sep="\t", header=False, index=False)

if len(df) == 0:
raise ValueError("No peptides left after applying length filters! Aborting..")
else:
logging.info(f"{len(df)} peptides post-filtering will be predicted..")

# Parse versions
versions_this_module = {}
versions_this_module["${task.process}"] = Version.get_versions([argparse, pd])
Expand Down
6 changes: 3 additions & 3 deletions nextflow.config
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,9 @@ params {

// Options: Peptides
min_peptide_length_classI = 8
max_peptide_length_classI = 11
min_peptide_length_classII = 15
max_peptide_length_classII = 16
max_peptide_length_classI = 14
min_peptide_length_classII = 8
max_peptide_length_classII = 25

// Options: Annotation
proteome = null
Expand Down
4 changes: 2 additions & 2 deletions nextflow_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -96,8 +96,8 @@
"tools": {
"type": "string",
"default": "syfpeithi",
"pattern": "^(syfpeithi|mhcnuggets-class-1|mhcnuggets-class-2|mhcflurry|netmhc-4.0|netmhcpan-4.0|netmhcpan-4.1|netmhciipan-4.1|netmhc_darwin-4.0|netmhcpan_darwin-4.0|netmhcpan_darwin-4.1|netmhciipan_darwin-4.1)(,(syfpeithi|mhcnuggets-class-1|mhcnuggets-class-2|mhcflurry|netmhc-4.0|netmhcpan-4.0|netmhcpan-4.1|netmhciipan-4.1|netmhc_darwin-4.0|netmhcpan_darwin-4.0|netmhcpan_darwin-4.1|netmhciipan_darwin-4.1))*$",
"help_text": "Specifies the tool(s) to use. Multiple tools can be combined in a list separated by comma.\nAvailable are: `syfpeithi`, `mhcflurry`, `mhcnuggets-class-1`, `mhcnuggets-class-2`,`netmhcpan-4.0`,`netmhcpan-4.1`,`netmhc-4.0`,`netmhciipan-4.1`.",
"pattern": "^(syfpeithi|mhcnuggets-class-1|mhcnuggets-class-2|mhcflurry|netmhc-4.0|netmhcpan-4.0|netmhcpan-4.1|netmhciipan-4.3|netmhc_darwin-4.0|netmhcpan_darwin-4.0|netmhcpan_darwin-4.1|netmhciipan_darwin-4.1)(,(syfpeithi|mhcnuggets-class-1|mhcnuggets-class-2|mhcflurry|netmhc-4.0|netmhcpan-4.0|netmhcpan-4.1|netmhciipan-4.3|netmhc_darwin-4.0|netmhcpan_darwin-4.0|netmhcpan_darwin-4.1|netmhciipan_darwin-4.1))*$",
"help_text": "Specifies the tool(s) to use. Multiple tools can be combined in a list separated by comma.\nAvailable are: `syfpeithi`, `mhcflurry`, `mhcnuggets-class-1`, `mhcnuggets-class-2`,`netmhcpan-4.0`,`netmhcpan-4.1`,`netmhc-4.0`,`netmhciipan-4.3`.",
"description": "Specifies the prediction tool(s) to use."
},
"tool_thresholds": {
Expand Down
6 changes: 3 additions & 3 deletions subworkflows/local/mhc_binding_prediction.nf
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ workflow MHC_BINDING_PREDICTION {
return [meta, file]
}
.set{ ch_prediction_input }
ch_prediction_input.netmhcpan.view()
ch_prediction_input.netmhciipan.view()
SYFPEITHI ( ch_prediction_input.syfpeithi )
ch_versions = ch_versions.mix(SYFPEITHI.out.versions)
ch_binding_predictors_out = ch_binding_predictors_out.mix(SYFPEITHI.out.predicted)
Expand All @@ -67,12 +67,12 @@ workflow MHC_BINDING_PREDICTION {
ch_binding_predictors_out = ch_binding_predictors_out.mix(NETMHCPAN.out.predicted)
}

if ( "netmhciipan" in params.tools.tokenize(",") )
if ( "netmhciipan-4.3" in params.tools.tokenize(",") )
{
// TODO: External tools import for netmhciipan
NETMHCIIPAN_IMPORT( parse_netmhc_params("netmhciipan", "4.3") )
//TODO: Update netmhc container
NETMHCIIPAN ( ch_prediction_input.netmhcpan.combine(NETMHCIIPAN_IMPORT.out.nonfree_tools) )
NETMHCIIPAN ( ch_prediction_input.netmhciipan.combine(NETMHCIIPAN_IMPORT.out.nonfree_tools) )
ch_versions = ch_versions.mix(NETMHCIIPAN.out.versions)
ch_binding_predictors_out = ch_binding_predictors_out.mix(NETMHCIIPAN.out.predicted)
}
Expand Down
Loading
Loading