diff --git a/README.rst b/README.rst index 55077f6a..79b0c7ad 100644 --- a/README.rst +++ b/README.rst @@ -57,6 +57,7 @@ The MS³PIP Python application can perform the following tasks: - ``predict-batch``: Predict fragmentation spectra for a batch of peptides. - ``predict-library``: Predict a spectral library from protein FASTA file. - ``correlate``: Compare predicted and observed intensities and optionally compute correlations. +- ``correlate-single``: Compare predicted and observed intensities for a single peptide spectrum. - ``get-training-data``: Extract feature vectors and target intensities from observed spectra for training. - ``annotate-spectra``: Annotate peaks in observed spectra. diff --git a/conversion_tools/MSF_to_MS2PIP_SpecLib.py b/conversion_tools/MSF_to_MS2PIP_SpecLib.py deleted file mode 100644 index a075ca0e..00000000 --- a/conversion_tools/MSF_to_MS2PIP_SpecLib.py +++ /dev/null @@ -1,298 +0,0 @@ -""" -## MSF and MGF to MS2PIP SpecLib -Reads an MSF file (SQLite DB), combines it with the matched (multiple) MGF files and writes a spectral library as 1 MS2PIP PEPREC and MGF file. Filters by a given FDR threshold, using q-values calculated from decoy hits or from Percolator. - -*MSF_to_MS2PIP_SpecLib.py* -**Input:** MSF and MGF files -**Output:** Matched PEPREC and MGF file - -``` -usage: MSF_to_MS2PIP_SpecLib.py [-h] [-s MSF_FOLDER] [-g MGF_FOLDER] - [-o OUTNAME] [-f FDR_CUTOFF] [-p] [-c] - -Convert Sequest MSF and MGF to MS2PIP spectral library. - -optional arguments: - -h, --help show this help message and exit - -s MSF_FOLDER, --msf MSF_FOLDER - Folder with Sequest MSF files (default: "msf") - -g MGF_FOLDER, --mgf MGF_FOLDER - Folder with MGF spectrum files (default: "mgf") - -o OUTNAME, --out OUTNAME - Name for output files (default: "SpecLib") - -f FDR_CUTOFF, --fdr FDR_CUTOFF - FDR cut-off value to filter PSMs (default: 0.01) - -p Use Percolator q-values instead of calculating them - from TDS (default: False) - -c Combine multiple MSF files into one spectral library - (default: False) -``` -""" - -# -------------- -# Import modules -# -------------- -import argparse -import re -import sqlite3 -import pandas as pd -from glob import glob -from os import getcwd - - -# ---------------------------------------------------------------- -# Calculate q-Value from PeptideScores using Target-Decoy strategy -# ---------------------------------------------------------------- -def CalcQVal(Peptides, conn): - PeptideScores = pd.read_sql("select * from 'PeptideScores';", conn) - PeptideScores_Decoy = pd.read_sql("select * from 'PeptideScores_Decoy';", conn) - if len(PeptideScores_Decoy) == 0: - print("No decoy PSMs found: cannot calculate q-value.") - exit(1) - PeptideScores['Decoy'] = False - PeptideScores_Decoy['Decoy'] = True - PeptideScores = PeptideScores.append(PeptideScores_Decoy) - - PeptideScores.sort_values('ScoreValue', ascending=False, inplace=True) - PeptideScores['CumSum_Decoys'] = PeptideScores['Decoy'].cumsum() - PeptideScores['CumSum_Targets'] = (~PeptideScores['Decoy']).cumsum() - PeptideScores['q-Value'] = PeptideScores['CumSum_Decoys'] / PeptideScores['CumSum_Targets'] - PeptideScores = PeptideScores[~PeptideScores['Decoy']][['PeptideID', 'q-Value']] - - Peptides = Peptides.merge(PeptideScores, on='PeptideID') - - return(Peptides) - - -# ------------------------------------------------------ -# Get Percolator q-value and PEP out of CustomDataFields -# ------------------------------------------------------ -def GetPercolatorQVal(Peptides, conn): - CustomDataFields = pd.read_sql("select FieldID, DisplayName from 'CustomDataFields';", conn) - CustomDataFields.index = CustomDataFields['FieldID'] - CustomDataFields_dict = CustomDataFields['DisplayName'].to_dict() - - CustomDataPeptides = pd.read_sql("select * from CustomDataPeptides;", conn) - CustomDataPeptides['FieldName'] = [CustomDataFields_dict[ID] for ID in CustomDataPeptides['FieldID']] - CustomDataPeptides = CustomDataPeptides.pivot(values='FieldValue', index='PeptideID', columns='FieldName').reset_index() - - Peptides = Peptides.merge(CustomDataPeptides, on='PeptideID') - - return(Peptides) - - -# -------- -# Get PTMs -# -------- -def ConcatPTMs(series): - return(series.str.cat(sep='|')) - - -def GetPTMs(Peptides, conn): - AminoAcidModifications = pd.read_sql("select * from 'AminoAcidModifications'", conn) - PeptidesAminoAcidModifications = pd.read_sql("select * from 'PeptidesAminoAcidModifications'", conn) - PeptidesTerminalModifications = pd.read_sql("select * from 'PeptidesTerminalModifications'", conn) - - # Get amino acid IDs and names in dict for easy access - AminoAcidModifications.index = AminoAcidModifications.AminoAcidModificationID - AminoAcidModifications_dict = AminoAcidModifications['ModificationName'].to_dict() - - # Combine terminal and normal PTMs into one dataframe - PeptidesTerminalModifications.columns = ['ProcessingNodeNumber', 'PeptideID', 'AminoAcidModificationID'] - PeptidesTerminalModifications['PositionMS2PIP'] = 0 - PeptidesAminoAcidModifications['PositionMS2PIP'] = PeptidesAminoAcidModifications['Position'] + 1 - Modifications = PeptidesAminoAcidModifications.append(PeptidesTerminalModifications, ignore_index=True) - - # Translate PTM IDs to names - Modifications['ModificationName'] = [AminoAcidModifications_dict[ID] for ID in Modifications['AminoAcidModificationID']] - - # Concatenate PTMs to MS2PIP notation per peptide - Modifications.sort_values('PositionMS2PIP', inplace=True) - Modifications['NotationMS2PIP'] = Modifications['PositionMS2PIP'].map(str) + '|' + Modifications['ModificationName'] - MS2PIPmods = pd.DataFrame(Modifications.groupby('PeptideID')['NotationMS2PIP'].apply(ConcatPTMs)) - MS2PIPmods.rename(columns={'NotationMS2PIP': 'Modifications'}, inplace=True) - MS2PIPmods.reset_index(inplace=True) - Peptides = Peptides.merge(MS2PIPmods, on='PeptideID') - - return(Peptides) - - -# ------------------------------------------ -# Get spectrum properties and RAW file names -# ------------------------------------------ -def GetSpectrumProperties(Peptides, conn): - cols = 'SpectrumID, Charge, Mass, LastScan, FirstScan, ScanNumbers, MassPeakID' - SpectrumHeaders = pd.read_sql("select {} from SpectrumHeaders".format(cols), conn) - - FileInfos = pd.read_sql("select FileID, FileName from FileInfos", conn) - FileInfos['FileName'] = FileInfos['FileName'].str.split('\\') - FileInfos['FileName'] = [name[-1].split('.raw')[0].split('.mzML')[0] for name in FileInfos['FileName']] - - MassPeaks = pd.read_sql("select MassPeakID, FileID from MassPeaks;", conn) - MassPeaks = MassPeaks.merge(FileInfos, on='FileID') - - SpectrumHeaders = SpectrumHeaders.merge(MassPeaks, on='MassPeakID') - Peptides = Peptides.merge(SpectrumHeaders, on='SpectrumID') - - return(Peptides) - - -# ------------------------------------------------------- -# Filter Peptide list for non-redundancy and low q-values -# ------------------------------------------------------- -def FilterForSpecLib(Peptides, FDR_CutOff): - # Filter on q-value - len_before = len(Peptides) - Peptides = Peptides[Peptides['q-Value'] <= FDR_CutOff].copy() - len_after = len(Peptides) - print("q-value cut-off {}: Kept {} out of {} peptides.".format(FDR_CutOff, len_after, len_before)) - - # Remove duplicate peptides: keep the one with the highest q-value - len_before = len(Peptides) - Peptides.sort_values('q-Value', inplace=True) - Peptides = Peptides[~Peptides.duplicated(['Sequence', 'Modifications', 'Charge'], keep='first')].copy() - len_after = len(Peptides) - print("Removed duplicate peptides: Kept {} out of {} peptides.".format(len_after, len_before)) - - # Remove multiple peptides mapping to the same spectrum: keep the one with the highest q-value - len_before = len(Peptides) - Peptides = Peptides[~Peptides.duplicated(['SpectrumID', 'FileName'], keep='first')].copy() - len_after = len(Peptides) - print("Removed duplicate spectra: Kept {} out of {} peptides.".format(len_after, len_before)) - - # Annotate tryptic peptides - Peptides['Tryptic'] = ((Peptides['Sequence'].str.endswith('K')) | (Peptides['Sequence'].str.endswith('R'))) - - # Set PeptideID as index - Peptides.index = Peptides['PeptideID'] - - return(Peptides) - - -# ------------------------------------------------------------------ -# Scan multiple MGF files for spectra present in Peptides data frame -# ------------------------------------------------------------------ -def ScanMGF(df_in, mgf_folder, outname='SpecLib.mgf'): - with open(outname, 'w') as out: - count_runs = 0 - count = 0 - runs = df_in['FileName'].unique() - print("Scanning MGF files: {} runs to do. Now working on run: ".format(len(runs)), end='') - for run in runs: - count_runs += 1 - if count_runs % 10 == 0: - print(str(count_runs), end='') - else: - print('.', end='') - spec_dict = dict((v, k) for k, v in df_in[(df_in['FileName'] == run)]['FirstScan'].to_dict().items()) - - # Parse file - found = False - with open('{}/{}.mgf'.format(mgf_folder, str(run)), 'r') as f: - for i, line in enumerate(f): - if 'TITLE' in line: - ScanNum = int(re.split('scan=|"', line)[-2]) - if ScanNum in spec_dict: - found = True - out.write("BEGIN IONS\n") - line = "TITLE=" + str(spec_dict[ScanNum]) + '\n' - count += 1 - if 'END IONS' in line: - if found: - out.write(line + '\n') - found = False - if found and line[-4:] != '0.0\n': - out.write(line) - - print("\n{}/{} spectra found and written to new MGF file.".format(count, len(df_in))) - - -# ------------------------ -# Write MS2PIP PEPREC file -# ------------------------ -def WritePEPREC(Peptides, outname='SpecLib.peprec'): - peprec = Peptides[['PeptideID', 'Modifications', 'Sequence', 'Charge']] - peprec.columns = ['spec_id', 'modifications', 'peptide', 'charge'] - peprec.to_csv(outname, sep=' ', na_rep='-', index=None) - - -# --------------- -# Argument parser -# --------------- -def ArgParse(): - parser = argparse.ArgumentParser(description='Convert Sequest MSF and MGF to MS2PIP spectral library.') - parser.add_argument('-s', '--msf', dest='msf_folder', action='store', default='msf', - help='Folder with Sequest MSF files (default: "msf")') - parser.add_argument('-g', '--mgf', dest='mgf_folder', action='store', default='mgf', - help='Folder with MGF spectrum files (default: "mgf")') - parser.add_argument('-o', '--out', dest='outname', action='store', default='SpecLib', - help='Name for output files (default: "SpecLib")') - parser.add_argument('-f', '--fdr', dest='FDR_CutOff', action='store', default=0.01, type=float, - help='FDR cut-off value to filter PSMs (default: 0.01)') - parser.add_argument('-p', dest='percolator', action='store_true', default=False, - help='Use Percolator q-values instead of calculating them from TDS (default: False)') - parser.add_argument('-c', dest='combine_msf', action='store_true', default=False, - help='Combine multiple MSF files into one spectral library (default: False)') - args = parser.parse_args() - - return(args) - - -# ---- -# Run! -# ---- -def run(): - args = ArgParse() - msf_files = glob("{}/*.msf".format(args.msf_folder)) - msf_count = 0 - - if args.combine_msf: - Peptides = pd.DataFrame() - for msf_filename in msf_files: - msf_count += 1 - print("Adding MSF file {} of {}...".format(msf_count, len(msf_files))) - conn = sqlite3.connect(msf_filename) - Peptides_tmp = pd.read_sql("select * from Peptides;", conn) - if args.percolator: - Peptides_tmp = GetPercolatorQVal(Peptides_tmp, conn) - else: - Peptides_tmp = CalcQVal(Peptides_tmp, conn) - Peptides_tmp = GetPTMs(Peptides_tmp, conn) - Peptides_tmp = GetSpectrumProperties(Peptides_tmp, conn) - Peptides = Peptides.append(Peptides_tmp) - - Peptides = FilterForSpecLib(Peptides, args.FDR_CutOff) - - outname_appendix = msf_files[0] - outname_appendix = outname_appendix.replace('/', '') - outname_appendix = outname_appendix.replace('.msf', '') - - ScanMGF(Peptides, args.mgf_folder, outname='{}_{}_Combined.mgf'.format(args.outname, outname_appendix)) - WritePEPREC(Peptides, outname='{}_{}_Combined.peprec'.format(args.outname, outname_appendix)) - - else: - for msf_filename in msf_files: - msf_count += 1 - print("\nWorking on MSF file {} of {}...".format(msf_count, len(msf_files))) - conn = sqlite3.connect(msf_filename) - Peptides = pd.read_sql("select * from Peptides;", conn) - if args.percolator: - Peptides = GetPercolatorQVal(Peptides, conn) - else: - Peptides = CalcQVal(Peptides, conn) - Peptides = GetPTMs(Peptides, conn) - Peptides = GetSpectrumProperties(Peptides, conn) - Peptides = FilterForSpecLib(Peptides, args.FDR_CutOff) - - outname_appendix = msf_filename - outname_appendix = outname_appendix.replace('/', '') - outname_appendix = outname_appendix.replace('.msf', '') - - ScanMGF(Peptides, args.mgf_folder, outname='{}_{}.mgf'.format(args.outname, outname_appendix)) - WritePEPREC(Peptides, outname='{}_{}.peprec'.format(args.outname, outname_appendix)) - - print("Ready!") - - -if __name__ == "__main__": - run() diff --git a/conversion_tools/Split_MS2PIP_SpecLib.py b/conversion_tools/Split_MS2PIP_SpecLib.py deleted file mode 100644 index b459a3ee..00000000 --- a/conversion_tools/Split_MS2PIP_SpecLib.py +++ /dev/null @@ -1,118 +0,0 @@ -""" -## Split MS2PIP Spectral Library -Split MS2PIP spectral library (PEPREC and MGF file) into a train and test set. - -*Split_MS2PIP_SpecLib.py* -**Input:** PEPREC and MGF file -**Output:** PEPREC and MGF files for both train and test data set. - -``` -usage: Split_MS2PIP_SpecLib.py [-h] [-o OUT_FILENAME] [-f TEST_FRACTION] - peprec_file mgf_file - -Split MS2PIP spectral library (PEPREC and MGF file) into a train and test set. - -positional arguments: - peprec_file PEPREC file input - mgf_file MGF file input - -optional arguments: - -h, --help show this help message and exit - -o OUT_FILENAME Name for output files (default: "SpecLib") - -f TEST_FRACTION Fraction of input to use for test data set (default: 0.1) -``` -""" - -# -------------- -# Import modules -# -------------- -import argparse -import numpy as np -import pandas as pd - - -# -------------- -# Split PEPREC -# -------------- -def SplitPeprec(peprec, test_fraction): - msk = np.random.rand(len(peprec)) > test_fraction - peprec_train = peprec[msk] - peprec_test = peprec[~msk] - return(peprec_train, peprec_test) - - -# -------------- -# Split MGF file -# -------------- -def SplitMGF(peprec_train, peprec_test, mgf_filename, out_filename='SpecLib'): - count_train = 0 - count_test = 0 - print("{} spectra to go...".format(len(peprec_train) + len(peprec_test)), end='') - with open('{}_Train.mgf'.format(out_filename), 'w') as out_train: - with open('{}_Test.mgf'.format(out_filename), 'w') as out_test: - spec_ids_train = list(peprec_train['spec_id']) - spec_ids_test = list(peprec_test['spec_id']) - found_train = False - found_test = False - with open(mgf_filename, 'r') as mgf: - for i, line in enumerate(mgf): - if 'TITLE' in line: - spec_id = int(line.split('TITLE=')[1]) - if spec_id in spec_ids_train: - found_train = True - count_train += 1 - out_train.write("BEGIN IONS\n") - if count_train % 100 == 0: - print('.', end='') - elif spec_id in spec_ids_test: - found_test = True - count_test += 1 - out_test.write("BEGIN IONS\n") - if count_test % 100 == 0: - print('.', end='') - if 'END IONS' in line: - if found_train: - out_train.write(line + '\n') - found_train = False - elif found_test: - out_test.write(line + '\n') - found_test = False - if found_train: - out_train.write(line) - elif found_test: - out_test.write(line) - - print("\nReady! Wrote {}/{} spectra to {}_Train.mgf and {}/{} spectra to {}_Test.mgf.".format(count_train, len(peprec_train), - out_filename, count_test, len(peprec_test), - out_filename)) - - -# --------------- -# Argument parser -# --------------- -def ArgParse(): - parser = argparse.ArgumentParser(description='Split MS2PIP spectral library (PEPREC and MGF file) into a train and test set.') - parser.add_argument('peprec_file', action='store', help='PEPREC file input') - parser.add_argument('mgf_file', action='store', help='MGF file input') - parser.add_argument('-o', dest='out_filename', action='store', default='SpecLib', - help='Name for output files (default: "SpecLib")') - parser.add_argument('-f', dest='test_fraction', action='store', default=0.1, type=float, - help='Fraction of input to use for test data set (default: 0.1)') - args = parser.parse_args() - return(args) - - -# ---- -# Run! -# ---- -def run(): - args = ArgParse() - peprec = pd.read_csv(args.peprec_file, sep=' ') - (peprec_train, peprec_test) = SplitPeprec(peprec, args.test_fraction) - SplitMGF(peprec_train, peprec_test, args.mgf_file, args.out_filename) - peprec_train.to_csv('{}_Train.peprec'.format(args.out_filename), sep=' ', na_rep='-', index=False) - peprec_test.to_csv('{}_Test.peprec'.format(args.out_filename), sep=' ', na_rep='-', index=False) - - -if __name__ == "__main__": - run() diff --git a/conversion_tools/blib_converter.py b/conversion_tools/blib_converter.py deleted file mode 100755 index b38429b4..00000000 --- a/conversion_tools/blib_converter.py +++ /dev/null @@ -1,151 +0,0 @@ -#!/usr/bin/env python3 -import argparse -import sys -import zlib -import numpy -import sqlalchemy -import pyteomics.mgf -import ms2pip.config_parser -import ms2pip.peptides - -from collections import defaultdict, namedtuple -from sqlalchemy import (MetaData, Table, Column, Integer, String, Float, - ForeignKey, TypeDecorator) -from sqlalchemy.sql import select -from sqlalchemy.dialects.sqlite import BLOB - - -class CompressedArray(TypeDecorator): - """Compressed binary arrays stored as BLOB in SQLite.""" - - impl = BLOB - - def __init__(self, dtype, *args, **kwargs): - super().__init__(*args, **kwargs) - self.dtype = dtype - - def process_bind_param(self, value, dialect): - # TODO: allow configuration of compression level - return zlib.compress(value.tobytes(), 3) - - def process_result_value(self, value, dialect): - decompressed = zlib.decompress(value) - return numpy.frombuffer(decompressed, dtype=self.dtype) - - def copy(self): - return CompressedArray(self.dtype, self.impl.length) - - -metadata = MetaData() - -RefSpectra = Table('RefSpectra', metadata, - Column('id', Integer, primary_key=True), - Column('peptideSeq', String(200)), - Column('precursorMZ', Float), - Column('precursorCharge', Integer), - Column('retentionTime', Float), - ) - -Modifications = Table('Modifications', metadata, - Column('id', Integer, primary_key=True), - Column('RefSpectraID', Integer, ForeignKey('RefSpectra.id')), - Column('position', Integer), - Column('mass', Float), - ) - -RefSpectraPeaks = Table('RefSpectraPeaks', metadata, - Column('RefSpectraID', Integer, ForeignKey('RefSpectra.id')), - Column('peakMZ', CompressedArray(numpy.double)), - Column('peakIntensity', CompressedArray(numpy.single)), - ) - -Modification = namedtuple("Modification", "name, amino_acid") - - -def open_sqlite(filename): - engine = sqlalchemy.create_engine(f"sqlite:///{filename}") - metadata.bind = engine - return engine.connect() - - -def get_modification_config(config_file): - mods = defaultdict(list) - - config = ms2pip.config_parser.ConfigParser(filepath=config_file) - modifications = ms2pip.peptides.Modifications() - modifications.add_from_ms2pip_modstrings(config.config['ms2pip']['ptm']) - - for name, mod in modifications._all_modifications: - mods[mod['mass_shift']].append(Modification(name, mod['amino_acid'])) - return mods - - -def read_peptides(filename, modifications): - with open_sqlite(filename) as connection: - for spec in connection.execute(RefSpectra.select()): - spec_mods = [] - for modification in connection.execute(Modifications.select().where(Modifications.c.RefSpectraID == spec.id)): - for mod in modifications[modification.mass]: - if mod.amino_acid == 'N-term' and modification.position == 1: - spec_mods.append(f"0|{mod.name}") - elif mod.amino_acid == 'C-term' and modification.position == len(spec.peptideSeq): - spec_mods.append(f"-1|{mod.name}") - elif mod.amino_acid == spec.peptideSeq[modification.position-1]: - spec_mods.append(f"{modification.position}|{mod.name}") - spec_mods = '|'.join(spec_mods) if spec_mods else '-' - yield f"{spec.id} {spec_mods} {spec.peptideSeq} {spec.precursorCharge}" - - -def read_spectra(filename): - with open_sqlite(filename) as connection: - for spec in connection.execute(select([RefSpectra, RefSpectraPeaks]).select_from(RefSpectra.join(RefSpectraPeaks, RefSpectra.c.id == RefSpectraPeaks.c.RefSpectraID))): - yield { - 'm/z array': spec.peakMZ, - 'intensity array': spec.peakIntensity, - 'params': { - 'title': spec.id, - 'pepmass': spec.precursorMZ, - 'charge': spec.precursorCharge, - 'rtinseconds': spec.retentionTime - } - } - - -def blib_to_peprec(blib_filename, peprec_file, modifications): - for peptide in read_peptides(blib_filename, modifications): - peprec_file.write(f"{peptide}\n") - peprec_file.flush() - - -def blib_to_mgf(blib_filename, mgf_file): - pyteomics.mgf.write(read_spectra(blib_filename), output=mgf_file) - - -def main(): - parser = argparse.ArgumentParser(description='Convert BiblioSpec Spectral Library to peprec and/or MGF files.') - parser.add_argument('blib_filename', - help='input blib file') - parser.add_argument('--peprec', nargs='?', type=argparse.FileType('w'), - const=sys.stdout, - help='write peprec file') - parser.add_argument('--mgf', nargs='?', type=argparse.FileType('w'), - const=sys.stdout, - help='write MGF file') - parser.add_argument('--config', - help='MS2PIP config file') - - args = parser.parse_args() - - if args.peprec and not args.config: - parser.error("--peprec requires --config") - - if args.peprec: - modifications = get_modification_config(args.config) - blib_to_peprec(args.blib_filename, args.peprec, modifications) - - if args.mgf: - blib_to_mgf(args.blib_filename, args.mgf) - - -if __name__ == "__main__": - main() diff --git a/conversion_tools/calc_correlations.py b/conversion_tools/calc_correlations.py deleted file mode 100644 index 4133304e..00000000 --- a/conversion_tools/calc_correlations.py +++ /dev/null @@ -1,36 +0,0 @@ -""" -Calculate correlations from a pred_and_emp.csv file or calculate median -correlations by ion type from correlations.csv -""" - - -import argparse -import pandas as pd -from ms2pipC import calc_correlations - - -def argument_parser(): - parser = argparse.ArgumentParser() - parser.add_argument("pae", metavar="", - help="pred_and_emp.csv or correlations.csv file") - args = parser.parse_args() - return args - - -def main(): - args = argument_parser() - all_preds = pd.read_csv(args.pae) - if args.pae.endswith('pred_and_emp.csv'): - print('Computing correlations...') - correlations = calc_correlations(all_preds) - output_filename = args.pae.replace('pred_and_emp.csv', 'correlations.csv') - print("Writing to {}".format(output_filename)) - correlations.to_csv("{}_correlations.csv".format(output_filename), index=True) - else: - correlations = all_preds - print("Median correlations: ") - print(correlations.groupby('ion')['pearsonr'].median()) - - -if __name__ == '__main__': - main() diff --git a/conversion_tools/calc_perc_found_peaks.py b/conversion_tools/calc_perc_found_peaks.py deleted file mode 100644 index 8afcd4ec..00000000 --- a/conversion_tools/calc_perc_found_peaks.py +++ /dev/null @@ -1,53 +0,0 @@ -""" -Calculate percentage of found peaks - -Returns the percentage of values in the series that ar not close to log2(0.001), -which is the value MS2PIP gives a peak if it was not found in the spectrum. -""" - -import sys -from math import isclose - -import numpy as np -import pandas as pd - - -def calc_percentage_found_peaks(series): - """ - Calculate percentage of found peaks - - Returns the percentage of values in the series that ar not close to log2(0.001), - which is the value MS2PIP gives a peak if it was not found in the spectrum. - """ - zero = np.log2(0.001) - try: - x = series.apply(lambda x: isclose(x, zero, abs_tol=0.001)).value_counts( - normalize=True - )[False] - except KeyError: - x = 0.0 - return x - - -def main(): - if sys.argv[1].endswith(".csv"): - vectors = pd.read_csv(sys.argv[1]) - elif sys.argv[1].endswith(".h5"): - vectors = pd.read_hdf(sys.argv[1], key="table") - elif sys.argv[1].endswith("pkl"): - vectors = pd.read_pkl(sys.argv[1]) - else: - print("Unknown file extension") - exit(1) - - for ion_type in [ - col.split("targets_")[1] - for col in vectors.columns - if col.startswith("targets_") - ]: - perc_found = calc_percentage_found_peaks(vectors["targets_" + ion_type]) - print("Percentage of found {}-ion peaks: {}".format(ion_type, perc_found)) - - -if __name__ == "__main__": - main() diff --git a/conversion_tools/extpsmreport_to_peprec.py b/conversion_tools/extpsmreport_to_peprec.py deleted file mode 100644 index d7b4c98b..00000000 --- a/conversion_tools/extpsmreport_to_peprec.py +++ /dev/null @@ -1,130 +0,0 @@ -""" -Extended PSM Report to PEPREC -extpsmreport_to_peprec.py - -Create a PEPREC file out of a PeptideShaker Extended PSM Report. Attention: The -resulting PEPREC file will contain all PSMs and is not filtered on FDR or decoy -hits. -""" - -__author__ = "Tim Van Den Bossche" -__credits__ = ["Tim Van Den Bossche", "Ralf Gabriels", "Sven Degroeve", "Lennart Martens"] -__license__ = "Apache License, Version 2.0" -__version__ = "0.1" -__email__ = "Ralf.Gabriels@ugent.be" - - -# Native libraries -import argparse - -# Third party libraries -import pandas as pd - - -def modifications(modified_seq): - # initiate variables for nterm, seq and cterm - mod_list = list() - nterm, seq, cterm = modified_seq.split("-") - - # initiatle variable for nterm - pyro_bool = False - - # initiate variables for seq - mod_index = 0 - mod_description = False # to check if it's an amino acid (False) or a description in < ... > (True) - - # check amino terminus for modifications - if nterm == "ace": - mod_list.append("0|Acetyl") - elif nterm == "pyro": - pyro_bool = True - elif nterm != "NH2": - print("Unknown N-terminal modification: {}".format(nterm)) - - # check internal sequence - for char in seq: - if char == "<": - mod_peprec = "{}|".format(mod_index) - mod_name = "" - mod_description = True - elif char == ">": - mod_description = False - if mod_name == 'ox': - mod_peprec += 'Oxidizedmethionine' # allow only oxidation of Met!! - elif mod_name == 'cmm': - mod_peprec += 'Carbamidomethylation' - elif mod_name == 'deam': - mod_peprec += 'Deamidated' - else: - print("Unknown internal modification: {}".format(mod_name)) - mod_list.append("{}".format(mod_peprec)) # peprec format - mod_peprec = "" - - else: - if pyro_bool: - if char == 'C': - mod_name = "Pyro-carbamidomethyl" - elif char == 'Q': - mod_name = "Gln->pyro-Glu" - elif char == 'E': - mod_name = "Glu->pyro-Glu" - elif char == 'P': - mod_name = "Pro->pyro-Glu" - else: - print("Unknown N-terminal pyro modification from {}".format(char)) - mod_list.append("1|{}".format(mod_name)) - pyro_bool = False - mod_index += 1 - mod_name = "" - else: - if mod_description: - mod_name += char - else: - mod_index += 1 - - mods_peprec = "|".join(mod_list) - if mods_peprec == "": - mods_peprec = "-" - return mods_peprec - - -def ArgParse(): - parser = argparse.ArgumentParser(description='Create a PEPREC file out of a PeptideShaker Extended PSM Report.') - parser.add_argument("input", metavar="", - help="Path to input Extended PSM Report") - parser.add_argument('-o', '--out', dest='outname', action='store', default='SpecLib', - help='Name for output file (default: "SpecLib")') - args = parser.parse_args() - - return(args) - - -def main(): - args = ArgParse() - - df = pd.read_excel(str(args.input)) - - spec_id = list() - mods_peprec = list() - peptide = list() - charge = list() - label = list() - - for index, row in df.iterrows(): - spec_id.append(row["Spectrum Title"]) - mods_peprec.append(modifications(row["Modified Sequence"])) - peptide.append(row["Sequence"]) - charge.append(row["Measured Charge"][:-1]) - if row["Decoy"] == 0: - label.append(1) - elif row["Decoy"] == 1: - label.append(-1) - else: - print("No target/decoy label available for {}".format(row["Spectrum Title"])) - - d = {"spec_id": spec_id, "modifications": mods_peprec, "peptide": peptide, "charge": charge, "label":label} - peprec_df = pd.DataFrame(data=d, columns=["spec_id", "modifications", "peptide", "charge", "label"]) - peprec_df.to_csv('{}.peprec'.format(args.outname), index=False, sep="\t", quotechar="'") - -if __name__ == "__main__": - main() diff --git a/conversion_tools/fasta_to_peprec.py b/conversion_tools/fasta_to_peprec.py deleted file mode 100644 index 02a95e06..00000000 --- a/conversion_tools/fasta_to_peprec.py +++ /dev/null @@ -1,71 +0,0 @@ -""" -## Create PEPREC file from a FASTA file -Takes every protein in a FASTA file, generates a PEPREC file with all tryptic peptides (check global variables to set charge states, min/max lengths and number of missed cleavages) - -*fasta_to_peprec.py* -**Input:** FASTA file -**Output:** PEPREC file - -**Requirements:** Biopython to parse FASTA file; Pyteomics for in silico cleavage; tqdm for progress bar. - -``` -usage: fasta_to_peprec.py [-h] fasta_file - -Generate a PEPREC file for all proteins in a fasta file - -positional arguments: - fasta_file FASTA file with proteins of interest - -optional arguments: - -h, --help show this help message and exit -``` -""" - -# -------------- -# Import modules -# -------------- -from Bio import SeqIO -from pyteomics.parser import cleave -from pyteomics.parser import expasy_rules -import argparse -from tqdm import tqdm - -parser = argparse.ArgumentParser() - -parser.add_argument('fasta_file', help='FASTA file with all proteins of interest') - -args = parser.parse_args() - -# -------------- -# Global variables -# -------------- -CHARGES = [1, 2, 3] -NUM_MISSED = 2 -MIN_LEN = 8 -MAX_LEN = 50 - -peprec_file = open(args.fasta_file.replace('.fasta', '.PEPREC'), 'wt') -peprec_file.write('spec_id modifications peptide charge protein\n') - -n_prots = (sum(1 for x in SeqIO.parse(open(args.fasta_file),'fasta'))) -print('{} proteins in fasta file \n'.format(n_prots)) - -fasta_sequences = SeqIO.parse(open(args.fasta_file),'fasta') -aa = set(["A", "C", "D", "E", "F", "G", "H", "L", "I", "K", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "Y"]) - -i = 0 # identifier for each (peptide, charge) - -for fasta in tqdm(fasta_sequences, total=n_prots): - prot, seq = fasta.id, str(fasta.seq) - # seq = seq.replace('L','I') - RKcut = cleave(seq, '[KR]', NUM_MISSED, MIN_LEN) - # RKcut = cleave(seq, expasy_rules['trypsin'], NUM_MISSED, MIN_LEN) - for k in RKcut: - if (set(k) <= aa) & (len(k) <= MAX_LEN): # to exclude peptides with aa which aren't the standard - for charge in CHARGES: - i += 1 - to_write = '{} {} {} {} {}\n'.format(i, '', k, charge, prot) - peprec_file.write(to_write) - else: continue - -peprec_file.close() diff --git a/conversion_tools/maxquant_to_peprec.py b/conversion_tools/maxquant_to_peprec.py deleted file mode 100644 index 9162cb09..00000000 --- a/conversion_tools/maxquant_to_peprec.py +++ /dev/null @@ -1,191 +0,0 @@ -__author__ = "Ralf Gabriels" -__credits__ = ["Ralf Gabriels", "Sven Degroeve", "Lennart Martens"] -__license__ = "Apache License, Version 2.0" -__version__ = "0.1" -__email__ = "Ralf.Gabriels@ugent.be" - - -# Native libraries -import re -import logging -import argparse - -# Third party libraries -import pandas as pd -from tqdm import tqdm - - -def maxquant_to_peprec(evidence_file, msms_file, - ptm_mapping={'(ox)': 'Oxidation', '(ac)': 'Acetyl', '(cm)': 'Carbamidomethyl'}, - fixed_modifications=[]): - """ - Make an MS2PIP PEPREC file starting from the MaxQuant Evidence.txt and - MSMS.txt files. - - Positional arguments: - `evidence_file`: str with the file location of the Evidence.txt file - `msms_file`: str with the file location of the MSMS.txt file - - Keyword arguments: - `ptm_mapping` (dict) is used to convert the MaxQuant PTM labels to PSI-MS - modification names. For correct parsing, the key should always include the - two brackets. - `fixed_modifications` (list of tuples, [(aa, ptm)]) can contain fixed - modifications to be added to the peprec. E.g. `[('C', 'cm')]`. The first - tuple element contains the one-letter amino acid code. The second tuple - element contains a two-character label for the PTM. This PTM also needs - to be present in the `ptm_mapping` dictionary. - """ - - for (aa, mod) in fixed_modifications: - if re.fullmatch('[A-Z]|n_term', aa) is None: - raise ValueError("Fixed modification amino acid `{}` should be a single capital character (e.g. `C`) or `n_term`.".format(aa)) - if re.fullmatch('[a-z]{2}', mod) is None: - raise ValueError("Fixed modification label `{}` can only contain two non-capital characters. E.g. `cm`".format(mod)) - - logging.debug("Start converting MaxQuant output to MS2PIP PEPREC") - - # Read files and merge into one dataframe - evidence = pd.read_csv(evidence_file, sep='\t') - msms = pd.read_csv(msms_file, sep='\t', low_memory=False) - m = evidence.merge(msms[['Scan number', 'Scan index', 'id']], left_on='Best MS/MS', right_on='id') - - # Filter data - logging.debug("Removing decoys, contaminants, spectrum redundancy and peptide redundancy") - len_dict = { - 'len_original': len(m), - 'len_rev': len(m[m['Reverse'] == '+']), - 'len_con': len(m[m['Potential contaminant'] == '+']), - 'len_spec_red': len(m[m.duplicated(['Raw file', 'Scan number'], keep='first')]), - 'len_pep_red': len(m[m.duplicated(['Sequence', 'Modifications', 'Charge'], keep='first')]) - } - - m = m.sort_values('Score', ascending=False) - m = m[m['Reverse'] != '+'] - m = m[m['Potential contaminant'] != '+'] - m = m[~m.duplicated(['Raw file', 'Scan number'], keep='first')] - m = m[~m.duplicated(['Sequence', 'Modifications', 'Charge'], keep='first')] - m.sort_index(inplace=True) - - len_dict['len_filtered'] = len(m) - - print_psm_counts = """ - Original number of PSMs: {len_original} - - {len_rev} decoys - - {len_con} potential contaminants - - {len_spec_red} redundant spectra - - {len_pep_red} redundant peptides - = {len_filtered} PSMs in spectral library - """.format(**len_dict) - - logging.info(print_psm_counts) - logging.info("Spectral library FDR estimated from PEPs: {:.4f}".format(m['PEP'].sum()/len(m))) - - # Parse modifications - logging.debug("Parsing modifications") - for aa, mod in fixed_modifications: - if aa == 'n_term': - m['Modified sequence'] = ['_({})'.format(mod) + seq[1:] if seq[:2] != '_(' else seq for seq in m['Modified sequence']] - else: - m['Modified sequence'] = m['Modified sequence'].str.replace(aa, '{}({})'.format(aa, mod)) - - pattern = r'\([a-z].\)' - m['Parsed modifications'] = ['|'.join(['{}|{}'.format(m.start(0) - 1 - i*4, ptm_mapping[m.group()]) for i, m in enumerate(re.finditer(pattern, s))]) for s in m['Modified sequence']] - m['Parsed modifications'] = ['-' if mods == '' else mods for mods in m['Parsed modifications']] - - # Prepare PEPREC columns - logging.debug("Preparing PEPREC columns") - m['Protein list'] = m['Proteins'].str.split(';') - m['MS2PIP ID'] = range(len(m)) - peprec_cols = ['MS2PIP ID', 'Sequence', 'Parsed modifications', 'Charge', 'Protein list', - 'Retention time', 'Score', 'Delta score', 'PEP', 'Scan number', 'Scan index', 'Raw file'] - peprec = m[peprec_cols].sort_index().copy() - - col_mapping = { - 'MS2PIP ID': 'spec_id', - 'Sequence': 'peptide', - 'Parsed modifications': 'modifications', - 'Charge': 'charge', - 'Protein list': 'protein_list', - 'Retention time': 'rt', - 'Score': 'andromeda_score', - 'Delta score': 'andromeda_delta_score', - 'PEP': 'andromeda_pep', - 'Scan number': 'scan_number', - 'Scan index': 'scan_index', - 'Raw file': 'raw_file' - } - - peprec = peprec.rename(columns=col_mapping) - - logging.debug('PEPREC ready!') - - return peprec - - -def scan_mgf(peprec, mgf_folder, filename_col='raw_file', scan_num_col='scan_number', outname='SpecLib.mgf'): - with open(outname, 'w') as out: - count_runs = 0 - count = 0 - runs = peprec[filename_col].unique() - logging.info("Scanning MGF files...") - for run in tqdm(runs): - spec_dict = dict(('msmsid:F{:06d}'.format(v), k) for k, v in peprec[(peprec[filename_col] == run)][scan_num_col].to_dict().items()) - - # Parse file - found = False - with open('{}/{}.mgf'.format(mgf_folder, str(run)), 'r') as f: - for i, line in enumerate(f): - if 'TITLE' in line: - scan_num = re.split('TITLE=|,', line)[1] # Edit to match MGF title notation - if scan_num in spec_dict: - found = True - out.write("BEGIN IONS\n") - line = "TITLE=" + str(spec_dict[scan_num]) + '\n' - count += 1 - if 'END IONS' in line: - if found: - out.write(line + '\n') - found = False - if found and line[-4:] != '0.0\n': - out.write(line) - logging.info("%i/%i spectra found and written to new MGF file.", count, len(peprec)) - - -def ArgParse(): - parser = argparse.ArgumentParser(description='Convert MaxQuant txt files and MGF to MS2PIP spectral library.') - parser.add_argument('-t', '--txt', dest='txt_folder', action='store', default='txt', - help='Folder with MaxQuant txt files (default: "msf")') - parser.add_argument('-g', '--mgf', dest='mgf_folder', action='store', default='mgf', - help='Folder with MGF spectrum files (default: "mgf")') - parser.add_argument('-o', '--out', dest='outname', action='store', default='SpecLib', - help='Name for output files (default: "SpecLib")') - args = parser.parse_args() - - return(args) - - -def main(): - args = ArgParse() - - logging.basicConfig( - format='%(asctime)s - %(levelname)s - %(message)s', - datefmt='%Y-%m-%d %H:%M:%S', - level=logging.DEBUG - ) - - peprec = maxquant_to_peprec( - '{}/evidence.txt'.format(args.txt_folder), - '{}/msms.txt'.format(args.txt_folder), - ptm_mapping={'(ox)': 'Oxidation', '(ac)': 'Acetyl', '(cm)': 'Carbamidomethyl', '(tn)': 'TMT6plexN', '(tk)': 'TMT6plex'}, - fixed_modifications=[('C', 'cm'), ('n_term', 'tn'), ('K', 'tk')] - ) - - peprec.to_csv('{}.peprec'.format(args.outname), sep=' ', index=False) - - scan_mgf(peprec, args.mgf_folder, outname='{}.mgf'.format(args.outname)) - - - -if __name__ == '__main__': - main() diff --git a/conversion_tools/misc_functions.py b/conversion_tools/misc_functions.py deleted file mode 100644 index f9ef1102..00000000 --- a/conversion_tools/misc_functions.py +++ /dev/null @@ -1,85 +0,0 @@ -""" -Miscellaneous functions regarding MS2PIP file conversions. -""" - - -import re -import pandas as pd - - -def add_fixed_mods(peprec, fixed_mods=None, n_term=None, c_term=None): - """ - Add 'fixed' modifications to all peptides in an MS2PIP PEPREC file. - Return list with MS2PIP modifications with fixed mods added. - - Positional arguments: - peprec - MS2PIP PEPREC DataFrame - - Keyword arguments: - fixed_mods - List of tuples. First tuple element is amino acid, second tuple - element is modification name. E.g. `[('K', 'TMT6plex')]` - n_term - Name of fixed N-terminal modification to add - c_term - Name of fixed C-terminal modification to add - """ - - if not fixed_mods: - fixed_mods = [] - - result = [] - - for _, row in peprec.iterrows(): - mods = row['modifications'] - if mods == '-': - mods = [] - else: - mods = mods.split('|') - - current_mods = list(zip([int(i) for i in mods[::2]], mods[1::2])) - - for aa, mod in fixed_mods: - current_mods.extend([(m.start()+1, mod) for m in re.finditer(aa, row['peptide'])]) - - if n_term and not 0 in [i for i, n in current_mods]: - current_mods.append((0, n_term)) - if c_term and not -1 in [i for i, n in current_mods]: - current_mods.append((-1, c_term)) - - current_mods = sorted(current_mods, key=lambda x: x[0]) - current_mods = '|'.join(['|'.join(m) for m in [(str(i), n) for i, n in current_mods]]) - result.append(current_mods) - - return result - - -def peprec_add_charges(peprec_filename, mgf_filename, overwrite=False): - """ - Get precursor charges from MGF file and add them to a PEPREC - """ - peprec = pd.read_csv(peprec_filename, sep=' ', index_col=None) - - if not overwrite and 'charge' in peprec.columns: - print('Charges already in PEPREC') - return None - - spec_count = 0 - charges = {} - with open(mgf_filename, 'rt') as f: - for line in f: - if line.startswith('TITLE='): - title = line[6:].strip() - spec_count += 1 - if line.startswith('CHARGE='): - charge = line[7:].strip() - charges[title] = charge - - if not spec_count == len(charges): - print('Something went wrong') - return None - - peprec['charge'] = peprec['spec_id'].map(charges) - - new_peprec_filename = re.sub('\.peprec$|\.PEPREC$', '', peprec_filename) + '_withcharges.peprec' - peprec.to_csv(new_peprec_filename, sep=' ', index=False) - - print('PEPREC with charges written to ' + new_peprec_filename) - return peprec \ No newline at end of file diff --git a/conversion_tools/mztab_to_peprec.py b/conversion_tools/mztab_to_peprec.py deleted file mode 100644 index a08e6301..00000000 --- a/conversion_tools/mztab_to_peprec.py +++ /dev/null @@ -1,115 +0,0 @@ -""" -Convert mzTab to MS2PIP PEPREC spectral library (only unique peptides) -""" - -import argparse -import pandas as pd - - -def argument_parser(): - parser = argparse.ArgumentParser( - description='Convert mzTab to MS2PIP PEPREC spectral library (only\ - unique peptides)' - ) - parser.add_argument( - 'filename', metavar="", - help='filename of mzTab to convert.' - ) - args = parser.parse_args() - return args - - -def get_peptide_table_start(filename): - """ - Get line number of mzTab peptide table start. - """ - with open(filename, 'rt') as f: - for num, line in enumerate(f, 1): - if line[:3] == 'PSH': - return num - - -def parse_mods(modifications, psimod_mapping=None): - """ - Convert mzTab modifications to MS2PIP PEPREC modifications. - - N-term and C-term modifications are not supported! - Entries without modifications should be given as '-' - - modifications - Pandas Series containing mzTab-formatted modifications - psimod_mapping - dictionary that maps PSI-MOD entries to desired modification names - """ - if not psimod_mapping: - psimod_mapping = psimod_mapping = { - 'MOD:00394': 'Acetyl', - 'MOD:00400': 'Deamidated', - 'MOD:00425': 'Oxidation', - 'MOD:01214': 'Carbamidomethyl' - } - - parsed_mods = [] - for mods in modifications.fillna('-').str.split(',|-'): - if mods == ['', '']: - parsed_mods.append('-') - else: - pos = [str(int(p) + 1) for p in mods[::2]] - names = [psimod_mapping[n] for n in mods[1::2]] - parsed_mods.append('|'.join(['|'.join(i) for i in list(zip(pos, names))])) - - return parsed_mods - - -def main(): - args = argument_parser() - - start = get_peptide_table_start(args.filename) - mztab = pd.read_csv(args.filename, sep='\t', skiprows=start-1) - - # Filter unique peptides - unique_count = mztab.duplicated(['sequence', 'modifications', 'charge']).value_counts().to_dict() - if False in unique_count: - unique_count = unique_count[False] - else: - unique_count = 0 - print("mzTab contains {} unique peptides".format(unique_count)) - - mztab.sort_values('search_engine_score[1]', ascending=False, inplace=True) - mztab = mztab[~mztab.duplicated(['sequence', 'modifications', 'charge'])].copy() - - - # Check modifications present in mzTab - flatten = lambda l: [item for sublist in l for item in sublist] - unique_mods = set(flatten([mods[1::2] for mods in mztab['modifications'].dropna().str.split(',|-')])) - print("mzTab contains these modifications: {}".format(unique_mods)) - - mztab['parsed_modifications'] = parse_mods(mztab['modifications']) - - peprec_cols = [ - 'PSM_ID', - 'sequence', - 'parsed_modifications', - 'charge', - 'retention_time', - 'calc_mass_to_charge', - 'search_engine_score[1]', - ] - - peprec_col_mapping = { - 'PSM_ID': 'spec_id', - 'sequence': 'peptide', - 'parsed_modifications': 'modifications', - 'charge': 'charge', - 'retention_time': 'rt', - 'calc_mass_to_charge': 'mz', - 'search_engine_score[1]': 'score' - } - - peprec = mztab[peprec_cols] - peprec = peprec.rename(columns=peprec_col_mapping) - - filename_stripped = '.'.join(args.filename.split('.')[:-1]) - peprec.to_csv(filename_stripped + '.peprec', sep=' ', index=False) - - -if __name__ == '__main__': - main() diff --git a/conversion_tools/peprec_add_phospho_suffix.py b/conversion_tools/peprec_add_phospho_suffix.py deleted file mode 100644 index 3d59deff..00000000 --- a/conversion_tools/peprec_add_phospho_suffix.py +++ /dev/null @@ -1,91 +0,0 @@ -""" -## PEPREC: Add Phospho Suffix -Adds amino acid suffix to "Phospho" modifications in PEPREC file. "Phospho" becomes, -for instance, "PhosphoY". Also, for unmodified peptides, a hyphen is added to the -PEPREC file. - -*peprec_add_phospho_suffix.py* -**Input:** Folder with PEPREC files -**Output:** PEPREC files with amino acid suffix added to "Phospho" modifications -usage: peprec_add_phospho_suffix.py [-h] [-f PEPREC_FOLDER] [-r] - -Add amino acid suffix to "Phospho" in modifications column in PEPREC file(s). - -optional arguments: - -h, --help show this help message and exit - -f PEPREC_FOLDER Folder with input PEPREC files (default: "") - -r Replace the original PEPREC files instead of writing a new - file (default: False) -""" - -# -------------- -# Import modules -# -------------- -import argparse -import pandas as pd -from glob import glob - - -# --------------------------------------------------------- -# Find all PEPREC files in folder in a case-insensitive way -# --------------------------------------------------------- -def FindFiles(pattern): - def either(c): - return '[%s%s]' % (c.lower(), c.upper()) if c.isalpha() else c - return glob(''.join(map(either, pattern))) - - -# ------------------------------------ -# Add AA suffix to 'Phospho' in PEPREC -# ------------------------------------ -def AddPhosphoSuffix(peprec): - peprec = peprec.fillna('-') - peprec['modifications'] = [[x for x in zip([x for (i, x) in enumerate(mods) if i % 2 == 0], [x for (i, x) in enumerate(mods) if i % 2 != 0])] for mods in peprec['modifications'].str.split('|')] - res = [] - for _, row in peprec.iterrows(): - if row['modifications']: - res.append([(mod[0], mod[1] + row['peptide'][int(mod[0]) - 1]) if mod[1] == 'Phospho' else mod for mod in row['modifications']]) - else: - res.append('-') - res = ['|'.join(['|'.join(mod) for mod in row]) for row in res] - peprec['modifications'] = res - return(peprec) - - -# --------------- -# Argument parser -# --------------- -def ArgParse(): - parser = argparse.ArgumentParser(description='Add amino acid suffix to "Phospho" in modifications column in PEPREC file(s).') - parser.add_argument('-f', dest='peprec_folder', action='store', default='', - help='Folder with input PEPREC files (default: "")') - parser.add_argument('-r', dest='replace', action="store_true", default=False, - help='Replace the original PEPREC files instead of writing a new file (default: False)') - args = parser.parse_args() - return(args) - - -# ---- -# Run! -# ---- -def run(): - args = ArgParse() - if args.peprec_folder: - files = FindFiles("{}/*.peprec".format(args.peprec_folder)) - else: - files = FindFiles("*.peprec") - - if not files: - print("No PEPREC files found.") - else: - for peprec_file in files: - peprec = pd.read_csv(peprec_file, sep=' ') - peprec_out = AddPhosphoSuffix(peprec) - if not args.replace: - peprec_file = "{}_WithSuffix.peprec".format(peprec_file.strip('.peprec')) - peprec_out.to_csv(peprec_file, index=None, sep=' ', na_rep='-') - print("Ready!") - - -if __name__ == "__main__": - run() diff --git a/conversion_tools/predictions_to_ms2.py b/conversion_tools/predictions_to_ms2.py deleted file mode 100644 index 10c2b85a..00000000 --- a/conversion_tools/predictions_to_ms2.py +++ /dev/null @@ -1,95 +0,0 @@ -""" -## Create MS2+SSL files (SkyLine spectral library formats) from MS2PIP predictions -Takes spectra predicted by MS2PIP and write SkyLine spectral library format MS2+SSL - -*predictions_to_ms2.py* -**Input:** MS2PIP predictions file -**Output:** MS2 and SSL files - -**Requirements:** Pyteomics for mass calculations; tqdm for progress bar. - -``` -usage: predictions_to_ms2.py [-h] pep_file - -Generate MS2 and SSL files from MS2PIP predictions - -positional arguments: - pep_file PEPREC file used to generate predictions - -optional arguments: - -h, --help show this help message and exit -``` -""" - -# -------------- -# Import modules -# -------------- -import sys -import argparse -from time import localtime, strftime -from operator import itemgetter -from pyteomics import mass -from tqdm import tqdm - -parser = argparse.ArgumentParser() - -parser.add_argument("pep_file", help="PEPREC file used to get the ms2pip predictions") -args = parser.parse_args() - -pep_dict = {} -# pep_dic[spec_id] = [sequence, charge] -pep = open(args.pep_file) -for line in pep: - split_line = line.split(' ') - # first line must be the header, and first header must be spec_id: - if line.startswith('spec_id'): - seq_id = split_line.index('peptide') - ch_id = split_line.index('charge') - continue - pep_dict[split_line[0]] = [split_line[2], int(split_line[3])] - -pred_dict = {} -# pred_dict[spec_id] = [[mz_list], [int_list]] -pred = open(args.pep_file.replace(".PEPREC", "_predictions.csv")) -for line in pred: - if line.startswith('spec_id'): continue - split_line = line.strip().split(',') - if split_line[0] in pred_dict.keys(): - pred_dict[split_line[0]][0].append(float(split_line[5])) - pred_dict[split_line[0]][1].append(float(split_line[6])) - else: - pred_dict[split_line[0]] = [[float(split_line[5])], [float(split_line[6])]] - -ssl_output = open(args.pep_file + ".ssl", "w+") -ssl_output.write('file\tscan\tcharge\tsequence\n') - -sys.stdout.write("writing {} file\n".format(args.pep_file.replace(".PEPREC", ".ssl"))) - -peptides = pred_dict.keys() - -ms2_name = args.pep_file.replace(".PEPREC",".ms2") -sys.stdout.write("writing {} file\n".format(ms2_name)) -ms2_output = open(ms2_name, "w+") -ms2_output.write("H\tCreationDate\t{}\n".format(strftime("%Y-%m-%d %H:%M:%S", localtime()))) -ms2_output.write("H\tExtractor\tMS2PIP Predictions\n") - -for i, peptide in tqdm(enumerate(pep_dict.keys()), total=len(pep_dict.keys())): - intensities = list(map(lambda x: (2**x)+0.001, pred_dict[peptide][1])) - spectrum = zip(pred_dict[peptide][0], intensities) - spectrum = sorted(spectrum, key=itemgetter(0)) - - seq = pep_dict[peptide][0] - prec_mass = mass.calculate_mass(sequence=seq) - charge = pep_dict[peptide][1] - - ssl_output.write("{}\t{}\t{}\t{}\n".format(ms2_name, peptide, charge, seq)) - ms2_output.write("S\t{}\t{}\n".format(peptide, prec_mass)) - ms2_output.write("Z\t{}\t{}\n".format(int(charge), int(charge)*prec_mass)) - ms2_output.write("D\tseq\t{}\n".format(seq)) - - for mz, inte in spectrum: - ms2_output.write("{}\t{}\n".format(mz, inte)) - -ms2_output.close() - -ssl_output.close() diff --git a/conversion_tools/sample_peptides_n.py b/conversion_tools/sample_peptides_n.py deleted file mode 100644 index 05686477..00000000 --- a/conversion_tools/sample_peptides_n.py +++ /dev/null @@ -1,110 +0,0 @@ -#!/usr/bin/env python -import sys -import pandas as pd -#import numpy as np -#import cPickle as pickle - - -def return_len(x): - return len(x) - - -def is_mod(x): - if "|" in str(x): - tmp = x.split("|") - ok = True - for i in range(1, len(tmp), 2): - if (tmp[i] != "CAM") & (tmp[i] != "Oxidation"): - ok = False - if ok: - return 0 - else: - return 1 - return 0 - - -def return_charge(x): - return tmap[x] - - -def is_tryp(x): - if (x[-1] == 'R') | (x[-1] == 'K'): - return 1 - return 0 - - -def add_pepmass(x): - return tmap[x] - - -def add_cpepmass(x): - return 18.010601+sum([AminoMass[a] for a in x[2]])+x[6]*57.02146 - - -def num_CAM(x): - n = 0 - if "|" in str(x): - tmp = x.split("|") - # ok = True - for i in range(1, len(tmp), 2): - if tmp[i] == "CAM": - n += 1 - return n - - -def isA(x): - return x[0] - - -tmap = {} - -AminoMass = { - 'A': 71.037114, 'C': 103.009185, 'D': 115.026943, 'E': 129.042593, 'F': 147.068414, 'G': 57.021464, - 'H': 137.058912, 'I': 113.084064, 'K': 128.094963, 'L': 113.084064, 'M': 131.040485, 'N': 114.042927, - 'P': 97.052764, 'Q': 128.058578, 'R': 156.101111, 'S': 87.032028, 'T': 101.047679, 'V': 99.068414, - 'W': 186.079313, 'Y': 163.063329, -} - -f = open(sys.argv[2]) -title = "" -# pepmass = 0 -charge = 0 -while True: - rows = f.readlines(1000000) - if not rows: - break - for row in rows: - row = row.rstrip() - if row == "": - continue - if row[:5] == "TITLE": - title = row[6:] - elif row[:6] == "CHARGE": - charge = int(row[7:9].replace("+", "")) - # elif row[:7] == "PEPMASS": - # pepmass = float(row[8:].split()[0]) - elif row[:8] == "END IONS": - # tmap[title] = (float(pepmass) * (charge)) - ((charge)*1.007825035) - tmap[title] = charge - -# pickle.dump( tmap, open( "tmap.p", "wb" ) ) -# tmap = pickle.load( open( "tmap.p", "rb" ) ) - -sys.stderr.write("done\n") - -sys.stderr.write('reading file\n') -data = pd.read_csv(sys.argv[1], sep=' ') -data['peplen'] = data['peptide'].apply(return_len) -data['mod'] = data['modifications'].apply(is_mod) -data['charge'] = data['spec_id'].apply(return_charge) - -# data['numcam'] = data['modifications'].apply(num_CAM) -# data['pepmass'] = data['spec_id'].apply(add_pepmass) -# data['cpepmass'] = data.apply(add_cpepmass,axis=1) -# data['diff'] = np.abs(data['pepmass']-data['cpepmass']) - -data = data[data.peplen >= 8] -data = data[data['mod'] == 0] - -datatmp = data.drop_duplicates(subset=['peptide', 'charge'], keep="last") -datatmp[['spec_id', 'modifications', 'peptide', 'charge']].to_csv(sys.argv[1] + ".ms2pip", index=False, sep=" ") diff --git a/conversion_tools/scan_mgf.py b/conversion_tools/scan_mgf.py deleted file mode 100644 index 1dbecf92..00000000 --- a/conversion_tools/scan_mgf.py +++ /dev/null @@ -1,148 +0,0 @@ -""" -Scan MGF files in a given folder for spectra present in a given PEPREC file and -write those spectra to a new MGF file. -""" - - -__author__ = "Ralf Gabriels" -__credits__ = ["Ralf Gabriels", "Sven Degroeve", "Lennart Martens"] -__license__ = "Apache License, Version 2.0" -__version__ = "0.1" -__email__ = "Ralf.Gabriels@ugent.be" - - -import argparse -import mmap -import os - -import pandas as pd - -try: - from tqdm import tqdm -except ImportError: - USE_TQDM = False -else: - USE_TQDM = True - - -def argument_parser(): - parser = argparse.ArgumentParser( - description="Scan MGF files in a given folder for spectra present in a\ - given PEPREC file and write those spectra to a new MGF file." - ) - parser.add_argument( - "mgf", - metavar="", - help="Path to MGF file or directory containing MGF files. If a\ - directory is given, the peprec file must have an additional column\ - `mgf_filename` that contains the respective MGF filename.", - ) - parser.add_argument( - "peprec", - metavar="", - help="Path to PEPREC file that contains the peptides to scan for.", - ) - # parser.add_argument( - # "--tqdm", action="store_true", dest='use_tqdm', default=False, - # help="Use tqdm to display progress bar." - # ) - args = parser.parse_args() - return args - - -def get_num_lines(file_path): - fp = open(file_path, "r+") - buf = mmap.mmap(fp.fileno(), 0) - lines = 0 - while buf.readline(): - lines += 1 - return lines - - -def scan_mgf( - df_in, - mgf_folder, - outname="scan_mgf_result.mgf", - filename_col="mgf_filename", - spec_title_col="spec_id", - use_tqdm=False, -): - if df_in[filename_col].iloc[0][-4:] in [".mgf", ".MGF"]: - file_suffix = "" - else: - file_suffix = ".mgf" - - with open(outname, "w") as out: - count_runs = 0 - count = 0 - runs = df_in[filename_col].unique() - print( - "Scanning MGF files: {} runs to do. Now working on run: ".format(len(runs)), - end="", - ) - for run in runs: - count_runs += 1 - if count_runs % 10 == 0: - print(str(count_runs), end="") - else: - print(".", end="") - - spec_dict = dict( - (v, k) - for k, v in df_in[(df_in[filename_col] == run)][spec_title_col] - .to_dict() - .items() - ) - - found = False - current_mgf_file = "{}/{}{}".format(mgf_folder, str(run), file_suffix) - with open(current_mgf_file, "r") as f: - if use_tqdm: - mgf_iterator = tqdm(f, total=get_num_lines(current_mgf_file)) - else: - mgf_iterator = f - for line in mgf_iterator: - if "TITLE" in line: - title = line[6:].strip() - if title in spec_dict: - found = True - out.write("BEGIN IONS\n") - # line = "TITLE=" + str(spec_dict[title]) + "\n" - count += 1 - if "END IONS" in line: - if found: - out.write(line + "\n") - found = False - if found and line[-4:] != "0.0\n": - out.write(line) - - print( - "\n{}/{} spectra found and written to new MGF file.".format(count, len(df_in)) - ) - - -def main(): - args = argument_parser() - - peprec = pd.read_csv(args.peprec, sep=" ") - - if os.path.isfile(args.mgf): - mgf = os.path.dirname(args.mgf) - peprec["mgf_filename"] = os.path.basename(args.mgf) - elif os.path.isdir(args.mgf): - assert ( - "mgf_file" in peprec.columns - ), "Path to MGF directory was given, but PEPREC does not contain a `mgf_file` \ -column." - mgf = args.mgf - else: - print(f"{args.mgf} does not exist.") - exit(1) - - outname = os.path.splitext(args.mgf)[0] + "_scanned.mgf" - - scan_mgf(peprec, mgf, outname=outname, use_tqdm=USE_TQDM) - - -if __name__ == "__main__": - main() diff --git a/conversion_tools/speclib_to_mgf.py b/conversion_tools/speclib_to_mgf.py deleted file mode 100644 index aebf7b7f..00000000 --- a/conversion_tools/speclib_to_mgf.py +++ /dev/null @@ -1,236 +0,0 @@ -#!/usr/bin/env python3 -""" -Convert MSP and SPTXT spectral library files. - -Writes three files: mgf with the spectra; PEPREC with the peptide sequences; -meta with additional metainformation. - -Arguments: - arg1 path to spectral library file - arg2 prefix for spec_id - -""" - -import re -import sys -import logging - - -AMINO_MASSES = { - "A": 71.037114, - "C": 103.009185, - "D": 115.026943, - "E": 129.042593, - "F": 147.068414, - "G": 57.021464, - "H": 137.058912, - "I": 113.084064, - "K": 128.094963, - "L": 113.084064, - "M": 131.040485, - "N": 114.042927, - "P": 97.052764, - "Q": 128.058578, - "R": 156.101111, - "S": 87.032028, - "T": 101.047679, - "V": 99.068414, - "W": 186.079313, - "Y": 163.063329, -} -PROTON_MASS = 1.007825035 -WATER_MASS = 18.010601 - - -def setup_logging(): - """Initiate logging.""" - root_logger = logging.getLogger() - handler = logging.StreamHandler() - handler.setFormatter( - logging.Formatter("%(asctime)s %(levelname)s %(module)s %(message)s") - ) - root_logger.addHandler(handler) - root_logger.setLevel(logging.INFO) - - -def parse_peprec_mods(mods, ptm_list): - """Parse PEPREC modification string out of MSP Mod string.""" - if mods.split("/")[0] != "0": - num_mods = mods[0] - mod_list = [mod.split(",") for mod in mods.split("/")[1:]] - - peprec_mods = [] - for location, aa, name in mod_list: - if not (location == "0" and name == "iTRAQ"): - location = str(int(location) + 1) - peprec_mods.append(location) - peprec_mods.append(name) - - if name not in ptm_list: - ptm_list[name] = 1 - else: - ptm_list[name] += 1 - - peprec_mods = "|".join(peprec_mods) - - else: - peprec_mods = "-" - - return peprec_mods - - -def validate(spec_id, peptide, charge, mods, reported_mw): - """Validate amino acids and reported peptide mass.""" - invalid_aas = ["B", "J", "O", "U", "X", "Z"] - if any(aa in invalid_aas for aa in peptide): - logging.warning("Peptide with non-canonical amino acid found: %s", peptide) - - elif ( - mods.split("/")[0] == "0" - ): # Cannot validate mass of peptide with unknown modification - calculated = WATER_MASS + sum([AMINO_MASSES[x] for x in peptide]) - reported = float(reported_mw) * float(charge) - float(charge) * PROTON_MASS - if abs(calculated - reported) > 0.5: - logging.warning( - "Reported MW does not match calculated mass for spectrum %s", spec_id - ) - - -def parse_speclib(speclib_filename, title_prefix, speclib_format="msp"): - """Parse MSP file.""" - filename = ".".join(speclib_filename.split(".")[:-1]) - fpip = open(filename + ".peprec", "w") - fpip.write("spec_id modifications peptide charge\n") - fmgf = open(filename + ".mgf", "w") - fmeta = open(filename + ".meta", "w") - - with open(speclib_filename) as f: - mod_dict = {} - spec_id = 1 - peak_sep = None - peptide = None - charge = None - parentmz = None - mods = None - purity = None - HCDenergy = None - read_spec = False - mgf = "" - - for row in f: - if read_spec: - # Infer peak int/mz separator - if not peak_sep: - if "\t" in row: - peak_sep = "\t" - elif " " in row: - peak_sep = " " - else: - raise ValueError("Invalid peak separator") - - line = row.rstrip().split(peak_sep) - - # Read all peaks, so save to output files and set read_spec to False - if row[0].isdigit(): - # Continue reading spectrum - mgf += " ".join([line[0], line[1]]) + "\n" - continue - - # Last peak reached, finish up spectrum - else: - validate(spec_id, peptide, charge, mods, parentmz) - peprec_mods = parse_peprec_mods(mods, mod_dict) - fpip.write( - "{}{} {} {} {}\n".format( - title_prefix, spec_id, peprec_mods, peptide, charge - ) - ) - fmeta.write( - "{}{} {} {} {} {} {}\n".format( - title_prefix, - spec_id, - charge, - peptide, - parentmz, - purity, - HCDenergy, - ) - ) - - buf = "BEGIN IONS\n" - buf += "TITLE=" + title_prefix + str(spec_id) + "\n" - buf += "CHARGE=" + str(charge) + "\n" - buf += "PEPMASS=" + parentmz + "\n" - fmgf.write("{}{}END IONS\n\n".format(buf, mgf)) - - spec_id += 1 - read_spec = False - mgf = "" - - if row.startswith("Name:"): - line = row.rstrip().split(" ") - tmp = line[1].split("/") - peptide = tmp[0].replace("(O)", "") - if speclib_format == "sptxt": - peptide = re.sub(r"\[\d*\]|[a-z]", "", peptide) - charge = tmp[1].split("_")[0] - continue - - elif row.startswith("Comment:"): - line = row.rstrip().split(" ") - for i in range(1, len(line)): - if line[i].startswith("Mods="): - tmp = line[i].split("=") - mods = tmp[1] - if line[i].startswith("Parent="): - tmp = line[i].split("=") - parentmz = tmp[1] - if line[i].startswith("Purity="): - tmp = line[i].split("=") - purity = tmp[1] - if line[i].startswith("HCD="): - tmp = line[i].split("=") - HCDenergy = tmp[1].replace("eV", "") - continue - - elif row.startswith("Num peaks:") or row.startswith("NumPeaks:"): - read_spec = True - continue - - fmgf.close() - fpip.close() - fmeta.close() - - return spec_id, mod_dict - - -def main(): - """Run CLI.""" - # Get arguments - speclib_filename = sys.argv[1] - title_prefix = sys.argv[2] - - speclib_ext = speclib_filename.split(".")[-1] - if speclib_ext.lower() == "sptxt": - speclib_format = "sptxt" - elif speclib_ext.lower() == "msp": - speclib_format = "msp" - else: - raise ValueError("Unknown spectral library format: `%s`" % speclib_ext) - - logging.info("Converting %s to MGF, PEPREC and meta file", speclib_filename) - - num_peptides, mod_dict = parse_speclib( - speclib_filename, title_prefix, speclib_format=speclib_format - ) - - logging.info( - "Finished!\nSpectral library contains %i peptides and the following modifications: %s", - num_peptides, - mod_dict, - ) - - -if __name__ == "__main__": - setup_logging() - main() diff --git a/docs/source/index.rst b/docs/source/index.rst index 39e1974e..9020a820 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -11,6 +11,7 @@ installation usage prediction-models + webserver-api .. toctree:: diff --git a/docs/source/usage.rst b/docs/source/usage.rst index 119fd7d4..7b63206f 100644 --- a/docs/source/usage.rst +++ b/docs/source/usage.rst @@ -148,8 +148,14 @@ Any unresolvable modification will result in an error. If needed, PSM files can Spectrum file ------------- -In the :ref:`correlate` and :ref:`get-training-data` usage modes, an MGF or mzML file with observed -spectra must be provided to MS²PIP. +In the :ref:`correlate` and :ref:`get-training-data` usage modes, a spectrum file with observed +spectra must be provided to MS²PIP. Spectrum files in mzML, MGF, Thermo raw, and Bruker raw formats +are supported. + +.. note:: + + To read Thermo raw files, the + `.NET runtime `_ must be installed. Make sure that the PSM file ``spectrum_id`` matches the MGF ``TITLE`` field or mzML ``nativeID`` fields. If the values of these fields are different, but the PSM file ``spectrum_id`` is embedded diff --git a/docs/source/webserver-api.md b/docs/source/webserver-api.md new file mode 100644 index 00000000..67ad0f7c --- /dev/null +++ b/docs/source/webserver-api.md @@ -0,0 +1,110 @@ +# MS2PIP Server API +MS2PIP Server can be accessed through the webpage or through a RESTful API. While this page provides an example Python script for contacting the API, the Swagger-generated documentation can be found here: https://iomics.ugent.be/ms2pip/api/v2. + +The following Python function contacts the MS2PIP Server RESTful API: + +```python +def run_ms2pip_server(peptides, frag_method, ptm_list, url='https://iomics.ugent.be/ms2pip/api/v2'): + # Check if all columns are present in dataframe + for col in ['spec_id', 'peptide', 'charge', 'modifications']: + if col not in peptides.columns: + print("{} is missing from peptides DataFrame".format(col)) + return None + + # Split-up into batches of 100 000 peptides (maximum MS2PIP Server accepts per request) + batch_size = 100000 + result = pd.DataFrame() + for i in list(range(0, len(peptides), batch_size)): + print("Working on batch {}/{}".format(int(i / batch_size + 1), len(peptides) // batch_size + 1)) + peptides_batch = peptides.iloc[i:i+batch_size, :] + + # Combine data into dictionary for json post request + input_data = { + "params": { + "frag_method": frag_method, + "ptm": ptm_list + }, + "peptides": peptides_batch.to_dict(orient='list') + } + + # Post data to server and get task id + response = requests.post('{}/task'.format(url), json=input_data) + if 'task_id' not in response.json(): + if 'error' in response.json(): + print("Server error: {}".format(response.json()['error'])) + else: + print("Something went wrong") + return None + task_id = response.json()['task_id'] + print("Received task id: {}".format(task_id)) + + # Check server task status and get result when ready + sleep(1) + response = requests.get('{}/task/{}/status'.format(url, task_id)) + state = response.json()['state'] + + if state != 'SUCCESS': + print("Check MS2PIP Server status every 5 seconds", end='') + state = 'PENDING' + pending_count = 0 + + while state == 'PENDING' or state == 'PROGRESS': + sleep(5) + print('.', end='') + response = requests.get('{}/task/{}/status'.format(url, task_id)) + state = response.json()['state'] + + # Do not keep looping if task state is stuck on PENDING, it might have failed silently + if state == 'PENDING': + pending_count += 1 + if pending_count > 24: + print("\nSomething went wrong") + return None + print('') + + if state == 'SUCCESS': + response = requests.get('{}/task/{}/result'.format(url, task_id)) + result_batch = pd.DataFrame.from_dict(response.json())['ms2pip_out'] + result_batch = result_batch[['spec_id', 'charge', 'ion', 'ionnumber', 'mz', 'prediction']] + result = result.append(result_batch) + print("Result received", end='\n\n') + else: + error_message = response.json()['status'] + print("Something went wrong: {}".format(error_message)) + return None + + print("Finished with all batches!") + + return result +``` + +The function takes three arguments: `peptides`, `frag_method` and `ptm_list`. `peptides` is an MS2PIP PEPREC-formatted Pandas DataFrame. `ptm_list` is a list of MS2PIP formatted PTM definitions. A detailed explanation of both data structures can be found on the [MS2PIP Server webpage](https://iomics.ugent.be/ms2pip/#howto). `frag_method` is the specific model with which you want to predict peak intensities (e.g. HCD, CID, iTRAQ...) Checkout [README.md](https://github.com/compomics/ms2pip_c/#ms2pip-models) for a list of all available models. + +The function also takes an argument `url`, in which you can provide a custom URL to the server. By default this is `https://iomics.ugent.be/ms2pip/api`. + +An example for running the function: +```python +# Define arguments +frag_method = 'HCD' +ptm_list = [ + 'Oxidation,15.994915,M', + 'Carbamidomethyl,57.021464,C', + 'PhosphoS,79.966331,S', + 'PhosphoT,79.966331,T', + 'PhosphoY,79.966331,Y', + 'Glu->pyro-Glu,-18.010565,E', + 'Gln->pyro-Glu,-17.026549,Q', + 'Pyro-carbamidomethyl,39.994915,C', + 'Deamidated,0.984016,N', + 'iTRAQ,144.102063,N-term' +] +peptides = pd.DataFrame(data={ + 'spec_id': ['peptide1', 'peptide2'], + 'peptide': ['STCINTFWLIVK', 'GRLNTFILK'], + 'modifications': ['3|Carbamidomethyl', '-'], + 'charge': [2, 3] +}) + +# Run function +result = run_ms2pip_server(peptides, frag_method, ptm_list) +``` diff --git a/train_scripts/train_xgboost_c.py b/train_scripts/train_xgboost_c.py deleted file mode 100644 index 7fa84174..00000000 --- a/train_scripts/train_xgboost_c.py +++ /dev/null @@ -1,370 +0,0 @@ -import os -import argparse -import pandas as pd -import numpy as np -import xgboost as xgb -from math import ceil -from operator import itemgetter -from itertools import product -from scipy.stats import pearsonr -import matplotlib.pyplot as plt - - -def evalerror_pearson(preds, dtrain): - labels = dtrain.get_label() - return 'pearsonr', pearsonr(preds, labels)[0] - - -def convert_model_to_c(bst, args, numf, filename): - bst.dump_model("{}_dump.raw.txt".format(filename)) - num_nodes = [] - mmax = 0 - with open("{}_dump.raw.txt".format(filename)) as f: - for row in f: - if row.startswith('booster'): - if row.startswith('booster[0]'): - mmax = 0 - else: - num_nodes.append(mmax + 1) - mmax = 0 - continue - l = int(row.rstrip().replace(' ', '').split(':')[0]) - if l > mmax: - mmax = l - num_nodes.append(mmax + 1) - forest = [] - tree = None - b = 0 - with open("{}_dump.raw.txt".format(filename)) as f: - for row in f: - if row.startswith('booster'): - if row.startswith('booster[0]'): - tree = [0] * num_nodes[b] - b += 1 - else: - forest.append(tree) - tree = [0] * num_nodes[b] - b += 1 - continue - l = row.rstrip().replace(' ', '').split(':') - if l[1][:4] == "leaf": - tmp = l[1].split('=') - tree[int(l[0])] = [-1, float(tmp[1]), -1, -1] # !!!! - else: - tmp = l[1].split('yes=') - tmp[0] = tmp[0].replace('[Features', '') - tmp[0] = tmp[0].replace('[Feature', '') - tmp[0] = tmp[0].replace('[f', '') - tmp[0] = tmp[0].replace(']', '') - tmp2 = tmp[0].split('<') - if float(tmp2[1]) < 0: - tmp2[1] = 1 - tmp3 = tmp[1].split(",no=") - tmp4 = tmp3[1].split(',') - tree[int(l[0])] = [int(tmp2[0]), int(ceil(float(tmp2[1]))), int(tmp3[0]), int(tmp4[0])] - forest.append(tree) - - with open('{}.c'.format(filename), 'w') as fout: - fout.write("static float score_{}_{}(unsigned int* v){{\n".format(args.model_name, args.type)) - fout.write("float s = 0.;\n") - for tt in range(len(forest)): - fout.write(tree_to_code(forest[tt], 0, 1)) - fout.write("\nreturn s;}\n") - - """ - with open('{}.pyx'.format(filename), 'w') as fout: - fout.write("cdef extern from \"{}.c\":\n".format(filename)) - fout.write("\tfloat score_{}(short unsigned short[{}] v)\n\n".format(args.type, numf)) - fout.write("def myscore(sv):\n") - fout.write("\tcdef unsigned short[{}] v = sv\n".format(numf)) - fout.write("\treturn score_{}(v)\n".format(args.type)) - """ - - os.remove("{}_dump.raw.txt".format(filename)) - - -def tree_to_code(tree, pos, padding): - p = "\t" * padding - if tree[pos][0] == -1: - if tree[pos][1] < 0: - return p + "s = s {};\n".format(tree[pos][1]) - else: - return p + "s = s + {};\n".format(tree[pos][1]) - return p + "if (v[{}]<{}){{\n{}}}\n{}else{{\n{}}}".format(tree[pos][0], tree[pos][1], tree_to_code(tree, tree[pos][2], padding + 1), p, tree_to_code(tree, tree[pos][3], padding + 1)) - - -def load_data(vector_filename, ion_type): - # Read file - if vector_filename.split('.')[-1] == 'pkl': - vectors = pd.read_pickle(vector_filename) - elif vector_filename.split('.')[-1] == 'h5': - #vectors = pd.read_hdf(vector_filename, key='table', stop=1000) - vectors = pd.read_hdf(vector_filename, key='table') - else: - print("Unsuported feature vector format") - exit(1) - print("{} contains {} feature vectors".format(args.vectors, len(vectors))) - - #vectors = vectors[vectors["ce"]==2] - - # Extract targets for given ion type - target_names = list(vectors.columns[vectors.columns.str.contains('targets')]) - if not 'targets_{}'.format(ion_type) in target_names: - print("Targets for {} could not be found in vector file.".format(ion_type)) - print("Vector file only contains these targets: {}".format(target_names)) - exit(1) - targets = vectors.pop('targets_{}'.format(ion_type)) - target_names.remove('targets_{}'.format(ion_type)) - vectors.drop(labels=target_names, axis=1, inplace=True) - - # Get psmids - psmids = vectors.pop('psmid') - # Cast vectors to numpy float for XGBoost - vectors.astype(np.float32) - - return(vectors, targets, psmids) - - -def get_params_combinations(params): - keys, values = zip(*params.items()) - combinations = [dict(zip(keys, v)) for v in product(*values)] - return(combinations) - - -def get_best_params(df, params_grid): - params = {} - best = df[df['test-rmse-mean'] == df['test-rmse-mean'].min()] - for p in params_grid.keys(): - params[p] = best[p].iloc[0] - # num_boost_round = best['boosting-round'].iloc[0] - return(params) - - -def gridsearch(xtrain, params, params_grid): - cols = ['boosting-round', 'test-rmse-mean', 'test-rmse-std', 'train-rmse-mean', 'train-rmse-std'] - cols.extend(sorted(params_grid.keys())) - result = pd.DataFrame(columns=cols) - - count = 1 - combinations = get_params_combinations(params_grid) - - for param_overrides in combinations: - print("Working on combination {}/{}".format(count, len(combinations))) - count += 1 - params.update(param_overrides) - tmp = xgb.cv(params, xtrain, nfold=5, num_boost_round=200, early_stopping_rounds=10, verbose_eval=10) - tmp['boosting-round'] = tmp.index - for param in param_overrides.keys(): - tmp[param] = param_overrides[param] - result = result.append(tmp) - - print("Grid search ready!\n") - - return(result) - - -def print_logo(): - logo = """ - _____ _____ _____ _____ _____ - |_ _| __ | _ | | | | - | | | -| |- -| | | | - |_| |__|__|__|__|_____|_|___| - - """ - print(logo) - - -if __name__ == "__main__": - print_logo() - print("Using XGBoost version {}".format(xgb.__version__)) - - parser = argparse.ArgumentParser(description='XGBoost training') - parser.add_argument('vectors', metavar='<_vectors.pkl>', - help='feature vector file') - parser.add_argument('model_name', metavar='', - help='model name (e.g. HCD') - parser.add_argument('type', metavar='', - help='model type') - parser.add_argument('-c', metavar='INT', action="store", dest='num_cpu', default=24, - help='number of CPUs to use') - parser.add_argument('-t', metavar='INT', action="store", dest='num_trees', default=30, - help='number of trees in XGBoost model') - parser.add_argument('-e', metavar='FILE', action="store", dest='vectorseval', - help='additional evaluation file') - parser.add_argument("-p", action="store_true", dest='make_plots', default=False, - help="output plots") - parser.add_argument("-g", action="store_true", dest='gridsearch', default=False, - help="perform Grid Search CV to select best parameters") - args = parser.parse_args() - - np.random.seed(1) - - filename = "{}_{}_{}".format(args.vectors.split('.')[-2], args.num_trees, args.type) - print("Using output filename {}".format(filename)) - - print("Loading train and test data...") - vectors, targets, psmids = load_data(args.vectors, args.type) - - - print("Splitting up into train and test set...") - upeps = psmids.unique() - np.random.shuffle(upeps) - test_psms = upeps[:int(len(upeps) * 0.2)] - - train_vectors = vectors[~psmids.isin(test_psms)] - train_targets = targets[~psmids.isin(test_psms)] - train_psmids = psmids[~psmids.isin(test_psms)] - - #train_targets.hist(bins=50) - #plt.show() - - test_vectors = vectors[psmids.isin(test_psms)] - test_targets = targets[psmids.isin(test_psms)] - test_psmids = psmids[psmids.isin(test_psms)] - - """ - train_vectors = train_vectors[train_targets > 0] - train_psmids = train_psmids[train_targets > 0] - train_targets = np.log2(train_targets[train_targets > 0]) - test_vectors = test_vectors[test_targets > 0] - test_psmids = test_psmids[test_targets > 0] - test_targets = np.log2(test_targets[test_targets > 0]) - train_targets = [1 if x==0 else 0 for x in train_targets] - test_targets = [1 if x==0 else 0 for x in test_targets] - """ - numf = len(train_vectors.columns.values) - - - # Rename features to understand decision tree dump - train_vectors.columns = ['Feature' + str(i) for i in range(len(train_vectors.columns))] - test_vectors.columns = ['Feature' + str(i) for i in range(len(test_vectors.columns))] - - print(train_vectors.shape) - - print("Creating train and test DMatrix...") - xtrain = xgb.DMatrix(train_vectors, label=train_targets) - xtest = xgb.DMatrix(test_vectors, label=test_targets) - evallist = [(xtest, 'test'),(xtrain, 'train')] - - # If needed, repeat for evaluation vectors, and create evallist - if args.vectorseval: - print("Loading eval data...") - eval_vectors, eval_targets, eval_psmids = load_data(args.vectorseval, args.type) - eval_vectors.columns = ['Feature' + str(i) for i in range(len(eval_vectors.columns))] - print("Creating eval DMatrix...") - xeval = xgb.DMatrix(eval_vectors, label=eval_targets) - del eval_vectors - - # Remove items to save memory - del vectors, targets, psmids, train_vectors, train_targets, train_psmids, test_vectors - - # Set XGBoost default parameters - params = { - "nthread": int(args.num_cpu), - "objective": "reg:linear", - #"objective": "binary:logistic", - "eval_metric": 'mae', - #"eval_metric": 'aucpr', - "silent": 1, - "eta": 0.5, - "max_depth": 9, - "grow_policy":"lossguide", - "max_leaves":100, - "min_child_weight": 50, - "gamma": 0.1, - "subsample": 1, - #"lambda" : 2, - # "colsample_bytree": 1, - # "max_delta_step": 0, - } - - if args.gridsearch: - print("Performing GridSearchCV...") - params_grid = { - 'max_depth': [7, 8, 9], - 'min_child_weight': [300, 350, 400], - 'gamma': [0, 1], - 'max_delta_step': [0] - } - - gs = gridsearch(xtrain, params, params_grid) - gs.to_csv('{}_GridSearchCV.csv'.format(filename)) - best_params = get_best_params(gs, params_grid) - params.update(best_params) - print("Using best parameters: {}".format(best_params)) - - print("Training XGBoost model...") - #bst = xgb.train(params, xtrain, int(args.num_trees), evallist, early_stopping_rounds=10, maximize=False) - bst = xgb.train(params, xtrain, int(args.num_trees), evallist, maximize=False) - - bst.save_model("{}.xgboost".format(filename)) - - # Load previously saved model here, if necessary - # bst = xgb.Booster({'nthread':23}) #init model - # bst.load_model(filename+'.xgboost') # load data - - print("Writing model to C code...") - convert_model_to_c(bst, args, numf, filename) - - print("Analyzing newly made model...") - # Get feature importances - importance = bst.get_fscore() - importance = sorted(importance.items(), key=itemgetter(1)) - importance_list = [] - with open("{}_importance.csv".format(filename), "w") as f: - f.write("Name,F-score\n") - for feat, n in importance[:]: - importance_list.append(feat) - f.write("{},{}\n".format(feat, str(n))) - - # Print feature importances - print_feature_importances = False - if print_feature_importances: - print('[') - for l in importance_list: - print("'{}',".format(l), end='') - print(']') - - # Use new model to make predictions on eval/test set and write to csv - tmp = pd.DataFrame() - if args.vectorseval: - predictions = bst.predict(xeval) - tmp['target'] = list(eval_targets.values) - tmp['predictions'] = predictions - tmp['psmid'] = list(eval_psmids.values) - else: - predictions = bst.predict(xtest) - tmp['target'] = list(test_targets.values) - tmp['predictions'] = predictions - tmp['psmid'] = list(test_psmids.values) - tmp.to_csv("{}_predictions.csv".format(filename), index=False) - # tmp.to_pickle("{}_predictions.pkl".format(filename)) - - """ - for ch in range(8, 20): - print("len {}".format(ch)) - n1 = 0 - n2 = 0 - tmp3 = tmp[tmp.peplen == ch] - for pid in tmp3.psmid.unique().values: - tmp2 = tmp3[tmp3.psmid == pid] - print(pid) - for (t, p) in zip(tmp2.target, tmp2.predictions): - print("{} {}".format(t, p)) - # n1 += pearsonr(tmp2.target, tmp2.predictions)[0] - n1 += np.mean(np.abs(tmp2.target - tmp2.predictions)) - # print(n1) - n2 += 1 - print(n1 / n2) - """ - - if args.make_plots: - plt.figure() - plt.scatter(x=test_targets, y=predictions) - plt.title('Test set') - plt.xlabel('Target') - plt.ylabel('Prediction') - plt.savefig("{}_test.png".format(filename)) - plt.show() - - print("Ready!")