From 3a9ca9d65a96f151e52c4aea6a89d30f3c1e5f2d Mon Sep 17 00:00:00 2001 From: Matthias Bernt Date: Wed, 9 Aug 2023 14:50:48 +0200 Subject: [PATCH 01/11] bump calisp and add script for filtering and peptide summary based on https://github.com/kinestetika/Calisp/blob/master/benchmarking/sif-benchmarking.ipynb --- tools/calisp/benchmarking.py | 214 +++++++++++++++++++++++++++++++++++ tools/calisp/calisp.xml | 42 +++++-- tools/calisp/feather2tsv.py | 35 ------ 3 files changed, 246 insertions(+), 45 deletions(-) create mode 100644 tools/calisp/benchmarking.py delete mode 100755 tools/calisp/feather2tsv.py diff --git a/tools/calisp/benchmarking.py b/tools/calisp/benchmarking.py new file mode 100644 index 000000000..901889e6e --- /dev/null +++ b/tools/calisp/benchmarking.py @@ -0,0 +1,214 @@ +import argparse +import os + +import pandas as pd +import numpy as np + +# Define the ArgumentParser +parser = argparse.ArgumentParser("List of natural abundances of the isotopes") + +# Add arguments +parser.add_argument( + "-f", + "--factor", + type=float, + metavar="F", + help="Correction factor (natural abundances), e.g.:" + "C: 0.9889434148335, H: 0.99988, N: 0.996323567, O: 0.997574195, S: 0.9493", +) +parser.add_argument( + "--input", type=str, metavar="data", help="Input file/folder", required=True +) +parser.add_argument( + "--out_summary", type=str, metavar="output", help="Peptide summary output", required=True +) +parser.add_argument( + "--out_filtered", type=str, metavar="output", help="Filtered output", required=True +) + +# Indicate end of argument definitions and parse args +args = parser.parse_args() + +# Benchmarking section +# the functions for optimising calis-p data + + +def load_calisp_data(filename, factor): + # (1) load data + file_count = 1 + if os.path.isdir(filename): + file_data = [] + file_count = len(os.listdir(filename)) + for f in os.listdir(filename): + f = os.path.join(filename, f) + file_data.append(pd.read_feather(f)) + base, _ = os.path.splitext(f) + file_data[-1].to_csv(f"{base}.tsv", sep="\t", index=False) + data = pd.concat(file_data) + else: + data = pd.read_feather(filename) + base, _ = os.path.splitext(filename) + data.to_csv(f"{base}.tsv", sep="\t", index=False) + + file_success_count = len(data["ms_run"].unique()) + # (2) calculate deltas + # ((1-f)/f) - 1 == 1/f -2 + + data["delta_na"] = data["ratio_na"] / ((1 / factor) - 2) * 1000 + data["delta_fft"] = data["ratio_fft"] / ((1 / factor) - 2) * 1000 + print( + f"Loaded {len(data.index)} isotopic patterns from {file_success_count}/{file_count} file(s)" + ) + return data + + +def filter_calisp_data(data, target): + if target.lower() == "na": + subdata = data.loc[ + lambda df: (df["flag_peak_at_minus_one_pos"] == False) + & (df["flag_pattern_is_wobbly"] == False) + & (df["flag_psm_has_low_confidence"] == False) + & (df["flag_psm_is_ambiguous"] == False) + & (df["flag_pattern_is_contaminated"] == False) + & (df["flag_peptide_assigned_to_multiple_bins"] == False), + :, + ] + elif target.lower() == "fft": + subdata = data.loc[ + lambda df: (df["error_fft"] < 0.001) + & (df["flag_peptide_assigned_to_multiple_bins"] == False), + :, + ] + elif target.lower() == "clumpy": + subdata = data.loc[ + lambda df: (df["error_clumpy"] < 0.001) + & (df["flag_peptide_assigned_to_multiple_bins"] == False), + :, + ] + + print( + f"{len(subdata.index)} ({len(subdata.index)/len(data.index)*100:.1f}%) remaining after filters." + ) + + return subdata + + +def estimate_clumpiness(data): + subdata = data.loc[lambda df: df["error_clumpy"] < 0.001, :] + clumpiness = [] + for c in ["c1", "c2", "c3", "c4", "c5", "c6"]: + try: + count, division = np.histogram(subdata[c], bins=50) + count = count[1:-1] + opt = 0.02 * np.where(count == count.max())[0][0] / 0.96 + clumpiness.append(opt) + except ValueError: + pass + return clumpiness / sum(clumpiness) + + +# the function for benchmarking +def benchmark_sip_mock_community_data(data, factor): + background_isotope = 1 - factor + background_unlabelled = factor + + # For false positive discovery rates we set the threshold at the isotope/unlabelled associated with 1/4 of a generation + # of labeling. The E. coli values (1.7, 4.2 and 7.1) are for 1 generation at 1, 5 and 10% label, + # and we take the background (1.07) into account as well. + thresholds = { + 1: 1.07 + (1.7 - 1.07) / 4, + 5: 1.07 + (4.2 - 1.07) / 4, + 10: 1.07 + (7.1 - 1.07) / 4, + } + + filenames = data["ms_run"].unique() + bin_names = data["bins"].unique() + peptide_sequences = data["peptide"].unique() + benchmarking = pd.DataFrame( + columns=[ + "file", + "bins", + "% label", + "ratio", + "peptide", + "psm_mz", + "n(patterns)", + "mean intensity", + "ratio_NA median", + "N mean", + "ratio_NA SEM", + "ratio_FFT median", + "ratio_FFT SEM", + "False Positive", + ] + ) + false_positives = 0 + for p in peptide_sequences: + pep_data = data.loc[lambda df: df["peptide"] == p, :] + for b in bin_names: + bindata = data.loc[lambda df: df["bins"] == b, :] + for f in filenames: + words = f.split("_") + # TODO what is nominal_value doing? it uses int(the 4th component of the filename/runname) + # if the filename hase more than 5 components, and the nominal_value=0 otherwise + nominal_value = 0 + if len(words) > 5: + nominal_value = int(words[3].replace("rep", "")) + unlabeled_fraction = 1 - nominal_value / 100 + U = unlabeled_fraction * background_unlabelled + I = nominal_value / 100 + unlabeled_fraction * background_isotope + ratio = I / U * 100 + pepfiledata = pep_data.loc[lambda df: df["ms_run"] == f, :] + is_false_positive = 0 + try: + if ( + b != "K12" + and pepfiledata["ratio_na"].median() > thresholds[nominal_value] + ): + is_false_positive = 1 + false_positives += 1 + except KeyError: + pass + benchmarking.loc[len(benchmarking)] = [ + f, + b, + nominal_value, + ratio, + p, + pepfiledata["psm_mz"].median(), + len(pepfiledata.index), + pepfiledata["pattern_total_intensity"].mean(), + pepfiledata["ratio_na"].median(), + pepfiledata["psm_neutrons"].mean(), + pepfiledata["ratio_na"].sem(), + pepfiledata["ratio_fft"].median(), + pepfiledata["ratio_fft"].sem(), + is_false_positive, + ] + + benchmarking = benchmarking.sort_values(["bins", "peptide"]) + benchmarking = benchmarking.reset_index(drop=True) + return benchmarking + + +# cleaning and filtering data +data = load_calisp_data(args.input, args.factor) +data = filter_calisp_data(data, "na") + +data["peptide_clean"] = data["peptide"] +data["peptide_clean"] = data["peptide_clean"].replace("'Oxidation'", "", regex=True) +data["peptide_clean"] = data["peptide_clean"].replace("'Carbamidomethyl'", "", regex=True) +data["peptide_clean"] = data["peptide_clean"].replace("\s*\[.*\]", "", regex=True) + +data["ratio_na"] = data["ratio_na"] * 100 +data["ratio_fft"] = data["ratio_fft"] * 100 +data["psm_neutrons"] = data["psm_neutrons"] * 100 + +# The column "% label" indicates the amount of label applied (percentage of label in the glucose). The amount of +# labeled E. coli cells added corresponded to 1 generation of labeling (50% of E. coli cells were labeled in +# all experiments except controls) + +benchmarks = benchmark_sip_mock_community_data(data, args.factor) +benchmarks.to_csv(args.out_summary, sep='\t', index=False) + +data.to_csv(args.out_filtered, sep='\t', index=False) diff --git a/tools/calisp/calisp.xml b/tools/calisp/calisp.xml index ed097f038..ac80db94d 100644 --- a/tools/calisp/calisp.xml +++ b/tools/calisp/calisp.xml @@ -1,7 +1,7 @@ Estimate isotopic composition of peptides from proteomics mass spectrometry data - 3.0.10 + 3.0.12 0 https://raw.githubusercontent.com/kinestetika/Calisp/208d495674e2b52fe56cf23457c833d1c2527242 @@ -30,8 +30,25 @@ calisp --bin_delimiter '$bin_delimiter' --threads "\${GALAXY_SLOTS:-1}" --isotope $isotope - $compute_clumps && -'$__tool_directory__/feather2tsv.py' --calisp_output calisp-output/ + $compute_clumps +#if $isotope_abundance_matrix + --isotope_abundance_matrix '$isotope_abundance_matrix' +&& +ISOTOPE_ABUNDANCE_MATRIX="$isotope_abundance_matrix" +#else +&& +ISOTOPE_ABUNDANCE_MATRIX="\$(python -c 'import site; print(f"{site.getsitepackages()[0]}/calisp/isotope_matrix.txt")')" +#end if + +## row numbers in the factor matrix +## C=1 N=2 O=3 H=4 S=5 +#set rowcol = {"13C": (1, 2), "14C": (1, 3), "15N": (2, 2), "17O": (3, 2), "18O": (3, 3), "2H": (4, 2), "3H": (4, 3), "33S": (5, 2), "34S": (5, 3), "36S": (5, 4)} +&& +#set selected_isotope = str($isotope) +FACTOR=\$(grep -v "^#" "\$ISOTOPE_ABUNDANCE_MATRIX" | sed 's/ \+/\t/g'| cut -f$rowcol[str($selected_isotope)][1] | head -n $rowcol[$selected_isotope][0] | tail -n 1) +&& echo "FACTOR \$FACTOR" + +&& python '$__tool_directory__/benchmarking.py' --input calisp-output/ --out_filtered '$filtered' --out_summary '$summary' --factor "\$FACTOR" ]]> @@ -59,18 +76,21 @@ calisp + + + + + + @@ -80,13 +100,15 @@ calisp - + + + - --> + 10.1101/2021.03.29.437612 10.1093/bioinformatics/bty046 - \ No newline at end of file + diff --git a/tools/calisp/feather2tsv.py b/tools/calisp/feather2tsv.py deleted file mode 100755 index e43101f58..000000000 --- a/tools/calisp/feather2tsv.py +++ /dev/null @@ -1,35 +0,0 @@ -#!/usr/bin/env python -""" -based on https://github.com/kinestetika/Calisp/blob/master/benchmarking/sip%20benchmarking.ipynb -""" - -import argparse -import os - -import pandas as pd - - -def load_calisp_data(filename): - - # (1) load data - if os.path.isdir(filename): - file_data = [] - for f in os.listdir(filename): - if not f.endswith(".feather"): - continue - f = os.path.join(filename, f) - file_data.append(pd.read_feather(f)) - base, _ = os.path.splitext(f) - file_data[-1].to_csv(f"{base}.tsv", sep="\t") - data = pd.concat(file_data) - else: - data = pd.read_feather(filename) - base, _ = os.path.splitext(filename) - data.to_csv(f"{base}.tsv", sep="\t") - - -parser = argparse.ArgumentParser(description='feather2tsv') -parser.add_argument('--calisp_output', required=True, help='feather file') -args = parser.parse_args() - -data = load_calisp_data(args.calisp_output) From 1eb4d7040366dc948e5ebf5862dbdab1ac4d0b9c Mon Sep 17 00:00:00 2001 From: Matthias Bernt Date: Thu, 31 Aug 2023 19:04:49 +0200 Subject: [PATCH 02/11] cont doc --- tools/calisp/calisp.xml | 55 +++++++++++++++++++++++++++++------------ 1 file changed, 39 insertions(+), 16 deletions(-) diff --git a/tools/calisp/calisp.xml b/tools/calisp/calisp.xml index ac80db94d..2b820207a 100644 --- a/tools/calisp/calisp.xml +++ b/tools/calisp/calisp.xml @@ -33,22 +33,26 @@ calisp $compute_clumps #if $isotope_abundance_matrix --isotope_abundance_matrix '$isotope_abundance_matrix' -&& -ISOTOPE_ABUNDANCE_MATRIX="$isotope_abundance_matrix" -#else -&& -ISOTOPE_ABUNDANCE_MATRIX="\$(python -c 'import site; print(f"{site.getsitepackages()[0]}/calisp/isotope_matrix.txt")')" #end if -## row numbers in the factor matrix -## C=1 N=2 O=3 H=4 S=5 -#set rowcol = {"13C": (1, 2), "14C": (1, 3), "15N": (2, 2), "17O": (3, 2), "18O": (3, 3), "2H": (4, 2), "3H": (4, 3), "33S": (5, 2), "34S": (5, 3), "36S": (5, 4)} -&& -#set selected_isotope = str($isotope) -FACTOR=\$(grep -v "^#" "\$ISOTOPE_ABUNDANCE_MATRIX" | sed 's/ \+/\t/g'| cut -f$rowcol[str($selected_isotope)][1] | head -n $rowcol[$selected_isotope][0] | tail -n 1) -&& echo "FACTOR \$FACTOR" - -&& python '$__tool_directory__/benchmarking.py' --input calisp-output/ --out_filtered '$filtered' --out_summary '$summary' --factor "\$FACTOR" +#if benchmark + #if $isotope_abundance_matrix + && ISOTOPE_ABUNDANCE_MATRIX="$isotope_abundance_matrix" + #else + && ISOTOPE_ABUNDANCE_MATRIX="\$(python -c 'import site; print(f"{site.getsitepackages()[0]}/calisp/isotope_matrix.txt")')" + #end if + ## row numbers in the factor matrix + ## C=1 N=2 O=3 H=4 S=5 + #set rowcol = {"13C": (1, 2), "14C": (1, 3), "15N": (2, 2), "17O": (3, 2), "18O": (3, 3), "2H": (4, 2), "3H": (4, 3), "33S": (5, 2), "34S": (5, 3), "36S": (5, 4)} + #set selected_isotope = str($isotope) + && FACTOR=\$(grep -v "^#" "\$ISOTOPE_ABUNDANCE_MATRIX" | sed 's/ \+/\t/g'| cut -f$rowcol[str($selected_isotope)][1] | head -n $rowcol[$selected_isotope][0] | tail -n 1) + && echo "Using factor \$FACTOR" + && python '$__tool_directory__/benchmarking.py' + --input calisp-output/ + --out_filtered '$filtered' + --out_summary '$summary' + --factor "\$FACTOR" +#end if ]]> @@ -77,13 +81,18 @@ FACTOR=\$(grep -v "^#" "\$ISOTOPE_ABUNDANCE_MATRIX" | sed 's/ \+/\t/g'| cut -f$r + - - + + benchmark + + + benchmark + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Date: Mon, 11 Sep 2023 15:28:29 +0200 Subject: [PATCH 08/11] move logic to python script --- setup.cfg | 2 +- tools/calisp/benchmarking.py | 56 +++++++++++++++++++++-------- tools/calisp/calisp.xml | 9 ++--- tools/calisp/test-data/filtered.tsv | 24 +++++++++++++ tools/calisp/test-data/summary.tsv | 4 +++ 5 files changed, 72 insertions(+), 23 deletions(-) create mode 100644 tools/calisp/test-data/filtered.tsv create mode 100644 tools/calisp/test-data/summary.tsv diff --git a/setup.cfg b/setup.cfg index 675f87768..b748bd0b8 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [flake8] -ignore = E501 +ignore = D,E501,E741,W503 # For flake8-import-order # https://github.com/PyCQA/flake8-import-order/blob/master/tests/test_cases/complete_smarkets.py import-order-style = smarkets diff --git a/tools/calisp/benchmarking.py b/tools/calisp/benchmarking.py index ddb349c5f..90f4441e0 100644 --- a/tools/calisp/benchmarking.py +++ b/tools/calisp/benchmarking.py @@ -7,17 +7,23 @@ # Define the ArgumentParser parser = argparse.ArgumentParser("List of natural abundances of the isotopes") -# Add arguments parser.add_argument( - "-f", - "--factor", - type=float, - metavar="F", - help="Correction factor (natural abundances), e.g.:" - "C: 0.9889434148335, H: 0.99988, N: 0.996323567, O: 0.997574195, S: 0.9493", + "--input", type=str, metavar="data", help="Input file/folder", required=True ) + parser.add_argument( - "--input", type=str, metavar="data", help="Input file/folder", required=True + "--isotope_abundance_matrix", + type=str, + metavar="data", + help="Isotope abundance matrix", + required=True, +) +parser.add_argument( + "--isotope", + type=str, + metavar="ISOTOPE", + help="Isotope", + required=True, ) parser.add_argument( "--out_summary", @@ -180,10 +186,6 @@ def benchmark_sip_mock_community_data(data, factor, nominal_values): for b in bin_names: # bindata = data.loc[lambda df: df["bins"] == b, :] for f in filenames: - words = f.split("_") - # TODO what is nominal_value doing? it uses int(the 4th component of the filename/runname) - # if the filename hase more than 5 components, and the nominal_value=0 otherwise - # USE table nominal_value = nominal_values.get(fname, 0) unlabeled_fraction = 1 - nominal_value / 100 U = unlabeled_fraction * background_unlabelled @@ -222,8 +224,32 @@ def benchmark_sip_mock_community_data(data, factor, nominal_values): return benchmarking +rowcol = { + "13C": (0, 1), + "14C": (0, 2), + "15N": (1, 1), + "17O": (2, 1), + "18O": (2, 2), + "2H": (3, 1), + "3H": (3, 2), + "33S": (4, 1), + "34S": (4, 2), + "36S": (4, 3), +} +with open(args.isotope_abundance_matrix) as iamf: + matrix = [] + for line in iamf: + line = line.strip() + line = line.split("#")[0] + if line == "": + continue + matrix.append([float(x) for x in line.split()]) +factor = matrix[rowcol[args.isotope][0]][rowcol[args.isotope][1]] +print(f"Using factor {factor}") + + # cleaning and filtering data -data = load_calisp_data(args.input, args.factor) +data = load_calisp_data(args.input, factor) if args.out_filtered: data = filter_calisp_data(data, "na") @@ -232,7 +258,7 @@ def benchmark_sip_mock_community_data(data, factor, nominal_values): data["peptide_clean"] = data["peptide_clean"].replace( "'Carbamidomethyl'", "", regex=True ) - data["peptide_clean"] = data["peptide_clean"].replace("\s*\[.*\]", "", regex=True) + data["peptide_clean"] = data["peptide_clean"].replace(r"\s*\[.*\]", "", regex=True) data["ratio_na"] = data["ratio_na"] * 100 data["ratio_fft"] = data["ratio_fft"] * 100 @@ -244,5 +270,5 @@ def benchmark_sip_mock_community_data(data, factor, nominal_values): if args.out_summary: nominal_values = parse_nominal_values(args.nominal_values) - benchmarks = benchmark_sip_mock_community_data(data, args.factor, nominal_values) + benchmarks = benchmark_sip_mock_community_data(data, factor, nominal_values) benchmarks.to_csv(args.out_summary, sep="\t", index=False) diff --git a/tools/calisp/calisp.xml b/tools/calisp/calisp.xml index f78cb91f1..22de6d31a 100644 --- a/tools/calisp/calisp.xml +++ b/tools/calisp/calisp.xml @@ -40,16 +40,11 @@ calisp #else && ISOTOPE_ABUNDANCE_MATRIX="\$(python -c 'import site; print(f"{site.getsitepackages()[0]}/calisp/isotope_matrix.txt")')" #end if -## row numbers in the factor matrix -## C=1 N=2 O=3 H=4 S=5 -#set rowcol = {"13C": (1, 2), "14C": (1, 3), "15N": (2, 2), "17O": (3, 2), "18O": (3, 3), "2H": (4, 2), "3H": (4, 3), "33S": (5, 2), "34S": (5, 3), "36S": (5, 4)} -#set selected_isotope = str($isotope) -&& FACTOR=\$(grep -v "^#" "\$ISOTOPE_ABUNDANCE_MATRIX" | sed 's/ \+/\t/g'| cut -f$rowcol[str($selected_isotope)][1] | head -n $rowcol[$selected_isotope][0] | tail -n 1) -&& echo "Using factor \$FACTOR" && python '$__tool_directory__/benchmarking.py' --input calisp-output/ - --factor "\$FACTOR" + --isotope_abundance_matrix "\$ISOTOPE_ABUNDANCE_MATRIX" + --isotope $isotope #if $benchmark_cond.benchmark == 'yes' --out_filtered '$filtered' --out_summary '$summary' diff --git a/tools/calisp/test-data/filtered.tsv b/tools/calisp/test-data/filtered.tsv new file mode 100644 index 000000000..f3e2b0e8a --- /dev/null +++ b/tools/calisp/test-data/filtered.tsv @@ -0,0 +1,24 @@ +experiment ms_run bins proteins peptide peptide_mass C N O H S psm_id psm_mz psm_charge psm_neutrons psm_rank psm_precursor_id psm_precursor_mz pattern_charge pattern_precursor_id pattern_total_intensity pattern_peak_count pattern_median_peak_spacing pattern_mass_irregularity ratio_na ratio_fft error_fft error_clumpy flag_peptide_contains_sulfur flag_peptide_has_modifications flag_peptide_assigned_to_multiple_bins flag_peptide_assigned_to_multiple_proteins flag_peptide_mass_and_elements_undefined flag_psm_has_low_confidence flag_psm_is_ambiguous flag_pattern_is_contaminated flag_pattern_is_wobbly flag_peak_at_minus_one_pos i0 i1 i2 i3 i4 i5 i6 i7 i8 i9 i10 i11 i12 i13 i14 i15 i16 i17 i18 i19 m0 m1 m2 m3 m4 m5 m6 m7 m8 m9 m10 m11 m12 m13 m14 m15 m16 m17 m18 m19 c1 c2 c3 c4 c5 c6 delta_na delta_fft peptide_clean +calisp_test_data_TargetPeptideSpectrumMatch.txt MKH_260min_1800ng HOMO P13645 NHEEEMKDLR ['Oxidation'] [5] 1315.5826638283597 52 17 21 85 1 5030 439.534698486328 3 0.0 0 5023 439.0 3 5023 423972.552734375 5 1.0014495849609375 8.421018719673157e-07 1.293745308200513 1.2093344883383923 0.0006659466122922349 True True False False False False False False False False 0.44394272703762094 0.3148879400069139 0.14210273699957185 0.07040158071111131 0.028665015244782 439.5346984863281 439.8691101074219 440.202880859375 440.5367431640625 440.87158203125 0.14627873849960357 0.13673473616244913 NHEEEMKDLR +calisp_test_data_TargetPeptideSpectrumMatch.txt MKH_260min_1800ng HOMO P13645 NHEEEMKDLR ['Oxidation'] [5] 1315.5826638283597 52 17 21 85 1 5030 439.534698486328 3 0.0 0 5023 439.0 3 5039 2552931.005859375 5 1.0029144287109375 3.7936493754386903e-06 0.9331397273613753 0.983704127441109 0.00016296070621992734 True True False False False False False False False False 0.50433956971218 0.31463918263220236 0.12821113721787344 0.040752111156242804 0.012057999281501404 439.53472900390625 439.86907958984375 440.203125 440.5376892089844 440.8695068359375 0.10550647124830428 0.11122359084655085 NHEEEMKDLR +calisp_test_data_TargetPeptideSpectrumMatch.txt MKH_260min_1800ng HOMO P13645 NHEEEMKDLR ['Oxidation'] [5] 1315.5826638283597 52 17 21 85 1 5030 439.534698486328 3 0.0 0 5023 439.0 3 5055 4528080.1640625 5 1.0013580322265625 2.766028046607971e-07 1.1137463132487007 1.2048850184659758 0.0007066586931116304 True True False False False False False False False False 0.44868667876613083 0.34631545118077006 0.14091221652037716 0.05091043163382875 0.013175221898893164 439.5347900390625 439.8689880371094 440.2029113769531 440.53656005859375 440.870849609375 0.12592695384318583 0.13623165194965742 NHEEEMKDLR +calisp_test_data_TargetPeptideSpectrumMatch.txt MKH_260min_1800ng HOMO P13645 NHEEEMKDLR ['Oxidation'] [5] 1315.5826638283597 52 17 21 85 1 5030 439.534698486328 3 0.0 0 5023 439.0 3 5071 3585135.21484375 5 1.001861572265625 2.8962269425392153e-06 0.973706224145174 0.9923920677561815 4.189626353316104e-05 True True False False False False False False False False 0.5011505891217285 0.3029469071356449 0.1411451820296418 0.04201873422410525 0.012738587488879525 439.5347900390625 439.86907958984375 440.2027587890625 440.5369873046875 440.8688049316406 0.11009316689641141 0.11220590238916531 NHEEEMKDLR +calisp_test_data_TargetPeptideSpectrumMatch.txt MKH_260min_1800ng HOMO P13645 NHEEEMKDLR ['Oxidation'] [5] 1315.5826638283597 52 17 21 85 1 5030 439.534698486328 3 0.0 0 5023 439.0 3 5087 2536973.255859375 5 1.0016326904296875 1.321546733379364e-06 0.9497254947012664 0.9973952682331323 0.00012217211822969365 True True False False False False False False False False 0.5006110123812825 0.31486575514940235 0.13072462401171767 0.041495180849606586 0.012303427607990981 439.5348205566406 439.869140625 440.20281982421875 440.5368957519531 440.869384765625 0.10738175930395859 0.11277159476276466 NHEEEMKDLR +calisp_test_data_TargetPeptideSpectrumMatch.txt MKH_260min_1800ng HOMO P13645 NHEEEMKDLR ['Oxidation'] [5] 1315.5826638283597 52 17 21 85 1 5030 439.534698486328 3 0.0 0 5023 439.0 3 5103 1284353.626953125 5 1.000213623046875 9.59448516368866e-07 1.005826035272335 0.9296079243051336 0.0016341333203192983 True True False False False False False False False False 0.513846653016916 0.26831059629387966 0.15602610005111445 0.04900510684745169 0.012811543790638232 439.5347595214844 439.86920166015625 440.2029113769531 440.5360107421875 440.8699951171875 0.11372482872563323 0.10510714404501212 NHEEEMKDLR +calisp_test_data_TargetPeptideSpectrumMatch.txt MKH_260min_1800ng HOMO P13645 NHEEEMKDLR ['Oxidation'] [5] 1315.5826638283597 52 17 21 85 1 5030 439.534698486328 3 0.0 0 5023 439.0 3 5119 850503.681640625 5 1.0009002685546875 2.687796950340271e-07 1.048453350907777 0.9549092559948614 0.001962976183331938 True True False False False False False False False False 0.5057472963200563 0.31020000935337294 0.09786231336699759 0.05034405565994057 0.035846325299632595 439.5347900390625 439.8690490722656 440.20281982421875 440.53631591796875 440.8700866699219 0.11854453312746013 0.10796786698520291 NHEEEMKDLR +calisp_test_data_TargetPeptideSpectrumMatch.txt MKH_260min_1800ng HOMO P13645 NHEEEMKDLR ['Oxidation'] [5] 1315.5826638283597 52 17 21 85 1 5030 439.534698486328 3 0.0 0 5023 439.0 3 5135 552709.8154296875 5 1.003143310546875 1.5936791896820069e-06 1.0040066410095079 1.0069942065094228 0.00039783172002031634 True True False False False False False False False False 0.49602390548621744 0.31631307943044984 0.12126168482351878 0.046744794319491766 0.019656535940322198 439.5346984863281 439.868896484375 440.2027893066406 440.53765869140625 440.8705749511719 0.11351911690900841 0.11385690929344641 NHEEEMKDLR +calisp_test_data_TargetPeptideSpectrumMatch.txt MKH_260min_1800ng HOMO P13645 NHEEEMKDLR ['Oxidation'] [5] 1315.5826638283597 52 17 21 85 1 5030 439.534698486328 3 0.0 0 5023 439.0 3 5151 370754.9375 4 0.99957275390625 1.4842953532934189e-06 0.756338358392029 0.8146499777536352 0.00020295507145469405 True True False False False False False False False False 0.5534717676551509 0.2777179594202977 0.1330019901117298 0.03580828281282161 439.53460693359375 439.8688659667969 440.2020568847656 440.5361633300781 0.08551622969619299 0.09210929717710928 NHEEEMKDLR +calisp_test_data_TargetPeptideSpectrumMatch.txt MKH_260min_1800ng HOMO P13645 NHEEEMKDLR ['Oxidation'] [5] 1315.5826638283597 52 17 21 85 1 5030 439.534698486328 3 0.0 0 5023 439.0 3 5167 313023.962890625 4 1.004150390625 2.2582244127988815e-06 0.45064317136308785 0.5407485244526391 0.003741691287093788 True True False False False False False False False False 0.6447993270583087 0.26181284830623586 0.04788995693214856 0.04549786770330688 439.5347595214844 439.8687744140625 440.2034912109375 440.53662109375 0.050952466611949666 0.06114032762172897 NHEEEMKDLR +calisp_test_data_TargetPeptideSpectrumMatch.txt MKH_260min_1800ng HOMO E9PND2 E9PP21 E9PS42 P21291 HEEAPGHRPTTNPNASK [] [] 1841.8768716895202 76 27 27 119 0 5235 461.476684570313 4 0.0 0 5231 461.0 2 5231 539140.189453125 4 1.0037841796875 4.122033715248108e-06 0.8084537858830131 0.945651523215571 0.001257057371164605 False False False True False False False False False False 0.41876179315997636 0.35274368447269105 0.19445271468973835 0.03404180767759423 921.94482421875 922.4465942382812 922.948486328125 923.447509765625 0.09140871791735004 0.10692113122992151 HEEAPGHRPTTNPNASK +calisp_test_data_TargetPeptideSpectrumMatch.txt MKH_260min_1800ng HOMO E9PND2 E9PP21 E9PS42 P21291 HEEAPGHRPTTNPNASK [] [] 1841.8768716895202 76 27 27 119 0 5235 461.476684570313 4 0.0 0 5231 461.0 3 5231 532287.23828125 4 1.00140380859375 1.0952353477478027e-06 0.9935866375027093 1.0623248628838955 0.0016081767103566236 False False False True False False False False False False 0.38864290062256607 0.3245897845642302 0.2029177253502542 0.08384958946294951 614.966552734375 615.3012084960938 615.635009765625 615.9679565429688 0.11234096773352967 0.12011293091030585 HEEAPGHRPTTNPNASK +calisp_test_data_TargetPeptideSpectrumMatch.txt MKH_260min_1800ng HOMO E9PND2 E9PP21 E9PS42 P21291 HEEAPGHRPTTNPNASK [] [] 1841.8768716895202 76 27 27 119 0 5235 461.476684570313 4 0.0 0 5231 461.0 4 5231 1711730.5859375 5 1.00311279296875 3.8370490074157715e-07 1.206717846857389 1.3108322224571687 0.002574304851973422 False False False True False False False False False False 0.2956395075588592 0.3938560443089588 0.21349358538210017 0.06843886953497332 0.028571993215108532 461.4766845703125 461.72711181640625 461.9777526855469 462.2286682128906 462.4788818359375 0.13643888271083018 0.14821068928348002 HEEAPGHRPTTNPNASK +calisp_test_data_TargetPeptideSpectrumMatch.txt MKH_260min_1800ng HOMO E9PND2 E9PP21 E9PS42 P21291 HEEAPGHRPTTNPNASK [] [] 1841.8768716895202 76 27 27 119 0 5235 461.476684570313 4 0.0 0 5231 461.0 2 5247 608586.04296875 4 1.00244140625 1.5273690223693848e-06 0.837527481680081 0.9460216877607668 0.00678454049412297 False False False True False False False False False False 0.4268736727968962 0.3101983313306015 0.23347104389197432 0.02945695198052797 921.9454345703125 922.4449462890625 922.9461669921875 923.447021484375 0.09469596736108457 0.10696298429200823 HEEAPGHRPTTNPNASK +calisp_test_data_TargetPeptideSpectrumMatch.txt MKH_260min_1800ng HOMO E9PND2 E9PP21 E9PS42 P21291 HEEAPGHRPTTNPNASK [] [] 1841.8768716895202 76 27 27 119 0 5235 461.476684570313 4 0.0 0 5231 461.0 3 5247 740210.6279296875 5 1.001678466796875 8.873641490936279e-07 0.89225994138517 0.9391360431906918 0.010531410030580671 False False False True False False False False False False 0.42486678032387487 0.29477163055638556 0.24531911005910198 0.01918211669067489 0.01586036236996273 614.9664916992188 615.3005981445312 615.635009765625 615.9683837890625 616.3013305664062 0.10088435321252912 0.10618445130326398 HEEAPGHRPTTNPNASK +calisp_test_data_TargetPeptideSpectrumMatch.txt MKH_260min_1800ng HOMO E9PND2 E9PP21 E9PS42 P21291 HEEAPGHRPTTNPNASK [] [] 1841.8768716895202 76 27 27 119 0 5235 461.476684570313 4 0.0 0 5231 461.0 4 5247 1838239.728515625 5 1.00286865234375 7.227063179016113e-08 0.9229311651701914 1.0161266422773116 0.0013760781360197894 False False False True False False False False False False 0.38252886122084645 0.3829030983180696 0.16760613168163346 0.05667263578789228 0.010289272991558147 461.4764099121094 461.7270812988281 461.97784423828125 462.228515625 462.47894287109375 0.10435222891810549 0.11488947820410278 HEEAPGHRPTTNPNASK +calisp_test_data_TargetPeptideSpectrumMatch.txt MKH_260min_1800ng HOMO E9PND2 E9PP21 E9PS42 P21291 HEEAPGHRPTTNPNASK [] [] 1841.8768716895202 76 27 27 119 0 5235 461.476684570313 4 0.0 0 5231 461.0 2 5263 464765.828125 4 1.0020751953125 7.897615432739258e-07 0.8767404472179741 0.9279231261890819 0.0004103165570351758 False False False True False False False False False False 0.4205397724990068 0.3387635433206904 0.16604480892309578 0.07465187525720698 921.9442138671875 922.4458618164062 922.9468994140625 923.4490356445312 0.09912962450778352 0.10491665048999753 HEEAPGHRPTTNPNASK +calisp_test_data_TargetPeptideSpectrumMatch.txt MKH_260min_1800ng HOMO E9PND2 E9PP21 E9PS42 P21291 HEEAPGHRPTTNPNASK [] [] 1841.8768716895202 76 27 27 119 0 5235 461.476684570313 4 0.0 0 5231 461.0 4 5263 1322493.66015625 5 1.00152587890625 2.913177013397217e-07 1.1010656704692936 1.0514592047523068 0.0026813136743757578 False False False True False False False False False False 0.38359850714903443 0.310106519680061 0.20619829471831383 0.06091621829815187 0.03918046015443889 461.4764099121094 461.7273254394531 461.977783203125 462.22808837890625 462.4787292480469 0.12449320299796286 0.11888439330372896 HEEAPGHRPTTNPNASK +calisp_test_data_TargetPeptideSpectrumMatch.txt MKH_260min_1800ng HOMO E9PND2 E9PP21 E9PS42 P21291 HEEAPGHRPTTNPNASK [] [] 1841.8768716895202 76 27 27 119 0 5235 461.476684570313 4 0.0 0 5231 461.0 4 5279 711034.515625 4 1.0008544921875 9.853392839431763e-07 0.9964145730244355 1.0001027048487163 0.003224005582612932 False False False True False False False False False False 0.3985974749072154 0.3306744800473272 0.15877753077607498 0.11195051426938245 461.47625732421875 461.7271728515625 461.9773864746094 462.2283020019531 0.11266071137863065 0.11307771406629753 HEEAPGHRPTTNPNASK +calisp_test_data_TargetPeptideSpectrumMatch.txt MKH_260min_1800ng HOMO E9PND2 E9PP21 E9PS42 P21291 HEEAPGHRPTTNPNASK [] [] 1841.8768716895202 76 27 27 119 0 5235 461.476684570313 4 0.0 0 5231 461.0 3 5215 356586.994140625 5 1.00286865234375 3.2857060432434083e-07 1.3803353534465017 1.2686969927674814 0.004905104558178289 False False False True False False False False False False 0.3257182469734044 0.3117354432833358 0.21255310238575253 0.06776577097647997 0.08222743638102731 614.9671020507812 615.3006591796875 615.635009765625 615.96923828125 616.3035888671875 0.15606913735548386 0.1434466231212063 HEEAPGHRPTTNPNASK +calisp_test_data_TargetPeptideSpectrumMatch.txt MKH_260min_1800ng HOMO E9PND2 E9PP21 E9PS42 P21291 HEEAPGHRPTTNPNASK [] [] 1841.8768716895202 76 27 27 119 0 5235 461.476684570313 4 0.0 0 5231 461.0 4 5215 636583.953125 4 1.004638671875 6.94766640663147e-07 1.0745410331371468 1.2348126109082105 0.0058880061628108216 False False False True False False False False False False 0.3181153713204511 0.41939877676680165 0.16460995313044555 0.09787589878230171 461.4763488769531 461.72674560546875 461.9779052734375 462.2287292480469 0.12149416565768245 0.1396154481582554 HEEAPGHRPTTNPNASK +calisp_test_data_TargetPeptideSpectrumMatch.txt MKH_260min_1800ng HOMO P15924 HKQSLEEAAK [] [] 1139.5934862127199 48 15 17 81 0 5363 380.871520996094 3 0.0 0 5359 380.0 3 5359 398100.75244140625 5 1.0032806396484375 1.1237338185310363e-06 1.2706333054645902 1.3478553692900508 0.0016870581736247586 False False False False False False False False False False 0.46901694823468454 0.36980694201694175 0.12886000669529102 0.013655931294019978 0.018660171759062726 380.87152099609375 381.20587158203125 381.54052734375 381.8747253417969 382.20782470703125 0.14366555444940393 0.15239675216592818 HKQSLEEAAK +calisp_test_data_TargetPeptideSpectrumMatch.txt MKH_260min_1800ng HOMO P15924 HKQSLEEAAK [] [] 1139.5934862127199 48 15 17 81 0 5363 380.871520996094 3 0.0 0 5359 380.0 3 5375 676275.6352539062 5 1.002685546875 7.247552275657654e-07 1.129920041404187 1.0681299513263602 0.00035141829865935705 False False False False False False False False False False 0.534627931130854 0.31148213874497005 0.1028064193624199 0.04424334939630989 0.006840161365446044 380.8716735839844 381.2059326171875 381.5401611328125 381.8743896484375 382.20751953125 0.1277556542345409 0.12076928963010944 HKQSLEEAAK diff --git a/tools/calisp/test-data/summary.tsv b/tools/calisp/test-data/summary.tsv new file mode 100644 index 000000000..453e5b9eb --- /dev/null +++ b/tools/calisp/test-data/summary.tsv @@ -0,0 +1,4 @@ +file bins % label ratio peptide psm_mz n(patterns) mean intensity ratio_NA median N mean ratio_NA SEM ratio_FFT median ratio_FFT SEM False Positive +MKH_260min_1800ng HOMO 0 8944.383821398755 HEEAPGHRPTTNPNASK [] [] 461.476684570313 11 860150.8512961648 0.9935866375027093 0.0 0.05207457939795477 1.0161266422773116 0.04270105864239165 0 +MKH_260min_1800ng HOMO 0 8944.383821398755 HKQSLEEAAK [] [] 380.871520996094 2 537188.1938476562 1.2002766734343886 0.0 0.0703566320302016 1.2079926603082054 0.13986270898184525 0 +MKH_260min_1800ng HOMO 0 8944.383821398755 NHEEEMKDLR ['Oxidation'] [5] 439.534698486328 10 1699843.8217773438 0.988856432577341 0.0 0.07057874972895482 0.9880480975986452 0.06010014927442129 0 From 29b3080dbb6f32f3ba3c3a2367c5ceb2454dc3d5 Mon Sep 17 00:00:00 2001 From: Matthias Bernt Date: Mon, 11 Sep 2023 15:42:19 +0200 Subject: [PATCH 09/11] ignore some flake8 errors using `is False` seems not to work --- tools/calisp/benchmarking.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tools/calisp/benchmarking.py b/tools/calisp/benchmarking.py index 90f4441e0..e887002bb 100644 --- a/tools/calisp/benchmarking.py +++ b/tools/calisp/benchmarking.py @@ -99,24 +99,24 @@ def load_calisp_data(filename, factor): def filter_calisp_data(data, target): if target.lower() == "na": subdata = data.loc[ - lambda df: (df["flag_peak_at_minus_one_pos"] == False) - & (df["flag_pattern_is_wobbly"] == False) - & (df["flag_psm_has_low_confidence"] == False) - & (df["flag_psm_is_ambiguous"] == False) - & (df["flag_pattern_is_contaminated"] == False) - & (df["flag_peptide_assigned_to_multiple_bins"] == False), + lambda df: (df["flag_peak_at_minus_one_pos"] == False) # noqa: E712 + & (df["flag_pattern_is_wobbly"] == False) # noqa: E712 + & (df["flag_psm_has_low_confidence"] == False) # noqa: E712 + & (df["flag_psm_is_ambiguous"] == False) # noqa: E712 + & (df["flag_pattern_is_contaminated"] == False) # noqa: E712 + & (df["flag_peptide_assigned_to_multiple_bins"] == False), # noqa: E712 :, ] elif target.lower() == "fft": subdata = data.loc[ lambda df: (df["error_fft"] < 0.001) - & (df["flag_peptide_assigned_to_multiple_bins"] == False), + & (df["flag_peptide_assigned_to_multiple_bins"] == False), # noqa: E712 :, ] elif target.lower() == "clumpy": subdata = data.loc[ lambda df: (df["error_clumpy"] < 0.001) - & (df["flag_peptide_assigned_to_multiple_bins"] == False), + & (df["flag_peptide_assigned_to_multiple_bins"] == False), # noqa: E712 :, ] From a2cd91641519a51cfbab33b1c0e3ff83cf58a0ac Mon Sep 17 00:00:00 2001 From: Matthias Bernt Date: Tue, 12 Sep 2023 16:26:14 +0200 Subject: [PATCH 10/11] fix python linting --- tools/calisp/benchmarking.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/calisp/benchmarking.py b/tools/calisp/benchmarking.py index e887002bb..ced0c42c9 100644 --- a/tools/calisp/benchmarking.py +++ b/tools/calisp/benchmarking.py @@ -1,8 +1,8 @@ import argparse import os -import pandas as pd import numpy as np +import pandas as pd # Define the ArgumentParser parser = argparse.ArgumentParser("List of natural abundances of the isotopes") From ce0e16c3258c12f91eda19455da2f0602055ccec Mon Sep 17 00:00:00 2001 From: Matthias Bernt Date: Wed, 13 Sep 2023 16:24:53 +0200 Subject: [PATCH 11/11] fix tests and comment them (because they will run only for 23.1 anyway) --- tools/calisp/benchmarking.py | 4 ---- tools/calisp/calisp.xml | 30 ++++++++++++++++++++--------- tools/calisp/test-data/filtered.tsv | 24 ----------------------- 3 files changed, 21 insertions(+), 37 deletions(-) delete mode 100644 tools/calisp/test-data/filtered.tsv diff --git a/tools/calisp/benchmarking.py b/tools/calisp/benchmarking.py index ced0c42c9..f546a1db9 100644 --- a/tools/calisp/benchmarking.py +++ b/tools/calisp/benchmarking.py @@ -84,8 +84,6 @@ def load_calisp_data(filename, factor): data.to_csv(f"{base}.tsv", sep="\t", index=False) file_success_count = len(data["ms_run"].unique()) - # sort proteins to always get the same output - data["proteins"] = data["proteins"].sort_values() # (2) calculate deltas # ((1-f)/f) - 1 == 1/f -2 data["delta_na"] = data["ratio_na"] / ((1 / factor) - 2) * 1000 @@ -123,8 +121,6 @@ def filter_calisp_data(data, target): print( f"{len(subdata.index)} ({len(subdata.index)/len(data.index)*100:.1f}%) remaining after filters." ) - subdata["proteins"] = subdata["proteins"].sort_values() - return subdata diff --git a/tools/calisp/calisp.xml b/tools/calisp/calisp.xml index 22de6d31a..cb190dc25 100644 --- a/tools/calisp/calisp.xml +++ b/tools/calisp/calisp.xml @@ -105,7 +105,7 @@ calisp - + - + (using the same as the built in => same results) + + TODO: test will only work with 23.1 tool-utils package available --> + - +