diff --git a/tools/proteore_filter_keywords_values/.shed.yml b/tools/proteore_filter_keywords_values/.shed.yml new file mode 100644 index 000000000..ddaad82e5 --- /dev/null +++ b/tools/proteore_filter_keywords_values/.shed.yml @@ -0,0 +1,5 @@ +categories: [Proteomics] +description: ProteoRE - Filter a file by keywords or values +long_description: Filter a file by keywords or values +name: proteore_filter_keywords_values +owner: proteore diff --git a/tools/proteore_filter_keywords_values/README.rst b/tools/proteore_filter_keywords_values/README.rst new file mode 100644 index 000000000..bcf21be0f --- /dev/null +++ b/tools/proteore_filter_keywords_values/README.rst @@ -0,0 +1,102 @@ +Wrapper for Filter by keywords or numerical values Tool +======================================================= + +**Authors** + +T.P. Lien Nguyen, David Christiany, Florence Combes, Yves Vandenbrouck CEA, INSERM, CNRS, Grenoble-Alpes University, BIG Institute, FR + +Sandra Dérozier, Olivier Rué, Christophe Caron, Valentin Loux INRA, Paris-Saclay University, MAIAGE Unit, Migale Bioinformatics platform + +This work has been partially funded through the French National Agency for Research (ANR) IFB project. + +Contact support@proteore.org for any questions or concerns about the Galaxy implementation of this tool. + +------------------------------------------------------- + +This tool allows to remove unneeded data (e.g. contaminants, non-significant values) from a proteomics results file (e.g. MaxQuant or Proline output). + +**Filter by keyword(s)** + +Several options can be used. For each option, you can fill in the field or upload a file which contains the keywords. + +- If you choose to fill in the field, the keywords should be separated by ";", for example: A8K2U0;Q5TA79;O43175 + +- If you choose to upload a file in a text format in which each line is a keyword, for example: + +REV + +TRYP_PIG + +ALDOA_RABBIT + +**The line that contains these keywords will be eliminated from input file.** + +**Keywords search can be applied by performing either exact match or partial one by using the following option** + +- If you choose **Yes**, only the fields that contains exactly the same content will be removed. + +- If you choose **No**, all the fields containing the keyword will be removed. + +For example: + +**Yes** option (exact match) selected using the keyword "kinase": only lines which contain exactly "kinase" is removed. + +**No** option (partial match) for "kinase": not only lines which contain "kinase" but also lines with "alpha-kinase" (and so on) are removed. + +------------------------------------------------------- + +**Filter by values** + +You can filter your data by a column of numerical values. +Enter the column to be use and select one operator in the list : + +- = +- != +- < +- <= +- > +- >= + +Then enter the value to filter and specify the column to apply that option. +If a row contains a value that correspond to your settings, it will be filtered. + +------------------------------------------------------- + +**Filter by a range of values** + +You can also set a range of values to filter your file. +In opposition to value filter, rows with values inside of the defined range are kept. + +Rows with values outside of the defined range will be filtered. + +------------------------------------------------------- + +**AND/OR operator** + +Since you can add as many filters as you want, you can choose how filters apply on your data. + +AND or OR operator option works on all filters : + +- OR : only one filter to be satisfied to remove one row +- AND : all filters must be satisfied to remove one row + +------------------------------------------------------- + +**Sort the results files** + +You can sort the result file if you wish, it can help you to check results. + +In order to do so : enter the column to be used, all columns will be sorted according to the one filled in. + +Rows stay intact, just in different order like excel. +You can also choose ascending or descending order, by default descending order is set. + +------------------------------------------------------- + +**Output** + +The tool will produce 2 output files. + +* A text file containing the resulting filtered input file. + +* A text file containing the rows removed from the input file. \ No newline at end of file diff --git a/tools/proteore_filter_keywords_values/filter_kw_val.py b/tools/proteore_filter_keywords_values/filter_kw_val.py new file mode 100644 index 000000000..950785ca7 --- /dev/null +++ b/tools/proteore_filter_keywords_values/filter_kw_val.py @@ -0,0 +1,458 @@ +import argparse +import csv +import re +import sys + + +def options(): + """ + Parse options: + -i, --input Input filename and boolean value if the file contains header ["filename,true/false"] # noqa 501 + --kw Keyword to be filtered, the column number where this filter applies, + boolean value if the keyword should be filtered in exact ["keyword,ncol,true/false"]. + This option can be repeated: --kw "kw1,c1,true" --kw "kw2,c1,false" --kw "kw3,c2,true" + --kwfile A file that contains keywords to be filter, the column where this filter applies and + boolean value if the keyword should be filtered in exact ["filename,ncol,true/false"] + --value The value to be filtered, the column number where this filter applies and the + operation symbol ["value,ncol,=/>/>=/", "Equal-or-higher": ">=", "Lower": "<", "Equal-or-lower": "<=", "Different": "!="} # noqa 501 + + if args.kw: + keywords = args.kw + for k in keywords: + results_dict = filter_keyword(csv_file, + header, + results_dict, + k[0], + k[1], + k[2]) + + if args.kw_file: + key_files = args.kw_file + for kf in key_files: + header = str_to_bool(kf[1]) + ncol = column_from_txt(kf[2], csv_file) + keywords = read_keywords_file(kf[0], header, ncol) + results_dict = filter_keyword(csv_file, header, results_dict, + keywords, kf[3], kf[4]) + + if args.value: + for v in args.value: + v[0] = v[0].replace(",", ".") + v[2] = operator_dict[v[2]] + if is_number("float", v[0]): + csv_file = comma_number_to_float(csv_file, + column_from_txt( + v[1], csv_file), header) + results_dict = filter_value(csv_file, header, + results_dict, v[0], v[1], v[2]) + else: + raise ValueError("Please enter a number in filter by value") + + if args.values_range: + for vr in args.values_range: + vr[:2] = [value.replace(",", ".") for value in vr[:2]] + csv_file = comma_number_to_float(csv_file, + column_from_txt( + vr[2], csv_file), header) + if (is_number("float", vr[0]) or is_number("int", vr[0])) and (is_number("float", vr[1]) or is_number("int", vr[1])): # noqa 501 + results_dict = filter_values_range(csv_file, + header, results_dict, + vr[0], vr[1], vr[2], vr[3]) + + remaining_lines = [] + filtered_lines = [] + + if header is True: + remaining_lines.append(csv_file[0]) + filtered_lines.append(csv_file[0]) + + if results_dict == {}: # no filter used + remaining_lines.extend(csv_file[1:]) + else: + for id_line, line in enumerate(csv_file): + if id_line in results_dict: # skip header and empty lines + if args.operator == 'OR': + if any(results_dict[id_line]): + filtered_lines.append(line) + else: + remaining_lines.append(line) + + elif args.operator == "AND": + if all(results_dict[id_line]): + filtered_lines.append(line) + else: + remaining_lines.append(line) + + # sort of results by column + if args.sort_col: + sort_col = args.sort_col.split(",")[0] + sort_col = column_from_txt(sort_col, csv_file) + reverse = str_to_bool(args.sort_col.split(",")[1]) + remaining_lines = sort_by_column(remaining_lines, sort_col, + reverse, header) + filtered_lines = sort_by_column(filtered_lines, sort_col, + reverse, header) + + # swap lists of lines (files) if 'keep' option selected + if args.operation == "keep": + swap = remaining_lines, filtered_lines + remaining_lines = swap[1] + filtered_lines = swap[0] + + # Write results to output + with open(args.output, "w") as output: + writer = csv.writer(output, delimiter="\t") + writer.writerows(remaining_lines) + + # Write filtered lines to filtered_output + with open(args.discarded_lines, "w") as filtered_output: + writer = csv.writer(filtered_output, delimiter="\t") + writer.writerows(filtered_lines) + +# function to sort the csv_file by value in a specific column + + +def sort_by_column(tab, sort_col, reverse, header): + + if len(tab) > 1: # if there's more than just a header or 1 row + if header: + head = tab[0] + tab = tab[1:] + + # list of empty cells in the column to sort + unsortable_lines = [i for i, line in enumerate(tab) if (line[sort_col]=='' or line[sort_col] == 'NA')] # noqa 501 + unsorted_tab = [tab[i] for i in unsortable_lines] + tab = [line for i, line in enumerate(tab) if i not in unsortable_lines] + + if only_number(tab, sort_col) and any_float(tab, sort_col): + tab = comma_number_to_float(tab, sort_col, False) + tab = sorted(tab, key=lambda row: float(row[sort_col]), + reverse=reverse) + elif only_number(tab, sort_col): + tab = sorted(tab, key=lambda row: int(row[sort_col]), + reverse=reverse) + else: + tab = sorted(tab, key=lambda row: row[sort_col], reverse=reverse) + + tab.extend(unsorted_tab) + if header is True: + tab = [head] + tab + + return tab + + +# replace all blank cells to NA + + +def blank_to_NA(csv_file): + + tmp = [] + for line in csv_file: + line = ["NA" if cell=="" or cell==" " or cell=="NaN" else cell for cell in line ] # noqa 501 + tmp.append(line) + + return tmp + +# turn into float a column + + +def comma_number_to_float(csv_file, ncol, header): + if header: + tmp = [csv_file[0]] + csv_file = csv_file[1:] + else: + tmp = [] + + for line in csv_file: + line[ncol] = line[ncol].replace(",", ".") + tmp.append(line) + + return (tmp) + +# return True is there is at least one float in the column + + +def any_float(tab, col): + + for line in tab: + if is_number("float", line[col].replace(",", ".")): + return True + + return False + + +def only_number(tab, col): + for line in tab: + if not (is_number("float", line[col].replace(",", ".")) or is_number("int", line[col].replace(",", "."))): # noqa 501 + return False + return True + +# Read the keywords file to extract the list of keywords + + +def read_keywords_file(filename, header, ncol): + with open(filename, "r") as csv_file: + lines = csv.reader(csv_file, delimiter='\t') + lines = blank_to_NA(lines) + if (len(lines[0])) > 1: + keywords = [line[ncol] for line in lines] + else: + keywords = ["".join(key) for key in lines] + if header: + keywords = keywords[1:] + keywords = list(set(keywords)) + + return keywords + +# Read input file + + +def read_file(filename): + with open(filename, "r") as f: + reader = csv.reader(f, delimiter="\t") + tab = list(reader) + + # Remove empty lines (contain only space or new line or "") + # [tab.remove(blank) for blank in tab if blank.isspace() or blank == ""] + tab = [line for line in tab if len("".join(line).replace(" ", "")) != 0] # noqa 501 + + return tab + +# seek for keywords in rows of csvfile, return a dictionary of boolean +# (true if keyword found, false otherwise) + + +def filter_keyword(csv_file, header, results_dict, keywords, ncol, match): + match = str_to_bool(match) + ncol = column_from_txt(ncol, csv_file) + if type(keywords) != list: + keywords = keywords.upper().split() # Split list of filter keyword + + for id_line, line in enumerate(csv_file): + if header is True and id_line == 0: + continue + keyword_inline = line[ncol].replace('"', "").split(";") + + # Perfect match or not + if match is True: + found_in_line = any(pid.upper() in keywords for pid in keyword_inline) # noqa 501 + else: + found_in_line = any(ft in pid.upper() for pid in keyword_inline for ft in keywords) # noqa 501 + + # if the keyword is found in line + if id_line in results_dict: + results_dict[id_line].append(found_in_line) + else: + results_dict[id_line] = [found_in_line] + + return results_dict + +# filter ba determined value in rows of csvfile, return a dictionary +# of boolean (true if value filtered, false otherwise) + + +def filter_value(csv_file, header, results_dict, filter_value, ncol, opt): + + filter_value = float(filter_value) + ncol = column_from_txt(ncol, csv_file) + nb_string = 0 + + for id_line, line in enumerate(csv_file): + if header is True and id_line == 0: + continue + value = line[ncol].replace('"', "").replace(",", ".").strip() + if value.replace(".", "", 1).isdigit(): + to_filter = value_compare(value, filter_value, opt) + + # adding the result to the dictionary + if id_line in results_dict: + results_dict[id_line].append(to_filter) + else: + results_dict[id_line] = [to_filter] + + # impossible to treat (ex : "" instead of a number), + # we keep the line by default + else: + nb_string += 1 + if id_line in results_dict: + results_dict[id_line].append(False) + else: + results_dict[id_line] = [False] + + # number of lines in the csv file + if header: + nb_lines = len(csv_file) - 1 + else: + nb_lines = len(csv_file) + + # if there's no numeric value in the column + if nb_string == nb_lines: + print('No numeric values found in the column ' + str(ncol + 1)) + print('The filter "'+str(opt)+' '+str(filter_value)+'" can not be applied on the column '+str(ncol+1)) # noqa 501 + + return results_dict + +# filter ba determined value in rows of csvfile, return a dictionary +# of boolean (true if value filtered, false otherwise) + + +def filter_values_range(csv_file, header, results_dict, bottom_value, top_value, ncol, inclusive): # noqa 501 + inclusive = str_to_bool(inclusive) + bottom_value = float(bottom_value) + top_value = float(top_value) + ncol = column_from_txt(ncol, csv_file) + nb_string = 0 + + for id_line, line in enumerate(csv_file): + if header is True and id_line == 0: + continue + value = line[ncol].replace('"', "").replace(",", ".").strip() + if value.replace(".", "", 1).isdigit(): + value = float(value) + if inclusive is True: + in_range = not (bottom_value <= value <= top_value) + else: + in_range = not (bottom_value < value < top_value) + + # adding the result to the dictionary + if id_line in results_dict: + results_dict[id_line].append(in_range) + else: + results_dict[id_line] = [in_range] + + # impossible to treat (ex : "" instead of a number), + # we keep the line by default + else: + nb_string += 1 + if id_line in results_dict: + results_dict[id_line].append(False) + else: + results_dict[id_line] = [False] + + # number of lines in the csv file + if header: + nb_lines = len(csv_file) - 1 + else: + nb_lines = len(csv_file) + + # if there's no numeric value in the column + if nb_string == nb_lines: + print('No numeric values found in the column ' + str(ncol + 1)) + if inclusive: + print ('The filter "'+str(bottom_value)+' <= x <= '+str(top_value)+'" can not be applied on the column '+str(ncol+1)) # noqa 501 + else: + print ('The filter "'+str(bottom_value)+' < x < '+str(top_value)+'" can not be applied on the column '+str(ncol+1)) # noqa 501 + + return results_dict + + +def column_from_txt(ncol, file): + if is_number("int", ncol.replace("c", "")): + ncol = int(ncol.replace("c", "")) - 1 + else: + raise ValueError("Please specify the column where " + "you would like to apply the filter " + "with valid format") + + proper_ncol(ncol, file) + + return ncol + +# return True if value is in the determined values, false otherwise + + +def value_compare(value, filter_value, opt): + test_value = False + + if opt == "<": + if float(value) < filter_value: + test_value = True + elif opt == "<=": + if float(value) <= filter_value: + test_value = True + elif opt == ">": + if float(value) > filter_value: + test_value = True + elif opt == ">=": + if float(value) >= filter_value: + test_value = True + elif opt == "=": + if float(value) == filter_value: + test_value = True + elif opt == "!=": + if float(value) != filter_value: + test_value = True + + return test_value + + +if __name__ == "__main__": + options() diff --git a/tools/proteore_filter_keywords_values/filter_kw_val.xml b/tools/proteore_filter_keywords_values/filter_kw_val.xml new file mode 100644 index 000000000..322a9ebf7 --- /dev/null +++ b/tools/proteore_filter_keywords_values/filter_kw_val.xml @@ -0,0 +1,280 @@ + + + + + + + + + + + + + + + + + + + + [c]{0,1}[0-9]+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + [c]{0,1}[0-9]+ + + + + + + + + + + + + + + [c]{0,1}[0-9]+ + + + + + + + + + + [c]{0,1}[0-9] + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (greater than) +- >= (greater than or equal to) + +Then enter the numerical threshold to apply by filling the "Value" box. +If you choose > 10, each row containing a numerical value (in the chosen column of your input file) that correspond to your settings will be kept or discarded (based on operation parameter). + +----- + +**Filter by a range of values**: You can also set a range of values to filter your file. +Conversely to the numeric filter, rows with numerical values within the defined range will be kept while rows with values out of this range will be discarded (or the other way around based on operation parameter). + +----- + +**Sort by column ?** +click on the "Yes" button allows to "Sort result files by:" a column number. this can be done in ascending (default value) or descending order by entering the column number on which to sort the data. + +----- + +**Output** + +The tool returns two output files. + +* A text file containing the results that satisfy your filters (i.e. "keep" mode). + +* A text file containing the rows removed from the input file (i.e. "discard" mode). + +----- + +.. class:: infomark + +**Authors** + +David Christiany, Lien Nguyen, Florence Combes, Yves Vandenbrouck - CEA, INSERM, CNRS, Grenoble-Alpes University, BIG Institute, FR + +Sandra Dérozier, Olivier Rué, Christophe Caron, Valentin Loux - INRA, Paris-Saclay University, MAIAGE Unit, Migale Bioinformatics platform, FR + +This work has been partially funded through the French National Agency for Research (ANR) IFB project. + +Help: contact@proteore.org for any questions or concerns about this tool. + + ]]> + + doi:10.1186/1471-2105-15-293 + @book{toolsProteoRE, + journal = {GitHub repository}, + publisher = {GitHub}, + title = {ProteoRE tools}, + url = {https://github.com/galaxyproteomics/tools-galaxyp}, + author = {Lien Nguyen, David Chistiany, Florence Combes,Christophe Caron, Valentin Loux Yves Vandenbrouck}, + date = {2021}, + year = {2021}, + } + diff --git a/tools/proteore_filter_keywords_values/releases_notes.rst b/tools/proteore_filter_keywords_values/releases_notes.rst new file mode 100644 index 000000000..4a3d9fee0 --- /dev/null +++ b/tools/proteore_filter_keywords_values/releases_notes.rst @@ -0,0 +1,20 @@ +=================================== +Filter by keywords - Releases Notes +=================================== + +================== +2019-03-15 Release +================== + +New +--- + +Improvements +------------ + +- more intuitive outputs : + 1. Filtered + 2. Filtered - discarded lines + +Fixes +----- diff --git a/tools/proteore_filter_keywords_values/test-data/Lacombe_et_al_2017_OK.txt b/tools/proteore_filter_keywords_values/test-data/Lacombe_et_al_2017_OK.txt new file mode 100644 index 000000000..188c01b91 --- /dev/null +++ b/tools/proteore_filter_keywords_values/test-data/Lacombe_et_al_2017_OK.txt @@ -0,0 +1,165 @@ +Protein accession number (UniProt) Protein name Number of peptides (razor + unique) +P15924 Desmoplakin 69 +P02538 Keratin, type II cytoskeletal 6A 53 +P02768 Serum albumin 44 +P08779 Keratin, type I cytoskeletal 16 29 +Q02413 Desmoglein-1 24 +P07355 "Annexin A2;Putative annexin A2-like protein" 22 +P14923 Junction plakoglobin 22 +P02788 Lactotransferrin 21 +Q9HC84 Mucin-5B 21 +P29508 Serpin B3 20 +P63261 Actin, cytoplasmic 2 19 +Q8N1N4 Keratin, type II cytoskeletal 78 18 +Q04695 Keratin, type I cytoskeletal 17 18 +P01876 Ig alpha-1 chain C region 16 +Q01469 Fatty acid-binding protein 5, epidermal 15 +P31944 Caspase-14 15 +P01833 Polymeric immunoglobulin receptor 15 +P06733 Alpha-enolase 15 +P25311 Zinc-alpha-2-glycoprotein 15 +Q15149 Plectin 15 +P19013 Keratin, type II cytoskeletal 4 13 +Q6KB66 Keratin, type II cytoskeletal 80 13 +Q08188 Protein-glutamine gamma-glutamyltransferase E 12 +P13646 Keratin, type I cytoskeletal 13 11 +Q86YZ3 Hornerin 11 +P04259 Keratin, type II cytoskeletal 6B 10 +P02545 "Prelamin-A/C;Lamin-A/C" 10 +P04083 Annexin A1 10 +P11021 78 kDa glucose-regulated protein 10 +P02787 Serotransferrin 9 +P04040 Catalase 9 +P31151 Protein S100-A7 9 +P31947 14-3-3 protein sigma 9 +Q96P63 Serpin B12 9 +P14618 Pyruvate kinase PKM 9 +P60174 Triosephosphate isomerase 9 +Q06830 Peroxiredoxin-1 9 +P01040 Cystatin-A 8 +P05089 Arginase-1 8 +P01834 Ig kappa chain C region 8 +P04406 Glyceraldehyde-3-phosphate dehydrogenase 8 +P0DMV9 Heat shock 70 kDa protein 1B 8 +P13639 Elongation factor 2 8 +P35579 Myosin-9 8 +P68371 Tubulin beta-4B chain 8 +Q8WVV4 Protein POF1B 8 +O75635 Serpin B7 7 +P01857 Ig gamma-1 chain C region 7 +P61626 Lysozyme C 7 +P68363 Tubulin alpha-1B chain 7 +P01009 "Alpha-1-antitrypsin;Short peptide from AAT" 6 +P07900 Heat shock protein HSP 90-alpha 6 +Q9NZH8 Interleukin-36 gamma 6 +O43707 "Alpha-actinin-4;Alpha-actinin-1" 6 +O75223 Gamma-glutamylcyclotransferase 6 +P00338 L-lactate dehydrogenase A chain 6 +P07339 Cathepsin D 6 +P62987 Ubiquitin-60S ribosomal protein L40 6 +P10599 Thioredoxin 6 +Q9UGM3 Deleted in malignant brain tumors 1 protein 6 +Q9UI42 Carboxypeptidase A4 6 +P47929 Galectin-7 5 +Q13867 Bleomycin hydrolase 5 +Q6P4A8 Phospholipase B-like 1 5 +O75369 Filamin-B 5 +P00441 Superoxide dismutase [Cu-Zn] 5 +P04792 Heat shock protein beta-1 5 +P11142 Heat shock cognate 71 kDa protein 5 +P58107 Epiplakin 5 +P60842 Eukaryotic initiation factor 4A-I 5 +P62937 Peptidyl-prolyl cis-trans isomerase A 5 +P63104 14-3-3 protein zeta/delta 5 +Q92820 Gamma-glutamyl hydrolase 5 +O75342 Arachidonate 12-lipoxygenase, 12R-type 4 +P09211 Glutathione S-transferase P 4 +P31025 Lipocalin-1 4 +P48594 Serpin B4 4 +Q14574 Desmocollin-3 4 +Q5T750 Skin-specific protein 32 4 +Q6UWP8 Suprabasin 4 +O60911 Cathepsin L2 4 +P00558 Phosphoglycerate kinase 1 4 +P04075 Fructose-bisphosphate aldolase A 4 +P07384 Calpain-1 catalytic subunit 4 +P0CG05 Ig lambda-2 chain C regions 4 +P18206 Vinculin 4 +P62258 14-3-3 protein epsilon 4 +P68871 Hemoglobin subunit beta 4 +Q9C075 Keratin, type I cytoskeletal 23 4 +A8K2U0 Alpha-2-macroglobulin-like protein 1 3 +P00738 Haptoglobin 3 +P01011 Alpha-1-antichymotrypsin 3 +P02763 Alpha-1-acid glycoprotein 1 3 +P18510 Interleukin-1 receptor antagonist protein 3 +P22528 Cornifin-B 3 +P30740 Leukocyte elastase inhibitor 3 +P80188 Neutrophil gelatinase-associated lipocalin 3 +Q15828 Cystatin-M 3 +Q9HCY8 Protein S100-A14 3 +P01623 Ig kappa chain V-III region 3 +P01877 Ig alpha-2 chain C region 3 +P06396 Gelsolin 3 +P14735 Insulin-degrading enzyme 3 +P20933 N(4)-(beta-N-acetylglucosaminyl)-L-asparaginase 3 +P25788 Proteasome subunit alpha type-3 3 +P26641 Elongation factor 1-gamma 3 +P36952 Serpin B5 3 +P40926 Malate dehydrogenase, mitochondrial 3 +Q9Y6R7 IgGFc-binding protein 3 +O95274 Ly6/PLAUR domain-containing protein 3 2 +P00491 Purine nucleoside phosphorylase 2 +P04080 Cystatin-B 2 +P09972 Fructose-bisphosphate aldolase C 2 +P19012 Keratin, type I cytoskeletal 15 2 +P20930 Filaggrin 2 +Q96FX8 p53 apoptosis effector related to PMP-22 2 +Q9UIV8 Serpin B13 2 +P01625 Ig kappa chain V-IV region Len 2 +P01765 Ig heavy chain V-III region TIL 2 +P01766 Ig heavy chain V-III region BRO 2 +P01860 Ig gamma-3 chain C region 2 +P01871 Ig mu chain C region 2 +P05090 Apolipoprotein D 2 +P06870 Kallikrein-1 2 +P07858 Cathepsin B 2 +P08865 40S ribosomal protein SA 2 +P11279 Lysosome-associated membrane glycoprotein 1 2 +P13473 Lysosome-associated membrane glycoprotein 2 2 +P19971 Thymidine phosphorylase 2 +P23284 Peptidyl-prolyl cis-trans isomerase B 2 +P23396 40S ribosomal protein S3 2 +P25705 ATP synthase subunit alpha, mitochondrial 2 +P27482 Calmodulin-like protein 3 2 +P31949 Protein S100-A11 2 +P40121 Macrophage-capping protein 2 +P42357 Histidine ammonia-lyase 2 +P47756 F-actin-capping protein subunit beta 2 +P48637 Glutathione synthetase 2 +P49720 Proteasome subunit beta type-3 2 +P50395 Rab GDP dissociation inhibitor beta 2 +P59998 Actin-related protein 2/3 complex subunit 4 2 +P61160 Actin-related protein 2 2 +P61916 Epididymal secretory protein E1 2 +P04745 Alpha-amylase 1 23 +Q9NZT1 Calmodulin-like protein 5 8 +P12273 Prolactin-inducible protein 6 +Q96DA0 Zymogen granule protein 16 homolog B 5 +P01036 Cystatin-S 5 +Q8TAX7 Mucin-7 2 +P01037 Cystatin-SN 2 +P09228 Cystatin-SA 2 +P04264 Keratin, type II cytoskeletal 1 61 +P35908 Keratin, type II cytoskeletal 2 epidermal 40 +P13645 Keratin, type I cytoskeletal 10 40 +Q5D862 Filaggrin-2 14 +Q5T749 Keratinocyte proline-rich protein 13 +Q8IW75 Serpin A12 3 +P81605 Dermcidin 3 +P22531 Small proline-rich protein 2E 3 +P59666 Neutrophil defensin 3 2 +P78386 Keratin, type II cuticular Hb5 2 + + + diff --git a/tools/proteore_filter_keywords_values/test-data/discarded_lines.tsv b/tools/proteore_filter_keywords_values/test-data/discarded_lines.tsv new file mode 100644 index 000000000..0bb9d57b2 --- /dev/null +++ b/tools/proteore_filter_keywords_values/test-data/discarded_lines.tsv @@ -0,0 +1,21 @@ +Protein accession number (UniProt) Protein name Number of peptides (razor + unique) +P15924 Desmoplakin 69 +P02538 Keratin, type II cytoskeletal 6A 53 +P02768 Serum albumin 44 +P08779 Keratin, type I cytoskeletal 16 29 +Q02413 Desmoglein-1 24 +P07355 Annexin A2;Putative annexin A2-like protein 22 +P14923 Junction plakoglobin 22 +P02788 Lactotransferrin 21 +Q9HC84 Mucin-5B 21 +P04745 Alpha-amylase 1 23 +P04264 Keratin, type II cytoskeletal 1 61 +P35908 Keratin, type II cytoskeletal 2 epidermal 40 +P13645 Keratin, type I cytoskeletal 10 40 +Q5D862 Filaggrin-2 14 +Q5T749 Keratinocyte proline-rich protein 13 +Q8IW75 Serpin A12 3 +P81605 Dermcidin 3 +P22531 Small proline-rich protein 2E 3 +P59666 Neutrophil defensin 3 2 +P78386 Keratin, type II cuticular Hb5 2 diff --git a/tools/proteore_filter_keywords_values/test-data/output.tsv b/tools/proteore_filter_keywords_values/test-data/output.tsv new file mode 100644 index 000000000..8682b80b8 --- /dev/null +++ b/tools/proteore_filter_keywords_values/test-data/output.tsv @@ -0,0 +1,142 @@ +Protein accession number (UniProt) Protein name Number of peptides (razor + unique) +P29508 Serpin B3 20 +P63261 Actin, cytoplasmic 2 19 +Q8N1N4 Keratin, type II cytoskeletal 78 18 +Q04695 Keratin, type I cytoskeletal 17 18 +P01876 Ig alpha-1 chain C region 16 +Q01469 Fatty acid-binding protein 5, epidermal 15 +P31944 Caspase-14 15 +P01833 Polymeric immunoglobulin receptor 15 +P06733 Alpha-enolase 15 +P25311 Zinc-alpha-2-glycoprotein 15 +Q15149 Plectin 15 +P19013 Keratin, type II cytoskeletal 4 13 +Q6KB66 Keratin, type II cytoskeletal 80 13 +Q08188 Protein-glutamine gamma-glutamyltransferase E 12 +P13646 Keratin, type I cytoskeletal 13 11 +Q86YZ3 Hornerin 11 +P04259 Keratin, type II cytoskeletal 6B 10 +P02545 Prelamin-A/C;Lamin-A/C 10 +P04083 Annexin A1 10 +P11021 78 kDa glucose-regulated protein 10 +P02787 Serotransferrin 9 +P04040 Catalase 9 +P31151 Protein S100-A7 9 +P31947 14-3-3 protein sigma 9 +Q96P63 Serpin B12 9 +P14618 Pyruvate kinase PKM 9 +P60174 Triosephosphate isomerase 9 +Q06830 Peroxiredoxin-1 9 +P01040 Cystatin-A 8 +P05089 Arginase-1 8 +P01834 Ig kappa chain C region 8 +P04406 Glyceraldehyde-3-phosphate dehydrogenase 8 +P0DMV9 Heat shock 70 kDa protein 1B 8 +P13639 Elongation factor 2 8 +P35579 Myosin-9 8 +P68371 Tubulin beta-4B chain 8 +Q8WVV4 Protein POF1B 8 +O75635 Serpin B7 7 +P01857 Ig gamma-1 chain C region 7 +P61626 Lysozyme C 7 +P68363 Tubulin alpha-1B chain 7 +P01009 Alpha-1-antitrypsin;Short peptide from AAT 6 +P07900 Heat shock protein HSP 90-alpha 6 +Q9NZH8 Interleukin-36 gamma 6 +O43707 Alpha-actinin-4;Alpha-actinin-1 6 +O75223 Gamma-glutamylcyclotransferase 6 +P00338 L-lactate dehydrogenase A chain 6 +P07339 Cathepsin D 6 +P62987 Ubiquitin-60S ribosomal protein L40 6 +P10599 Thioredoxin 6 +Q9UGM3 Deleted in malignant brain tumors 1 protein 6 +Q9UI42 Carboxypeptidase A4 6 +P47929 Galectin-7 5 +Q13867 Bleomycin hydrolase 5 +Q6P4A8 Phospholipase B-like 1 5 +O75369 Filamin-B 5 +P00441 Superoxide dismutase [Cu-Zn] 5 +P04792 Heat shock protein beta-1 5 +P11142 Heat shock cognate 71 kDa protein 5 +P58107 Epiplakin 5 +P60842 Eukaryotic initiation factor 4A-I 5 +P62937 Peptidyl-prolyl cis-trans isomerase A 5 +P63104 14-3-3 protein zeta/delta 5 +Q92820 Gamma-glutamyl hydrolase 5 +O75342 Arachidonate 12-lipoxygenase, 12R-type 4 +P09211 Glutathione S-transferase P 4 +P31025 Lipocalin-1 4 +P48594 Serpin B4 4 +Q14574 Desmocollin-3 4 +Q5T750 Skin-specific protein 32 4 +Q6UWP8 Suprabasin 4 +O60911 Cathepsin L2 4 +P00558 Phosphoglycerate kinase 1 4 +P04075 Fructose-bisphosphate aldolase A 4 +P07384 Calpain-1 catalytic subunit 4 +P0CG05 Ig lambda-2 chain C regions 4 +P18206 Vinculin 4 +P62258 14-3-3 protein epsilon 4 +P68871 Hemoglobin subunit beta 4 +Q9C075 Keratin, type I cytoskeletal 23 4 +A8K2U0 Alpha-2-macroglobulin-like protein 1 3 +P00738 Haptoglobin 3 +P01011 Alpha-1-antichymotrypsin 3 +P02763 Alpha-1-acid glycoprotein 1 3 +P18510 Interleukin-1 receptor antagonist protein 3 +P22528 Cornifin-B 3 +P30740 Leukocyte elastase inhibitor 3 +P80188 Neutrophil gelatinase-associated lipocalin 3 +Q15828 Cystatin-M 3 +Q9HCY8 Protein S100-A14 3 +P01623 Ig kappa chain V-III region 3 +P01877 Ig alpha-2 chain C region 3 +P06396 Gelsolin 3 +P14735 Insulin-degrading enzyme 3 +P20933 N(4)-(beta-N-acetylglucosaminyl)-L-asparaginase 3 +P25788 Proteasome subunit alpha type-3 3 +P26641 Elongation factor 1-gamma 3 +P36952 Serpin B5 3 +P40926 Malate dehydrogenase, mitochondrial 3 +Q9Y6R7 IgGFc-binding protein 3 +O95274 Ly6/PLAUR domain-containing protein 3 2 +P00491 Purine nucleoside phosphorylase 2 +P04080 Cystatin-B 2 +P09972 Fructose-bisphosphate aldolase C 2 +P19012 Keratin, type I cytoskeletal 15 2 +P20930 Filaggrin 2 +Q96FX8 p53 apoptosis effector related to PMP-22 2 +Q9UIV8 Serpin B13 2 +P01625 Ig kappa chain V-IV region Len 2 +P01765 Ig heavy chain V-III region TIL 2 +P01766 Ig heavy chain V-III region BRO 2 +P01860 Ig gamma-3 chain C region 2 +P01871 Ig mu chain C region 2 +P05090 Apolipoprotein D 2 +P06870 Kallikrein-1 2 +P07858 Cathepsin B 2 +P08865 40S ribosomal protein SA 2 +P11279 Lysosome-associated membrane glycoprotein 1 2 +P13473 Lysosome-associated membrane glycoprotein 2 2 +P19971 Thymidine phosphorylase 2 +P23284 Peptidyl-prolyl cis-trans isomerase B 2 +P23396 40S ribosomal protein S3 2 +P25705 ATP synthase subunit alpha, mitochondrial 2 +P27482 Calmodulin-like protein 3 2 +P31949 Protein S100-A11 2 +P40121 Macrophage-capping protein 2 +P42357 Histidine ammonia-lyase 2 +P47756 F-actin-capping protein subunit beta 2 +P48637 Glutathione synthetase 2 +P49720 Proteasome subunit beta type-3 2 +P50395 Rab GDP dissociation inhibitor beta 2 +P59998 Actin-related protein 2/3 complex subunit 4 2 +P61160 Actin-related protein 2 2 +P61916 Epididymal secretory protein E1 2 +Q9NZT1 Calmodulin-like protein 5 8 +P12273 Prolactin-inducible protein 6 +Q96DA0 Zymogen granule protein 16 homolog B 5 +P01036 Cystatin-S 5 +Q8TAX7 Mucin-7 2 +P01037 Cystatin-SN 2 +P09228 Cystatin-SA 2