From 307917b20c1c0831531da9486da67731088a4e69 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?B=C3=A9r=C3=A9nice=20Batut?= Date: Tue, 31 Oct 2023 14:51:36 +0100 Subject: [PATCH] Fix filtering by categories --- bin/extract_galaxy_tools.py | 23 ++++++++++++++----- ...y_tools.sh => filter_microgalaxy_tools.sh} | 3 +++ 2 files changed, 20 insertions(+), 6 deletions(-) rename bin/{extract_microgalaxy_tools.sh => filter_microgalaxy_tools.sh} (88%) diff --git a/bin/extract_galaxy_tools.py b/bin/extract_galaxy_tools.py index e0656f5c..15dcfb24 100644 --- a/bin/extract_galaxy_tools.py +++ b/bin/extract_galaxy_tools.py @@ -327,6 +327,16 @@ def parse_tools(repo): return tools +def format_list_column(col): + """ + Format a column that could be a list before exporting + """ + if isinstance(col, list): + return col.apply(lambda x: ", ".join([str(i) for i in x])) + else: + return col + + def export_tools(tools: list, output_fp: str) -> None: """ Export tool metadata to tsv output file @@ -335,10 +345,10 @@ def export_tools(tools: list, output_fp: str) -> None: :param output_fp: path to output file """ df = pd.DataFrame(tools) - df["ToolShed categories"] = df["ToolShed categories"].apply(lambda x: ", ".join([str(i) for i in x])) - df["EDAM operation"] = df["EDAM operation"].apply(lambda x: ", ".join([str(i) for i in x])) - df["EDAM topic"] = df["EDAM topic"].apply(lambda x: ", ".join([str(i) for i in x])) - df["Galaxy tool ids"] = df["Galaxy tool ids"].apply(lambda x: ", ".join([str(i) for i in x])) + df["ToolShed categories"] = format_list_column(df["ToolShed categories"]) + df["EDAM operation"] = format_list_column(df["EDAM operation"]) + df["EDAM topic"] = format_list_column(df["EDAM topic"]) + df["Galaxy tool ids"] = format_list_column(df["Galaxy tool ids"]) df.to_csv(output_fp, sep="\t", index=False) @@ -356,13 +366,14 @@ def filter_tools(tools, ts_cat, excluded_tools, keep_tools): # filter ToolShed categories and leave function if not in expected categories if check_categories(tool["ToolShed categories"], ts_cat): name = tool["Galaxy wrapper id"] - tool["Reviewed"] = tool.name in keep_tools or tool.name in excluded_tools + tool["Reviewed"] = name in keep_tools or name in excluded_tools tool["To keep"] = None if name in keep_tools: tool["To keep"] = True elif name in excluded_tools: tool["To keep"] = False filtered_tools.append(tool) + return filtered_tools if __name__ == "__main__": @@ -408,7 +419,7 @@ def filter_tools(tools, ts_cat, excluded_tools, keep_tools): export_tools(tools, args.all_tools) print() elif args.command == "filtertools": - tools = pd.read_csv(Path(args.tools)).to_dict("records") + tools = pd.read_csv(Path(args.tools), sep="\t", keep_default_na=False).to_dict("records") # get categories and tools to exclude categories = read_file(args.categories) excl_tools = read_file(args.exclude) diff --git a/bin/extract_microgalaxy_tools.sh b/bin/filter_microgalaxy_tools.sh similarity index 88% rename from bin/extract_microgalaxy_tools.sh rename to bin/filter_microgalaxy_tools.sh index 12bf25ce..b7597276 100644 --- a/bin/extract_microgalaxy_tools.sh +++ b/bin/filter_microgalaxy_tools.sh @@ -10,9 +10,12 @@ curl \ "https://docs.google.com/spreadsheets/d/1Nq_g-CPc8t_eC4M1NAS9XFJDflA7yE3b9hfSg3zu9L4/export?format=tsv&gid=672552331" \ -o "data/microgalaxy/tools_to_exclude" +mkdir -p 'results/microgalaxy' + python bin/extract_galaxy_tools.py \ filtertools \ --tools 'results/all_tools.csv' \ + --filtered_tools 'results/microgalaxy/tools.csv' \ --categories "data/microgalaxy/categories" \ --exclude "data/microgalaxy/tools_to_exclude" \ --keep "data/microgalaxy/tools_to_keep" \ No newline at end of file