From 307917b20c1c0831531da9486da67731088a4e69 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?B=C3=A9r=C3=A9nice=20Batut?= <bebatut@gmail.com>
Date: Tue, 31 Oct 2023 14:51:36 +0100
Subject: [PATCH] Fix filtering by categories

---
 bin/extract_galaxy_tools.py                   | 23 ++++++++++++++-----
 ...y_tools.sh => filter_microgalaxy_tools.sh} |  3 +++
 2 files changed, 20 insertions(+), 6 deletions(-)
 rename bin/{extract_microgalaxy_tools.sh => filter_microgalaxy_tools.sh} (88%)

diff --git a/bin/extract_galaxy_tools.py b/bin/extract_galaxy_tools.py
index e0656f5c..15dcfb24 100644
--- a/bin/extract_galaxy_tools.py
+++ b/bin/extract_galaxy_tools.py
@@ -327,6 +327,16 @@ def parse_tools(repo):
     return tools
 
 
+def format_list_column(col):
+    """
+    Format a column that could be a list before exporting
+    """
+    if isinstance(col, list):
+        return col.apply(lambda x: ", ".join([str(i) for i in x]))
+    else:
+        return col
+
+
 def export_tools(tools: list, output_fp: str) -> None:
     """
     Export tool metadata to tsv output file
@@ -335,10 +345,10 @@ def export_tools(tools: list, output_fp: str) -> None:
     :param output_fp: path to output file
     """
     df = pd.DataFrame(tools)
-    df["ToolShed categories"] = df["ToolShed categories"].apply(lambda x: ", ".join([str(i) for i in x]))
-    df["EDAM operation"] = df["EDAM operation"].apply(lambda x: ", ".join([str(i) for i in x]))
-    df["EDAM topic"] = df["EDAM topic"].apply(lambda x: ", ".join([str(i) for i in x]))
-    df["Galaxy tool ids"] = df["Galaxy tool ids"].apply(lambda x: ", ".join([str(i) for i in x]))
+    df["ToolShed categories"] = format_list_column(df["ToolShed categories"])
+    df["EDAM operation"] = format_list_column(df["EDAM operation"])
+    df["EDAM topic"] = format_list_column(df["EDAM topic"])
+    df["Galaxy tool ids"] = format_list_column(df["Galaxy tool ids"])
     df.to_csv(output_fp, sep="\t", index=False)
 
 
@@ -356,13 +366,14 @@ def filter_tools(tools, ts_cat, excluded_tools, keep_tools):
         # filter ToolShed categories and leave function if not in expected categories
         if check_categories(tool["ToolShed categories"], ts_cat):
             name = tool["Galaxy wrapper id"]
-            tool["Reviewed"] = tool.name in keep_tools or tool.name in excluded_tools
+            tool["Reviewed"] = name in keep_tools or name in excluded_tools
             tool["To keep"] = None
             if name in keep_tools:
                 tool["To keep"] = True
             elif name in excluded_tools:
                 tool["To keep"] = False
             filtered_tools.append(tool)
+    return filtered_tools
 
 
 if __name__ == "__main__":
@@ -408,7 +419,7 @@ def filter_tools(tools, ts_cat, excluded_tools, keep_tools):
             export_tools(tools, args.all_tools)
             print()
     elif args.command == "filtertools":
-        tools = pd.read_csv(Path(args.tools)).to_dict("records")
+        tools = pd.read_csv(Path(args.tools), sep="\t", keep_default_na=False).to_dict("records")
         # get categories and tools to exclude
         categories = read_file(args.categories)
         excl_tools = read_file(args.exclude)
diff --git a/bin/extract_microgalaxy_tools.sh b/bin/filter_microgalaxy_tools.sh
similarity index 88%
rename from bin/extract_microgalaxy_tools.sh
rename to bin/filter_microgalaxy_tools.sh
index 12bf25ce..b7597276 100644
--- a/bin/extract_microgalaxy_tools.sh
+++ b/bin/filter_microgalaxy_tools.sh
@@ -10,9 +10,12 @@ curl \
         "https://docs.google.com/spreadsheets/d/1Nq_g-CPc8t_eC4M1NAS9XFJDflA7yE3b9hfSg3zu9L4/export?format=tsv&gid=672552331" \
         -o "data/microgalaxy/tools_to_exclude"
 
+mkdir -p 'results/microgalaxy'
+
 python bin/extract_galaxy_tools.py \
         filtertools \
         --tools 'results/all_tools.csv' \
+        --filtered_tools 'results/microgalaxy/tools.csv' \
         --categories "data/microgalaxy/categories" \
         --exclude "data/microgalaxy/tools_to_exclude" \
         --keep "data/microgalaxy/tools_to_keep"
\ No newline at end of file