Skip to content

Commit

Permalink
Fix filtering by categories
Browse files Browse the repository at this point in the history
  • Loading branch information
bebatut committed Oct 31, 2023
1 parent b806367 commit 307917b
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 6 deletions.
23 changes: 17 additions & 6 deletions bin/extract_galaxy_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,6 +327,16 @@ def parse_tools(repo):
return tools


def format_list_column(col):
"""
Format a column that could be a list before exporting
"""
if isinstance(col, list):
return col.apply(lambda x: ", ".join([str(i) for i in x]))
else:
return col


def export_tools(tools: list, output_fp: str) -> None:
"""
Export tool metadata to tsv output file
Expand All @@ -335,10 +345,10 @@ def export_tools(tools: list, output_fp: str) -> None:
:param output_fp: path to output file
"""
df = pd.DataFrame(tools)
df["ToolShed categories"] = df["ToolShed categories"].apply(lambda x: ", ".join([str(i) for i in x]))
df["EDAM operation"] = df["EDAM operation"].apply(lambda x: ", ".join([str(i) for i in x]))
df["EDAM topic"] = df["EDAM topic"].apply(lambda x: ", ".join([str(i) for i in x]))
df["Galaxy tool ids"] = df["Galaxy tool ids"].apply(lambda x: ", ".join([str(i) for i in x]))
df["ToolShed categories"] = format_list_column(df["ToolShed categories"])
df["EDAM operation"] = format_list_column(df["EDAM operation"])
df["EDAM topic"] = format_list_column(df["EDAM topic"])
df["Galaxy tool ids"] = format_list_column(df["Galaxy tool ids"])
df.to_csv(output_fp, sep="\t", index=False)


Expand All @@ -356,13 +366,14 @@ def filter_tools(tools, ts_cat, excluded_tools, keep_tools):
# filter ToolShed categories and leave function if not in expected categories
if check_categories(tool["ToolShed categories"], ts_cat):
name = tool["Galaxy wrapper id"]
tool["Reviewed"] = tool.name in keep_tools or tool.name in excluded_tools
tool["Reviewed"] = name in keep_tools or name in excluded_tools
tool["To keep"] = None
if name in keep_tools:
tool["To keep"] = True
elif name in excluded_tools:
tool["To keep"] = False
filtered_tools.append(tool)
return filtered_tools


if __name__ == "__main__":
Expand Down Expand Up @@ -408,7 +419,7 @@ def filter_tools(tools, ts_cat, excluded_tools, keep_tools):
export_tools(tools, args.all_tools)
print()
elif args.command == "filtertools":
tools = pd.read_csv(Path(args.tools)).to_dict("records")
tools = pd.read_csv(Path(args.tools), sep="\t", keep_default_na=False).to_dict("records")
# get categories and tools to exclude
categories = read_file(args.categories)
excl_tools = read_file(args.exclude)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,12 @@ curl \
"https://docs.google.com/spreadsheets/d/1Nq_g-CPc8t_eC4M1NAS9XFJDflA7yE3b9hfSg3zu9L4/export?format=tsv&gid=672552331" \
-o "data/microgalaxy/tools_to_exclude"

mkdir -p 'results/microgalaxy'

python bin/extract_galaxy_tools.py \
filtertools \
--tools 'results/all_tools.csv' \
--filtered_tools 'results/microgalaxy/tools.csv' \
--categories "data/microgalaxy/categories" \
--exclude "data/microgalaxy/tools_to_exclude" \
--keep "data/microgalaxy/tools_to_keep"

0 comments on commit 307917b

Please sign in to comment.