Skip to content

Commit

Permalink
Export all tools to JSON (#105)
Browse files Browse the repository at this point in the history
  • Loading branch information
bebatut authored Jun 3, 2024
1 parent a4c7c5c commit 5646420
Show file tree
Hide file tree
Showing 7 changed files with 39 additions and 17 deletions.
5 changes: 4 additions & 1 deletion .github/workflows/fetch_all_tools.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -61,10 +61,13 @@ jobs:
ref: main #pull latest code produced by job 1, not the revision that started the workflow (https://github.com/actions/checkout/issues/439)
- uses: actions/setup-python@v5
- name: Install requirement
run: python -m pip install -r requirements.txt
run: |
python -m pip install -r requirements.txt
apt-get install jq
- name: Merge all tools
run: | #merge files with only one header -> https://stackoverflow.com/questions/16890582/unixmerge-multiple-csv-files-with-same-header-by-keeping-the-header-of-the-firs
awk 'FNR==1 && NR!=1{next;}{print}' results/repositories*.list_tools.tsv > results/all_tools.tsv
jq -s '.' results/repositories*.list_tools.json > results/all_tools.json
- name: Wordcloud and interactive table
run: |
bash ./bin/extract_all_tools_downstream.sh
Expand Down
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -105,10 +105,10 @@ The script will generate a TSV file with each tool found in the list of GitHub r
4. Run the tool extractor script

```
$ python bin/extract_galaxy_tools.py filtertools \
--tools <Path to CSV file with all extracted tools> \
--ts_filtered_tools <Path to output TSV with tools filtered based on ToolShed category>
--filtered_tools <Path to output TSV with filtered tools based on ToolShed category and manual curation> \
$ python bin/extract_galaxy_tools.py \
--tools <Path to JSON file with all extracted tools> \
--ts-filtered-tools <Path to output TSV with tools filtered based on ToolShed category>
--filtered-tools <Path to output TSV with filtered tools based on ToolShed category and manual curation> \
[--categories <Path to ToolShed category file>] \
[--status <Path to a TSV file with tool status - 3 columns: ToolShed ids of tool suites, Boolean with True to keep and False to exclude, Boolean with True if deprecated and False if not>]
```
Expand Down
3 changes: 2 additions & 1 deletion bin/extract_all_tools.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@ mkdir -p 'results/'
python bin/extract_galaxy_tools.py \
extractools \
--api $GITHUB_API_KEY \
--all-tools 'results/all_tools.tsv'
--all-tools 'results/all_tools.tsv' \
--all-tools-json 'results/all_tools.json'

python bin/create_interactive_table.py \
--table "results/all_tools.tsv" \
Expand Down
4 changes: 3 additions & 1 deletion bin/extract_all_tools_stepwise.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,15 @@

mkdir -p 'results/'

output="results/${1}_tools.tsv"
tsv_output="results/${1}_tools.tsv"
json_output="results/${1}_tools.json"

if [[ $1 =~ "01" ]]; then
python bin/extract_galaxy_tools.py \
extractools \
--api $GITHUB_API_KEY \
--all-tools $output \
--all-tools-json $json_output \
--planemo-repository-list $1
else
python bin/extract_galaxy_tools.py \
Expand Down
4 changes: 3 additions & 1 deletion bin/extract_all_tools_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,14 @@

mkdir -p 'results/'

output="results/${1}_tools.tsv"
tsv_output="results/${1}_tools.tsv"
json_output="results/${1}_tools.json"

python bin/extract_galaxy_tools.py \
extractools \
--api $GITHUB_API_KEY \
--all-tools $output \
--all-tools-json $json_output \
--planemo-repository-list $1 \
--test

30 changes: 22 additions & 8 deletions bin/extract_galaxy_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import argparse
import base64
import json
import sys
import time
import xml.etree.ElementTree as et
Expand Down Expand Up @@ -246,7 +247,7 @@ def check_categories(ts_categories: str, ts_cat: List[str]) -> bool:
return True
if not ts_categories:
return False
ts_cats = ts_categories.split(", ")
ts_cats = ts_categories
return bool(set(ts_cat) & set(ts_cats))


Expand Down Expand Up @@ -549,11 +550,22 @@ def format_list_column(col: pd.Series) -> pd.Series:
return col.apply(lambda x: ", ".join(str(i) for i in x))


def export_tools(
def export_tools_to_json(tools: List[Dict], output_fp: str) -> None:
"""
Export tool metadata to TSV output file
:param tools: dictionary with tools
:param output_fp: path to output file
"""
with Path(output_fp).open("w") as f:
json.dump(tools, f, default=list, indent=4)


def export_tools_to_tsv(
tools: List[Dict], output_fp: str, format_list_col: bool = False, add_usage_stats: bool = False
) -> None:
"""
Export tool metadata to tsv output file
Export tool metadata to TSV output file
:param tools: dictionary with tools
:param output_fp: path to output file
Expand Down Expand Up @@ -646,7 +658,7 @@ def filter_tools(
"--tools",
"-i",
required=True,
help="Filepath to TSV with all extracted tools, generated by extractools command",
help="Filepath to JSON with all extracted tools, generated by extractools command",
)
filtertools.add_argument(
"--ts-filtered-tools",
Expand Down Expand Up @@ -696,14 +708,16 @@ def filter_tools(
f"Error while extracting tools from repo {r}: {e}",
file=sys.stderr,
)
export_tools(tools, args.all_tools, format_list_col=True, add_usage_stats=True)
export_tools_to_json(tools, args.all_tools_json)
export_tools_to_tsv(tools, args.all_tools, format_list_col=True, add_usage_stats=True)

elif args.command == "filtertools":
tools = pd.read_csv(args.tools, sep="\t", keep_default_na=False).to_dict("records")
with Path(args.tools).open() as f:
tools = json.load(f)
# get categories and tools to exclude
categories = read_file(args.categories)
status = pd.read_csv(args.status, sep="\t", index_col=0, header=None).to_dict("index")
# filter tool lists
ts_filtered_tools, filtered_tools = filter_tools(tools, categories, status)
export_tools(ts_filtered_tools, args.ts_filtered_tools)
export_tools(filtered_tools, args.filtered_tools)
export_tools_to_tsv(ts_filtered_tools, args.ts_filtered_tools, format_list_col=True)
export_tools_to_tsv(filtered_tools, args.filtered_tools, format_list_col=True)
2 changes: 1 addition & 1 deletion bin/get_community_tools.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ for com_data_fp in data/communities/* ; do

python bin/extract_galaxy_tools.py \
filtertools \
--tools "results/all_tools.tsv" \
--tools "results/all_tools.json" \
--ts-filtered-tools "results/$community/tools_filtered_by_ts_categories.tsv" \
--filtered-tools "results/$community/tools.tsv" \
--categories "data/communities/$community/categories" \
Expand Down

0 comments on commit 5646420

Please sign in to comment.