From 0d95bdc702ca3c75b0034d6a9b7066f4f2754532 Mon Sep 17 00:00:00 2001 From: paulzierep Date: Tue, 8 Oct 2024 14:02:26 +0200 Subject: [PATCH] add this to metadata (#187) --- communities/all/resources/test_tools.json | 11 +++++------ communities/all/resources/test_tools.tsv | 13 +++++++------ sources/bin/extract_galaxy_tools.py | 4 ++++ sources/bin/shared.py | 21 +++++++++++++++++++++ 4 files changed, 37 insertions(+), 12 deletions(-) diff --git a/communities/all/resources/test_tools.json b/communities/all/resources/test_tools.json index fd91f160..0bad9a6f 100644 --- a/communities/all/resources/test_tools.json +++ b/communities/all/resources/test_tools.json @@ -29,6 +29,7 @@ "Imaging" ], "ToolShed id": "2d_auto_threshold", + "Date of first commit of the suite": "2024-03-01", "Galaxy wrapper owner": "imgteam", "Galaxy wrapper source": "https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/2d_auto_threshold/", "Galaxy wrapper parsed folder": "https://github.com/paulzierep/Galaxy-Tool-Metadata-Extractor-Test-Wrapper/tree/main/tools/2d_auto_threshold", @@ -122,6 +123,7 @@ "Sequence Analysis" ], "ToolShed id": "abritamr", + "Date of first commit of the suite": "2024-03-01", "Galaxy wrapper owner": "iuc", "Galaxy wrapper source": "https://github.com/galaxyproject/tools-iuc/tree/master/tools/abritamr", "Galaxy wrapper parsed folder": "https://github.com/paulzierep/Galaxy-Tool-Metadata-Extractor-Test-Wrapper/tree/main/tools/abritamr", @@ -213,6 +215,7 @@ "Metagenomics" ], "ToolShed id": "aldex2", + "Date of first commit of the suite": "2024-03-01", "Galaxy wrapper owner": "iuc", "Galaxy wrapper source": "https://github.com/galaxyproject/tools-iuc/tree/master/tools/aldex2", "Galaxy wrapper parsed folder": "https://github.com/paulzierep/Galaxy-Tool-Metadata-Extractor-Test-Wrapper/tree/main/tools/aldex2", @@ -310,6 +313,7 @@ "Sequence Analysis" ], "ToolShed id": "fastp", + "Date of first commit of the suite": "2024-03-11", "Galaxy wrapper owner": "iuc", "Galaxy wrapper source": "https://github.com/galaxyproject/tools-iuc/tree/master/tools/fastp", "Galaxy wrapper parsed folder": "https://github.com/paulzierep/Galaxy-Tool-Metadata-Extractor-Test-Wrapper/tree/main/tools/fastp", @@ -398,12 +402,6 @@ "metaplasmidspades", "coronaspades", "rnaspades", - "metaspades", - "spades", - "rnaviralspades", - "metaviralspades", - "biosyntheticspades", - "plasmidspades" ], "biii": null, "bio.tool name": "SPAdes", @@ -422,6 +420,7 @@ "Metagenomics" ], "ToolShed id": "spades", + "Date of first commit of the suite": "2024-03-12", "Galaxy wrapper owner": "iuc", "Galaxy wrapper source": "https://github.com/galaxyproject/tools-iuc/tree/master/tools/spades", "Galaxy wrapper parsed folder": "https://github.com/paulzierep/Galaxy-Tool-Metadata-Extractor-Test-Wrapper/tree/main/tools/spades", diff --git a/communities/all/resources/test_tools.tsv b/communities/all/resources/test_tools.tsv index e9e65d29..f60a80e2 100644 --- a/communities/all/resources/test_tools.tsv +++ b/communities/all/resources/test_tools.tsv @@ -1,6 +1,7 @@ -Galaxy wrapper id Galaxy tool ids Description bio.tool id bio.tool ids biii bio.tool name bio.tool description EDAM operation EDAM topic Status Source ToolShed categories ToolShed id Galaxy wrapper owner Galaxy wrapper source Galaxy wrapper parsed folder Galaxy wrapper version Conda id Conda version EDAM operation (no superclasses) EDAM topic (no superclasses) Available on UseGalaxy.org (Main) Available on UseGalaxy.org.au Available on UseGalaxy.eu Available on UseGalaxy.fr Tools available on UseGalaxy.org (Main) Tools available on UseGalaxy.org.au Tools available on UseGalaxy.eu Tools available on UseGalaxy.fr Tools available on APOSTL Tools available on BF2I-MAP Tools available on BioBix Tools available on CIRM-CFBP Tools available on Center for Phage Technology (CPT) Tools available on ChemFlow Tools available on Coloc-stats Tools available on CoralSNP Tools available on CropGalaxy Tools available on Dintor Tools available on FreeBioinfo Tools available on GASLINI Tools available on Galaxy@AuBi Tools available on Galaxy@Pasteur Tools available on GalaxyTrakr Tools available on Genomic Hyperbrowser Tools available on GigaGalaxy Tools available on HyPhy HIV NGS Tools Tools available on IPK Galaxy Blast Suite Tools available on ImmPort Galaxy Tools available on InteractoMIX Tools available on MISSISSIPPI Tools available on Mandoiu Lab Tools available on MiModD NacreousMap Tools available on Oqtans Tools available on Palfinder Tools available on PepSimili Tools available on PhagePromotor Tools available on UseGalaxy.be Tools available on UseGalaxy.cz Tools available on UseGalaxy.no Tools available on Viral Variant Visualizer (VVV) No. of tool users (5 years) (usegalaxy.eu) No. of tool users (all time) (usegalaxy.eu) Tool usage (5 years) (usegalaxy.eu) Tool usage (all time) (usegalaxy.eu) No. of tool users (5 years) (usegalaxy.org) No. of tool users (all time) (usegalaxy.org) Tool usage (5 years) (usegalaxy.org) Tool usage (all time) (usegalaxy.org) No. of tool users (5 years) (usegalaxy.org.au) No. of tool users (all time) (usegalaxy.org.au) Tool usage (5 years) (usegalaxy.org.au) Tool usage (all time) (usegalaxy.org.au) No. of tool users (5 years) - all main servers No. of tool users (all time) - all main servers Tool usage (5 years) - all main servers Tool usage (all time) - all main servers -2d_auto_threshold ip_threshold Automatic thresholding scikit-image scikit-image scikit-image scikit-image Scikit-image contains image processing algorithms for SciPy, including IO, morphology, filtering, warping, color manipulation, object detection, etc. Image analysis, Image annotation, Visualisation, Data handling Imaging, Software engineering, Literature and language To update https://github.com/bmcv Imaging 2d_auto_threshold imgteam https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/2d_auto_threshold/ https://github.com/paulzierep/Galaxy-Tool-Metadata-Extractor-Test-Wrapper/tree/main/tools/2d_auto_threshold 0.0.6-2 scikit-image Image analysis, Image annotation, Visualisation, Data handling Imaging, Software engineering, Literature and language 0 1 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 1434 6746 120 122 0 0 0 0 305 305 11 11 1739 7051 131 133 -abritamr abritamr A pipeline for running AMRfinderPlus and collating results into functional classes To update https://zenodo.org/record/7370628 Sequence Analysis abritamr iuc https://github.com/galaxyproject/tools-iuc/tree/master/tools/abritamr https://github.com/paulzierep/Galaxy-Tool-Metadata-Extractor-Test-Wrapper/tree/main/tools/abritamr 1.0.14 abritamr 1.0.19 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1139 1139 109 109 0 0 0 0 0 0 0 0 1139 1139 109 109 -aldex2 aldex2 Performs analysis Of differential abundance taking sample variation into account aldex2 aldex2 ALDEx2 A differential abundance analysis for the comparison of two or more conditions. It uses a Dirichlet-multinomial model to infer abundance from counts, that has been optimized for three or more experimental replicates. Infers sampling variation and calculates the expected FDR given the biological and sampling variation using the Wilcox rank test and Welches t-test, or the glm and Kruskal Wallis tests. Reports both P and fdr values calculated by the Benjamini Hochberg correction. Statistical inference Gene expression, Statistics and probability To update https://github.com/ggloor/ALDEx_bioc Metagenomics aldex2 iuc https://github.com/galaxyproject/tools-iuc/tree/master/tools/aldex2 https://github.com/paulzierep/Galaxy-Tool-Metadata-Extractor-Test-Wrapper/tree/main/tools/aldex2 1.26.0 bioconductor-aldex2 1.34.0 Statistical inference Gene expression, Statistics and probability 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 262 262 36 36 0 0 0 0 0 0 0 0 262 262 36 36 -fastp fastp Fast all-in-one preprocessing for FASTQ files fastp fastp fastp A tool designed to provide fast all-in-one preprocessing for FastQ files. This tool is developed in C++ with multithreading supported to afford high performance. Sequencing quality control, Sequence contamination filtering Sequence analysis, Probes and primers To update https://github.com/OpenGene/fastp Sequence Analysis fastp iuc https://github.com/galaxyproject/tools-iuc/tree/master/tools/fastp https://github.com/paulzierep/Galaxy-Tool-Metadata-Extractor-Test-Wrapper/tree/main/tools/fastp fastp 0.23.4 Sequence contamination filtering Probes and primers 1 1 1 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 1 0 0 0 1 0 0 0 0 0 0 1 1 1 1 1126086 1127111 6847 6909 422259 422259 10722 10722 76462 76462 2242 2242 1624807 1625832 19811 19873 -spades spades_biosyntheticspades, spades_coronaspades, spades_metaplasmidspades, metaspades, spades_metaviralspades, spades_plasmidspades, rnaspades, spades_rnaviralspades, spades SPAdes is an assembly toolkit containing various assembly pipelines. It implements the following 4 stages: assembly graph construction, k-bimer adjustment, construction of paired assembly graph and contig construction. spades metaplasmidspades, coronaspades, rnaspades, metaspades, spades, rnaviralspades, metaviralspades, biosyntheticspades, plasmidspades SPAdes St. Petersburg genome assembler – is intended for both standard isolates and single-cell MDA bacteria assemblies. SPAdes 3.9 works with Illumina or IonTorrent reads and is capable of providing hybrid assemblies using PacBio, Oxford Nanopore and Sanger reads. Additional contigs can be provided and can be used as long reads. Genome assembly Sequence assembly To update https://github.com/ablab/spades Assembly, RNA, Metagenomics spades iuc https://github.com/galaxyproject/tools-iuc/tree/master/tools/spades https://github.com/paulzierep/Galaxy-Tool-Metadata-Extractor-Test-Wrapper/tree/main/tools/spades 3.15.5 spades 4.0.0 Genome assembly Sequence assembly 9 9 9 0 9 9 9 0 0 0 0 0 0 0 0 0 0 0 0 0 1 3 8 0 0 2 0 0 0 3 0 0 0 0 0 0 3 9 3 0 82716 87113 8209 8526 120471 120475 14787 14790 54067 61541 5817 6653 257254 269129 28813 29969 +Galaxy wrapper id Galaxy tool ids Description bio.tool id bio.tool ids biii bio.tool name bio.tool description EDAM operation EDAM topic Status Source ToolShed categories ToolShed id Date of first commit of the suite Galaxy wrapper owner Galaxy wrapper source Galaxy wrapper parsed folder Galaxy wrapper version Conda id Conda version EDAM operation (no superclasses) EDAM topic (no superclasses) Available on UseGalaxy.org (Main) Available on UseGalaxy.org.au Available on UseGalaxy.eu Available on UseGalaxy.fr Tools available on UseGalaxy.org (Main) Tools available on UseGalaxy.org.au Tools available on UseGalaxy.eu Tools available on UseGalaxy.fr Tools available on APOSTL Tools available on BF2I-MAP Tools available on BioBix Tools available on CIRM-CFBP Tools available on Center for Phage Technology (CPT) Tools available on ChemFlow Tools available on Coloc-stats Tools available on CoralSNP Tools available on CropGalaxy Tools available on Dintor Tools available on FreeBioinfo Tools available on GASLINI Tools available on Galaxy@AuBi Tools available on Galaxy@Pasteur Tools available on GalaxyTrakr Tools available on Genomic Hyperbrowser Tools available on GigaGalaxy Tools available on HyPhy HIV NGS Tools Tools available on IPK Galaxy Blast Suite Tools available on ImmPort Galaxy Tools available on InteractoMIX Tools available on MISSISSIPPI Tools available on Mandoiu Lab Tools available on MiModD NacreousMap Tools available on Oqtans Tools available on Palfinder Tools available on PepSimili Tools available on PhagePromotor Tools available on UseGalaxy.be Tools available on UseGalaxy.cz Tools available on UseGalaxy.no Tools available on Viral Variant Visualizer (VVV) No. of tool users (5 years) (usegalaxy.eu) No. of tool users (all time) (usegalaxy.eu) Tool usage (5 years) (usegalaxy.eu) Tool usage (all time) (usegalaxy.eu) No. of tool users (5 years) (usegalaxy.org) No. of tool users (all time) (usegalaxy.org) Tool usage (5 years) (usegalaxy.org) Tool usage (all time) (usegalaxy.org) No. of tool users (5 years) (usegalaxy.org.au) No. of tool users (all time) (usegalaxy.org.au) Tool usage (5 years) (usegalaxy.org.au) Tool usage (all time) (usegalaxy.org.au) +2d_auto_threshold ip_threshold Automatic thresholding scikit-image scikit-image scikit-image scikit-image Scikit-image contains image processing algorithms for SciPy, including IO, morphology, filtering, warping, color manipulation, object detection, etc. Image analysis, Image annotation, Visualisation, Data handling Imaging, Software engineering, Literature and language To update https://github.com/bmcv Imaging 2d_auto_threshold 2024-03-01 imgteam https://github.com/BMCV/galaxy-image-analysis/tree/master/tools/2d_auto_threshold/ https://github.com/paulzierep/Galaxy-Tool-Metadata-Extractor-Test-Wrapper/tree/main/tools/2d_auto_threshold 0.0.6-2 scikit-image Image analysis, Image annotation, Visualisation, Data handling Imaging, Software engineering, Literature and language 0 1 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 1434 6746 120 122 0 0 0 0 305 305 11 11 +abritamr abritamr A pipeline for running AMRfinderPlus and collating results into functional classes To update https://zenodo.org/record/7370628 Sequence Analysis abritamr 2024-03-01 iuc https://github.com/galaxyproject/tools-iuc/tree/master/tools/abritamr https://github.com/paulzierep/Galaxy-Tool-Metadata-Extractor-Test-Wrapper/tree/main/tools/abritamr 1.0.14 abritamr 1.0.19 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1139 1139 109 109 0 0 0 0 0 0 0 0 +aldex2 aldex2 Performs analysis Of differential abundance taking sample variation into account aldex2 aldex2 ALDEx2 A differential abundance analysis for the comparison of two or more conditions. It uses a Dirichlet-multinomial model to infer abundance from counts, that has been optimized for three or more experimental replicates. Infers sampling variation and calculates the expected FDR given the biological and sampling variation using the Wilcox rank test and Welches t-test, or the glm and Kruskal Wallis tests. Reports both P and fdr values calculated by the Benjamini Hochberg correction. Statistical inference Gene expression, Statistics and probability To update https://github.com/ggloor/ALDEx_bioc Metagenomics aldex2 2024-03-01 iuc https://github.com/galaxyproject/tools-iuc/tree/master/tools/aldex2 https://github.com/paulzierep/Galaxy-Tool-Metadata-Extractor-Test-Wrapper/tree/main/tools/aldex2 1.26.0 bioconductor-aldex2 1.34.0 Statistical inference Gene expression, Statistics and probability 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 262 262 36 36 0 0 0 0 0 0 0 0 +fastp fastp Fast all-in-one preprocessing for FASTQ files fastp fastp fastp A tool designed to provide fast all-in-one preprocessing for FastQ files. This tool is developed in C++ with multithreading supported to afford high performance. Sequencing quality control, Sequence contamination filtering Sequence analysis, Probes and primers To update https://github.com/OpenGene/fastp Sequence Analysis fastp 2024-03-11 iuc https://github.com/galaxyproject/tools-iuc/tree/master/tools/fastp https://github.com/paulzierep/Galaxy-Tool-Metadata-Extractor-Test-Wrapper/tree/main/tools/fastp fastp 0.23.4 Sequence contamination filtering Probes and primers 1 1 1 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 1 0 0 0 1 0 0 0 0 0 0 1 1 1 1 1126086 1127111 6847 6909 422259 422259 10722 10722 76462 76462 2242 2242 +spades spades_biosyntheticspades, spades_coronaspades, spades_metaplasmidspades, metaspades, spades_metaviralspades, spades_plasmidspades, rnaspades, spades_rnaviralspades, spades SPAdes is an assembly toolkit containing various assembly pipelines. It implements the following 4 stages: assembly graph construction, k-bimer adjustment, construction of paired assembly graph and contig construction. spades metaplasmidspades, rnaspades, metaviralspades, spades, rnaviralspades, plasmidspades, coronaspades, biosyntheticspades, metaspades SPAdes St. Petersburg genome assembler – is intended for both standard isolates and single-cell MDA bacteria assemblies. SPAdes 3.9 works with Illumina or IonTorrent reads and is capable of providing hybrid assemblies using PacBio, Oxford Nanopore and Sanger reads. Additional contigs can be provided and can be used as long reads. Genome assembly Sequence assembly To update https://github.com/ablab/spades Assembly, RNA, Metagenomics spades 2024-03-12 iuc https://github.com/galaxyproject/tools-iuc/tree/master/tools/spades https://github.com/paulzierep/Galaxy-Tool-Metadata-Extractor-Test-Wrapper/tree/main/tools/spades 3.15.5 spades 4.0.0 Genome assembly Sequence assembly 9 9 9 0 9 9 9 0 0 0 0 0 0 0 0 0 0 0 0 0 1 3 8 0 0 2 0 0 0 3 0 0 0 0 0 0 3 9 3 0 82716 87113 8209 8526 120471 120475 14787 14790 54067 61541 5817 6653 + diff --git a/sources/bin/extract_galaxy_tools.py b/sources/bin/extract_galaxy_tools.py index 859fbc60..1ba62496 100644 --- a/sources/bin/extract_galaxy_tools.py +++ b/sources/bin/extract_galaxy_tools.py @@ -288,6 +288,7 @@ def get_tool_metadata(tool: ContentFile, repo: Repository) -> Optional[Dict[str, "Source": None, "ToolShed categories": [], "ToolShed id": None, + "Date of first commit of the suite": None, "Galaxy wrapper owner": None, "Galaxy wrapper source": None, # this is what it written in the .shed.yml "Galaxy wrapper parsed folder": None, # this is the actual parsed file @@ -325,6 +326,9 @@ def get_tool_metadata(tool: ContentFile, repo: Repository) -> Optional[Dict[str, # store the github location where the folder was parsed metadata["Galaxy wrapper parsed folder"] = tool.html_url + # get the first commit date + metadata["Date of first commit of the suite"] = shared.get_first_commit_for_folder(tool, repo) + # find and parse macro file for file in file_list: if "macro" in file.name and file.name.endswith("xml"): diff --git a/sources/bin/shared.py b/sources/bin/shared.py index f0aefc46..eb8a6b10 100644 --- a/sources/bin/shared.py +++ b/sources/bin/shared.py @@ -11,6 +11,27 @@ import pandas as pd import requests +from github.ContentFile import ContentFile +from github.Repository import Repository + + +def get_first_commit_for_folder(tool: ContentFile, repo: Repository) -> str: + """ + Get the date of the first commit in the tool folder + + :param commit_date: date of the first commit + """ + + # Get commits related to the specific folder + commits = repo.get_commits(path=tool.path) + + # Get the last commit in the history (which is the first commit made to the folder) + first_commit = commits.reversed[0] + + # Extract relevant information about the first commit + commit_date = first_commit.commit.author.date.date() + + return str(commit_date) def format_list_column(col: pd.Series) -> pd.Series: