From 232f88ec8a27001977dca9e22111b5bdc3be4f84 Mon Sep 17 00:00:00 2001 From: Brandon Duane Walker Date: Mon, 11 Mar 2024 17:00:20 -0400 Subject: [PATCH] first commit --- .../{{cookiecutter.package_name}}.py | 1 + .../torchdrug_download-tool/.bumpversion.cfg | 29 ++++ .../torchdrug_download-tool/.dockerignore | 4 + .../torchdrug_download-tool/.gitignore | 1 + .../torchdrug_download-tool/CHANGELOG.md | 5 + .../torchdrug_download-tool/Dockerfile | 24 +++ .../torchdrug_download-tool/README.md | 28 ++++ .../torchdrug_download-tool/VERSION | 1 + .../torchdrug_download-tool/ict.yml | 21 +++ .../torchdrug_download-tool/pyproject.toml | 46 ++++++ .../mm/utils/torchdrug_download/__init__.py | 7 + .../mm/utils/torchdrug_download/__main__.py | 152 ++++++++++++++++++ .../torchdrug_download/torchdrug_download.py | 19 +++ .../torchdrug_download-tool/tests/__init__.py | 1 + .../tests/test_torchdrug.py | 14 ++ .../torchdrug_download-tool/torch_drug.cwl | 42 +++++ 16 files changed, 395 insertions(+) create mode 100644 utils/mm-python-template/{{cookiecutter.container_name}}/src/{{cookiecutter.package_folders}}/{{cookiecutter.package_name}}.py create mode 100644 utils/pre-process/data-download/torchdrug_download-tool/.bumpversion.cfg create mode 100644 utils/pre-process/data-download/torchdrug_download-tool/.dockerignore create mode 100644 utils/pre-process/data-download/torchdrug_download-tool/.gitignore create mode 100644 utils/pre-process/data-download/torchdrug_download-tool/CHANGELOG.md create mode 100644 utils/pre-process/data-download/torchdrug_download-tool/Dockerfile create mode 100644 utils/pre-process/data-download/torchdrug_download-tool/README.md create mode 100644 utils/pre-process/data-download/torchdrug_download-tool/VERSION create mode 100644 utils/pre-process/data-download/torchdrug_download-tool/ict.yml create mode 100644 utils/pre-process/data-download/torchdrug_download-tool/pyproject.toml create mode 100644 utils/pre-process/data-download/torchdrug_download-tool/src/polus/mm/utils/torchdrug_download/__init__.py create mode 100644 utils/pre-process/data-download/torchdrug_download-tool/src/polus/mm/utils/torchdrug_download/__main__.py create mode 100644 utils/pre-process/data-download/torchdrug_download-tool/src/polus/mm/utils/torchdrug_download/torchdrug_download.py create mode 100644 utils/pre-process/data-download/torchdrug_download-tool/tests/__init__.py create mode 100644 utils/pre-process/data-download/torchdrug_download-tool/tests/test_torchdrug.py create mode 100644 utils/pre-process/data-download/torchdrug_download-tool/torch_drug.cwl diff --git a/utils/mm-python-template/{{cookiecutter.container_name}}/src/{{cookiecutter.package_folders}}/{{cookiecutter.package_name}}.py b/utils/mm-python-template/{{cookiecutter.container_name}}/src/{{cookiecutter.package_folders}}/{{cookiecutter.package_name}}.py new file mode 100644 index 00000000..2ed6c179 --- /dev/null +++ b/utils/mm-python-template/{{cookiecutter.container_name}}/src/{{cookiecutter.package_folders}}/{{cookiecutter.package_name}}.py @@ -0,0 +1 @@ +"""{{ cookiecutter.plugin_name }}.""" diff --git a/utils/pre-process/data-download/torchdrug_download-tool/.bumpversion.cfg b/utils/pre-process/data-download/torchdrug_download-tool/.bumpversion.cfg new file mode 100644 index 00000000..f3363762 --- /dev/null +++ b/utils/pre-process/data-download/torchdrug_download-tool/.bumpversion.cfg @@ -0,0 +1,29 @@ +[bumpversion] +current_version = 0.1.0 +commit = False +tag = False +parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? +serialize = + {major}.{minor}.{patch}-{release}{dev} + {major}.{minor}.{patch} + +[bumpversion:part:release] +optional_value = _ +first_value = dev +values = + dev + _ + +[bumpversion:part:dev] + +[bumpversion:file:pyproject.toml] +search = version = "{current_version}" +replace = version = "{new_version}" + +[bumpversion:file:VERSION] + +[bumpversion:file:README.md] + +[bumpversion:file:plugin.json] + +[bumpversion:file:src/polus/mm/utils/torchdrug_download/__init__.py] diff --git a/utils/pre-process/data-download/torchdrug_download-tool/.dockerignore b/utils/pre-process/data-download/torchdrug_download-tool/.dockerignore new file mode 100644 index 00000000..7c603f81 --- /dev/null +++ b/utils/pre-process/data-download/torchdrug_download-tool/.dockerignore @@ -0,0 +1,4 @@ +.venv +out +tests +__pycache__ diff --git a/utils/pre-process/data-download/torchdrug_download-tool/.gitignore b/utils/pre-process/data-download/torchdrug_download-tool/.gitignore new file mode 100644 index 00000000..c04bc49f --- /dev/null +++ b/utils/pre-process/data-download/torchdrug_download-tool/.gitignore @@ -0,0 +1 @@ +poetry.lock diff --git a/utils/pre-process/data-download/torchdrug_download-tool/CHANGELOG.md b/utils/pre-process/data-download/torchdrug_download-tool/CHANGELOG.md new file mode 100644 index 00000000..b67793f7 --- /dev/null +++ b/utils/pre-process/data-download/torchdrug_download-tool/CHANGELOG.md @@ -0,0 +1,5 @@ +# CHANGELOG + +## 0.1.0 + +Initial release. diff --git a/utils/pre-process/data-download/torchdrug_download-tool/Dockerfile b/utils/pre-process/data-download/torchdrug_download-tool/Dockerfile new file mode 100644 index 00000000..4f2ed038 --- /dev/null +++ b/utils/pre-process/data-download/torchdrug_download-tool/Dockerfile @@ -0,0 +1,24 @@ +# docker build -f Dockerfile -t mrbrandonwalker/torch_drug . + +FROM condaforge/mambaforge + +ENV EXEC_DIR="/opt/executables" +ENV POLUS_LOG="INFO" +RUN mkdir -p ${EXEC_DIR} + +# Work directory defined in the base container +WORKDIR ${EXEC_DIR} + +COPY pyproject.toml ${EXEC_DIR} +COPY VERSION ${EXEC_DIR} +COPY README.md ${EXEC_DIR} +COPY CHANGELOG.md ${EXEC_DIR} +# need this here because poetry install needs the src directory +COPY src ${EXEC_DIR}/src + +# Install needed packages here +RUN pip3 install --upgrade pip +RUN pip3 install poetry +RUN poetry install + +RUN pip3 install ${EXEC_DIR} --no-cache-dir diff --git a/utils/pre-process/data-download/torchdrug_download-tool/README.md b/utils/pre-process/data-download/torchdrug_download-tool/README.md new file mode 100644 index 00000000..a7d54d6d --- /dev/null +++ b/utils/pre-process/data-download/torchdrug_download-tool/README.md @@ -0,0 +1,28 @@ +# torchdrug (0.1.0) + +Access datasets and models from TorchDrug + +## Reading inputs/outputs from .cwl files +This adds inputs/outputs from .cwl files into cookiecutter.json +`python read_cwl_inputs_outputs.py path_to_cwl_file.cwl` + +## Modifying template files +To dynamically add inputs/outputs from cookiecutter.json to README.MD, __main__.py and plugin_package function +`python modify_base_template.py` + +## Building + +To build the Docker image for the conversion plugin, run `./build-docker.sh`. + +## Install WIPP Plugin + +If WIPP is running, navigate to the plugins page and add a new plugin. Paste the +contents of `plugin.json` into the pop-up window and submit. +## Options + +This plugin takes 1 input arguments and 1 output argument: + +| Name | Description | I/O | Type | Default | +|---------------|-------------------------|--------|--------|---------| +| dataset | Input dataset to extract | Input | string | string | +| outdir | Output collection. | Output | collection | collection | diff --git a/utils/pre-process/data-download/torchdrug_download-tool/VERSION b/utils/pre-process/data-download/torchdrug_download-tool/VERSION new file mode 100644 index 00000000..6e8bf73a --- /dev/null +++ b/utils/pre-process/data-download/torchdrug_download-tool/VERSION @@ -0,0 +1 @@ +0.1.0 diff --git a/utils/pre-process/data-download/torchdrug_download-tool/ict.yml b/utils/pre-process/data-download/torchdrug_download-tool/ict.yml new file mode 100644 index 00000000..bf403dc0 --- /dev/null +++ b/utils/pre-process/data-download/torchdrug_download-tool/ict.yml @@ -0,0 +1,21 @@ +specVersion: 0.1.0 +name: labshare/torchdrug-download +version: 0.1.0 +container: polusai/torchdrug-tool:0.1.0 +entrypoint: "" +title: torchdrug_download +description: Access datasets and models from TorchDrug +author: Brandon Walker (brandon.walker@axleinfo.com) +repository: https://github.com/labshare/mmtools +documentation: https://ncats.nih.gov/preclinical/core/informatics +citation: "" +inputs: +- name: dataset + required: true + description: Input dataset to extract + type: string +outputs: +- name: outdir + required: false + description: Output collection. + type: collection diff --git a/utils/pre-process/data-download/torchdrug_download-tool/pyproject.toml b/utils/pre-process/data-download/torchdrug_download-tool/pyproject.toml new file mode 100644 index 00000000..0b0ff624 --- /dev/null +++ b/utils/pre-process/data-download/torchdrug_download-tool/pyproject.toml @@ -0,0 +1,46 @@ +[tool.poetry] +name = "polus-mm-utils-torchdrug_download" +version = "0.1.0" +description = "Access datasets and models from TorchDrug" +authors = ["Data Scientist "] +readme = "README.md" +packages = [{include = "polus", from = "src"}] + +[tool.poetry.dependencies] +python = ">=3.8,<3.11" +torch = { version = "1.12.1", source="torch"} +torchaudio = { version = "0.12.1", source="torch"} +torchvision = { version = "0.13.1", source="torch"} +torch-cluster = { version = "1.6.0", source="pyg"} +torch-scatter = { version = "2.0.9", source="pyg"} +torchdrug = "0.2.1" +rdkit = "2023.9.5" +typer = "^0.7.0" + +[[tool.poetry.source]] +name = "torch" +url = "https://download.pytorch.org/whl/cu116" +secondary = true + +[[tool.poetry.source]] +name = "pyg" +url = "https://data.pyg.org/whl/torch-1.12.1+cu116.html" +secondary = true + +[tool.poetry.group.dev.dependencies] +bump2version = "^1.0.1" +pytest = "^7.4" +pytest-sugar = "^0.9.6" +pre-commit = "^3.2.1" +black = "^23.3.0" +mypy = "^1.1.1" +ruff = "^0.0.270" + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" + +[tool.pytest.ini_options] +pythonpath = [ + "." +] diff --git a/utils/pre-process/data-download/torchdrug_download-tool/src/polus/mm/utils/torchdrug_download/__init__.py b/utils/pre-process/data-download/torchdrug_download-tool/src/polus/mm/utils/torchdrug_download/__init__.py new file mode 100644 index 00000000..b8f62957 --- /dev/null +++ b/utils/pre-process/data-download/torchdrug_download-tool/src/polus/mm/utils/torchdrug_download/__init__.py @@ -0,0 +1,7 @@ +"""torchdrug.""" + +__version__ = "0.1.0" + +from polus.mm.utils.torchdrug_download.torchdrug_download import ( # noqa # pylint: disable=unused-import + torchdrug_download, +) diff --git a/utils/pre-process/data-download/torchdrug_download-tool/src/polus/mm/utils/torchdrug_download/__main__.py b/utils/pre-process/data-download/torchdrug_download-tool/src/polus/mm/utils/torchdrug_download/__main__.py new file mode 100644 index 00000000..2a7856d2 --- /dev/null +++ b/utils/pre-process/data-download/torchdrug_download-tool/src/polus/mm/utils/torchdrug_download/__main__.py @@ -0,0 +1,152 @@ +"""Package entrypoint for the torchdrug package.""" + +# Base packages +import logging +from enum import Enum +from os import environ +from pathlib import Path + +import typer +from polus.mm.utils.torchdrug_download.torchdrug_download import torchdrug_download +from torchdrug import datasets + +logging.basicConfig( + format="%(asctime)s - %(name)-8s - %(levelname)-8s - %(message)s", + datefmt="%d-%b-%y %H:%M:%S", +) +POLUS_LOG = getattr(logging, environ.get("POLUS_LOG", "INFO")) +logger = logging.getLogger("polus.mm.utils.torchdrug_download.") +logger.setLevel(POLUS_LOG) + +app = typer.Typer(help="torchdrug_download.") + + +class DatabaseEnum(str, Enum): + """class DatabaseEnum for the input database to be processed.""" + + ClinTox = "ClinTox" + PDBBind = "PDBBind" + FB15k = "FB15k" + FB15k237 = "FB15k237" + WN18 = "WN18" + WN18RR = "WN18RR" + Hetionet = "Hetionet" + BACE = "BACE" + BBBP = "BBBP" + CEP = "CEP" + ChEMBLFiltered = "ChEMBLFiltered" + Delaney = "Delaney" + FreeSolv = "FreeSolv" + HIV = "HIV" + Lipophilicity = "Lipophilicity" + MUV = "MUV" + Malaria = "Malaria" + OPV = "OPV" + QM8 = "QM8" + QM9 = "QM9" + SIDER = "SIDER" + Tox21 = "Tox21" + ToxCast = "ToxCast" + ZINC250k = "ZINC250k" + ZINC2m = "ZINC2m" + MOSES = "MOSES" + PCQM4M = "PCQM4M" + BetaLactamase = "BetaLactamase" + Fluorescence = "Fluorescence" + Stability = "Stability" + Solubility = "Solubility" + BinaryLocalization = "BinaryLocalization" + SubcellularLocalization = "SubcellularLocalization" + EnzymeCommission = "EnzymeCommission" + GeneOntology = "GeneOntology" + AlphaFoldDB = "AlphaFoldDB" + Fold = "Fold" + SecondaryStructure = "SecondaryStructure" + ProteinNet = "ProteinNet" + HumanPPI = "HumanPPI" + YeastPPI = "YeastPPI" + PPIAffinity = "PPIAffinity" + BindingDB = "BindingDB" + USPTO50k = "USPTO50k" + Cora = "Cora" + PubMed = "PubMed" + + +@app.command() +def main( + dataset: DatabaseEnum = typer.Option( + ..., + "--dataset", + help="Input database to be processed.", + ), + out_dir: Path = typer.Option( + ..., + "--outdir", + help="Output directory.", + exists=True, + writable=True, + file_okay=False, + resolve_path=True, + ), +) -> None: + """torchdrug_download.""" + dataset_mapping = { + "PDBBind": datasets.PDBBind, + "ClinTox": datasets.ClinTox, + "FB15k": datasets.FB15k, + "FB15k237": datasets.FB15k237, + "WN18": datasets.WN18, + "WN18RR": datasets.WN18RR, + "Hetionet": datasets.Hetionet, + "BACE": datasets.BACE, + "BBBP": datasets.BBBP, + "CEP": datasets.CEP, + "ChEMBLFiltered": datasets.ChEMBLFiltered, + "Delaney": datasets.Delaney, + "FreeSolv": datasets.FreeSolv, + "HIV": datasets.HIV, + "Lipophilicity": datasets.Lipophilicity, + "MUV": datasets.MUV, + "Malaria": datasets.Malaria, + "OPV": datasets.OPV, + "QM8": datasets.QM8, + "QM9": datasets.QM9, + "SIDER": datasets.SIDER, + "Tox21": datasets.Tox21, + "ToxCast": datasets.ToxCast, + "ZINC250k": datasets.ZINC250k, + "ZINC2m": datasets.ZINC2m, + "MOSES": datasets.MOSES, + "PCQM4M": datasets.PCQM4M, + "BetaLactamase": datasets.BetaLactamase, + "Fluorescence": datasets.Fluorescence, + "Stability": datasets.Stability, + "Solubility": datasets.Solubility, + "BinaryLocalization": datasets.BinaryLocalization, + "SubcellularLocalization": datasets.SubcellularLocalization, + "EnzymeCommission": datasets.EnzymeCommission, + "GeneOntology": datasets.GeneOntology, + "AlphaFoldDB": datasets.AlphaFoldDB, + "Fold": datasets.Fold, + "SecondaryStructure": datasets.SecondaryStructure, + "ProteinNet": datasets.ProteinNet, + "HumanPPI": datasets.HumanPPI, + "YeastPPI": datasets.YeastPPI, + "PPIAffinity": datasets.PPIAffinity, + "BindingDB": datasets.BindingDB, + "USPTO50k": datasets.USPTO50k, + "Cora": datasets.Cora, + "PubMed": datasets.PubMed, + } + + if dataset not in dataset_mapping: + msg = f"Unsupported dataset: {dataset}" + raise ValueError(msg) + + logger.info(f"database: {dataset}") + logger.info(f"outdir: {out_dir}") + torchdrug_download(dataset, out_dir, dataset_mapping) + + +if __name__ == "__main__": + app() diff --git a/utils/pre-process/data-download/torchdrug_download-tool/src/polus/mm/utils/torchdrug_download/torchdrug_download.py b/utils/pre-process/data-download/torchdrug_download-tool/src/polus/mm/utils/torchdrug_download/torchdrug_download.py new file mode 100644 index 00000000..cf1e4089 --- /dev/null +++ b/utils/pre-process/data-download/torchdrug_download-tool/src/polus/mm/utils/torchdrug_download/torchdrug_download.py @@ -0,0 +1,19 @@ +"""torchdrug_download.""" +from pathlib import Path + + +def torchdrug_download(dataset: str, outdir: Path, dataset_mapping: dict) -> None: + """torchdrug. + + Args: + dataset: Input dataset to extract + outdir: Output collection. + dataset_mapping: Mapping of dataset to class. + + Returns: + None + """ + # Create an instance of the selected dataset class + selected_dataset_class = dataset_mapping[dataset] + # lazy = False causes issues with PDBBind dataset such as invalid sequence + dataset = selected_dataset_class(outdir, lazy=True) diff --git a/utils/pre-process/data-download/torchdrug_download-tool/tests/__init__.py b/utils/pre-process/data-download/torchdrug_download-tool/tests/__init__.py new file mode 100644 index 00000000..007d6750 --- /dev/null +++ b/utils/pre-process/data-download/torchdrug_download-tool/tests/__init__.py @@ -0,0 +1 @@ +"""Tests for torchdrug_download.""" diff --git a/utils/pre-process/data-download/torchdrug_download-tool/tests/test_torchdrug.py b/utils/pre-process/data-download/torchdrug_download-tool/tests/test_torchdrug.py new file mode 100644 index 00000000..87cf1bd8 --- /dev/null +++ b/utils/pre-process/data-download/torchdrug_download-tool/tests/test_torchdrug.py @@ -0,0 +1,14 @@ +"""Tests for torchdrug_download.""" +from pathlib import Path + +from polus.mm.utils.torchdrug_download.torchdrug_download import torchdrug_download +from torchdrug import datasets + + +def test_torchdrug_download_check() -> None: + """Test torchdrug_download.""" + dataset = "Tox21" + outdir = Path.cwd() + dataset_mapping = {"Tox21": datasets.Tox21} + torchdrug_download(dataset, outdir, dataset_mapping) + assert Path("tox21.csv").exists() diff --git a/utils/pre-process/data-download/torchdrug_download-tool/torch_drug.cwl b/utils/pre-process/data-download/torchdrug_download-tool/torch_drug.cwl new file mode 100644 index 00000000..50da5898 --- /dev/null +++ b/utils/pre-process/data-download/torchdrug_download-tool/torch_drug.cwl @@ -0,0 +1,42 @@ +#!/usr/bin/env cwl-runner +cwlVersion: v1.0 + +class: CommandLineTool + +label: Access datasets and models from TorchDrug + +doc: |- + Access datasets and models from TorchDrug + +baseCommand: ["python", "-m", "polus.mm.utils.torchdrug_download"] + +hints: + DockerRequirement: + dockerPull: mrbrandonwalker/torch_drug + +inputs: + + dataset: + label: Input dataset to extract + doc: |- + Input dataset to extract + type: string + inputBinding: + prefix: --dataset + +outputs: + + output_csv: + label: Path to the output CSV file + doc: |- + Path to the output CSV file + type: Directory + outputBinding: + glob: "*.csv" + + +$namespaces: + edam: https://edamontology.org/ + +$schemas: +- https://raw.githubusercontent.com/edamontology/edamontology/master/EDAM_dev.owl