Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

first commit #133

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""{{ cookiecutter.plugin_name }}."""
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
[bumpversion]
current_version = 0.1.0
commit = False
tag = False
parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>[a-z]+)(?P<dev>\d+))?
serialize =
{major}.{minor}.{patch}-{release}{dev}
{major}.{minor}.{patch}

[bumpversion:part:release]
optional_value = _
first_value = dev
values =
dev
_

[bumpversion:part:dev]

[bumpversion:file:pyproject.toml]
search = version = "{current_version}"
replace = version = "{new_version}"

[bumpversion:file:VERSION]

[bumpversion:file:README.md]

[bumpversion:file:plugin.json]

[bumpversion:file:src/polus/mm/utils/torchdrug_download/__init__.py]
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
.venv
out
tests
__pycache__
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
poetry.lock
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# CHANGELOG

## 0.1.0

Initial release.
49 changes: 49 additions & 0 deletions utils/pre-process/data-download/torchdrug_download-tool/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# docker build -f Dockerfile -t polusai/torch-drug-tool .

FROM condaforge/mambaforge

ENV EXEC_DIR="/opt/executables"
ENV POLUS_LOG="INFO"
RUN mkdir -p ${EXEC_DIR}

# Install g++ and other essential packages
# needed to install torch
RUN apt-get update && apt-get install -y \
g++ \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*

# Work directory defined in the base container
WORKDIR ${EXEC_DIR}

COPY pyproject.toml ${EXEC_DIR}
COPY VERSION ${EXEC_DIR}
COPY README.md ${EXEC_DIR}
COPY CHANGELOG.md ${EXEC_DIR}
# need this here because poetry install needs the src directory
COPY src ${EXEC_DIR}/src

# Install needed packages here
RUN pip3 install --upgrade pip
RUN pip3 install poetry
RUN poetry install
# need to install torch here otherwise get
#0 2.132 Collecting torch-cluster==1.6.0 (from polus-mm-utils-torchdrug_download==0.1.0)
#0 2.144 Downloading torch_cluster-1.6.0.tar.gz (43 kB)
#0 2.150 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 43.4/43.4 kB 9.0 MB/s eta 0:00:00
#0 2.164 Preparing metadata (setup.py): started
#0 2.329 Preparing metadata (setup.py): finished with status 'error'
#0 2.333 error: subprocess-exited-with-error
#0 2.333
#0 2.333 × python setup.py egg_info did not run successfully.
#0 2.333 │ exit code: 1
#0 2.333 ╰─> [6 lines of output]
#0 2.333 Traceback (most recent call last):
#0 2.333 File "<string>", line 2, in <module>
#0 2.333 File "<pip-setuptools-caller>", line 34, in <module>
#0 2.333 File "/tmp/pip-install-fdght_zq/torch-cluster_10a1a1bbf63e4e1ca1e035b9639f5253/setup.py", line 8, in <module>
#0 2.333 import torch
#0 2.333 ModuleNotFoundError: No module named 'torch'
RUN pip3 install torch

RUN pip3 install ${EXEC_DIR} --no-cache-dir
28 changes: 28 additions & 0 deletions utils/pre-process/data-download/torchdrug_download-tool/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# torchdrug (0.1.0)

Access datasets and models from TorchDrug

## Reading inputs/outputs from .cwl files
This adds inputs/outputs from .cwl files into cookiecutter.json
`python read_cwl_inputs_outputs.py path_to_cwl_file.cwl`

## Modifying template files
To dynamically add inputs/outputs from cookiecutter.json to README.MD, __main__.py and plugin_package function
`python modify_base_template.py`

## Building

To build the Docker image for the conversion plugin, run `./build-docker.sh`.

## Install WIPP Plugin

If WIPP is running, navigate to the plugins page and add a new plugin. Paste the
contents of `plugin.json` into the pop-up window and submit.
## Options

This plugin takes 1 input arguments and 1 output argument:

| Name | Description | I/O | Type | Default |
|---------------|-------------------------|--------|--------|---------|
| dataset | Input dataset to extract | Input | string | string |
| outdir | Output collection. | Output | collection | collection |
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
0.1.0
21 changes: 21 additions & 0 deletions utils/pre-process/data-download/torchdrug_download-tool/ict.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
specVersion: 0.1.0
name: labshare/torchdrug-download
version: 0.1.0
container: polusai/torchdrug-tool:0.1.0
entrypoint: ""
title: torchdrug_download
description: Access datasets and models from TorchDrug
author: Brandon Walker ([email protected])
repository: https://github.com/labshare/mmtools
documentation: https://ncats.nih.gov/preclinical/core/informatics
citation: ""
inputs:
- name: dataset
required: true
description: Input dataset to extract
type: string
outputs:
- name: outdir
required: false
description: Output collection.
type: collection
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
[tool.poetry]
name = "polus-mm-utils-torchdrug_download"
version = "0.1.0"
description = "Access datasets and models from TorchDrug"
authors = ["Data Scientist <[email protected]>"]
readme = "README.md"
packages = [{include = "polus", from = "src"}]

[tool.poetry.dependencies]
python = ">=3.8,<3.11"
torch = { version = "1.12.1", source="torch"}
torchaudio = { version = "0.12.1", source="torch"}
torchvision = { version = "0.13.1", source="torch"}
torch-cluster = { version = "1.6.0", source="pyg"}
torch-scatter = { version = "2.0.9", source="pyg"}
torchdrug = "0.2.1"
rdkit = "2023.9.5"
typer = "^0.7.0"

[[tool.poetry.source]]
name = "torch"
url = "https://download.pytorch.org/whl/cu116"
secondary = true

[[tool.poetry.source]]
name = "pyg"
url = "https://data.pyg.org/whl/torch-1.12.1+cu116.html"
secondary = true

[tool.poetry.group.dev.dependencies]
bump2version = "^1.0.1"
pytest = "^7.4"
pytest-sugar = "^0.9.6"
pre-commit = "^3.2.1"
black = "^23.3.0"
mypy = "^1.1.1"
ruff = "^0.0.270"

[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"

[tool.pytest.ini_options]
pythonpath = [
"."
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
"""torchdrug."""

__version__ = "0.1.0"

from polus.mm.utils.torchdrug_download.torchdrug_download import ( # noqa # pylint: disable=unused-import
torchdrug_download,
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
"""Package entrypoint for the torchdrug package."""

# Base packages
import logging
from enum import Enum
from os import environ
from pathlib import Path

import typer
from polus.mm.utils.torchdrug_download.torchdrug_download import torchdrug_download
from torchdrug import datasets

logging.basicConfig(
format="%(asctime)s - %(name)-8s - %(levelname)-8s - %(message)s",
datefmt="%d-%b-%y %H:%M:%S",
)
POLUS_LOG = getattr(logging, environ.get("POLUS_LOG", "INFO"))
logger = logging.getLogger("polus.mm.utils.torchdrug_download.")
logger.setLevel(POLUS_LOG)

app = typer.Typer(help="torchdrug_download.")


class DatabaseEnum(str, Enum):
"""class DatabaseEnum for the input database to be processed."""

ClinTox = "ClinTox"
PDBBind = "PDBBind"
FB15k = "FB15k"
FB15k237 = "FB15k237"
WN18 = "WN18"
WN18RR = "WN18RR"
Hetionet = "Hetionet"
BACE = "BACE"
BBBP = "BBBP"
CEP = "CEP"
ChEMBLFiltered = "ChEMBLFiltered"
Delaney = "Delaney"
FreeSolv = "FreeSolv"
HIV = "HIV"
Lipophilicity = "Lipophilicity"
MUV = "MUV"
Malaria = "Malaria"
OPV = "OPV"
QM8 = "QM8"
QM9 = "QM9"
SIDER = "SIDER"
Tox21 = "Tox21"
ToxCast = "ToxCast"
ZINC250k = "ZINC250k"
ZINC2m = "ZINC2m"
MOSES = "MOSES"
PCQM4M = "PCQM4M"
BetaLactamase = "BetaLactamase"
Fluorescence = "Fluorescence"
Stability = "Stability"
Solubility = "Solubility"
BinaryLocalization = "BinaryLocalization"
SubcellularLocalization = "SubcellularLocalization"
EnzymeCommission = "EnzymeCommission"
GeneOntology = "GeneOntology"
AlphaFoldDB = "AlphaFoldDB"
Fold = "Fold"
SecondaryStructure = "SecondaryStructure"
ProteinNet = "ProteinNet"
HumanPPI = "HumanPPI"
YeastPPI = "YeastPPI"
PPIAffinity = "PPIAffinity"
BindingDB = "BindingDB"
USPTO50k = "USPTO50k"
Cora = "Cora"
PubMed = "PubMed"


@app.command()
def main(
dataset: DatabaseEnum = typer.Option(
...,
"--dataset",
help="Input database to be processed.",
),
out_dir: Path = typer.Option(
...,
"--outdir",
help="Output directory.",
exists=True,
writable=True,
file_okay=False,
resolve_path=True,
),
) -> None:
"""torchdrug_download."""
dataset_mapping = {
"PDBBind": datasets.PDBBind,
"ClinTox": datasets.ClinTox,
"FB15k": datasets.FB15k,
"FB15k237": datasets.FB15k237,
"WN18": datasets.WN18,
"WN18RR": datasets.WN18RR,
"Hetionet": datasets.Hetionet,
"BACE": datasets.BACE,
"BBBP": datasets.BBBP,
"CEP": datasets.CEP,
"ChEMBLFiltered": datasets.ChEMBLFiltered,
"Delaney": datasets.Delaney,
"FreeSolv": datasets.FreeSolv,
"HIV": datasets.HIV,
"Lipophilicity": datasets.Lipophilicity,
"MUV": datasets.MUV,
"Malaria": datasets.Malaria,
"OPV": datasets.OPV,
"QM8": datasets.QM8,
"QM9": datasets.QM9,
"SIDER": datasets.SIDER,
"Tox21": datasets.Tox21,
"ToxCast": datasets.ToxCast,
"ZINC250k": datasets.ZINC250k,
"ZINC2m": datasets.ZINC2m,
"MOSES": datasets.MOSES,
"PCQM4M": datasets.PCQM4M,
"BetaLactamase": datasets.BetaLactamase,
"Fluorescence": datasets.Fluorescence,
"Stability": datasets.Stability,
"Solubility": datasets.Solubility,
"BinaryLocalization": datasets.BinaryLocalization,
"SubcellularLocalization": datasets.SubcellularLocalization,
"EnzymeCommission": datasets.EnzymeCommission,
"GeneOntology": datasets.GeneOntology,
"AlphaFoldDB": datasets.AlphaFoldDB,
"Fold": datasets.Fold,
"SecondaryStructure": datasets.SecondaryStructure,
"ProteinNet": datasets.ProteinNet,
"HumanPPI": datasets.HumanPPI,
"YeastPPI": datasets.YeastPPI,
"PPIAffinity": datasets.PPIAffinity,
"BindingDB": datasets.BindingDB,
"USPTO50k": datasets.USPTO50k,
"Cora": datasets.Cora,
"PubMed": datasets.PubMed,
}

if dataset not in dataset_mapping:
msg = f"Unsupported dataset: {dataset}"
raise ValueError(msg)

logger.info(f"database: {dataset}")
logger.info(f"outdir: {out_dir}")
torchdrug_download(dataset, out_dir, dataset_mapping)


if __name__ == "__main__":
app()
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
"""torchdrug_download."""
from pathlib import Path


def torchdrug_download(dataset: str, outdir: Path, dataset_mapping: dict) -> None:
"""torchdrug.
Args:
dataset: Input dataset to extract
outdir: Output collection.
dataset_mapping: Mapping of dataset to class.
Returns:
None
"""
# Create an instance of the selected dataset class
selected_dataset_class = dataset_mapping[dataset]
# lazy = False causes issues with PDBBind dataset such as invalid sequence
dataset = selected_dataset_class(outdir, lazy=True)
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Tests for torchdrug_download."""
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
"""Tests for torchdrug_download."""
from pathlib import Path

from polus.mm.utils.torchdrug_download.torchdrug_download import torchdrug_download
from torchdrug import datasets


def test_torchdrug_download_check() -> None:
"""Test torchdrug_download."""
dataset = "Tox21"
outdir = Path.cwd()
dataset_mapping = {"Tox21": datasets.Tox21}
torchdrug_download(dataset, outdir, dataset_mapping)
assert Path("tox21.csv").exists()
Loading
Loading