diff --git a/utils/docking/diffdock/diffdock-tool/.bumpversion.cfg b/utils/docking/diffdock/diffdock-tool/.bumpversion.cfg new file mode 100644 index 00000000..80ed4022 --- /dev/null +++ b/utils/docking/diffdock/diffdock-tool/.bumpversion.cfg @@ -0,0 +1,29 @@ +[bumpversion] +current_version = 0.1.0 +commit = False +tag = False +parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? +serialize = + {major}.{minor}.{patch}-{release}{dev} + {major}.{minor}.{patch} + +[bumpversion:part:release] +optional_value = _ +first_value = dev +values = + dev + _ + +[bumpversion:part:dev] + +[bumpversion:file:pyproject.toml] +search = version = "{current_version}" +replace = version = "{new_version}" + +[bumpversion:file:VERSION] + +[bumpversion:file:README.md] + +[bumpversion:file:plugin.json] + +[bumpversion:file:src/polus/mm/utils/diffdock/__init__.py] diff --git a/utils/docking/diffdock/diffdock-tool/.dockerignore b/utils/docking/diffdock/diffdock-tool/.dockerignore new file mode 100644 index 00000000..7c603f81 --- /dev/null +++ b/utils/docking/diffdock/diffdock-tool/.dockerignore @@ -0,0 +1,4 @@ +.venv +out +tests +__pycache__ diff --git a/utils/docking/diffdock/diffdock-tool/.gitattributes b/utils/docking/diffdock/diffdock-tool/.gitattributes new file mode 100644 index 00000000..07fedc1e --- /dev/null +++ b/utils/docking/diffdock/diffdock-tool/.gitattributes @@ -0,0 +1,5 @@ +*.pdb filter=lfs diff=lfs merge=lfs -text +*.pdbqt filter=lfs diff=lfs merge=lfs -text +*.mol2 filter=lfs diff=lfs merge=lfs -text +*.xlsx filter=lfs diff=lfs merge=lfs -text +*.sdf filter=lfs diff=lfs merge=lfs -text diff --git a/utils/docking/diffdock/diffdock-tool/.gitignore b/utils/docking/diffdock/diffdock-tool/.gitignore new file mode 100644 index 00000000..c04bc49f --- /dev/null +++ b/utils/docking/diffdock/diffdock-tool/.gitignore @@ -0,0 +1 @@ +poetry.lock diff --git a/utils/docking/diffdock/diffdock-tool/CHANGELOG.md b/utils/docking/diffdock/diffdock-tool/CHANGELOG.md new file mode 100644 index 00000000..b67793f7 --- /dev/null +++ b/utils/docking/diffdock/diffdock-tool/CHANGELOG.md @@ -0,0 +1,5 @@ +# CHANGELOG + +## 0.1.0 + +Initial release. diff --git a/utils/docking/diffdock/diffdock-tool/Dockerfile_diffdock_gpu b/utils/docking/diffdock/diffdock-tool/Dockerfile_diffdock_gpu new file mode 100644 index 00000000..9d873d27 --- /dev/null +++ b/utils/docking/diffdock/diffdock-tool/Dockerfile_diffdock_gpu @@ -0,0 +1,58 @@ +# docker build -f Dockerfile_diffdock_gpu -t mrbrandonwalker/diffdock_gpu . + +FROM nvidia/cuda:11.7.1-cudnn8-devel-ubuntu20.04 as devel + +# Install conda / mamba +RUN apt-get update && apt-get install -y wget git build-essential + +RUN CONDA="Mambaforge-Linux-x86_64.sh" && \ + wget --quiet https://github.com/conda-forge/miniforge/releases/latest/download/$CONDA && \ + chmod +x $CONDA && \ + ./$CONDA -b -p /mambaforge && \ + rm -f $CONDA +ENV PATH /mambaforge/bin:$PATH + +RUN conda install pytorch==1.13.0 pytorch-cuda=11.7 pytorch-cluster -c pytorch -c nvidia -c pyg + +# Website suggests torch 1.12 but only 1.12 doesn't exist with cuda 11.7 https://github.com/gcorso/DiffDock https://data.pyg.org/whl/torch-1.13.0+cu117.html +# Need to install pytorch first before other torch packages +# Cannot use conda for these packages otherwise will install them with a default CUDA (not same as cuda 11.7) +# so need to specify which package versions (CUDA and Torch) +# If install pytorch-cluster below with rest of torch packages, the will get error Not compiled with CUDA support + +RUN pip install torch-scatter torch-sparse torch-spline-conv torch-geometric==2.0.4 -f https://data.pyg.org/whl/torch-1.13.0+cu117.html + +RUN conda install PyYAML scipy "networkx[default]" biopython rdkit e3nn spyrmsd pandas biopandas + +# See install for GPU https://github.com/gcorso/DiffDock, some packages are only in pip + +RUN pip install 'openfold @ git+https://github.com/aqlaboratory/openfold.git@4b41059694619831a7db195b7e0988fc4ff3a307' 'dllogger @ git+https://github.com/NVIDIA/dllogger.git' "fair-esm[esmfold]" + +RUN git clone https://github.com/gcorso/DiffDock.git + +WORKDIR /DiffDock + +RUN conda init bash + +# generate the pre-computed cached files for speeding up the inference +# See https://github.com/gcorso/DiffDock#running-diffdock-on-your-own-complexes +# Note that the first time you run DiffDock on a device the program will precompute and store in cache look-up tables for SO(2) and SO(3) distributions (typically takes a couple of minutes), this won't be repeated in following runs. +# output pre-computed cached files are of the format .*.npy such as .so3_omegas_array2.npy, .so3_cdf_vals2.npy, .so3_score_norms2.npy, .so3_exp_score_norms2.npy +RUN python -m inference --protein_ligand_csv data/protein_ligand_example_csv.csv --out_dir results/user_predictions_small --inference_steps 1 --samples_per_complex 1 --batch_size 1 --actual_steps 1 + +# Delete output results so not in same output folder as future runs +RUN rm -r results/user_predictions_small + +# Clean up temp files +RUN mamba clean --all --yes + +# Now copy everything into a minimal cuda runtime base image. +FROM nvidia/cuda:11.7.1-cudnn8-runtime-ubuntu20.04 as runtime + +COPY --from=devel DiffDock/ DiffDock/ +COPY --from=devel mambaforge/ mambaforge/ + +# shell file to copy cached files, run diffdock and remove large cached files after execution +ADD diffdock_cmds.sh /DiffDock/ + +ENV PATH /mambaforge/bin:$PATH diff --git a/utils/docking/diffdock/diffdock-tool/README.md b/utils/docking/diffdock/diffdock-tool/README.md new file mode 100644 index 00000000..0658c289 --- /dev/null +++ b/utils/docking/diffdock/diffdock-tool/README.md @@ -0,0 +1,22 @@ +# diffdock (0.1.0) + +DiffDock Diffusion based protein ligand docking + +## Options + +This plugin takes 11 input arguments and 4 output argument: + +| Name | Description | I/O | Type | Default | +|---------------|-------------------------|--------|--------|---------| +| protein_path | Protein input file | Input | File | File | +| ligand_path | Ligand input file | Input | File | File | +| inference_steps | Number of inference steps for diffusion | Input | int | int | +| samples_per_complex | Number of pose samples to generate per complex | Input | int | int | +| batch_size | Batch size | Input | int | int | +| out_dir | Output directory to save poses | Input | string | string | +| model_dir | Input model directory to use | Input | string | string | +| confidence_model_dir | Input confidence model directory | Input | string | string | +| complex_name | Name of complex | Input | string | string | +| max_confident_pose | Highest confident pose | Output | File | File | +| output_files | The output poses | Output | File[] | File[] | +| execution_time | Time to run DiffDock | Output | float | float | diff --git a/utils/docking/diffdock/diffdock-tool/VERSION b/utils/docking/diffdock/diffdock-tool/VERSION new file mode 100644 index 00000000..6e8bf73a --- /dev/null +++ b/utils/docking/diffdock/diffdock-tool/VERSION @@ -0,0 +1 @@ +0.1.0 diff --git a/utils/docking/diffdock/diffdock-tool/diffdock.cwl b/utils/docking/diffdock/diffdock-tool/diffdock.cwl new file mode 100644 index 00000000..e83d1ce6 --- /dev/null +++ b/utils/docking/diffdock/diffdock-tool/diffdock.cwl @@ -0,0 +1,138 @@ +#!/usr/bin/env cwl-runner +cwlVersion: v1.0 + +class: CommandLineTool + +label: DiffDock Diffusion based protein ligand docking + +doc: |- + DiffDock Diffusion based protein ligand docking + +baseCommand: ["bash", "/DiffDock/diffdock_cmds.sh"] + +hints: + cwltool:CUDARequirement: + cudaVersionMin: "11.7" + cudaComputeCapability: "3.0" + cudaDeviceCountMin: 1 + cudaDeviceCountMax: 1 + DockerRequirement: + dockerPull: mrbrandonwalker/diffdock_gpu + +requirements: + InlineJavascriptRequirement: {} +inputs: + + protein_path: + type: File + format: edam:format_1476 + inputBinding: + prefix: --protein_path + + ligand_path: + type: File + format: edam:format_3814 + inputBinding: + prefix: --ligand_description + + inference_steps: + label: number of reverse diffusion steps + type: int? + inputBinding: + prefix: --inference_steps + default: 20 + + samples_per_complex: + label: Number of sample poses to generate per complex + type: int? + inputBinding: + prefix: --samples_per_complex + default: 40 + + batch_size: + label: input batch size for neural net + type: int? + inputBinding: + prefix: --batch_size + default: 1 + + out_dir: + label: where output from diffdock is saved + type: string? + inputBinding: + prefix: --out_dir + default: results/ + + model_dir: + label: directory of DiffDock score model from paper + type: string? + inputBinding: + prefix: --model_dir + default: /DiffDock/workdir/paper_score_model/ + + confidence_model_dir: + label: directory of DiffDock confidence model from paper + type: string? + inputBinding: + prefix: --confidence_model_dir + default: /DiffDock/workdir/paper_confidence_model + + complex_name: + label: name of folder with pose outputs that will be saved under out_dir folder + type: string? + inputBinding: + prefix: --complex_name + default: outputs + + output_files: + type: string? + + max_confident_pose: + type: string? + +outputs: + + max_confident_pose: + type: File + outputBinding: + # the diffdock developers copy only the top ranked pose to a new file rank1.sdf + glob: $(inputs.out_dir)/$(inputs.complex_name)/rank1.sdf + format: edam:format_3814 + + output_files: + type: File[] + outputBinding: + # all other output files besides rank1.sdf have confidence information in them rank*_confidence*.sdf + glob: $(inputs.out_dir)/$(inputs.complex_name)/rank*_confidence*.sdf + format: edam:format_3814 + + stderr: + type: File + outputBinding: + glob: stderr + + execution_time: + label: Time to run DiffDock + doc: |- + Time to run DiffDock + type: float + outputBinding: + glob: stderr + loadContents: true + outputEval: | + ${ + // the time command outputs to stderr and not to stdout + // example output below, parse the float value of seconds (first item in line) + // 1it [00:41, 41.03s/it] + // 78.909 + return self[0].contents.split("\n").map(str => parseFloat(str)).reverse().find(num => !isNaN(num)); + } + +stderr: stderr + +$namespaces: + edam: https://edamontology.org/ + cwltool: http://commonwl.org/cwltool# + +$schemas: +- https://raw.githubusercontent.com/edamontology/edamontology/master/EDAM_dev.owl diff --git a/utils/docking/diffdock/diffdock-tool/diffdock_cmds.sh b/utils/docking/diffdock/diffdock-tool/diffdock_cmds.sh new file mode 100644 index 00000000..a2c3b4ce --- /dev/null +++ b/utils/docking/diffdock/diffdock-tool/diffdock_cmds.sh @@ -0,0 +1,7 @@ +#!/bin/bash -e +# copy cached files to current directory +cp /DiffDock/.*.npy . +TIMEFORMAT=%R && time python /DiffDock/inference.py "$@" +# need to remove large files otherwise cachedir folder will be 3GB each!! +rm .*.npy +rm -r .cache/ diff --git a/utils/docking/diffdock/diffdock-tool/ict.yml b/utils/docking/diffdock/diffdock-tool/ict.yml new file mode 100644 index 00000000..d2371fba --- /dev/null +++ b/utils/docking/diffdock/diffdock-tool/ict.yml @@ -0,0 +1,135 @@ +specVersion: "0.1.0" +name: diffdock +version: 0.1.0 +container: diffdock-tool +entrypoint: +title: diffdock +description: DiffDock Diffusion based protein ligand docking +author: Brandon Walker +contact: brandon.walker@axleinfo.com +repository: +documentation: +citation: + +inputs: + - name: protein_path + required: true + description: + type: File + format: + uri: edam:format_1476 + - name: ligand_path + required: true + description: + type: File + format: + uri: edam:format_3814 + - name: inference_steps + required: true + description: + type: int + defaultValue: 20 + - name: samples_per_complex + required: true + description: + type: int + defaultValue: 40 + - name: batch_size + required: true + description: + type: int + defaultValue: 1 + - name: out_dir + required: true + description: + type: string + defaultValue: results/ + - name: model_dir + required: true + description: + type: string + defaultValue: /DiffDock/workdir/paper_score_model/ + - name: confidence_model_dir + required: true + description: + type: string + defaultValue: /DiffDock/workdir/paper_confidence_model + - name: complex_name + required: true + description: + type: string + defaultValue: outputs + - name: output_files + required: true + description: + type: string + - name: max_confident_pose + required: true + description: + type: string +outputs: + - name: max_confident_pose + required: true + description: + type: File + format: + uri: edam:format_3814 + - name: output_files + required: true + description: + type: File[] + format: + uri: edam:format_3814 + - name: stderr + required: true + description: + type: File + - name: execution_time + required: true + description: Time to run DiffDock + type: float +ui: + - key: inputs.protein_path + title: "protein_path: " + description: "" + type: File + - key: inputs.ligand_path + title: "ligand_path: " + description: "" + type: File + - key: inputs.inference_steps + title: "inference_steps: " + description: "" + type: int + - key: inputs.samples_per_complex + title: "samples_per_complex: " + description: "" + type: int + - key: inputs.batch_size + title: "batch_size: " + description: "" + type: int + - key: inputs.out_dir + title: "out_dir: " + description: "" + type: string + - key: inputs.model_dir + title: "model_dir: " + description: "" + type: string + - key: inputs.confidence_model_dir + title: "confidence_model_dir: " + description: "" + type: string + - key: inputs.complex_name + title: "complex_name: " + description: "" + type: string + - key: inputs.output_files + title: "output_files: " + description: "" + type: string + - key: inputs.max_confident_pose + title: "max_confident_pose: " + description: "" + type: string diff --git a/utils/docking/diffdock/diffdock-tool/pyproject.toml b/utils/docking/diffdock/diffdock-tool/pyproject.toml new file mode 100644 index 00000000..7609b230 --- /dev/null +++ b/utils/docking/diffdock/diffdock-tool/pyproject.toml @@ -0,0 +1,30 @@ +[tool.poetry] +name = "polus-mm-utils-diffdock" +version = "0.1.0" +description = "DiffDock Diffusion based protein ligand docking" +authors = ["Data Scientist "] +readme = "README.md" +package-mode = false + +[tool.poetry.dependencies] +python = ">=3.9,<3.12" +cwl-utils = "0.33" +cwltool = "3.1.20240404144621" + +[tool.poetry.group.dev.dependencies] +bump2version = "^1.0.1" +pytest = "^7.4" +pytest-sugar = "^0.9.6" +pre-commit = "^3.2.1" +black = "^23.3.0" +mypy = "^1.1.1" +ruff = "^0.0.270" + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" + +[tool.pytest.ini_options] +pythonpath = [ + "." +] diff --git a/utils/docking/diffdock/diffdock-tool/tests/5umx_ligand.sdf b/utils/docking/diffdock/diffdock-tool/tests/5umx_ligand.sdf new file mode 100644 index 00000000..a4faf09d --- /dev/null +++ b/utils/docking/diffdock/diffdock-tool/tests/5umx_ligand.sdf @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6c5502c25f2b4d271d1a00812373c094097470cdb8451170f30aa24122a94ad +size 3315 diff --git a/utils/docking/diffdock/diffdock-tool/tests/5umx_protein.pdb b/utils/docking/diffdock/diffdock-tool/tests/5umx_protein.pdb new file mode 100644 index 00000000..0c753aa6 --- /dev/null +++ b/utils/docking/diffdock/diffdock-tool/tests/5umx_protein.pdb @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3121f590648faea13099e3053900626faf8e7b891f4214af35a557b125f11196 +size 197779 diff --git a/utils/docking/diffdock/diffdock-tool/tests/__init__.py b/utils/docking/diffdock/diffdock-tool/tests/__init__.py new file mode 100644 index 00000000..d93c7a1e --- /dev/null +++ b/utils/docking/diffdock/diffdock-tool/tests/__init__.py @@ -0,0 +1 @@ +"""Tests for diffdock.""" diff --git a/utils/docking/diffdock/diffdock-tool/tests/test_diffdock.py b/utils/docking/diffdock/diffdock-tool/tests/test_diffdock.py new file mode 100644 index 00000000..5bd51f7d --- /dev/null +++ b/utils/docking/diffdock/diffdock-tool/tests/test_diffdock.py @@ -0,0 +1,30 @@ +"""Tests for diffdock.""" +import sys +from pathlib import Path + +current_dir = Path(__file__).resolve().parent +target_dir = current_dir.parent.parent.parent / "cwl_utils" +sys.path.append(str(target_dir)) + +from cwl_utilities import call_cwltool # noqa: E402 +from cwl_utilities import create_input_yaml # noqa: E402 +from cwl_utilities import parse_cwl_arguments # noqa: E402 + + +def test_diffdock() -> None: + """Test diffdock.""" + cwl_file_str = "diffdock.cwl" + cwl_file = Path(__file__).resolve().parent.parent / Path(cwl_file_str) + input_to_props = parse_cwl_arguments(cwl_file) + file_path_str = "5umx_protein.pdb" + file_path = str(Path(__file__).resolve().parent / Path(file_path_str)) + input_to_props["protein_path"]["path"] = file_path + file_path_str = "5umx_ligand.sdf" + file_path = str(Path(__file__).resolve().parent / Path(file_path_str)) + input_to_props["ligand_path"]["path"] = file_path + + input_yaml_path = Path("diffdock.yml") + create_input_yaml(input_to_props, input_yaml_path) + + stdout, stderr = call_cwltool(cwl_file, input_yaml_path) + assert Path("rank1.sdf").exists()