diffdock

PolusAI · May 30, 2024 · ee6db7a · ee6db7a
1 parent 6d5007e
commit ee6db7a
Show file tree

Hide file tree

Showing 16 changed files with 472 additions and 0 deletions.
diff --git a/utils/docking/diffdock/diffdock-tool/.bumpversion.cfg b/utils/docking/diffdock/diffdock-tool/.bumpversion.cfg
@@ -0,0 +1,29 @@
+[bumpversion]
+current_version = 0.1.0
+commit = False
+tag = False
+parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>[a-z]+)(?P<dev>\d+))?
+serialize = 
+	{major}.{minor}.{patch}-{release}{dev}
+	{major}.{minor}.{patch}
+
+[bumpversion:part:release]
+optional_value = _
+first_value = dev
+values = 
+	dev
+	_
+
+[bumpversion:part:dev]
+
+[bumpversion:file:pyproject.toml]
+search = version = "{current_version}"
+replace = version = "{new_version}"
+
+[bumpversion:file:VERSION]
+
+[bumpversion:file:README.md]
+
+[bumpversion:file:plugin.json]
+
+[bumpversion:file:src/polus/mm/utils/diffdock/__init__.py]
diff --git a/utils/docking/diffdock/diffdock-tool/.dockerignore b/utils/docking/diffdock/diffdock-tool/.dockerignore
@@ -0,0 +1,4 @@
+.venv
+out
+tests
+__pycache__
diff --git a/utils/docking/diffdock/diffdock-tool/.gitattributes b/utils/docking/diffdock/diffdock-tool/.gitattributes
@@ -0,0 +1,5 @@
+*.pdb filter=lfs diff=lfs merge=lfs -text
+*.pdbqt filter=lfs diff=lfs merge=lfs -text
+*.mol2 filter=lfs diff=lfs merge=lfs -text
+*.xlsx filter=lfs diff=lfs merge=lfs -text
+*.sdf filter=lfs diff=lfs merge=lfs -text
diff --git a/utils/docking/diffdock/diffdock-tool/.gitignore b/utils/docking/diffdock/diffdock-tool/.gitignore
@@ -0,0 +1 @@
+poetry.lock
diff --git a/utils/docking/diffdock/diffdock-tool/CHANGELOG.md b/utils/docking/diffdock/diffdock-tool/CHANGELOG.md
@@ -0,0 +1,5 @@
+# CHANGELOG
+
+## 0.1.0
+
+Initial release.
diff --git a/utils/docking/diffdock/diffdock-tool/Dockerfile_diffdock_gpu b/utils/docking/diffdock/diffdock-tool/Dockerfile_diffdock_gpu
@@ -0,0 +1,58 @@
+# docker build -f Dockerfile_diffdock_gpu -t mrbrandonwalker/diffdock_gpu .
+
+FROM nvidia/cuda:11.7.1-cudnn8-devel-ubuntu20.04 as devel
+
+# Install conda / mamba
+RUN apt-get update && apt-get install -y wget git build-essential
+
+RUN CONDA="Mambaforge-Linux-x86_64.sh" && \
+    wget --quiet https://github.com/conda-forge/miniforge/releases/latest/download/$CONDA && \
+    chmod +x $CONDA && \
+    ./$CONDA -b -p /mambaforge && \
+    rm -f $CONDA
+ENV PATH /mambaforge/bin:$PATH
+
+RUN conda install pytorch==1.13.0 pytorch-cuda=11.7 pytorch-cluster -c pytorch -c nvidia -c pyg
+
+# Website suggests torch 1.12 but only 1.12 doesn't exist with cuda 11.7 https://github.com/gcorso/DiffDock https://data.pyg.org/whl/torch-1.13.0+cu117.html
+# Need to install pytorch first before other torch packages
+# Cannot use conda for these packages otherwise will install them with a default CUDA (not same as cuda 11.7)
+# so need to specify which package versions (CUDA and Torch)
+# If install pytorch-cluster below with rest of torch packages, the will get error Not compiled with CUDA support
+
+RUN pip install torch-scatter torch-sparse torch-spline-conv torch-geometric==2.0.4 -f https://data.pyg.org/whl/torch-1.13.0+cu117.html
+
+RUN conda install PyYAML scipy "networkx[default]" biopython rdkit e3nn spyrmsd pandas biopandas
+
+# See install for GPU https://github.com/gcorso/DiffDock, some packages are only in pip
+
+RUN pip install 'openfold @ git+https://github.com/aqlaboratory/openfold.git@4b41059694619831a7db195b7e0988fc4ff3a307' 'dllogger @ git+https://github.com/NVIDIA/dllogger.git' "fair-esm[esmfold]"
+
+RUN git clone https://github.com/gcorso/DiffDock.git
+
+WORKDIR /DiffDock
+
+RUN conda init bash
+
+# generate the pre-computed cached files for speeding up the inference
+# See https://github.com/gcorso/DiffDock#running-diffdock-on-your-own-complexes
+# Note that the first time you run DiffDock on a device the program will precompute and store in cache look-up tables for SO(2) and SO(3) distributions (typically takes a couple of minutes), this won't be repeated in following runs.
+# output pre-computed cached files are of the format .*.npy such as .so3_omegas_array2.npy, .so3_cdf_vals2.npy, .so3_score_norms2.npy, .so3_exp_score_norms2.npy
+RUN python -m inference --protein_ligand_csv data/protein_ligand_example_csv.csv --out_dir results/user_predictions_small --inference_steps 1 --samples_per_complex 1 --batch_size 1 --actual_steps 1
+
+# Delete output results so not in same output folder as future runs
+RUN rm -r results/user_predictions_small
+
+# Clean up temp files
+RUN mamba clean --all --yes
+
+# Now copy everything into a minimal cuda runtime base image.
+FROM nvidia/cuda:11.7.1-cudnn8-runtime-ubuntu20.04 as runtime
+
+COPY --from=devel DiffDock/ DiffDock/
+COPY --from=devel mambaforge/ mambaforge/
+
+# shell file to copy cached files, run diffdock and remove large cached files after execution
+ADD diffdock_cmds.sh /DiffDock/
+
+ENV PATH /mambaforge/bin:$PATH
diff --git a/utils/docking/diffdock/diffdock-tool/README.md b/utils/docking/diffdock/diffdock-tool/README.md
@@ -0,0 +1,22 @@
+# diffdock (0.1.0)
+
+DiffDock Diffusion based protein ligand docking
+
+## Options
+
+This plugin takes     11     input arguments and 4 output argument:
+
+| Name          | Description             | I/O    | Type   | Default |
+|---------------|-------------------------|--------|--------|---------|
+| protein_path | Protein input file | Input | File | File |
+| ligand_path | Ligand input file | Input | File | File |
+| inference_steps | Number of inference steps for diffusion | Input | int | int |
+| samples_per_complex | Number of pose samples to generate per complex | Input | int | int |
+| batch_size | Batch size | Input | int | int |
+| out_dir | Output directory to save poses | Input | string | string |
+| model_dir | Input model directory to use | Input | string | string |
+| confidence_model_dir | Input confidence model directory | Input | string | string |
+| complex_name | Name of complex | Input | string | string |
+| max_confident_pose | Highest confident pose | Output | File | File |
+| output_files | The output poses | Output | File[] | File[] |
+| execution_time | Time to run DiffDock | Output | float | float |
diff --git a/utils/docking/diffdock/diffdock-tool/VERSION b/utils/docking/diffdock/diffdock-tool/VERSION
@@ -0,0 +1 @@
+0.1.0
diff --git a/utils/docking/diffdock/diffdock-tool/diffdock.cwl b/utils/docking/diffdock/diffdock-tool/diffdock.cwl
@@ -0,0 +1,138 @@
+#!/usr/bin/env cwl-runner
+cwlVersion: v1.0
+
+class: CommandLineTool
+
+label: DiffDock Diffusion based protein ligand docking
+
+doc: |-
+  DiffDock Diffusion based protein ligand docking
+
+baseCommand: ["bash", "/DiffDock/diffdock_cmds.sh"]
+
+hints:
+  cwltool:CUDARequirement:
+    cudaVersionMin: "11.7"
+    cudaComputeCapability: "3.0"
+    cudaDeviceCountMin: 1
+    cudaDeviceCountMax: 1
+  DockerRequirement:
+    dockerPull: mrbrandonwalker/diffdock_gpu
+
+requirements:
+  InlineJavascriptRequirement: {}
+inputs:
+
+  protein_path:
+    type: File
+    format: edam:format_1476
+    inputBinding:
+      prefix: --protein_path
+
+  ligand_path:
+    type: File
+    format: edam:format_3814
+    inputBinding:
+      prefix: --ligand_description
+
+  inference_steps:
+    label: number of reverse diffusion steps
+    type: int?
+    inputBinding:
+      prefix: --inference_steps
+    default: 20
+
+  samples_per_complex:
+    label: Number of sample poses to generate per complex
+    type: int?
+    inputBinding:
+      prefix: --samples_per_complex
+    default: 40
+
+  batch_size:
+    label: input batch size for neural net
+    type: int?
+    inputBinding:
+      prefix: --batch_size
+    default: 1
+
+  out_dir:
+    label: where output from diffdock is saved
+    type: string?
+    inputBinding:
+      prefix: --out_dir
+    default: results/
+
+  model_dir:
+    label: directory of DiffDock score model from paper
+    type: string?
+    inputBinding:
+      prefix: --model_dir
+    default:  /DiffDock/workdir/paper_score_model/
+
+  confidence_model_dir:
+    label: directory of DiffDock confidence model from paper
+    type: string?
+    inputBinding:
+      prefix: --confidence_model_dir
+    default:  /DiffDock/workdir/paper_confidence_model
+
+  complex_name:
+    label: name of folder with pose outputs that will be saved under out_dir folder
+    type: string?
+    inputBinding:
+      prefix: --complex_name
+    default:  outputs
+
+  output_files:
+    type: string?
+
+  max_confident_pose:
+    type: string?
+
+outputs:
+
+  max_confident_pose:
+    type: File
+    outputBinding:
+      # the diffdock developers copy only the top ranked pose to a new file rank1.sdf
+      glob: $(inputs.out_dir)/$(inputs.complex_name)/rank1.sdf
+    format: edam:format_3814
+
+  output_files:
+    type: File[]
+    outputBinding:
+      # all other output files besides rank1.sdf have confidence information in them rank*_confidence*.sdf
+      glob: $(inputs.out_dir)/$(inputs.complex_name)/rank*_confidence*.sdf
+    format: edam:format_3814
+
+  stderr:
+    type: File
+    outputBinding:
+      glob: stderr
+
+  execution_time:
+    label: Time to run DiffDock
+    doc: |-
+      Time to run DiffDock
+    type: float
+    outputBinding:
+      glob: stderr
+      loadContents: true
+      outputEval: |
+        ${
+          // the time command outputs to stderr and not to stdout
+          // example output below, parse the float value of seconds (first item in line)
+          // 1it [00:41, 41.03s/it]
+          // 78.909
+          return self[0].contents.split("\n").map(str => parseFloat(str)).reverse().find(num => !isNaN(num));
+        }
+
+stderr: stderr
+
+$namespaces:
+  edam: https://edamontology.org/
+  cwltool: http://commonwl.org/cwltool#
+
+$schemas:
+- https://raw.githubusercontent.com/edamontology/edamontology/master/EDAM_dev.owl
diff --git a/utils/docking/diffdock/diffdock-tool/diffdock_cmds.sh b/utils/docking/diffdock/diffdock-tool/diffdock_cmds.sh
@@ -0,0 +1,7 @@
+#!/bin/bash -e
+# copy cached files to current directory
+cp /DiffDock/.*.npy .
+TIMEFORMAT=%R && time python /DiffDock/inference.py "$@"
+# need to remove large files otherwise cachedir folder will be 3GB each!!
+rm .*.npy
+rm -r .cache/