extract pdbbind refined

PolusAI · May 30, 2024 · f1bf40f · f1bf40f
1 parent 6d5007e
commit f1bf40f
Show file tree

Hide file tree

Showing 17 changed files with 729 additions and 0 deletions.
diff --git a/utils/pre-process/data-download/pdbbind_refined_v2020_tool/.bumpversion.cfg b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/.bumpversion.cfg
@@ -0,0 +1,29 @@
+[bumpversion]
+current_version = 0.1.0
+commit = False
+tag = False
+parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>[a-z]+)(?P<dev>\d+))?
+serialize = 
+	{major}.{minor}.{patch}-{release}{dev}
+	{major}.{minor}.{patch}
+
+[bumpversion:part:release]
+optional_value = _
+first_value = dev
+values = 
+	dev
+	_
+
+[bumpversion:part:dev]
+
+[bumpversion:file:pyproject.toml]
+search = version = "{current_version}"
+replace = version = "{new_version}"
+
+[bumpversion:file:VERSION]
+
+[bumpversion:file:README.md]
+
+[bumpversion:file:plugin.json]
+
+[bumpversion:file:src/polus/mm/utils/pdbbind_refined_v2020/__init__.py]
diff --git a/utils/pre-process/data-download/pdbbind_refined_v2020_tool/.dockerignore b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/.dockerignore
@@ -0,0 +1,4 @@
+.venv
+out
+tests
+__pycache__
diff --git a/utils/pre-process/data-download/pdbbind_refined_v2020_tool/.gitignore b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/.gitignore
@@ -0,0 +1 @@
+poetry.lock
diff --git a/utils/pre-process/data-download/pdbbind_refined_v2020_tool/CHANGELOG.md b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/CHANGELOG.md
@@ -0,0 +1,5 @@
+# CHANGELOG
+
+## 0.1.0
+
+Initial release.
diff --git a/utils/pre-process/data-download/pdbbind_refined_v2020_tool/Dockerfile b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/Dockerfile
@@ -0,0 +1,32 @@
+# docker build -f Dockerfile -t mrbrandonwalker/pdbbind_refined_v2020_tool .
+
+FROM condaforge/mambaforge
+
+ENV EXEC_DIR="/opt/executables"
+ENV POLUS_LOG="INFO"
+RUN mkdir -p ${EXEC_DIR}
+
+RUN apt-get update && apt-get install -y wget
+# Download the bdbbind dataset
+# RUN wget --no-clobber http://www.pdbbind.org.cn/download/PDBbind_v2020_refined.tar.gz
+## update to the new download URL (around 10 times faster) from PDBbind website.
+RUN wget --no-clobber https://pdbbind.oss-cn-hangzhou.aliyuncs.com/download/PDBbind_v2020_refined.tar.gz
+RUN tar -xvf PDBbind_v2020_refined.tar.gz
+
+# Work directory defined in the base container
+
+COPY pyproject.toml ${EXEC_DIR}
+COPY VERSION ${EXEC_DIR}
+COPY README.md ${EXEC_DIR}
+COPY CHANGELOG.md ${EXEC_DIR}
+
+# Install needed packages here
+RUN mamba install -c conda-forge pandas
+
+COPY src ${EXEC_DIR}/src
+
+RUN pip3 install ${EXEC_DIR} --no-cache-dir
+
+CMD ["--help"]
+
+WORKDIR /outdir
diff --git a/utils/pre-process/data-download/pdbbind_refined_v2020_tool/README.md b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/README.md
@@ -0,0 +1,16 @@
+# pdbbind_refined_v2020 (0.1.0)
+
+Extract pdbbind_refined_v2020 data
+
+## Options
+
+This plugin takes 7 input arguments
+
+| Name          | Description             | I/O    | Type   | Default |
+|---------------|-------------------------|--------|--------|---------|
+| index_file_name |  | Input | string | string |
+| base_dir |  | Input | string | string |
+| query | query str to search the dataset. Pandas query doesn't support slash(/) in column names please use Kd_Ki instead of Kd/Ki, Type: string, File type: input, Accepted formats: txt | Input | string | string |
+| min_row | The row min inex, Type: int | Input | int | int |
+| max_row | The row max inex, Type: int | Input | int | int |
+| convert_Kd_dG | If this is set to true, dG will be calculated | Input | boolean | boolean |
diff --git a/utils/pre-process/data-download/pdbbind_refined_v2020_tool/VERSION b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/VERSION
@@ -0,0 +1 @@
+0.1.0
diff --git a/utils/pre-process/data-download/pdbbind_refined_v2020_tool/__init__.py b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/__init__.py
@@ -0,0 +1 @@
+"""pdbbind_refined_v2020_plugin package."""
diff --git a/utils/pre-process/data-download/pdbbind_refined_v2020_tool/build-docker.sh b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/build-docker.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+version=$(<VERSION)
+docker build . -t polusai/pdbbind-refined-v2020-tool:${version}
diff --git a/utils/pre-process/data-download/pdbbind_refined_v2020_tool/extract_pdbbind_refined.cwl b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/extract_pdbbind_refined.cwl
@@ -0,0 +1,246 @@
+#!/usr/bin/env cwl-runner
+cwlVersion: v1.0
+
+class: CommandLineTool
+
+label: Download the PDBbind refined database
+
+doc: |-
+  Download the PDBbind refined database
+
+baseCommand: ["python", "-m", "polus.mm.utils.pdbbind_refined_v2020"]
+
+hints:
+  DockerRequirement:
+    dockerPull: mrbrandonwalker/pdbbind_refined_v2020_tool
+
+
+requirements:
+  InlineJavascriptRequirement: {}
+
+inputs:
+
+  index_file_name:
+    label: The index file name
+    type: string
+    format:
+    - edam:format_2330
+    inputBinding:
+      prefix: --index_file_name
+      position: 1
+    default: INDEX_refined_data.2020
+
+  query:
+    label: query str to search the dataset, Pandas query doesn't support slash(/) in column names please use Kd_Ki instead of Kd/Ki
+    doc: |-
+      query str to search the dataset. Pandas query doesn't support slash(/) in column names please use Kd_Ki instead of Kd/Ki
+      Type: string
+      File type: input
+      Accepted formats: txt
+    type: string
+    format:
+    - edam:format_2330
+    inputBinding:
+      prefix: --query
+      position: 2
+
+  output_txt_path:
+    label: Path to the text dataset file
+    doc: |-
+      Path to the text dataset file
+      Type: string
+      File type: output
+      Accepted formats: txt
+    type: string
+    format:
+    - edam:format_2330
+    inputBinding:
+      prefix: --output_txt_path
+      position: 3
+    default: system.log
+
+  min_row:
+    label: The row min index
+    doc: |-
+      The row min inex
+      Type: int
+    type: int?
+    format:
+    - edam:format_2330
+    inputBinding:
+      position: 4
+      prefix: --min_row
+
+  max_row:
+    label: The row max index
+    doc: |-
+      The row max inex
+      Type: int
+    type: int?
+    format:
+    - edam:format_2330
+    inputBinding:
+      position: 5
+      prefix: --max_row
+
+  convert_Kd_dG:
+    label: If this is set to true, dG will be calculated
+    doc: If this is set to true, dG will be calculated
+    type: boolean
+    format:
+    - edam:format_2330
+    inputBinding:
+      prefix: --convert_Kd_dG
+      position: 6
+    default: False
+
+  experimental_dGs:
+    label: Experimental Free Energies of Binding
+    doc: |-
+      Experimental Free Energies of Binding
+    type: string?
+    format:
+    - edam:format_2330
+
+  pdb_ids:
+    label: The PDBID of proteins
+    doc: |-
+      The PDBID of proteins
+    type: string?
+    format:
+    - edam:format_2330
+
+outputs:
+
+  output_txt_path:
+    label: Path to the txt file
+    doc: |-
+      Path to the txt file
+    type: File
+    outputBinding:
+      glob: $(inputs.output_txt_path)
+    format: edam:format_2330
+
+  output_pdb_paths:
+    label: Path to the input file
+    doc: |-
+      Path to the input file
+      Type: string
+      File type: input
+      Accepted formats: pdb
+    type: File[]
+    outputBinding:
+      # NOTE: Do NOT just use glob: ./*.pdb !!! This will return an array sorted by filenames.
+      # We want the order of output_pdb_paths to match the order of experimental_dGs, etc
+      # Because we need to compare experimental ΔGs with predicted values.
+      glob: $(inputs.output_txt_path)
+      loadContents: true
+      outputEval: |
+        ${
+          var lines = self[0].contents.split("\n");
+          var pdbs = [];
+          for (var i = 0; i < lines.length; i++) {
+            var words = lines[i].split(" ");
+            var pdbid = words[0];
+            var pdbfile = {"class": "File", "path": pdbid + "_protein.pdb"};
+            pdbs.push(pdbfile);
+            }
+
+          return pdbs;
+        }
+    format: edam:format_1476
+
+  output_sdf_paths:
+    label: Path to the input file
+    doc: |-
+      Path to the input file
+      Type: string
+      File type: input
+      Accepted formats: sdf
+    type: File[]
+    outputBinding:
+      # NOTE: Do NOT just use glob: ./*.sdf !!! This will return an array sorted by filenames.
+      # We want the order of output_sdf_paths to match the order of experimental_dGs, etc
+      # Because we need to compare experimental ΔGs with predicted values.
+      glob: $(inputs.output_txt_path)
+      loadContents: true
+      outputEval: |
+        ${
+          var lines = self[0].contents.split("\n");
+          var sdfs = [];
+          for (var i = 0; i < lines.length; i++) {
+            var words = lines[i].split(" ");
+            var pdbid = words[0];
+            var sdffile = {"class": "File", "path": pdbid + "_ligand.sdf"};
+            sdfs.push(sdffile);
+            }
+
+          return sdfs;
+        }
+    format: edam:format_3814
+
+  experimental_dGs:
+    label: Experimental Free Energies of Binding
+    doc: |-
+      Experimental Free Energies of Binding
+    type: ["null", {"type": "array", "items": "float"}]
+    outputBinding:
+      glob: $(inputs.output_txt_path)
+      loadContents: true
+      outputEval: |
+        ${
+          var lines = self[0].contents.split("\n");
+          var experimental_dGs = [];
+          for (var i = 0; i < lines.length; i++) {
+            var words = lines[i].split(" ");
+            if (words.length > 2) {
+              var experimental_dG = parseFloat(words[2]);
+              experimental_dGs.push(experimental_dG);
+            }
+          }
+
+          if (experimental_dGs.length == 0) {
+            return null;
+          } else {
+            return experimental_dGs;
+          }
+        }
+
+  pdb_ids:
+    label: The PDBID of proteins
+    doc: |-
+      The PDBID of proteins
+    type:
+      type: array
+      items: string
+    outputBinding:
+      glob: $(inputs.output_txt_path)
+      loadContents: true
+      outputEval: |
+        ${
+          var lines = self[0].contents.split("\n");
+          var pdbids = [];
+          for (var i = 0; i < lines.length; i++) {
+            var words = lines[i].split(" ");
+            pdbids.push(words[0]);
+            }
+
+          if (pdbids.length == 0) {
+            throw new Error("Error! pdbids are empty!");
+          } else {
+            return pdbids;
+          }
+        }
+
+  stdout:
+    type: File
+    outputBinding:
+      glob: stdout
+
+stdout: stdout
+
+$namespaces:
+  edam: https://edamontology.org/
+
+$schemas:
+- https://raw.githubusercontent.com/edamontology/edamontology/master/EDAM_dev.owl
diff --git a/utils/pre-process/data-download/pdbbind_refined_v2020_tool/ict.yml b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/ict.yml
@@ -0,0 +1,42 @@
+specVersion: 0.1.0
+name: labshare/pdbbind-refined-v2020
+version: 0.1.0
+container: polusai/pdbbind-refined-v2020-tool:0.1.0
+entrypoint: ""
+title: pdbbind_refined_v2020
+description: Extracts data from the PDBBind refined dataset
+author: Brandon Walker ([email protected]), Nazanin Donyapour ([email protected])
+repository: https://github.com/labshare/mmtools
+documentation: https://ncats.nih.gov/preclinical/core/informatics
+citation: ""
+inputs:
+- name: output_txt_path
+  required: false
+  description: Path to the text dataset file
+  type: string
+  default: /outdir/system.log
+- name: index_file_name
+  required: false
+  description: The index file name
+  type: string
+  default: INDEX_refined_data.2020
+- name: base_dir
+  required: true
+  description: The base_dir path
+  type: string
+- name: query
+  required: false
+  description: "query str to search the dataset, Pandas query doesn't support slash(/) in column names please use Kd_Ki instead of Kd/Ki"
+  type: string
+- name: min_row
+  required: false
+  description: The row min index
+  type: int
+- name: max_row
+  required: false
+  description: The row max index
+  type: int
+- name: convert_Kd_dG
+  required: false
+  description: If this is set to true, dG will be calculated
+  type: boolean