PolusAI · ndonyapour · Jun 6, 2024
diff --git a/utils/extract-pdbids-drugbank-plugin/.bumpversion.cfg b/utils/extract-pdbids-drugbank-plugin/.bumpversion.cfg
@@ -0,0 +1,29 @@
+[bumpversion]
+current_version = 0.1.0
+commit = False
+tag = False
+parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>[a-z]+)(?P<dev>\d+))?
+serialize = 
+	{major}.{minor}.{patch}-{release}{dev}
+	{major}.{minor}.{patch}
+
+[bumpversion:part:release]
+optional_value = _
+first_value = dev
+values = 
+	dev
+	_
+
+[bumpversion:part:dev]
+
+[bumpversion:file:pyproject.toml]
+search = version = "{current_version}"
+replace = version = "{new_version}"
+
+[bumpversion:file:VERSION]
+
+[bumpversion:file:README.md]
+
+[bumpversion:file:plugin.json]
+
+[bumpversion:file:src/polus/mm/utils/extract_pdbids_drugbank/__init__.py]
diff --git a/utils/extract-pdbids-drugbank-plugin/.dockerignore b/utils/extract-pdbids-drugbank-plugin/.dockerignore
@@ -0,0 +1,4 @@
+.venv
+out
+tests
+__pycache__
diff --git a/utils/extract-pdbids-drugbank-plugin/.gitignore b/utils/extract-pdbids-drugbank-plugin/.gitignore
@@ -0,0 +1 @@
+poetry.lock
diff --git a/utils/extract-pdbids-drugbank-plugin/CHANGELOG.md b/utils/extract-pdbids-drugbank-plugin/CHANGELOG.md
@@ -0,0 +1,5 @@
+# CHANGELOG
+
+## 0.1.0
+
+Initial release.
diff --git a/utils/extract-pdbids-drugbank-plugin/Dockerfile b/utils/extract-pdbids-drugbank-plugin/Dockerfile
@@ -0,0 +1,21 @@
+FROM condaforge/mambaforge
+
+ENV EXEC_DIR="/opt/executables"
+ENV POLUS_LOG="INFO"
+RUN mkdir -p ${EXEC_DIR}
+
+
+# Work directory defined in the base container
+# WORKDIR ${EXEC_DIR}
+
+COPY pyproject.toml ${EXEC_DIR}
+COPY VERSION ${EXEC_DIR}
+COPY README.md ${EXEC_DIR}
+COPY CHANGELOG.md ${EXEC_DIR}
+
+# Install needed packages here
+COPY src ${EXEC_DIR}/src
+
+RUN pip3 install ${EXEC_DIR} --no-cache-dir
+
+CMD ["--help"]
diff --git a/utils/extract-pdbids-drugbank-plugin/README.md b/utils/extract-pdbids-drugbank-plugin/README.md
@@ -0,0 +1,19 @@
+# extract_pdbids_drugbank (0.1.0)
+
+Filter the Drugbank database
+
+## Options
+
+This plugin takes 5 input arguments and 4 output argument:
+
+| Name          | Description             | I/O    | Type   | Default |
+|---------------|-------------------------|--------|--------|---------|
+| drugbank_xml_file_path | Path to the Drugbank xml file | Input | File | File |
+| smiles | List of input SMILES, Type: string[], File type: input, Accepted formats: list[string] | Input | ['null', {'type': 'array', 'items': 'string'}] | ['null', {'type': 'array', 'items': 'string'}] |
+| inchi | List of input SMILES, Type: string[], File type: input, Accepted formats: list[string] | Input | ['null', {'type': 'array', 'items': 'string'}] | ['null', {'type': 'array', 'items': 'string'}] |
+| inchi_keys | List of input SMILES, Type: string[], File type: input, Accepted formats: list[string] | Input | ['null', {'type': 'array', 'items': 'string'}] | ['null', {'type': 'array', 'items': 'string'}] |
+| output_txt_path | Path to the text dataset file, Type: string, File type: output, Accepted formats: txt | Input | string | string |
+| output_txt_path | Path to the txt file | Output | File | File |
+| output_smiles | The Smiles of small molecules | Output | {'type': 'array', 'items': 'string'} | {'type': 'array', 'items': 'string'} |
+| output_pdbids_1D | The PDB IDs of target structures in 1D array  | Output | {'type': 'array', 'items': 'string'} | {'type': 'array', 'items': 'string'} |
+| output_pdbids_2D | The PDB IDs of target structures in 2D array  | Output | {'type': 'array', 'items': {'type': 'array', 'items': 'string'}} | {'type': 'array', 'items': {'type': 'array', 'items': 'string'}} |
diff --git a/utils/extract-pdbids-drugbank-plugin/VERSION b/utils/extract-pdbids-drugbank-plugin/VERSION
@@ -0,0 +1 @@
+0.1.0
diff --git a/utils/extract-pdbids-drugbank-plugin/build-docker.sh b/utils/extract-pdbids-drugbank-plugin/build-docker.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+version=$(<VERSION)
+docker build . -t polusai/extract-pdbids-drugbank-tool:${version}
diff --git a/utils/extract-pdbids-drugbank-plugin/extract_pdbids_drugbank_0@[email protected] b/utils/extract-pdbids-drugbank-plugin/extract_pdbids_drugbank_0@[email protected]
@@ -0,0 +1,185 @@
+#!/usr/bin/env cwl-runner
+cwlVersion: v1.0
+
+class: CommandLineTool
+
+label: Filter the Drugbank database
+
+doc: |-
+  Filter the Drugbank database
+
+baseCommand: ["python", "-m", "polus.mm.utils.extract_pdbids_drugbank"]
+
+hints:
+  DockerRequirement:
+    dockerPull: polusai/extract-pdbids-drugbank-tool@sha256:60097d534aef1ced5e08bc896d541a5364d8452a16d883b845e29552a99027b4
+
+requirements:
+  InlineJavascriptRequirement: {}
+  # Enabling InitialWorkDirRequirement will stage the input Drugbank xml file
+  InitialWorkDirRequirement:
+    listing:
+    - $(inputs.drugbank_xml_file_path)
+
+inputs:
+  drugbank_xml_file_path:
+    label: Path to the Drugbank xml file
+    doc: |-
+      Path to the Drugbank xml file
+    type: File
+    format: edam:format_2332
+    inputBinding:
+      prefix: --drugbank_xml_file_path
+
+  smiles:
+    label: List of input SMILES  # type:
+    doc: |-
+      List of input SMILES
+      Type: string[]
+      File type: input
+      Accepted formats: list[string]
+    type: ["null", {"type": "array", "items": "string"}]
+    format: edam:format_2330
+    inputBinding:
+      prefix: --smiles
+    default: []
+
+  inchi:
+    label: List of input SMILES  # type:
+    doc: |-
+      List of input SMILES
+      Type: string[]
+      File type: input
+      Accepted formats: list[string]
+    type: ["null", {"type": "array", "items": "string"}]
+    format:
+    - edam:format_2330
+    inputBinding:
+      prefix: --inchi
+    default: []
+
+  inchi_keys:
+    label: List of input SMILES  # type:
+    doc: |-
+      List of input SMILES
+      Type: string[]
+      File type: input
+      Accepted formats: list[string]
+    type: ["null", {"type": "array", "items": "string"}]
+    format:
+    - edam:format_2330
+    inputBinding:
+      prefix: --inchi_keys
+    default: []
+
+  output_txt_path:
+    label: Path to the text dataset file
+    doc: |-
+      Path to the text dataset file
+      Type: string
+      File type: output
+      Accepted formats: txt
+    type: string
+    format:
+    - edam:format_2330
+    inputBinding:
+      prefix: --output_txt_path
+    default: system.log
+
+outputs:
+  output_txt_path:
+    label: Path to the txt file
+    doc: |-
+      Path to the txt file
+    type: File
+    outputBinding:
+      glob: $(inputs.output_txt_path)
+    format: edam:format_2330
+
+  output_smiles:
+    label: The Smiles of small molecules
+    doc: |-
+      The Smiles of small molecules
+    type:
+      type: array
+      items: string
+    outputBinding:
+      glob: $(inputs.output_txt_path)
+      loadContents: true
+      outputEval: |
+        ${
+            var lines = self[0].contents.split("\n");
+            // remove black lines
+            lines = lines.filter(function(line) {return line.trim() !== '';});
+            var smiles = [];
+            for (var i = 0; i < lines.length; i++) {
+              // The format of the lines is as follows: NC1=NC=NN2C1=CC=C2[C@@]1(O[C@H](CO)[C@@H](O)[C@H]1O)C#N,7bf6,7qg7
+              // The first item is the SMILES notation. We need to duplicate it, so each SMILES string
+              // corresponds to a PDB ID in the PDB IDs array.
+                var words = lines[i].split(",").map(function(item) {return item.trim();});
+                for (var j = 1; j < words.length; j++) {
+                      smiles.push(words[0]);
+                }
+              }
+            return smiles;
+        }
+
+  output_pdbids_1D:
+    label: The PDB IDs of target structures in 1D array
+    doc: |-
+      The PDB IDs of target structures in 1D array
+    type:
+      type: array
+      items: string
+    outputBinding:
+      glob: $(inputs.output_txt_path)
+      loadContents: true
+      outputEval: |
+        ${
+            var lines = self[0].contents.split("\n");
+            // remove blank lines
+            lines = lines.filter(function(line) {return line.trim() !== '';});
+            var pdbids = [];
+            for (var i = 0; i < lines.length; i++) {
+              // The format of the lines is as follows: NC1=NC=NN2C1=CC=C2[C@@]1(O[C@H](CO)[C@@H](O)[C@H]1O)C#N,7bf6,7qg7
+              // The first item is the SMILES notation and the rest are the target structure PDB IDs.
+                var words = lines[i].split(",").map(function(item) {return item.trim();});
+                for (var j = 1; j < words.length; j++) {
+                      pdbids.push(words[j]);
+                }
+              }
+            return pdbids;
+        }
+
+  output_pdbids_2D:
+    label: The PDB IDs of target structures in 2D array
+    doc: |-
+      The PDB IDs of target structures in 2D array
+    type: {"type": "array", "items": {"type": "array", "items": "string"}}
+    outputBinding:
+      glob: $(inputs.output_txt_path)
+      loadContents: true
+      outputEval: |
+        ${
+            var lines = self[0].contents.split("\n");
+            // remove blank lines
+            lines = lines.filter(function(line) {return line.trim() !== '';});
+            var pdbids_2d = [];
+            for (var i = 0; i < lines.length; i++) {
+              // The format of the lines is as follows: NC1=NC=NN2C1=CC=C2[C@@]1(O[C@H](CO)[C@@H](O)[C@H]1O)C#N,7bf6,7qg7
+              // The first item is the SMILES notation and the rest are the target structure PDB IDs.
+                var words = lines[i].split(",").map(function(item) {return item.trim();});
+                var pdbids = [];
+                for (var j = 1; j < words.length; j++) {
+                      pdbids.push(words[j]);
+                }
+                pdbids_2d.push(pdbids);
+              }
+            return pdbids_2d;
+        }
+
+$namespaces:
+  edam: https://edamontology.org/
+
+$schemas:
+- https://raw.githubusercontent.com/edamontology/edamontology/master/EDAM_dev.owl
diff --git a/utils/extract-pdbids-drugbank-plugin/ict.yml b/utils/extract-pdbids-drugbank-plugin/ict.yml
@@ -0,0 +1,86 @@
+specVersion: "0.1.0"
+name: extract_pdbids_drugbank
+version: 0.1.0
+container: extract-pdbids-drugbank-plugin
+entrypoint:
+title: extract_pdbids_drugbank
+description: Filter the Drugbank database
+author: Brandon Walker, Nazanin Donyapour
+contact: [email protected], [email protected]
+repository:
+documentation:
+citation:
+
+inputs:
+  - name: drugbank_xml_file_path
+    required: true
+    description: Path to the Drugbank xml file
+    type: File
+    defaultValue: {'class': 'File', 'location': '../../../fda_drug_dataset/drugbank/drugbank_5.1.10.xml'}
+    format:
+      uri: edam:format_2332
+  - name: smiles
+    required: true
+    description: List of input SMILES, Type string[], File type input, Accepted formats list[string]
+    type: ['null', {'type': 'array', 'items': 'string'}]
+    format:
+      uri: edam:format_2330
+  - name: inchi
+    required: true
+    description: List of input SMILES, Type string[], File type input, Accepted formats list[string]
+    type: ['null', {'type': 'array', 'items': 'string'}]
+    format:
+      uri: edam:format_2330
+  - name: inchi_keys
+    required: true
+    description: List of input SMILES, Type string[], File type input, Accepted formats list[string]
+    type: ['null', {'type': 'array', 'items': 'string'}]
+    format:
+      uri: edam:format_2330
+  - name: output_txt_path
+    required: true
+    description: Path to the text dataset file, Type string, File type output, Accepted formats txt
+    type: string
+    defaultValue: system.log
+    format:
+      uri: edam:format_2330
+outputs:
+  - name: output_txt_path
+    required: true
+    description: Path to the txt file
+    type: File
+    format:
+      uri: edam:format_2330
+  - name: output_smiles
+    required: true
+    description: The Smiles of small molecules
+    type: {'type': 'array', 'items': 'string'}
+  - name: output_pdbids_1D
+    required: true
+    description: The PDB IDs of target structures in 1D array
+    type: {'type': 'array', 'items': 'string'}
+  - name: output_pdbids_2D
+    required: true
+    description: The PDB IDs of target structures in 2D array
+    type: {'type': 'array', 'items': {'type': 'array', 'items': 'string'}}
+ui:
+  - key: inputs.drugbank_xml_file_path
+    title: "drugbank_xml_file_path: "
+    description: "Path to the Drugbank xml file"
+    type: File
+  - key: inputs.smiles
+    title: "smiles: "
+    description: "List of input SMILES, Type string[], File type input, Accepted formats list[string]"
+    type: ['null', {'type': 'array', 'items': 'string'}]
+  - key: inputs.inchi
+    title: "inchi: "
+    description: "List of input SMILES, Type string[], File type input, Accepted formats list[string]"
+    type: ['null', {'type': 'array', 'items': 'string'}]
+  - key: inputs.inchi_keys
+    title: "inchi_keys: "
+    description: "List of input SMILES, Type string[], File type input, Accepted formats list[string]"
+    type: ['null', {'type': 'array', 'items': 'string'}]
+  - key: inputs.output_txt_path
+    title: "output_txt_path: "
+    description: "Path to the text dataset file, Type string, File type output, Accepted formats txt"
+    type: string
diff --git a/utils/extract-pdbids-drugbank-plugin/pyproject.toml b/utils/extract-pdbids-drugbank-plugin/pyproject.toml
@@ -0,0 +1,33 @@
+[tool.poetry]
+name = "polus-mm-utils-extract-pdbids-drugbank"
+version = "0.1.0"
+description = "Filter the Drugbank database"
+authors = ["Nazanin Donyapour <[email protected]>", "Brandon Walker <[email protected]>"]
+readme = "README.md"
+packages = [{include = "polus", from = "src"}]
+
+[tool.poetry.dependencies]
+python = ">=3.9,<3.13"
+typer = "^0.7.0"
+sophios = "0.1.4"
+pandas = "2.2.2"
+rdkit = "2024.3.5"
+defusedxml = "0.7.1"
+
+[tool.poetry.group.dev.dependencies]
+bump2version = "^1.0.1"
+pytest = "^7.4"
+pytest-sugar = "^0.9.6"
+pre-commit = "^3.2.1"
+black = "^23.3.0"
+mypy = "^1.1.1"
+ruff = "^0.0.270"
+
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
+
+[tool.pytest.ini_options]
+pythonpath = [
+  "."
+]