extract_pdbids_drugbank

PolusAI · Jul 17, 2024 · 38a1e30 · 38a1e30
1 parent 23fb84d
commit 38a1e30
Show file tree

Hide file tree

Showing 18 changed files with 1,202 additions and 0 deletions.
diff --git a/utils/extract-pdbids-drugbank-plugin/.bumpversion.cfg b/utils/extract-pdbids-drugbank-plugin/.bumpversion.cfg
@@ -0,0 +1,29 @@
+[bumpversion]
+current_version = 0.1.0
+commit = False
+tag = False
+parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>[a-z]+)(?P<dev>\d+))?
+serialize = 
+	{major}.{minor}.{patch}-{release}{dev}
+	{major}.{minor}.{patch}
+
+[bumpversion:part:release]
+optional_value = _
+first_value = dev
+values = 
+	dev
+	_
+
+[bumpversion:part:dev]
+
+[bumpversion:file:pyproject.toml]
+search = version = "{current_version}"
+replace = version = "{new_version}"
+
+[bumpversion:file:VERSION]
+
+[bumpversion:file:README.md]
+
+[bumpversion:file:plugin.json]
+
+[bumpversion:file:src/polus/mm/utils/extract_pdbids_drugbank/__init__.py]
diff --git a/utils/extract-pdbids-drugbank-plugin/.dockerignore b/utils/extract-pdbids-drugbank-plugin/.dockerignore
@@ -0,0 +1,4 @@
+.venv
+out
+tests
+__pycache__
diff --git a/utils/extract-pdbids-drugbank-plugin/.gitignore b/utils/extract-pdbids-drugbank-plugin/.gitignore
@@ -0,0 +1 @@
+poetry.lock
diff --git a/utils/extract-pdbids-drugbank-plugin/CHANGELOG.md b/utils/extract-pdbids-drugbank-plugin/CHANGELOG.md
@@ -0,0 +1,5 @@
+# CHANGELOG
+
+## 0.1.0
+
+Initial release.
diff --git a/utils/extract-pdbids-drugbank-plugin/Dockerfile b/utils/extract-pdbids-drugbank-plugin/Dockerfile
@@ -0,0 +1,27 @@
+FROM condaforge/mambaforge
+
+ENV EXEC_DIR="/opt/executables"
+ENV POLUS_LOG="INFO"
+RUN mkdir -p ${EXEC_DIR}
+
+
+# Work directory defined in the base container
+# WORKDIR ${EXEC_DIR}
+
+COPY pyproject.toml ${EXEC_DIR}
+COPY VERSION ${EXEC_DIR}
+COPY README.md ${EXEC_DIR}
+COPY CHANGELOG.md ${EXEC_DIR}
+
+# Install needed packages here
+# errors installing rdkit from poetry so using conda
+COPY environment.yml ${EXEC_DIR}
+RUN mamba env create -f ${EXEC_DIR}/environment.yml
+RUN echo "source activate project_env" > ~/.bashrc
+ENV PATH /opt/conda/envs/env/bin:$PATH
+
+COPY src ${EXEC_DIR}/src
+
+RUN pip3 install ${EXEC_DIR} --no-cache-dir
+
+CMD ["--help"]
diff --git a/utils/extract-pdbids-drugbank-plugin/README.md b/utils/extract-pdbids-drugbank-plugin/README.md
@@ -0,0 +1,19 @@
+# extract_pdbids_drugbank (0.1.0)
+
+Filter the Drugbank database
+
+## Options
+
+This plugin takes 5 input arguments and 4 output argument:
+
+| Name          | Description             | I/O    | Type   | Default |
+|---------------|-------------------------|--------|--------|---------|
+| drugbank_xml_file_path | Path to the Drugbank xml file | Input | File | File |
+| smiles | List of input SMILES, Type: string[], File type: input, Accepted formats: list[string] | Input | ['null', {'type': 'array', 'items': 'string'}] | ['null', {'type': 'array', 'items': 'string'}] |
+| inchi | List of input SMILES, Type: string[], File type: input, Accepted formats: list[string] | Input | ['null', {'type': 'array', 'items': 'string'}] | ['null', {'type': 'array', 'items': 'string'}] |
+| inchi_keys | List of input SMILES, Type: string[], File type: input, Accepted formats: list[string] | Input | ['null', {'type': 'array', 'items': 'string'}] | ['null', {'type': 'array', 'items': 'string'}] |
+| output_txt_path | Path to the text dataset file, Type: string, File type: output, Accepted formats: txt | Input | string | string |
+| output_txt_path | Path to the txt file | Output | File | File |
+| output_smiles | The Smiles of small molecules | Output | {'type': 'array', 'items': 'string'} | {'type': 'array', 'items': 'string'} |
+| output_pdbids_1D | The PDB IDs of target structures in 1D array  | Output | {'type': 'array', 'items': 'string'} | {'type': 'array', 'items': 'string'} |
+| output_pdbids_2D | The PDB IDs of target structures in 2D array  | Output | {'type': 'array', 'items': {'type': 'array', 'items': 'string'}} | {'type': 'array', 'items': {'type': 'array', 'items': 'string'}} |
diff --git a/utils/extract-pdbids-drugbank-plugin/VERSION b/utils/extract-pdbids-drugbank-plugin/VERSION
@@ -0,0 +1 @@
+0.1.0
diff --git a/utils/extract-pdbids-drugbank-plugin/build-docker.sh b/utils/extract-pdbids-drugbank-plugin/build-docker.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+version=$(<VERSION)
+docker build . -t polusai/extract-pdbids-drugbank-tool:${version}
diff --git a/utils/extract-pdbids-drugbank-plugin/environment.yml b/utils/extract-pdbids-drugbank-plugin/environment.yml
@@ -0,0 +1,10 @@
+name: project_env
+channels:
+  - conda-forge
+dependencies:
+  - python==3.10
+  - rdkit==2024.03.1
+  - defusedxml==0.7.1
+  - pytest==8.1.1
+  - cwltool==3.1.20240404144621
+  - cwl-utils==0.33
diff --git a/utils/extract-pdbids-drugbank-plugin/extract_pdbids_drugbank.cwl b/utils/extract-pdbids-drugbank-plugin/extract_pdbids_drugbank.cwl
@@ -0,0 +1,188 @@
+#!/usr/bin/env cwl-runner
+cwlVersion: v1.0
+
+class: CommandLineTool
+
+label: Filter the Drugbank database
+
+doc: |-
+  Filter the Drugbank database
+
+baseCommand: ["conda", "run", "-n", "project_env", "python", "-m", "polus.mm.utils.extract_pdbids_drugbank"]
+
+hints:
+  DockerRequirement:
+    dockerPull: ndonyapour/extract_pdbids_drugbank
+
+requirements:
+  InlineJavascriptRequirement: {}
+  # Enabling InitialWorkDirRequirement will stage the input Drugbank xml file
+  InitialWorkDirRequirement:
+    listing:
+    - $(inputs.drugbank_xml_file_path)
+
+inputs:
+  drugbank_xml_file_path:
+    label: Path to the Drugbank xml file
+    doc: |-
+      Path to the Drugbank xml file
+    type: File
+    format: edam:format_2332
+    inputBinding:
+      prefix: --drugbank_xml_file_path
+    default:
+      class: File
+      location: ../../../fda_drug_dataset/drugbank/drugbank_5.1.10.xml
+
+  smiles:
+    label: List of input SMILES  # type:
+    doc: |-
+      List of input SMILES
+      Type: string[]
+      File type: input
+      Accepted formats: list[string]
+    type: ["null", {"type": "array", "items": "string"}]
+    format: edam:format_2330
+    inputBinding:
+      prefix: --smiles
+    default: []
+
+  inchi:
+    label: List of input SMILES  # type:
+    doc: |-
+      List of input SMILES
+      Type: string[]
+      File type: input
+      Accepted formats: list[string]
+    type: ["null", {"type": "array", "items": "string"}]
+    format:
+    - edam:format_2330
+    inputBinding:
+      prefix: --inchi
+    default: []
+
+  inchi_keys:
+    label: List of input SMILES  # type:
+    doc: |-
+      List of input SMILES
+      Type: string[]
+      File type: input
+      Accepted formats: list[string]
+    type: ["null", {"type": "array", "items": "string"}]
+    format:
+    - edam:format_2330
+    inputBinding:
+      prefix: --inchi_keys
+    default: []
+
+  output_txt_path:
+    label: Path to the text dataset file
+    doc: |-
+      Path to the text dataset file
+      Type: string
+      File type: output
+      Accepted formats: txt
+    type: string
+    format:
+    - edam:format_2330
+    inputBinding:
+      prefix: --output_txt_path
+    default: system.log
+
+outputs:
+  output_txt_path:
+    label: Path to the txt file
+    doc: |-
+      Path to the txt file
+    type: File
+    outputBinding:
+      glob: $(inputs.output_txt_path)
+    format: edam:format_2330
+
+  output_smiles:
+    label: The Smiles of small molecules
+    doc: |-
+      The Smiles of small molecules
+    type:
+      type: array
+      items: string
+    outputBinding:
+      glob: $(inputs.output_txt_path)
+      loadContents: true
+      outputEval: |
+        ${
+            var lines = self[0].contents.split("\n");
+            // remove black lines
+            lines = lines.filter(function(line) {return line.trim() !== '';});
+            var smiles = [];
+            for (var i = 0; i < lines.length; i++) {
+              // The format of the lines is as follows: NC1=NC=NN2C1=CC=C2[C@@]1(O[C@H](CO)[C@@H](O)[C@H]1O)C#N,7bf6,7qg7
+              // The first item is the SMILES notation. We need to duplicate it, so each SMILES string
+              // corresponds to a PDB ID in the PDB IDs array.
+                var words = lines[i].split(",").map(function(item) {return item.trim();});
+                for (var j = 1; j < words.length; j++) {
+                      smiles.push(words[0]);
+                }
+              }
+            return smiles;
+        }
+
+  output_pdbids_1D:
+    label: The PDB IDs of target structures in 1D array
+    doc: |-
+      The PDB IDs of target structures in 1D array
+    type:
+      type: array
+      items: string
+    outputBinding:
+      glob: $(inputs.output_txt_path)
+      loadContents: true
+      outputEval: |
+        ${
+            var lines = self[0].contents.split("\n");
+            // remove blank lines
+            lines = lines.filter(function(line) {return line.trim() !== '';});
+            var pdbids = [];
+            for (var i = 0; i < lines.length; i++) {
+              // The format of the lines is as follows: NC1=NC=NN2C1=CC=C2[C@@]1(O[C@H](CO)[C@@H](O)[C@H]1O)C#N,7bf6,7qg7
+              // The first item is the SMILES notation and the rest are the target structure PDB IDs.
+                var words = lines[i].split(",").map(function(item) {return item.trim();});
+                for (var j = 1; j < words.length; j++) {
+                      pdbids.push(words[j]);
+                }
+              }
+            return pdbids;
+        }
+
+  output_pdbids_2D:
+    label: The PDB IDs of target structures in 2D array
+    doc: |-
+      The PDB IDs of target structures in 2D array
+    type: {"type": "array", "items": {"type": "array", "items": "string"}}
+    outputBinding:
+      glob: $(inputs.output_txt_path)
+      loadContents: true
+      outputEval: |
+        ${
+            var lines = self[0].contents.split("\n");
+            // remove blank lines
+            lines = lines.filter(function(line) {return line.trim() !== '';});
+            var pdbids_2d = [];
+            for (var i = 0; i < lines.length; i++) {
+              // The format of the lines is as follows: NC1=NC=NN2C1=CC=C2[C@@]1(O[C@H](CO)[C@@H](O)[C@H]1O)C#N,7bf6,7qg7
+              // The first item is the SMILES notation and the rest are the target structure PDB IDs.
+                var words = lines[i].split(",").map(function(item) {return item.trim();});
+                var pdbids = [];
+                for (var j = 1; j < words.length; j++) {
+                      pdbids.push(words[j]);
+                }
+                pdbids_2d.push(pdbids);
+              }
+            return pdbids_2d;
+        }
+
+$namespaces:
+  edam: https://edamontology.org/
+
+$schemas:
+- https://raw.githubusercontent.com/edamontology/edamontology/master/EDAM_dev.owl
diff --git a/utils/extract-pdbids-drugbank-plugin/ict.yml b/utils/extract-pdbids-drugbank-plugin/ict.yml
@@ -0,0 +1,86 @@
+specVersion: "0.1.0"
+name: extract_pdbids_drugbank
+version: 0.1.0
+container: extract-pdbids-drugbank-plugin
+entrypoint:
+title: extract_pdbids_drugbank
+description: Filter the Drugbank database
+author: Brandon Walker, Nazanin Donyapour
+contact: [email protected], [email protected]
+repository:
+documentation:
+citation:
+
+inputs:
+  - name: drugbank_xml_file_path
+    required: true
+    description: Path to the Drugbank xml file
+    type: File
+    defaultValue: {'class': 'File', 'location': '../../../fda_drug_dataset/drugbank/drugbank_5.1.10.xml'}
+    format:
+      uri: edam:format_2332
+  - name: smiles
+    required: true
+    description: List of input SMILES, Type string[], File type input, Accepted formats list[string]
+    type: ['null', {'type': 'array', 'items': 'string'}]
+    format:
+      uri: edam:format_2330
+  - name: inchi
+    required: true
+    description: List of input SMILES, Type string[], File type input, Accepted formats list[string]
+    type: ['null', {'type': 'array', 'items': 'string'}]
+    format:
+      uri: edam:format_2330
+  - name: inchi_keys
+    required: true
+    description: List of input SMILES, Type string[], File type input, Accepted formats list[string]
+    type: ['null', {'type': 'array', 'items': 'string'}]
+    format:
+      uri: edam:format_2330
+  - name: output_txt_path
+    required: true
+    description: Path to the text dataset file, Type string, File type output, Accepted formats txt
+    type: string
+    defaultValue: system.log
+    format:
+      uri: edam:format_2330
+outputs:
+  - name: output_txt_path
+    required: true
+    description: Path to the txt file
+    type: File
+    format:
+      uri: edam:format_2330
+  - name: output_smiles
+    required: true
+    description: The Smiles of small molecules
+    type: {'type': 'array', 'items': 'string'}
+  - name: output_pdbids_1D
+    required: true
+    description: The PDB IDs of target structures in 1D array
+    type: {'type': 'array', 'items': 'string'}
+  - name: output_pdbids_2D
+    required: true
+    description: The PDB IDs of target structures in 2D array
+    type: {'type': 'array', 'items': {'type': 'array', 'items': 'string'}}
+ui:
+  - key: inputs.drugbank_xml_file_path
+    title: "drugbank_xml_file_path: "
+    description: "Path to the Drugbank xml file"
+    type: File
+  - key: inputs.smiles
+    title: "smiles: "
+    description: "List of input SMILES, Type string[], File type input, Accepted formats list[string]"
+    type: ['null', {'type': 'array', 'items': 'string'}]
+  - key: inputs.inchi
+    title: "inchi: "
+    description: "List of input SMILES, Type string[], File type input, Accepted formats list[string]"
+    type: ['null', {'type': 'array', 'items': 'string'}]
+  - key: inputs.inchi_keys
+    title: "inchi_keys: "
+    description: "List of input SMILES, Type string[], File type input, Accepted formats list[string]"
+    type: ['null', {'type': 'array', 'items': 'string'}]
+  - key: inputs.output_txt_path
+    title: "output_txt_path: "
+    description: "Path to the text dataset file, Type string, File type output, Accepted formats txt"
+    type: string