Skip to content

Commit

Permalink
extract_pdbids_drugbank
Browse files Browse the repository at this point in the history
  • Loading branch information
ndonyapour committed Jul 17, 2024
1 parent 23fb84d commit 38a1e30
Show file tree
Hide file tree
Showing 18 changed files with 1,202 additions and 0 deletions.
29 changes: 29 additions & 0 deletions utils/extract-pdbids-drugbank-plugin/.bumpversion.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
[bumpversion]
current_version = 0.1.0
commit = False
tag = False
parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>[a-z]+)(?P<dev>\d+))?
serialize =
{major}.{minor}.{patch}-{release}{dev}
{major}.{minor}.{patch}

[bumpversion:part:release]
optional_value = _
first_value = dev
values =
dev
_

[bumpversion:part:dev]

[bumpversion:file:pyproject.toml]
search = version = "{current_version}"
replace = version = "{new_version}"

[bumpversion:file:VERSION]

[bumpversion:file:README.md]

[bumpversion:file:plugin.json]

[bumpversion:file:src/polus/mm/utils/extract_pdbids_drugbank/__init__.py]
4 changes: 4 additions & 0 deletions utils/extract-pdbids-drugbank-plugin/.dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
.venv
out
tests
__pycache__
1 change: 1 addition & 0 deletions utils/extract-pdbids-drugbank-plugin/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
poetry.lock
5 changes: 5 additions & 0 deletions utils/extract-pdbids-drugbank-plugin/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# CHANGELOG

## 0.1.0

Initial release.
27 changes: 27 additions & 0 deletions utils/extract-pdbids-drugbank-plugin/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
FROM condaforge/mambaforge

ENV EXEC_DIR="/opt/executables"
ENV POLUS_LOG="INFO"
RUN mkdir -p ${EXEC_DIR}


# Work directory defined in the base container
# WORKDIR ${EXEC_DIR}

COPY pyproject.toml ${EXEC_DIR}
COPY VERSION ${EXEC_DIR}
COPY README.md ${EXEC_DIR}
COPY CHANGELOG.md ${EXEC_DIR}

# Install needed packages here
# errors installing rdkit from poetry so using conda
COPY environment.yml ${EXEC_DIR}
RUN mamba env create -f ${EXEC_DIR}/environment.yml
RUN echo "source activate project_env" > ~/.bashrc
ENV PATH /opt/conda/envs/env/bin:$PATH

COPY src ${EXEC_DIR}/src

RUN pip3 install ${EXEC_DIR} --no-cache-dir

CMD ["--help"]
19 changes: 19 additions & 0 deletions utils/extract-pdbids-drugbank-plugin/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# extract_pdbids_drugbank (0.1.0)

Filter the Drugbank database

## Options

This plugin takes 5 input arguments and 4 output argument:

| Name | Description | I/O | Type | Default |
|---------------|-------------------------|--------|--------|---------|
| drugbank_xml_file_path | Path to the Drugbank xml file | Input | File | File |
| smiles | List of input SMILES, Type: string[], File type: input, Accepted formats: list[string] | Input | ['null', {'type': 'array', 'items': 'string'}] | ['null', {'type': 'array', 'items': 'string'}] |
| inchi | List of input SMILES, Type: string[], File type: input, Accepted formats: list[string] | Input | ['null', {'type': 'array', 'items': 'string'}] | ['null', {'type': 'array', 'items': 'string'}] |
| inchi_keys | List of input SMILES, Type: string[], File type: input, Accepted formats: list[string] | Input | ['null', {'type': 'array', 'items': 'string'}] | ['null', {'type': 'array', 'items': 'string'}] |
| output_txt_path | Path to the text dataset file, Type: string, File type: output, Accepted formats: txt | Input | string | string |
| output_txt_path | Path to the txt file | Output | File | File |
| output_smiles | The Smiles of small molecules | Output | {'type': 'array', 'items': 'string'} | {'type': 'array', 'items': 'string'} |
| output_pdbids_1D | The PDB IDs of target structures in 1D array | Output | {'type': 'array', 'items': 'string'} | {'type': 'array', 'items': 'string'} |
| output_pdbids_2D | The PDB IDs of target structures in 2D array | Output | {'type': 'array', 'items': {'type': 'array', 'items': 'string'}} | {'type': 'array', 'items': {'type': 'array', 'items': 'string'}} |
1 change: 1 addition & 0 deletions utils/extract-pdbids-drugbank-plugin/VERSION
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
0.1.0
4 changes: 4 additions & 0 deletions utils/extract-pdbids-drugbank-plugin/build-docker.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/bash

version=$(<VERSION)
docker build . -t polusai/extract-pdbids-drugbank-tool:${version}
10 changes: 10 additions & 0 deletions utils/extract-pdbids-drugbank-plugin/environment.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
name: project_env
channels:
- conda-forge
dependencies:
- python==3.10
- rdkit==2024.03.1
- defusedxml==0.7.1
- pytest==8.1.1
- cwltool==3.1.20240404144621
- cwl-utils==0.33
188 changes: 188 additions & 0 deletions utils/extract-pdbids-drugbank-plugin/extract_pdbids_drugbank.cwl
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
#!/usr/bin/env cwl-runner
cwlVersion: v1.0

class: CommandLineTool

label: Filter the Drugbank database

doc: |-
Filter the Drugbank database

baseCommand: ["conda", "run", "-n", "project_env", "python", "-m", "polus.mm.utils.extract_pdbids_drugbank"]

hints:
DockerRequirement:
dockerPull: ndonyapour/extract_pdbids_drugbank

requirements:
InlineJavascriptRequirement: {}
# Enabling InitialWorkDirRequirement will stage the input Drugbank xml file
InitialWorkDirRequirement:
listing:
- $(inputs.drugbank_xml_file_path)

inputs:
drugbank_xml_file_path:
label: Path to the Drugbank xml file
doc: |-
Path to the Drugbank xml file
type: File
format: edam:format_2332
inputBinding:
prefix: --drugbank_xml_file_path
default:
class: File
location: ../../../fda_drug_dataset/drugbank/drugbank_5.1.10.xml

smiles:
label: List of input SMILES # type:
doc: |-
List of input SMILES
Type: string[]
File type: input
Accepted formats: list[string]
type: ["null", {"type": "array", "items": "string"}]
format: edam:format_2330
inputBinding:
prefix: --smiles
default: []

inchi:
label: List of input SMILES # type:
doc: |-
List of input SMILES
Type: string[]
File type: input
Accepted formats: list[string]
type: ["null", {"type": "array", "items": "string"}]
format:
- edam:format_2330
inputBinding:
prefix: --inchi
default: []

inchi_keys:
label: List of input SMILES # type:
doc: |-
List of input SMILES
Type: string[]
File type: input
Accepted formats: list[string]
type: ["null", {"type": "array", "items": "string"}]
format:
- edam:format_2330
inputBinding:
prefix: --inchi_keys
default: []

output_txt_path:
label: Path to the text dataset file
doc: |-
Path to the text dataset file
Type: string
File type: output
Accepted formats: txt
type: string
format:
- edam:format_2330
inputBinding:
prefix: --output_txt_path
default: system.log

outputs:
output_txt_path:
label: Path to the txt file
doc: |-
Path to the txt file
type: File
outputBinding:
glob: $(inputs.output_txt_path)
format: edam:format_2330

output_smiles:
label: The Smiles of small molecules
doc: |-
The Smiles of small molecules
type:
type: array
items: string
outputBinding:
glob: $(inputs.output_txt_path)
loadContents: true
outputEval: |
${
var lines = self[0].contents.split("\n");
// remove black lines
lines = lines.filter(function(line) {return line.trim() !== '';});
var smiles = [];
for (var i = 0; i < lines.length; i++) {
// The format of the lines is as follows: NC1=NC=NN2C1=CC=C2[C@@]1(O[C@H](CO)[C@@H](O)[C@H]1O)C#N,7bf6,7qg7
// The first item is the SMILES notation. We need to duplicate it, so each SMILES string
// corresponds to a PDB ID in the PDB IDs array.
var words = lines[i].split(",").map(function(item) {return item.trim();});
for (var j = 1; j < words.length; j++) {
smiles.push(words[0]);
}
}
return smiles;
}

output_pdbids_1D:
label: The PDB IDs of target structures in 1D array
doc: |-
The PDB IDs of target structures in 1D array
type:
type: array
items: string
outputBinding:
glob: $(inputs.output_txt_path)
loadContents: true
outputEval: |
${
var lines = self[0].contents.split("\n");
// remove blank lines
lines = lines.filter(function(line) {return line.trim() !== '';});
var pdbids = [];
for (var i = 0; i < lines.length; i++) {
// The format of the lines is as follows: NC1=NC=NN2C1=CC=C2[C@@]1(O[C@H](CO)[C@@H](O)[C@H]1O)C#N,7bf6,7qg7
// The first item is the SMILES notation and the rest are the target structure PDB IDs.
var words = lines[i].split(",").map(function(item) {return item.trim();});
for (var j = 1; j < words.length; j++) {
pdbids.push(words[j]);
}
}
return pdbids;
}

output_pdbids_2D:
label: The PDB IDs of target structures in 2D array
doc: |-
The PDB IDs of target structures in 2D array
type: {"type": "array", "items": {"type": "array", "items": "string"}}
outputBinding:
glob: $(inputs.output_txt_path)
loadContents: true
outputEval: |
${
var lines = self[0].contents.split("\n");
// remove blank lines
lines = lines.filter(function(line) {return line.trim() !== '';});
var pdbids_2d = [];
for (var i = 0; i < lines.length; i++) {
// The format of the lines is as follows: NC1=NC=NN2C1=CC=C2[C@@]1(O[C@H](CO)[C@@H](O)[C@H]1O)C#N,7bf6,7qg7
// The first item is the SMILES notation and the rest are the target structure PDB IDs.
var words = lines[i].split(",").map(function(item) {return item.trim();});
var pdbids = [];
for (var j = 1; j < words.length; j++) {
pdbids.push(words[j]);
}
pdbids_2d.push(pdbids);
}
return pdbids_2d;
}

$namespaces:
edam: https://edamontology.org/

$schemas:
- https://raw.githubusercontent.com/edamontology/edamontology/master/EDAM_dev.owl
86 changes: 86 additions & 0 deletions utils/extract-pdbids-drugbank-plugin/ict.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
specVersion: "0.1.0"
name: extract_pdbids_drugbank
version: 0.1.0
container: extract-pdbids-drugbank-plugin
entrypoint:
title: extract_pdbids_drugbank
description: Filter the Drugbank database
author: Brandon Walker, Nazanin Donyapour
contact: [email protected], [email protected]
repository:
documentation:
citation:

inputs:
- name: drugbank_xml_file_path
required: true
description: Path to the Drugbank xml file
type: File
defaultValue: {'class': 'File', 'location': '../../../fda_drug_dataset/drugbank/drugbank_5.1.10.xml'}
format:
uri: edam:format_2332
- name: smiles
required: true
description: List of input SMILES, Type string[], File type input, Accepted formats list[string]
type: ['null', {'type': 'array', 'items': 'string'}]
format:
uri: edam:format_2330
- name: inchi
required: true
description: List of input SMILES, Type string[], File type input, Accepted formats list[string]
type: ['null', {'type': 'array', 'items': 'string'}]
format:
uri: edam:format_2330
- name: inchi_keys
required: true
description: List of input SMILES, Type string[], File type input, Accepted formats list[string]
type: ['null', {'type': 'array', 'items': 'string'}]
format:
uri: edam:format_2330
- name: output_txt_path
required: true
description: Path to the text dataset file, Type string, File type output, Accepted formats txt
type: string
defaultValue: system.log
format:
uri: edam:format_2330
outputs:
- name: output_txt_path
required: true
description: Path to the txt file
type: File
format:
uri: edam:format_2330
- name: output_smiles
required: true
description: The Smiles of small molecules
type: {'type': 'array', 'items': 'string'}
- name: output_pdbids_1D
required: true
description: The PDB IDs of target structures in 1D array
type: {'type': 'array', 'items': 'string'}
- name: output_pdbids_2D
required: true
description: The PDB IDs of target structures in 2D array
type: {'type': 'array', 'items': {'type': 'array', 'items': 'string'}}
ui:
- key: inputs.drugbank_xml_file_path
title: "drugbank_xml_file_path: "
description: "Path to the Drugbank xml file"
type: File
- key: inputs.smiles
title: "smiles: "
description: "List of input SMILES, Type string[], File type input, Accepted formats list[string]"
type: ['null', {'type': 'array', 'items': 'string'}]
- key: inputs.inchi
title: "inchi: "
description: "List of input SMILES, Type string[], File type input, Accepted formats list[string]"
type: ['null', {'type': 'array', 'items': 'string'}]
- key: inputs.inchi_keys
title: "inchi_keys: "
description: "List of input SMILES, Type string[], File type input, Accepted formats list[string]"
type: ['null', {'type': 'array', 'items': 'string'}]
- key: inputs.output_txt_path
title: "output_txt_path: "
description: "Path to the text dataset file, Type string, File type output, Accepted formats txt"
type: string
Loading

0 comments on commit 38a1e30

Please sign in to comment.