Skip to content

Commit

Permalink
extract pdbbind refined
Browse files Browse the repository at this point in the history
  • Loading branch information
Brandon Duane Walker authored and Brandon Duane Walker committed May 30, 2024
1 parent 6d5007e commit f1bf40f
Show file tree
Hide file tree
Showing 17 changed files with 729 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
[bumpversion]
current_version = 0.1.0
commit = False
tag = False
parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>[a-z]+)(?P<dev>\d+))?
serialize =
{major}.{minor}.{patch}-{release}{dev}
{major}.{minor}.{patch}

[bumpversion:part:release]
optional_value = _
first_value = dev
values =
dev
_

[bumpversion:part:dev]

[bumpversion:file:pyproject.toml]
search = version = "{current_version}"
replace = version = "{new_version}"

[bumpversion:file:VERSION]

[bumpversion:file:README.md]

[bumpversion:file:plugin.json]

[bumpversion:file:src/polus/mm/utils/pdbbind_refined_v2020/__init__.py]
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
.venv
out
tests
__pycache__
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
poetry.lock
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# CHANGELOG

## 0.1.0

Initial release.
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# docker build -f Dockerfile -t mrbrandonwalker/pdbbind_refined_v2020_tool .

FROM condaforge/mambaforge

ENV EXEC_DIR="/opt/executables"
ENV POLUS_LOG="INFO"
RUN mkdir -p ${EXEC_DIR}

RUN apt-get update && apt-get install -y wget
# Download the bdbbind dataset
# RUN wget --no-clobber http://www.pdbbind.org.cn/download/PDBbind_v2020_refined.tar.gz
## update to the new download URL (around 10 times faster) from PDBbind website.
RUN wget --no-clobber https://pdbbind.oss-cn-hangzhou.aliyuncs.com/download/PDBbind_v2020_refined.tar.gz
RUN tar -xvf PDBbind_v2020_refined.tar.gz

# Work directory defined in the base container

COPY pyproject.toml ${EXEC_DIR}
COPY VERSION ${EXEC_DIR}
COPY README.md ${EXEC_DIR}
COPY CHANGELOG.md ${EXEC_DIR}

# Install needed packages here
RUN mamba install -c conda-forge pandas

COPY src ${EXEC_DIR}/src

RUN pip3 install ${EXEC_DIR} --no-cache-dir

CMD ["--help"]

WORKDIR /outdir
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# pdbbind_refined_v2020 (0.1.0)

Extract pdbbind_refined_v2020 data

## Options

This plugin takes 7 input arguments

| Name | Description | I/O | Type | Default |
|---------------|-------------------------|--------|--------|---------|
| index_file_name | | Input | string | string |
| base_dir | | Input | string | string |
| query | query str to search the dataset. Pandas query doesn't support slash(/) in column names please use Kd_Ki instead of Kd/Ki, Type: string, File type: input, Accepted formats: txt | Input | string | string |
| min_row | The row min inex, Type: int | Input | int | int |
| max_row | The row max inex, Type: int | Input | int | int |
| convert_Kd_dG | If this is set to true, dG will be calculated | Input | boolean | boolean |
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
0.1.0
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""pdbbind_refined_v2020_plugin package."""
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/bash

version=$(<VERSION)
docker build . -t polusai/pdbbind-refined-v2020-tool:${version}
Original file line number Diff line number Diff line change
@@ -0,0 +1,246 @@
#!/usr/bin/env cwl-runner
cwlVersion: v1.0

class: CommandLineTool

label: Download the PDBbind refined database

doc: |-
Download the PDBbind refined database

baseCommand: ["python", "-m", "polus.mm.utils.pdbbind_refined_v2020"]

hints:
DockerRequirement:
dockerPull: mrbrandonwalker/pdbbind_refined_v2020_tool


requirements:
InlineJavascriptRequirement: {}

inputs:

index_file_name:
label: The index file name
type: string
format:
- edam:format_2330
inputBinding:
prefix: --index_file_name
position: 1
default: INDEX_refined_data.2020

query:
label: query str to search the dataset, Pandas query doesn't support slash(/) in column names please use Kd_Ki instead of Kd/Ki
doc: |-
query str to search the dataset. Pandas query doesn't support slash(/) in column names please use Kd_Ki instead of Kd/Ki
Type: string
File type: input
Accepted formats: txt
type: string
format:
- edam:format_2330
inputBinding:
prefix: --query
position: 2

output_txt_path:
label: Path to the text dataset file
doc: |-
Path to the text dataset file
Type: string
File type: output
Accepted formats: txt
type: string
format:
- edam:format_2330
inputBinding:
prefix: --output_txt_path
position: 3
default: system.log

min_row:
label: The row min index
doc: |-
The row min inex
Type: int
type: int?
format:
- edam:format_2330
inputBinding:
position: 4
prefix: --min_row

max_row:
label: The row max index
doc: |-
The row max inex
Type: int
type: int?
format:
- edam:format_2330
inputBinding:
position: 5
prefix: --max_row

convert_Kd_dG:
label: If this is set to true, dG will be calculated
doc: If this is set to true, dG will be calculated
type: boolean
format:
- edam:format_2330
inputBinding:
prefix: --convert_Kd_dG
position: 6
default: False

experimental_dGs:
label: Experimental Free Energies of Binding
doc: |-
Experimental Free Energies of Binding
type: string?
format:
- edam:format_2330

pdb_ids:
label: The PDBID of proteins
doc: |-
The PDBID of proteins
type: string?
format:
- edam:format_2330

outputs:

output_txt_path:
label: Path to the txt file
doc: |-
Path to the txt file
type: File
outputBinding:
glob: $(inputs.output_txt_path)
format: edam:format_2330

output_pdb_paths:
label: Path to the input file
doc: |-
Path to the input file
Type: string
File type: input
Accepted formats: pdb
type: File[]
outputBinding:
# NOTE: Do NOT just use glob: ./*.pdb !!! This will return an array sorted by filenames.
# We want the order of output_pdb_paths to match the order of experimental_dGs, etc
# Because we need to compare experimental ΔGs with predicted values.
glob: $(inputs.output_txt_path)
loadContents: true
outputEval: |
${
var lines = self[0].contents.split("\n");
var pdbs = [];
for (var i = 0; i < lines.length; i++) {
var words = lines[i].split(" ");
var pdbid = words[0];
var pdbfile = {"class": "File", "path": pdbid + "_protein.pdb"};
pdbs.push(pdbfile);
}

return pdbs;
}
format: edam:format_1476

output_sdf_paths:
label: Path to the input file
doc: |-
Path to the input file
Type: string
File type: input
Accepted formats: sdf
type: File[]
outputBinding:
# NOTE: Do NOT just use glob: ./*.sdf !!! This will return an array sorted by filenames.
# We want the order of output_sdf_paths to match the order of experimental_dGs, etc
# Because we need to compare experimental ΔGs with predicted values.
glob: $(inputs.output_txt_path)
loadContents: true
outputEval: |
${
var lines = self[0].contents.split("\n");
var sdfs = [];
for (var i = 0; i < lines.length; i++) {
var words = lines[i].split(" ");
var pdbid = words[0];
var sdffile = {"class": "File", "path": pdbid + "_ligand.sdf"};
sdfs.push(sdffile);
}

return sdfs;
}
format: edam:format_3814

experimental_dGs:
label: Experimental Free Energies of Binding
doc: |-
Experimental Free Energies of Binding
type: ["null", {"type": "array", "items": "float"}]
outputBinding:
glob: $(inputs.output_txt_path)
loadContents: true
outputEval: |
${
var lines = self[0].contents.split("\n");
var experimental_dGs = [];
for (var i = 0; i < lines.length; i++) {
var words = lines[i].split(" ");
if (words.length > 2) {
var experimental_dG = parseFloat(words[2]);
experimental_dGs.push(experimental_dG);
}
}

if (experimental_dGs.length == 0) {
return null;
} else {
return experimental_dGs;
}
}

pdb_ids:
label: The PDBID of proteins
doc: |-
The PDBID of proteins
type:
type: array
items: string
outputBinding:
glob: $(inputs.output_txt_path)
loadContents: true
outputEval: |
${
var lines = self[0].contents.split("\n");
var pdbids = [];
for (var i = 0; i < lines.length; i++) {
var words = lines[i].split(" ");
pdbids.push(words[0]);
}

if (pdbids.length == 0) {
throw new Error("Error! pdbids are empty!");
} else {
return pdbids;
}
}

stdout:
type: File
outputBinding:
glob: stdout

stdout: stdout

$namespaces:
edam: https://edamontology.org/

$schemas:
- https://raw.githubusercontent.com/edamontology/edamontology/master/EDAM_dev.owl
42 changes: 42 additions & 0 deletions utils/pre-process/data-download/pdbbind_refined_v2020_tool/ict.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
specVersion: 0.1.0
name: labshare/pdbbind-refined-v2020
version: 0.1.0
container: polusai/pdbbind-refined-v2020-tool:0.1.0
entrypoint: ""
title: pdbbind_refined_v2020
description: Extracts data from the PDBBind refined dataset
author: Brandon Walker ([email protected]), Nazanin Donyapour ([email protected])
repository: https://github.com/labshare/mmtools
documentation: https://ncats.nih.gov/preclinical/core/informatics
citation: ""
inputs:
- name: output_txt_path
required: false
description: Path to the text dataset file
type: string
default: /outdir/system.log
- name: index_file_name
required: false
description: The index file name
type: string
default: INDEX_refined_data.2020
- name: base_dir
required: true
description: The base_dir path
type: string
- name: query
required: false
description: "query str to search the dataset, Pandas query doesn't support slash(/) in column names please use Kd_Ki instead of Kd/Ki"
type: string
- name: min_row
required: false
description: The row min index
type: int
- name: max_row
required: false
description: The row max index
type: int
- name: convert_Kd_dG
required: false
description: If this is set to true, dG will be calculated
type: boolean
Loading

0 comments on commit f1bf40f

Please sign in to comment.