Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
extract pdbbind refined
Browse files Browse the repository at this point in the history
Brandon Duane Walker authored and Brandon Duane Walker committed May 2, 2024
1 parent 6d5007e commit 5647a5d
Showing 17 changed files with 729 additions and 0 deletions.
29 changes: 29 additions & 0 deletions utils/pdbbind_refined_v2020_plugin/.bumpversion.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
[bumpversion]
current_version = 0.1.0
commit = False
tag = False
parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>[a-z]+)(?P<dev>\d+))?
serialize =
{major}.{minor}.{patch}-{release}{dev}
{major}.{minor}.{patch}

[bumpversion:part:release]
optional_value = _
first_value = dev
values =
dev
_

[bumpversion:part:dev]

[bumpversion:file:pyproject.toml]
search = version = "{current_version}"
replace = version = "{new_version}"

[bumpversion:file:VERSION]

[bumpversion:file:README.md]

[bumpversion:file:plugin.json]

[bumpversion:file:src/polus/mm/utils/pdbbind_refined_v2020/__init__.py]
4 changes: 4 additions & 0 deletions utils/pdbbind_refined_v2020_plugin/.dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
.venv
out
tests
__pycache__
1 change: 1 addition & 0 deletions utils/pdbbind_refined_v2020_plugin/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
poetry.lock
5 changes: 5 additions & 0 deletions utils/pdbbind_refined_v2020_plugin/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# CHANGELOG

## 0.1.0

Initial release.
32 changes: 32 additions & 0 deletions utils/pdbbind_refined_v2020_plugin/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# docker build -f Dockerfile -t mrbrandonwalker/pdbbind_refined_v2020_tool .

FROM condaforge/mambaforge

ENV EXEC_DIR="/opt/executables"
ENV POLUS_LOG="INFO"
RUN mkdir -p ${EXEC_DIR}

RUN apt-get update && apt-get install -y wget
# Download the bdbbind dataset
# RUN wget --no-clobber http://www.pdbbind.org.cn/download/PDBbind_v2020_refined.tar.gz
## update to the new download URL (around 10 times faster) from PDBbind website.
RUN wget --no-clobber https://pdbbind.oss-cn-hangzhou.aliyuncs.com/download/PDBbind_v2020_refined.tar.gz
RUN tar -xvf PDBbind_v2020_refined.tar.gz

# Work directory defined in the base container

COPY pyproject.toml ${EXEC_DIR}
COPY VERSION ${EXEC_DIR}
COPY README.md ${EXEC_DIR}
COPY CHANGELOG.md ${EXEC_DIR}

# Install needed packages here
RUN mamba install -c conda-forge pandas

COPY src ${EXEC_DIR}/src

RUN pip3 install ${EXEC_DIR} --no-cache-dir

CMD ["--help"]

WORKDIR /outdir
16 changes: 16 additions & 0 deletions utils/pdbbind_refined_v2020_plugin/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# pdbbind_refined_v2020 (0.1.0)

Extract pdbbind_refined_v2020 data

## Options

This plugin takes 7 input arguments

| Name | Description | I/O | Type | Default |
|---------------|-------------------------|--------|--------|---------|
| index_file_name | | Input | string | string |
| base_dir | | Input | string | string |
| query | query str to search the dataset. Pandas query doesn't support slash(/) in column names please use Kd_Ki instead of Kd/Ki, Type: string, File type: input, Accepted formats: txt | Input | string | string |
| min_row | The row min inex, Type: int | Input | int | int |
| max_row | The row max inex, Type: int | Input | int | int |
| convert_Kd_dG | If this is set to true, dG will be calculated | Input | boolean | boolean |
1 change: 1 addition & 0 deletions utils/pdbbind_refined_v2020_plugin/VERSION
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
0.1.0
1 change: 1 addition & 0 deletions utils/pdbbind_refined_v2020_plugin/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""pdbbind_refined_v2020_plugin package."""
4 changes: 4 additions & 0 deletions utils/pdbbind_refined_v2020_plugin/build-docker.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/bash

version=$(<VERSION)
docker build . -t polusai/pdbbind-refined-v2020-plugin:${version}
246 changes: 246 additions & 0 deletions utils/pdbbind_refined_v2020_plugin/extract_pdbbind_refined.cwl
Original file line number Diff line number Diff line change
@@ -0,0 +1,246 @@
#!/usr/bin/env cwl-runner
cwlVersion: v1.0

class: CommandLineTool

label: Download the PDBbind refined database

doc: |-
Download the PDBbind refined database

baseCommand: ["python", "-m", "polus.mm.utils.pdbbind_refined_v2020"]

hints:
DockerRequirement:
dockerPull: mrbrandonwalker/pdbbind_refined_v2020_tool


requirements:
InlineJavascriptRequirement: {}

inputs:

index_file_name:
label: The index file name
type: string
format:
- edam:format_2330
inputBinding:
prefix: --index_file_name
position: 1
default: INDEX_refined_data.2020

query:
label: query str to search the dataset, Pandas query doesn't support slash(/) in column names please use Kd_Ki instead of Kd/Ki
doc: |-
query str to search the dataset. Pandas query doesn't support slash(/) in column names please use Kd_Ki instead of Kd/Ki
Type: string
File type: input
Accepted formats: txt
type: string
format:
- edam:format_2330
inputBinding:
prefix: --query
position: 2

output_txt_path:
label: Path to the text dataset file
doc: |-
Path to the text dataset file
Type: string
File type: output
Accepted formats: txt
type: string
format:
- edam:format_2330
inputBinding:
prefix: --output_txt_path
position: 3
default: system.log

min_row:
label: The row min index
doc: |-
The row min inex
Type: int
type: int?
format:
- edam:format_2330
inputBinding:
position: 4
prefix: --min_row

max_row:
label: The row max index
doc: |-
The row max inex
Type: int
type: int?
format:
- edam:format_2330
inputBinding:
position: 5
prefix: --max_row

convert_Kd_dG:
label: If this is set to true, dG will be calculated
doc: If this is set to true, dG will be calculated
type: boolean
format:
- edam:format_2330
inputBinding:
prefix: --convert_Kd_dG
position: 6
default: False

experimental_dGs:
label: Experimental Free Energies of Binding
doc: |-
Experimental Free Energies of Binding
type: string?
format:
- edam:format_2330

pdb_ids:
label: The PDBID of proteins
doc: |-
The PDBID of proteins
type: string?
format:
- edam:format_2330

outputs:

output_txt_path:
label: Path to the txt file
doc: |-
Path to the txt file
type: File
outputBinding:
glob: $(inputs.output_txt_path)
format: edam:format_2330

output_pdb_paths:
label: Path to the input file
doc: |-
Path to the input file
Type: string
File type: input
Accepted formats: pdb
type: File[]
outputBinding:
# NOTE: Do NOT just use glob: ./*.pdb !!! This will return an array sorted by filenames.
# We want the order of output_pdb_paths to match the order of experimental_dGs, etc
# Because we need to compare experimental ΔGs with predicted values.
glob: $(inputs.output_txt_path)
loadContents: true
outputEval: |
${
var lines = self[0].contents.split("\n");
var pdbs = [];
for (var i = 0; i < lines.length; i++) {
var words = lines[i].split(" ");
var pdbid = words[0];
var pdbfile = {"class": "File", "path": pdbid + "_protein.pdb"};
pdbs.push(pdbfile);
}

return pdbs;
}
format: edam:format_1476

output_sdf_paths:
label: Path to the input file
doc: |-
Path to the input file
Type: string
File type: input
Accepted formats: sdf
type: File[]
outputBinding:
# NOTE: Do NOT just use glob: ./*.sdf !!! This will return an array sorted by filenames.
# We want the order of output_sdf_paths to match the order of experimental_dGs, etc
# Because we need to compare experimental ΔGs with predicted values.
glob: $(inputs.output_txt_path)
loadContents: true
outputEval: |
${
var lines = self[0].contents.split("\n");
var sdfs = [];
for (var i = 0; i < lines.length; i++) {
var words = lines[i].split(" ");
var pdbid = words[0];
var sdffile = {"class": "File", "path": pdbid + "_ligand.sdf"};
sdfs.push(sdffile);
}

return sdfs;
}
format: edam:format_3814

experimental_dGs:
label: Experimental Free Energies of Binding
doc: |-
Experimental Free Energies of Binding
type: ["null", {"type": "array", "items": "float"}]
outputBinding:
glob: $(inputs.output_txt_path)
loadContents: true
outputEval: |
${
var lines = self[0].contents.split("\n");
var experimental_dGs = [];
for (var i = 0; i < lines.length; i++) {
var words = lines[i].split(" ");
if (words.length > 2) {
var experimental_dG = parseFloat(words[2]);
experimental_dGs.push(experimental_dG);
}
}

if (experimental_dGs.length == 0) {
return null;
} else {
return experimental_dGs;
}
}

pdb_ids:
label: The PDBID of proteins
doc: |-
The PDBID of proteins
type:
type: array
items: string
outputBinding:
glob: $(inputs.output_txt_path)
loadContents: true
outputEval: |
${
var lines = self[0].contents.split("\n");
var pdbids = [];
for (var i = 0; i < lines.length; i++) {
var words = lines[i].split(" ");
pdbids.push(words[0]);
}

if (pdbids.length == 0) {
throw new Error("Error! pdbids are empty!");
} else {
return pdbids;
}
}

stdout:
type: File
outputBinding:
glob: stdout

stdout: stdout

$namespaces:
edam: https://edamontology.org/

$schemas:
- https://raw.githubusercontent.com/edamontology/edamontology/master/EDAM_dev.owl
42 changes: 42 additions & 0 deletions utils/pdbbind_refined_v2020_plugin/ict.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
specVersion: 0.1.0
name: labshare/pdbbind-refined-v2020
version: 0.1.0
container: polusai/pdbbind-refined-v2020-plugin:0.1.0
entrypoint: ""
title: pdbbind_refined_v2020
description: Extracts data from the PDBBind refined dataset
author: Data Scientist ([email protected])
repository: https://github.com/labshare/mmtools
documentation: https://ncats.nih.gov/preclinical/core/informatics
citation: ""
inputs:
- name: output_txt_path
required: false
description: Path to the text dataset file
type: string
default: /outdir/system.log
- name: index_file_name
required: false
description: The index file name
type: string
default: INDEX_refined_data.2020
- name: base_dir
required: true
description: The base_dir path
type: string
- name: query
required: false
description: "query str to search the dataset, Pandas query doesn't support slash(/) in column names please use Kd_Ki instead of Kd/Ki"
type: string
- name: min_row
required: false
description: The row min index
type: int
- name: max_row
required: false
description: The row max index
type: int
- name: convert_Kd_dG
required: false
description: If this is set to true, dG will be calculated
type: boolean
Loading

0 comments on commit 5647a5d

Please sign in to comment.