From f1bf40f8bf96577be7e11ef44c82277e82514193 Mon Sep 17 00:00:00 2001
From: Brandon Duane Walker <walkerbd@UPDATEME.HOSTNAME.COM>
Date: Tue, 12 Mar 2024 15:33:35 -0400
Subject: [PATCH] extract pdbbind refined

---
 .../.bumpversion.cfg                          |  29 +++
 .../pdbbind_refined_v2020_tool/.dockerignore  |   4 +
 .../pdbbind_refined_v2020_tool/.gitignore     |   1 +
 .../pdbbind_refined_v2020_tool/CHANGELOG.md   |   5 +
 .../pdbbind_refined_v2020_tool/Dockerfile     |  32 +++
 .../pdbbind_refined_v2020_tool/README.md      |  16 ++
 .../pdbbind_refined_v2020_tool/VERSION        |   1 +
 .../pdbbind_refined_v2020_tool/__init__.py    |   1 +
 .../build-docker.sh                           |   4 +
 .../extract_pdbbind_refined.cwl               | 246 ++++++++++++++++++
 .../pdbbind_refined_v2020_tool/ict.yml        |  42 +++
 .../pdbbind_refined_v2020_tool/pyproject.toml |  32 +++
 .../utils/pdbbind_refined_v2020/__init__.py   |   7 +
 .../utils/pdbbind_refined_v2020/__main__.py   |  78 ++++++
 .../pdbbind_refined_v2020.py                  | 174 +++++++++++++
 .../tests/__init__.py                         |   1 +
 .../tests/test_pdbbind_refined_v2020.py       |  56 ++++
 17 files changed, 729 insertions(+)
 create mode 100644 utils/pre-process/data-download/pdbbind_refined_v2020_tool/.bumpversion.cfg
 create mode 100644 utils/pre-process/data-download/pdbbind_refined_v2020_tool/.dockerignore
 create mode 100644 utils/pre-process/data-download/pdbbind_refined_v2020_tool/.gitignore
 create mode 100644 utils/pre-process/data-download/pdbbind_refined_v2020_tool/CHANGELOG.md
 create mode 100644 utils/pre-process/data-download/pdbbind_refined_v2020_tool/Dockerfile
 create mode 100644 utils/pre-process/data-download/pdbbind_refined_v2020_tool/README.md
 create mode 100644 utils/pre-process/data-download/pdbbind_refined_v2020_tool/VERSION
 create mode 100644 utils/pre-process/data-download/pdbbind_refined_v2020_tool/__init__.py
 create mode 100755 utils/pre-process/data-download/pdbbind_refined_v2020_tool/build-docker.sh
 create mode 100644 utils/pre-process/data-download/pdbbind_refined_v2020_tool/extract_pdbbind_refined.cwl
 create mode 100644 utils/pre-process/data-download/pdbbind_refined_v2020_tool/ict.yml
 create mode 100644 utils/pre-process/data-download/pdbbind_refined_v2020_tool/pyproject.toml
 create mode 100644 utils/pre-process/data-download/pdbbind_refined_v2020_tool/src/polus/mm/utils/pdbbind_refined_v2020/__init__.py
 create mode 100644 utils/pre-process/data-download/pdbbind_refined_v2020_tool/src/polus/mm/utils/pdbbind_refined_v2020/__main__.py
 create mode 100644 utils/pre-process/data-download/pdbbind_refined_v2020_tool/src/polus/mm/utils/pdbbind_refined_v2020/pdbbind_refined_v2020.py
 create mode 100644 utils/pre-process/data-download/pdbbind_refined_v2020_tool/tests/__init__.py
 create mode 100644 utils/pre-process/data-download/pdbbind_refined_v2020_tool/tests/test_pdbbind_refined_v2020.py
diff --git a/utils/pre-process/data-download/pdbbind_refined_v2020_tool/.bumpversion.cfg b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/.bumpversion.cfg
new file mode 100644
index 00000000..5d6e5cf5
--- /dev/null
+++ b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/.bumpversion.cfg
@@ -0,0 +1,29 @@
+[bumpversion]
+current_version = 0.1.0
+commit = False
+tag = False
+parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>[a-z]+)(?P<dev>\d+))?
+serialize = 
+	{major}.{minor}.{patch}-{release}{dev}
+	{major}.{minor}.{patch}
+
+[bumpversion:part:release]
+optional_value = _
+first_value = dev
+values = 
+	dev
+	_
+
+[bumpversion:part:dev]
+
+[bumpversion:file:pyproject.toml]
+search = version = "{current_version}"
+replace = version = "{new_version}"
+
+[bumpversion:file:VERSION]
+
+[bumpversion:file:README.md]
+
+[bumpversion:file:plugin.json]
+
+[bumpversion:file:src/polus/mm/utils/pdbbind_refined_v2020/__init__.py]
diff --git a/utils/pre-process/data-download/pdbbind_refined_v2020_tool/.dockerignore b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/.dockerignore
new file mode 100644
index 00000000..7c603f81
--- /dev/null
+++ b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/.dockerignore
@@ -0,0 +1,4 @@
+.venv
+out
+tests
+__pycache__
diff --git a/utils/pre-process/data-download/pdbbind_refined_v2020_tool/.gitignore b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/.gitignore
new file mode 100644
index 00000000..c04bc49f
--- /dev/null
+++ b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/.gitignore
@@ -0,0 +1 @@
+poetry.lock
diff --git a/utils/pre-process/data-download/pdbbind_refined_v2020_tool/CHANGELOG.md b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/CHANGELOG.md
new file mode 100644
index 00000000..b67793f7
--- /dev/null
+++ b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/CHANGELOG.md
@@ -0,0 +1,5 @@
+# CHANGELOG
+
+## 0.1.0
+
+Initial release.
diff --git a/utils/pre-process/data-download/pdbbind_refined_v2020_tool/Dockerfile b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/Dockerfile
new file mode 100644
index 00000000..82d9b958
--- /dev/null
+++ b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/Dockerfile
@@ -0,0 +1,32 @@
+# docker build -f Dockerfile -t mrbrandonwalker/pdbbind_refined_v2020_tool .
+
+FROM condaforge/mambaforge
+
+ENV EXEC_DIR="/opt/executables"
+ENV POLUS_LOG="INFO"
+RUN mkdir -p ${EXEC_DIR}
+
+RUN apt-get update && apt-get install -y wget
+# Download the bdbbind dataset
+# RUN wget --no-clobber http://www.pdbbind.org.cn/download/PDBbind_v2020_refined.tar.gz
+## update to the new download URL (around 10 times faster) from PDBbind website.
+RUN wget --no-clobber https://pdbbind.oss-cn-hangzhou.aliyuncs.com/download/PDBbind_v2020_refined.tar.gz
+RUN tar -xvf PDBbind_v2020_refined.tar.gz
+
+# Work directory defined in the base container
+
+COPY pyproject.toml ${EXEC_DIR}
+COPY VERSION ${EXEC_DIR}
+COPY README.md ${EXEC_DIR}
+COPY CHANGELOG.md ${EXEC_DIR}
+
+# Install needed packages here
+RUN mamba install -c conda-forge pandas
+
+COPY src ${EXEC_DIR}/src
+
+RUN pip3 install ${EXEC_DIR} --no-cache-dir
+
+CMD ["--help"]
+
+WORKDIR /outdir
diff --git a/utils/pre-process/data-download/pdbbind_refined_v2020_tool/README.md b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/README.md
new file mode 100644
index 00000000..51e39081
--- /dev/null
+++ b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/README.md
@@ -0,0 +1,16 @@
+# pdbbind_refined_v2020 (0.1.0)
+
+Extract pdbbind_refined_v2020 data
+
+## Options
+
+This plugin takes 7 input arguments
+
+| Name          | Description             | I/O    | Type   | Default |
+|---------------|-------------------------|--------|--------|---------|
+| index_file_name |  | Input | string | string |
+| base_dir |  | Input | string | string |
+| query | query str to search the dataset. Pandas query doesn't support slash(/) in column names please use Kd_Ki instead of Kd/Ki, Type: string, File type: input, Accepted formats: txt | Input | string | string |
+| min_row | The row min inex, Type: int | Input | int | int |
+| max_row | The row max inex, Type: int | Input | int | int |
+| convert_Kd_dG | If this is set to true, dG will be calculated | Input | boolean | boolean |
diff --git a/utils/pre-process/data-download/pdbbind_refined_v2020_tool/VERSION b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/VERSION
new file mode 100644
index 00000000..6e8bf73a
--- /dev/null
+++ b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/VERSION
@@ -0,0 +1 @@
+0.1.0
diff --git a/utils/pre-process/data-download/pdbbind_refined_v2020_tool/__init__.py b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/__init__.py
new file mode 100644
index 00000000..187d0cd9
--- /dev/null
+++ b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/__init__.py
@@ -0,0 +1 @@
+"""pdbbind_refined_v2020_plugin package."""
diff --git a/utils/pre-process/data-download/pdbbind_refined_v2020_tool/build-docker.sh b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/build-docker.sh
new file mode 100755
index 00000000..cd23f2cb
--- /dev/null
+++ b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/build-docker.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+version=$(<VERSION)
+docker build . -t polusai/pdbbind-refined-v2020-tool:${version}
diff --git a/utils/pre-process/data-download/pdbbind_refined_v2020_tool/extract_pdbbind_refined.cwl b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/extract_pdbbind_refined.cwl
new file mode 100644
index 00000000..04639bd9
--- /dev/null
+++ b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/extract_pdbbind_refined.cwl
@@ -0,0 +1,246 @@
+#!/usr/bin/env cwl-runner
+cwlVersion: v1.0
+
+class: CommandLineTool
+
+label: Download the PDBbind refined database
+
+doc: |-
+  Download the PDBbind refined database
+
+baseCommand: ["python", "-m", "polus.mm.utils.pdbbind_refined_v2020"]
+
+hints:
+  DockerRequirement:
+    dockerPull: mrbrandonwalker/pdbbind_refined_v2020_tool
+
+
+requirements:
+  InlineJavascriptRequirement: {}
+
+inputs:
+
+  index_file_name:
+    label: The index file name
+    type: string
+    format:
+    - edam:format_2330
+    inputBinding:
+      prefix: --index_file_name
+      position: 1
+    default: INDEX_refined_data.2020
+
+  query:
+    label: query str to search the dataset, Pandas query doesn't support slash(/) in column names please use Kd_Ki instead of Kd/Ki
+    doc: |-
+      query str to search the dataset. Pandas query doesn't support slash(/) in column names please use Kd_Ki instead of Kd/Ki
+      Type: string
+      File type: input
+      Accepted formats: txt
+    type: string
+    format:
+    - edam:format_2330
+    inputBinding:
+      prefix: --query
+      position: 2
+
+  output_txt_path:
+    label: Path to the text dataset file
+    doc: |-
+      Path to the text dataset file
+      Type: string
+      File type: output
+      Accepted formats: txt
+    type: string
+    format:
+    - edam:format_2330
+    inputBinding:
+      prefix: --output_txt_path
+      position: 3
+    default: system.log
+
+  min_row:
+    label: The row min index
+    doc: |-
+      The row min inex
+      Type: int
+    type: int?
+    format:
+    - edam:format_2330
+    inputBinding:
+      position: 4
+      prefix: --min_row
+
+  max_row:
+    label: The row max index
+    doc: |-
+      The row max inex
+      Type: int
+    type: int?
+    format:
+    - edam:format_2330
+    inputBinding:
+      position: 5
+      prefix: --max_row
+
+  convert_Kd_dG:
+    label: If this is set to true, dG will be calculated
+    doc: If this is set to true, dG will be calculated
+    type: boolean
+    format:
+    - edam:format_2330
+    inputBinding:
+      prefix: --convert_Kd_dG
+      position: 6
+    default: False
+
+  experimental_dGs:
+    label: Experimental Free Energies of Binding
+    doc: |-
+      Experimental Free Energies of Binding
+    type: string?
+    format:
+    - edam:format_2330
+
+  pdb_ids:
+    label: The PDBID of proteins
+    doc: |-
+      The PDBID of proteins
+    type: string?
+    format:
+    - edam:format_2330
+
+outputs:
+
+  output_txt_path:
+    label: Path to the txt file
+    doc: |-
+      Path to the txt file
+    type: File
+    outputBinding:
+      glob: $(inputs.output_txt_path)
+    format: edam:format_2330
+
+  output_pdb_paths:
+    label: Path to the input file
+    doc: |-
+      Path to the input file
+      Type: string
+      File type: input
+      Accepted formats: pdb
+    type: File[]
+    outputBinding:
+      # NOTE: Do NOT just use glob: ./*.pdb !!! This will return an array sorted by filenames.
+      # We want the order of output_pdb_paths to match the order of experimental_dGs, etc
+      # Because we need to compare experimental ΔGs with predicted values.
+      glob: $(inputs.output_txt_path)
+      loadContents: true
+      outputEval: |
+        ${
+          var lines = self[0].contents.split("\n");
+          var pdbs = [];
+          for (var i = 0; i < lines.length; i++) {
+            var words = lines[i].split(" ");
+            var pdbid = words[0];
+            var pdbfile = {"class": "File", "path": pdbid + "_protein.pdb"};
+            pdbs.push(pdbfile);
+            }
+
+          return pdbs;
+        }
+    format: edam:format_1476
+
+  output_sdf_paths:
+    label: Path to the input file
+    doc: |-
+      Path to the input file
+      Type: string
+      File type: input
+      Accepted formats: sdf
+    type: File[]
+    outputBinding:
+      # NOTE: Do NOT just use glob: ./*.sdf !!! This will return an array sorted by filenames.
+      # We want the order of output_sdf_paths to match the order of experimental_dGs, etc
+      # Because we need to compare experimental ΔGs with predicted values.
+      glob: $(inputs.output_txt_path)
+      loadContents: true
+      outputEval: |
+        ${
+          var lines = self[0].contents.split("\n");
+          var sdfs = [];
+          for (var i = 0; i < lines.length; i++) {
+            var words = lines[i].split(" ");
+            var pdbid = words[0];
+            var sdffile = {"class": "File", "path": pdbid + "_ligand.sdf"};
+            sdfs.push(sdffile);
+            }
+
+          return sdfs;
+        }
+    format: edam:format_3814
+
+  experimental_dGs:
+    label: Experimental Free Energies of Binding
+    doc: |-
+      Experimental Free Energies of Binding
+    type: ["null", {"type": "array", "items": "float"}]
+    outputBinding:
+      glob: $(inputs.output_txt_path)
+      loadContents: true
+      outputEval: |
+        ${
+          var lines = self[0].contents.split("\n");
+          var experimental_dGs = [];
+          for (var i = 0; i < lines.length; i++) {
+            var words = lines[i].split(" ");
+            if (words.length > 2) {
+              var experimental_dG = parseFloat(words[2]);
+              experimental_dGs.push(experimental_dG);
+            }
+          }
+
+          if (experimental_dGs.length == 0) {
+            return null;
+          } else {
+            return experimental_dGs;
+          }
+        }
+
+  pdb_ids:
+    label: The PDBID of proteins
+    doc: |-
+      The PDBID of proteins
+    type:
+      type: array
+      items: string
+    outputBinding:
+      glob: $(inputs.output_txt_path)
+      loadContents: true
+      outputEval: |
+        ${
+          var lines = self[0].contents.split("\n");
+          var pdbids = [];
+          for (var i = 0; i < lines.length; i++) {
+            var words = lines[i].split(" ");
+            pdbids.push(words[0]);
+            }
+
+          if (pdbids.length == 0) {
+            throw new Error("Error! pdbids are empty!");
+          } else {
+            return pdbids;
+          }
+        }
+
+  stdout:
+    type: File
+    outputBinding:
+      glob: stdout
+
+stdout: stdout
+
+$namespaces:
+  edam: https://edamontology.org/
+
+$schemas:
+- https://raw.githubusercontent.com/edamontology/edamontology/master/EDAM_dev.owl
diff --git a/utils/pre-process/data-download/pdbbind_refined_v2020_tool/ict.yml b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/ict.yml
new file mode 100644
index 00000000..feb05f30
--- /dev/null
+++ b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/ict.yml
@@ -0,0 +1,42 @@
+specVersion: 0.1.0
+name: labshare/pdbbind-refined-v2020
+version: 0.1.0
+container: polusai/pdbbind-refined-v2020-tool:0.1.0
+entrypoint: ""
+title: pdbbind_refined_v2020
+description: Extracts data from the PDBBind refined dataset
+author: Brandon Walker (brandon.walker@axleinfo.com), Nazanin Donyapour (nazanin.donyapour@nih.gov)
+repository: https://github.com/labshare/mmtools
+documentation: https://ncats.nih.gov/preclinical/core/informatics
+citation: ""
+inputs:
+- name: output_txt_path
+  required: false
+  description: Path to the text dataset file
+  type: string
+  default: /outdir/system.log
+- name: index_file_name
+  required: false
+  description: The index file name
+  type: string
+  default: INDEX_refined_data.2020
+- name: base_dir
+  required: true
+  description: The base_dir path
+  type: string
+- name: query
+  required: false
+  description: "query str to search the dataset, Pandas query doesn't support slash(/) in column names please use Kd_Ki instead of Kd/Ki"
+  type: string
+- name: min_row
+  required: false
+  description: The row min index
+  type: int
+- name: max_row
+  required: false
+  description: The row max index
+  type: int
+- name: convert_Kd_dG
+  required: false
+  description: If this is set to true, dG will be calculated
+  type: boolean
diff --git a/utils/pre-process/data-download/pdbbind_refined_v2020_tool/pyproject.toml b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/pyproject.toml
new file mode 100644
index 00000000..c8732d8b
--- /dev/null
+++ b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/pyproject.toml
@@ -0,0 +1,32 @@
+[tool.poetry]
+name = "polus-mm-utils-pdbbind-refined-v2020"
+version = "0.1.0"
+description = "An awesome function."
+authors = ["Data Scientist <data.scientist@labshare.org>"]
+readme = "README.md"
+packages = [{include = "polus", from = "src"}]
+
+[tool.poetry.dependencies]
+python = ">=3.9,<3.12"
+typer = "^0.7.0"
+pandas = "^1.3.3"
+cwl-utils = "0.33"
+cwltool = "3.1.20240404144621"
+
+[tool.poetry.group.dev.dependencies]
+bump2version = "^1.0.1"
+pytest = "^7.4"
+pytest-sugar = "^0.9.6"
+pre-commit = "^3.2.1"
+black = "^23.3.0"
+mypy = "^1.1.1"
+ruff = "^0.0.270"
+
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
+
+[tool.pytest.ini_options]
+pythonpath = [
+  "."
+]
diff --git a/utils/pre-process/data-download/pdbbind_refined_v2020_tool/src/polus/mm/utils/pdbbind_refined_v2020/__init__.py b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/src/polus/mm/utils/pdbbind_refined_v2020/__init__.py
new file mode 100644
index 00000000..d5992f46
--- /dev/null
+++ b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/src/polus/mm/utils/pdbbind_refined_v2020/__init__.py
@@ -0,0 +1,7 @@
+"""pdbbind_refined_v2020."""
+
+__version__ = "0.1.0"
+
+from polus.mm.utils.pdbbind_refined_v2020.pdbbind_refined_v2020 import (  # noqa # pylint: disable=unused-import
+    pdbbind_refined_v2020,
+)
diff --git a/utils/pre-process/data-download/pdbbind_refined_v2020_tool/src/polus/mm/utils/pdbbind_refined_v2020/__main__.py b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/src/polus/mm/utils/pdbbind_refined_v2020/__main__.py
new file mode 100644
index 00000000..5ff8d576
--- /dev/null
+++ b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/src/polus/mm/utils/pdbbind_refined_v2020/__main__.py
@@ -0,0 +1,78 @@
+"""Package entrypoint for the pdbbind_refined_v2020 package."""
+
+# Base packages
+import logging
+from os import environ
+from pathlib import Path
+
+import typer
+from polus.mm.utils.pdbbind_refined_v2020.pdbbind_refined_v2020 import (
+    pdbbind_refined_v2020,
+)
+
+logging.basicConfig(
+    format="%(asctime)s - %(name)-8s - %(levelname)-8s - %(message)s",
+    datefmt="%d-%b-%y %H:%M:%S",
+)
+POLUS_LOG = getattr(logging, environ.get("POLUS_LOG", "INFO"))
+logger = logging.getLogger("polus.mm.utils.pdbbind_refined_v2020.")
+logger.setLevel(POLUS_LOG)
+
+app = typer.Typer(help="pdbbind_refined_v2020.")
+
+
+@app.command()
+def main(  # noqa: PLR0913
+    output_txt_path: str = typer.Option(
+        "system.log",
+        "--output_txt_path",
+        help="Path to the text dataset file",
+    ),
+    index_file_name: str = typer.Option(
+        "INDEX_refined_data.2020",
+        "--index_file_name",
+        help="",
+    ),
+    query: str = typer.Option(
+        ...,
+        "--query",
+        help="query str to search the dataset.",
+    ),
+    min_row: int = typer.Option(
+        1,
+        "--min_row",
+        help="The row min inex, Type: int",
+    ),
+    max_row: int = typer.Option(
+        ...,
+        "--max_row",
+        help="The row max inex, Type: int",
+    ),
+    convert_kd_dg: bool = typer.Option(
+        False,
+        "--convert_Kd_dG",
+        help="If this is set to true, dG will be calculated",
+    ),
+) -> None:
+    """pdbbind_refined_v2020."""
+    logger.info(f"output_txt_path: {output_txt_path}")
+    logger.info(f"index_file_name: {index_file_name}")
+    logger.info(f"query: {query}")
+    logger.info(f"min_row: {min_row}")
+    logger.info(f"max_row: {max_row}")
+    logger.info(f"convert_Kd_dG: {convert_kd_dg}")
+    base_dir = "/refined-set"
+    Path(base_dir)
+    pdbbind_refined_v2020(
+        output_txt_path,
+        index_file_name,
+        base_dir,
+        query,
+        min_row,
+        max_row,
+        convert_kd_dg,
+    )
+
+
+if __name__ == "__main__":
+    app()
diff --git a/utils/pre-process/data-download/pdbbind_refined_v2020_tool/src/polus/mm/utils/pdbbind_refined_v2020/pdbbind_refined_v2020.py b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/src/polus/mm/utils/pdbbind_refined_v2020/pdbbind_refined_v2020.py
new file mode 100644
index 00000000..cbcc59e4
--- /dev/null
+++ b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/src/polus/mm/utils/pdbbind_refined_v2020/pdbbind_refined_v2020.py
@@ -0,0 +1,174 @@
+"""extract pdbbind_refined_v2020."""
+import math
+import re
+import subprocess
+from collections import defaultdict
+from pathlib import Path
+from typing import Any
+
+import pandas as pd
+
+
+def pdbbind_refined_v2020(  # noqa: PLR0913
+    output_txt_path: str,
+    index_file_name: str,
+    base_dir: str,
+    query: str,
+    min_row: int,
+    max_row: int,
+    convert_kd_dg: bool,
+) -> None:
+    """pdbbind_refined_v2020.
+
+    Args:
+        output_txt_path: Path to the text dataset file
+        index_file_name: The PDBbind index file name
+        base_dir: The base directory of the dataset
+        query: query str to search the dataset.
+        min_row: The row min index
+        max_row: The row max index
+        convert_kd_dg: If this is set to true, dG will be calculated
+    Returns:
+        None
+    """
+    load_data(
+        index_file_name,
+        base_dir,
+        query,
+        output_txt_path,
+        min_row,
+        max_row,
+        convert_kd_dg,
+    )
+
+
+def calculate_dg(kd: float) -> float:
+    """Calculates binding free energy from Kd.
+
+    Args:
+        kd (float): The binding affinity of the protein-ligand complex
+
+    Returns:
+        float: The binding free energy
+    """
+    # Calculate the binding free energy from kd so we can make the correlation plots.
+    # See https://en.wikipedia.org/wiki/Binding_constant
+    ideal_gas_constant = 8.31446261815324  # J/(Mol*K)
+    kcal_per_joule = 4184
+    # NOTE: Unfortunately, the temperature at which
+    # experimental kd binding data was taken
+    # is often not recorded. Thus, we are forced to guess. The two standard guesses are
+    # physiological body temperature (310K) or room temperature (298K).
+    temperature = 298
+    rt = (ideal_gas_constant / kcal_per_joule) * temperature
+    # NOTE: For performance, simulations are often done in a very small unit cell, and
+    # thus at a very high concentration. The size of the unit cell bounds the volume.
+    # For shorter simulations where the ligand has not explored the entire box, it may
+    # be less. See the Yank paper for a method of calculating the correct volumes.
+    standard_concentration = 1  # Units of mol / L, but see comment above.
+    return rt * math.log(kd / standard_concentration)
+
+
+def read_index_file(index_file_path: str) -> pd.DataFrame:
+    """Reads the PDBbind index file and extracts binding data.
+
+    Args:
+        index_file_path (str): The path to the index file
+
+    Returns:
+        pd.DataFrame: The kd data
+    """
+    data: dict[str, Any] = defaultdict(list)
+    # The file format
+    # PDB code, resolution, release year, -logkd/Ki, kd/Ki, reference, ligand name
+    unit_conv = {"uM": 1, "mM": 1000.0, "nM": 0.001, "pM": 0.000001}
+
+    with Path(index_file_path).open(encoding="utf-8") as rfile:
+        lines = [line for line in rfile.readlines() if line[0] != "#" and "Kd=" in line]
+        for line in lines:
+            words = line.split()
+            data["PDB_code"].append(words[0])
+            data["resolution"].append(words[1])
+            data["release_year"].append(words[2])
+
+            # Kd conversion to micro molar
+            unit = re.split(r"=[-+]?(?:\d*\.\d+|\d+)", words[4])[1]
+            standard_type = re.split(r"=[-+]?(?:\d*\.\d+|\d+)", words[4])[0]
+            kd = float(re.findall(r"[-+]?(?:\d*\.\d+|\d+)", words[4])[0])
+            data["Kd_Ki"].append(standard_type)
+            data["value"].append(kd * unit_conv[unit])
+            data["ligand_name"].append(re.findall(r"\((.*?)\)", words[7])[0])
+
+    return pd.DataFrame.from_dict(data)
+
+
+# pylint: disable=too-many-arguments,too-many-locals
+
+
+def load_data(  # noqa: PLR0913
+    index_file_name: str,
+    base_dir: str,
+    query: str,
+    output_txt_path: str,
+    min_row: int = 1,
+    max_row: int = -1,
+    convert_kd_dg: bool = False,
+) -> None:
+    """Filters Kd data beased on a query.
+
+    Args:
+        index_file_name (str): The PDBbind index file name
+        base_dir (str): The base directory of the dataset
+        query (str): The Query to perform
+        output_txt_path (str): The output text file
+        min_row (int, optional): min index of rows. Defaults to 1.
+        max_row (int, optional): max index of rows. Defaults to -1.
+        convert_kd_dg (bool, optional): If this set to True,
+        The dG will be calculated. Defaults to False.
+    """
+    index_file_path = Path(base_dir).joinpath("index", index_file_name)
+    df = read_index_file(str(index_file_path))
+    # perform query
+    df = df.query(query)
+
+    # Perform row slicing (if any)
+    if int(min_row) != 1 or int(max_row) != -1:
+        # We want to convert to zero-based indices and we also want
+        # the upper index to be inclusive (i.e. <=) so -1 lower index.
+        df = df[
+            (int(min_row) - 1) : int(max_row)
+        ]  # pylint: disable=unsubscriptable-object
+
+    # Calculate dG
+    df = df[["PDB_code", "value", "Kd_Ki"]]
+    binding_data: list[str] = []
+    micromolar = 0.000001  # uM
+    for _, row in enumerate(df.values):
+        (pdbcode, binding_datum, kd_ki) = row
+        binding_datum = binding_datum * micromolar
+
+        if convert_kd_dg:
+            dg = calculate_dg(binding_datum)
+            binding_data.append(f"{pdbcode} {binding_datum} {dg} {kd_ki}")
+        else:
+            binding_data.append(f"{pdbcode} {binding_datum} {kd_ki}")
+
+    with Path(output_txt_path).open(mode="w", encoding="utf-8") as f:
+        f.write("\n".join(binding_data))
+
+    # copy pdb and sdf files
+    for _, row in df.iterrows():
+        pdbcode = row["PDB_code"]
+        source_pdb_path = Path(base_dir).joinpath(pdbcode, f"{pdbcode}_protein.pdb")
+        dist_pdb_path = f"{pdbcode}_protein.pdb"
+        subprocess.run(
+            ["cp", f"{source_pdb_path}", f"{dist_pdb_path}"],  # noqa: S603, S607
+            check=True,
+        )
+        source_sdf_path = Path(base_dir).joinpath(pdbcode, f"{pdbcode}_ligand.sdf")
+
+        dist_sdf_path = f"{pdbcode}_ligand.sdf"
+        subprocess.run(
+            ["cp", f"{source_sdf_path}", f"{dist_sdf_path}"],  # noqa: S603, S607
+            check=True,
+        )
diff --git a/utils/pre-process/data-download/pdbbind_refined_v2020_tool/tests/__init__.py b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/tests/__init__.py
new file mode 100644
index 00000000..e0368608
--- /dev/null
+++ b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/tests/__init__.py
@@ -0,0 +1 @@
+"""Tests for pdbbind_refined_v2020."""
diff --git a/utils/pre-process/data-download/pdbbind_refined_v2020_tool/tests/test_pdbbind_refined_v2020.py b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/tests/test_pdbbind_refined_v2020.py
new file mode 100644
index 00000000..b5d3b623
--- /dev/null
+++ b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/tests/test_pdbbind_refined_v2020.py
@@ -0,0 +1,56 @@
+"""Tests for pdbbind_refined_v2020."""
+import sys
+from pathlib import Path
+
+sys.path.append("src")
+from polus.mm.utils.pdbbind_refined_v2020.pdbbind_refined_v2020 import (  # noqa: E402
+    pdbbind_refined_v2020,
+)
+
+current_dir = Path(__file__).resolve().parent
+target_dir = current_dir.parent.parent.parent.parent.parent / "cwl_utils"
+sys.path.append(str(target_dir))
+
+from cwl_utilities import call_cwltool  # noqa: E402
+from cwl_utilities import create_input_yaml  # noqa: E402
+from cwl_utilities import parse_cwl_arguments  # noqa: E402
+
+
+def test_pdbbind_refined_v2020() -> None:
+    """Test pdbbind_refined_v2020."""
+    output_txt_path = "output.txt"
+    index_file_name = "INDEX_refined_data.2020"
+    base_dir = Path.cwd() / "refined-set"
+    query = '(Kd_Ki == "Kd") and (value < 0.001)'
+    min_row = 1
+    max_row = 1
+    convert_kd_dg = True
+    pdbbind_refined_v2020(
+        output_txt_path,
+        index_file_name,
+        base_dir,
+        query,
+        min_row,
+        max_row,
+        convert_kd_dg,
+    )
+    current_directory = Path.cwd()  # Get the current directory
+    pdb_files = list(current_directory.glob("*.pdb"))  # List all *.pdb files
+    sdf_files = list(current_directory.glob("*.sdf"))  # List all *.sdf files
+
+    assert pdb_files, "No .pdb files found in the directory"
+    assert sdf_files, "No .sdf files found in the directory"
+
+
+def test_extract_pdbbind_refined() -> None:
+    """Test pdb."""
+    cwl_file = Path("extract_pdbbind_refined.cwl")
+    input_to_props = parse_cwl_arguments(cwl_file)
+    input_to_props["query"] = '(Kd_Ki == "Kd") and (value < 0.001)'
+    input_to_props["convert_Kd_dG"] = True
+    input_to_props["max_row"] = 1
+    input_yaml_path = Path("extract_pdbbind_refined.yml")
+    create_input_yaml(input_to_props, input_yaml_path)
+    call_cwltool(cwl_file, input_yaml_path)
+
+    assert Path("1e3g_protein.pdb").is_file()