From f1bf40f8bf96577be7e11ef44c82277e82514193 Mon Sep 17 00:00:00 2001 From: Brandon Duane Walker Date: Tue, 12 Mar 2024 15:33:35 -0400 Subject: [PATCH] extract pdbbind refined --- .../.bumpversion.cfg | 29 +++ .../pdbbind_refined_v2020_tool/.dockerignore | 4 + .../pdbbind_refined_v2020_tool/.gitignore | 1 + .../pdbbind_refined_v2020_tool/CHANGELOG.md | 5 + .../pdbbind_refined_v2020_tool/Dockerfile | 32 +++ .../pdbbind_refined_v2020_tool/README.md | 16 ++ .../pdbbind_refined_v2020_tool/VERSION | 1 + .../pdbbind_refined_v2020_tool/__init__.py | 1 + .../build-docker.sh | 4 + .../extract_pdbbind_refined.cwl | 246 ++++++++++++++++++ .../pdbbind_refined_v2020_tool/ict.yml | 42 +++ .../pdbbind_refined_v2020_tool/pyproject.toml | 32 +++ .../utils/pdbbind_refined_v2020/__init__.py | 7 + .../utils/pdbbind_refined_v2020/__main__.py | 78 ++++++ .../pdbbind_refined_v2020.py | 174 +++++++++++++ .../tests/__init__.py | 1 + .../tests/test_pdbbind_refined_v2020.py | 56 ++++ 17 files changed, 729 insertions(+) create mode 100644 utils/pre-process/data-download/pdbbind_refined_v2020_tool/.bumpversion.cfg create mode 100644 utils/pre-process/data-download/pdbbind_refined_v2020_tool/.dockerignore create mode 100644 utils/pre-process/data-download/pdbbind_refined_v2020_tool/.gitignore create mode 100644 utils/pre-process/data-download/pdbbind_refined_v2020_tool/CHANGELOG.md create mode 100644 utils/pre-process/data-download/pdbbind_refined_v2020_tool/Dockerfile create mode 100644 utils/pre-process/data-download/pdbbind_refined_v2020_tool/README.md create mode 100644 utils/pre-process/data-download/pdbbind_refined_v2020_tool/VERSION create mode 100644 utils/pre-process/data-download/pdbbind_refined_v2020_tool/__init__.py create mode 100755 utils/pre-process/data-download/pdbbind_refined_v2020_tool/build-docker.sh create mode 100644 utils/pre-process/data-download/pdbbind_refined_v2020_tool/extract_pdbbind_refined.cwl create mode 100644 utils/pre-process/data-download/pdbbind_refined_v2020_tool/ict.yml create mode 100644 utils/pre-process/data-download/pdbbind_refined_v2020_tool/pyproject.toml create mode 100644 utils/pre-process/data-download/pdbbind_refined_v2020_tool/src/polus/mm/utils/pdbbind_refined_v2020/__init__.py create mode 100644 utils/pre-process/data-download/pdbbind_refined_v2020_tool/src/polus/mm/utils/pdbbind_refined_v2020/__main__.py create mode 100644 utils/pre-process/data-download/pdbbind_refined_v2020_tool/src/polus/mm/utils/pdbbind_refined_v2020/pdbbind_refined_v2020.py create mode 100644 utils/pre-process/data-download/pdbbind_refined_v2020_tool/tests/__init__.py create mode 100644 utils/pre-process/data-download/pdbbind_refined_v2020_tool/tests/test_pdbbind_refined_v2020.py diff --git a/utils/pre-process/data-download/pdbbind_refined_v2020_tool/.bumpversion.cfg b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/.bumpversion.cfg new file mode 100644 index 00000000..5d6e5cf5 --- /dev/null +++ b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/.bumpversion.cfg @@ -0,0 +1,29 @@ +[bumpversion] +current_version = 0.1.0 +commit = False +tag = False +parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\-(?P[a-z]+)(?P\d+))? +serialize = + {major}.{minor}.{patch}-{release}{dev} + {major}.{minor}.{patch} + +[bumpversion:part:release] +optional_value = _ +first_value = dev +values = + dev + _ + +[bumpversion:part:dev] + +[bumpversion:file:pyproject.toml] +search = version = "{current_version}" +replace = version = "{new_version}" + +[bumpversion:file:VERSION] + +[bumpversion:file:README.md] + +[bumpversion:file:plugin.json] + +[bumpversion:file:src/polus/mm/utils/pdbbind_refined_v2020/__init__.py] diff --git a/utils/pre-process/data-download/pdbbind_refined_v2020_tool/.dockerignore b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/.dockerignore new file mode 100644 index 00000000..7c603f81 --- /dev/null +++ b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/.dockerignore @@ -0,0 +1,4 @@ +.venv +out +tests +__pycache__ diff --git a/utils/pre-process/data-download/pdbbind_refined_v2020_tool/.gitignore b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/.gitignore new file mode 100644 index 00000000..c04bc49f --- /dev/null +++ b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/.gitignore @@ -0,0 +1 @@ +poetry.lock diff --git a/utils/pre-process/data-download/pdbbind_refined_v2020_tool/CHANGELOG.md b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/CHANGELOG.md new file mode 100644 index 00000000..b67793f7 --- /dev/null +++ b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/CHANGELOG.md @@ -0,0 +1,5 @@ +# CHANGELOG + +## 0.1.0 + +Initial release. diff --git a/utils/pre-process/data-download/pdbbind_refined_v2020_tool/Dockerfile b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/Dockerfile new file mode 100644 index 00000000..82d9b958 --- /dev/null +++ b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/Dockerfile @@ -0,0 +1,32 @@ +# docker build -f Dockerfile -t mrbrandonwalker/pdbbind_refined_v2020_tool . + +FROM condaforge/mambaforge + +ENV EXEC_DIR="/opt/executables" +ENV POLUS_LOG="INFO" +RUN mkdir -p ${EXEC_DIR} + +RUN apt-get update && apt-get install -y wget +# Download the bdbbind dataset +# RUN wget --no-clobber http://www.pdbbind.org.cn/download/PDBbind_v2020_refined.tar.gz +## update to the new download URL (around 10 times faster) from PDBbind website. +RUN wget --no-clobber https://pdbbind.oss-cn-hangzhou.aliyuncs.com/download/PDBbind_v2020_refined.tar.gz +RUN tar -xvf PDBbind_v2020_refined.tar.gz + +# Work directory defined in the base container + +COPY pyproject.toml ${EXEC_DIR} +COPY VERSION ${EXEC_DIR} +COPY README.md ${EXEC_DIR} +COPY CHANGELOG.md ${EXEC_DIR} + +# Install needed packages here +RUN mamba install -c conda-forge pandas + +COPY src ${EXEC_DIR}/src + +RUN pip3 install ${EXEC_DIR} --no-cache-dir + +CMD ["--help"] + +WORKDIR /outdir diff --git a/utils/pre-process/data-download/pdbbind_refined_v2020_tool/README.md b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/README.md new file mode 100644 index 00000000..51e39081 --- /dev/null +++ b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/README.md @@ -0,0 +1,16 @@ +# pdbbind_refined_v2020 (0.1.0) + +Extract pdbbind_refined_v2020 data + +## Options + +This plugin takes 7 input arguments + +| Name | Description | I/O | Type | Default | +|---------------|-------------------------|--------|--------|---------| +| index_file_name | | Input | string | string | +| base_dir | | Input | string | string | +| query | query str to search the dataset. Pandas query doesn't support slash(/) in column names please use Kd_Ki instead of Kd/Ki, Type: string, File type: input, Accepted formats: txt | Input | string | string | +| min_row | The row min inex, Type: int | Input | int | int | +| max_row | The row max inex, Type: int | Input | int | int | +| convert_Kd_dG | If this is set to true, dG will be calculated | Input | boolean | boolean | diff --git a/utils/pre-process/data-download/pdbbind_refined_v2020_tool/VERSION b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/VERSION new file mode 100644 index 00000000..6e8bf73a --- /dev/null +++ b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/VERSION @@ -0,0 +1 @@ +0.1.0 diff --git a/utils/pre-process/data-download/pdbbind_refined_v2020_tool/__init__.py b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/__init__.py new file mode 100644 index 00000000..187d0cd9 --- /dev/null +++ b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/__init__.py @@ -0,0 +1 @@ +"""pdbbind_refined_v2020_plugin package.""" diff --git a/utils/pre-process/data-download/pdbbind_refined_v2020_tool/build-docker.sh b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/build-docker.sh new file mode 100755 index 00000000..cd23f2cb --- /dev/null +++ b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/build-docker.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +version=$( 2) { + var experimental_dG = parseFloat(words[2]); + experimental_dGs.push(experimental_dG); + } + } + + if (experimental_dGs.length == 0) { + return null; + } else { + return experimental_dGs; + } + } + + pdb_ids: + label: The PDBID of proteins + doc: |- + The PDBID of proteins + type: + type: array + items: string + outputBinding: + glob: $(inputs.output_txt_path) + loadContents: true + outputEval: | + ${ + var lines = self[0].contents.split("\n"); + var pdbids = []; + for (var i = 0; i < lines.length; i++) { + var words = lines[i].split(" "); + pdbids.push(words[0]); + } + + if (pdbids.length == 0) { + throw new Error("Error! pdbids are empty!"); + } else { + return pdbids; + } + } + + stdout: + type: File + outputBinding: + glob: stdout + +stdout: stdout + +$namespaces: + edam: https://edamontology.org/ + +$schemas: +- https://raw.githubusercontent.com/edamontology/edamontology/master/EDAM_dev.owl diff --git a/utils/pre-process/data-download/pdbbind_refined_v2020_tool/ict.yml b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/ict.yml new file mode 100644 index 00000000..feb05f30 --- /dev/null +++ b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/ict.yml @@ -0,0 +1,42 @@ +specVersion: 0.1.0 +name: labshare/pdbbind-refined-v2020 +version: 0.1.0 +container: polusai/pdbbind-refined-v2020-tool:0.1.0 +entrypoint: "" +title: pdbbind_refined_v2020 +description: Extracts data from the PDBBind refined dataset +author: Brandon Walker (brandon.walker@axleinfo.com), Nazanin Donyapour (nazanin.donyapour@nih.gov) +repository: https://github.com/labshare/mmtools +documentation: https://ncats.nih.gov/preclinical/core/informatics +citation: "" +inputs: +- name: output_txt_path + required: false + description: Path to the text dataset file + type: string + default: /outdir/system.log +- name: index_file_name + required: false + description: The index file name + type: string + default: INDEX_refined_data.2020 +- name: base_dir + required: true + description: The base_dir path + type: string +- name: query + required: false + description: "query str to search the dataset, Pandas query doesn't support slash(/) in column names please use Kd_Ki instead of Kd/Ki" + type: string +- name: min_row + required: false + description: The row min index + type: int +- name: max_row + required: false + description: The row max index + type: int +- name: convert_Kd_dG + required: false + description: If this is set to true, dG will be calculated + type: boolean diff --git a/utils/pre-process/data-download/pdbbind_refined_v2020_tool/pyproject.toml b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/pyproject.toml new file mode 100644 index 00000000..c8732d8b --- /dev/null +++ b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/pyproject.toml @@ -0,0 +1,32 @@ +[tool.poetry] +name = "polus-mm-utils-pdbbind-refined-v2020" +version = "0.1.0" +description = "An awesome function." +authors = ["Data Scientist "] +readme = "README.md" +packages = [{include = "polus", from = "src"}] + +[tool.poetry.dependencies] +python = ">=3.9,<3.12" +typer = "^0.7.0" +pandas = "^1.3.3" +cwl-utils = "0.33" +cwltool = "3.1.20240404144621" + +[tool.poetry.group.dev.dependencies] +bump2version = "^1.0.1" +pytest = "^7.4" +pytest-sugar = "^0.9.6" +pre-commit = "^3.2.1" +black = "^23.3.0" +mypy = "^1.1.1" +ruff = "^0.0.270" + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" + +[tool.pytest.ini_options] +pythonpath = [ + "." +] diff --git a/utils/pre-process/data-download/pdbbind_refined_v2020_tool/src/polus/mm/utils/pdbbind_refined_v2020/__init__.py b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/src/polus/mm/utils/pdbbind_refined_v2020/__init__.py new file mode 100644 index 00000000..d5992f46 --- /dev/null +++ b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/src/polus/mm/utils/pdbbind_refined_v2020/__init__.py @@ -0,0 +1,7 @@ +"""pdbbind_refined_v2020.""" + +__version__ = "0.1.0" + +from polus.mm.utils.pdbbind_refined_v2020.pdbbind_refined_v2020 import ( # noqa # pylint: disable=unused-import + pdbbind_refined_v2020, +) diff --git a/utils/pre-process/data-download/pdbbind_refined_v2020_tool/src/polus/mm/utils/pdbbind_refined_v2020/__main__.py b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/src/polus/mm/utils/pdbbind_refined_v2020/__main__.py new file mode 100644 index 00000000..5ff8d576 --- /dev/null +++ b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/src/polus/mm/utils/pdbbind_refined_v2020/__main__.py @@ -0,0 +1,78 @@ +"""Package entrypoint for the pdbbind_refined_v2020 package.""" + +# Base packages +import logging +from os import environ +from pathlib import Path + +import typer +from polus.mm.utils.pdbbind_refined_v2020.pdbbind_refined_v2020 import ( + pdbbind_refined_v2020, +) + +logging.basicConfig( + format="%(asctime)s - %(name)-8s - %(levelname)-8s - %(message)s", + datefmt="%d-%b-%y %H:%M:%S", +) +POLUS_LOG = getattr(logging, environ.get("POLUS_LOG", "INFO")) +logger = logging.getLogger("polus.mm.utils.pdbbind_refined_v2020.") +logger.setLevel(POLUS_LOG) + +app = typer.Typer(help="pdbbind_refined_v2020.") + + +@app.command() +def main( # noqa: PLR0913 + output_txt_path: str = typer.Option( + "system.log", + "--output_txt_path", + help="Path to the text dataset file", + ), + index_file_name: str = typer.Option( + "INDEX_refined_data.2020", + "--index_file_name", + help="", + ), + query: str = typer.Option( + ..., + "--query", + help="query str to search the dataset.", + ), + min_row: int = typer.Option( + 1, + "--min_row", + help="The row min inex, Type: int", + ), + max_row: int = typer.Option( + ..., + "--max_row", + help="The row max inex, Type: int", + ), + convert_kd_dg: bool = typer.Option( + False, + "--convert_Kd_dG", + help="If this is set to true, dG will be calculated", + ), +) -> None: + """pdbbind_refined_v2020.""" + logger.info(f"output_txt_path: {output_txt_path}") + logger.info(f"index_file_name: {index_file_name}") + logger.info(f"query: {query}") + logger.info(f"min_row: {min_row}") + logger.info(f"max_row: {max_row}") + logger.info(f"convert_Kd_dG: {convert_kd_dg}") + base_dir = "/refined-set" + Path(base_dir) + pdbbind_refined_v2020( + output_txt_path, + index_file_name, + base_dir, + query, + min_row, + max_row, + convert_kd_dg, + ) + + +if __name__ == "__main__": + app() diff --git a/utils/pre-process/data-download/pdbbind_refined_v2020_tool/src/polus/mm/utils/pdbbind_refined_v2020/pdbbind_refined_v2020.py b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/src/polus/mm/utils/pdbbind_refined_v2020/pdbbind_refined_v2020.py new file mode 100644 index 00000000..cbcc59e4 --- /dev/null +++ b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/src/polus/mm/utils/pdbbind_refined_v2020/pdbbind_refined_v2020.py @@ -0,0 +1,174 @@ +"""extract pdbbind_refined_v2020.""" +import math +import re +import subprocess +from collections import defaultdict +from pathlib import Path +from typing import Any + +import pandas as pd + + +def pdbbind_refined_v2020( # noqa: PLR0913 + output_txt_path: str, + index_file_name: str, + base_dir: str, + query: str, + min_row: int, + max_row: int, + convert_kd_dg: bool, +) -> None: + """pdbbind_refined_v2020. + + Args: + output_txt_path: Path to the text dataset file + index_file_name: The PDBbind index file name + base_dir: The base directory of the dataset + query: query str to search the dataset. + min_row: The row min index + max_row: The row max index + convert_kd_dg: If this is set to true, dG will be calculated + Returns: + None + """ + load_data( + index_file_name, + base_dir, + query, + output_txt_path, + min_row, + max_row, + convert_kd_dg, + ) + + +def calculate_dg(kd: float) -> float: + """Calculates binding free energy from Kd. + + Args: + kd (float): The binding affinity of the protein-ligand complex + + Returns: + float: The binding free energy + """ + # Calculate the binding free energy from kd so we can make the correlation plots. + # See https://en.wikipedia.org/wiki/Binding_constant + ideal_gas_constant = 8.31446261815324 # J/(Mol*K) + kcal_per_joule = 4184 + # NOTE: Unfortunately, the temperature at which + # experimental kd binding data was taken + # is often not recorded. Thus, we are forced to guess. The two standard guesses are + # physiological body temperature (310K) or room temperature (298K). + temperature = 298 + rt = (ideal_gas_constant / kcal_per_joule) * temperature + # NOTE: For performance, simulations are often done in a very small unit cell, and + # thus at a very high concentration. The size of the unit cell bounds the volume. + # For shorter simulations where the ligand has not explored the entire box, it may + # be less. See the Yank paper for a method of calculating the correct volumes. + standard_concentration = 1 # Units of mol / L, but see comment above. + return rt * math.log(kd / standard_concentration) + + +def read_index_file(index_file_path: str) -> pd.DataFrame: + """Reads the PDBbind index file and extracts binding data. + + Args: + index_file_path (str): The path to the index file + + Returns: + pd.DataFrame: The kd data + """ + data: dict[str, Any] = defaultdict(list) + # The file format + # PDB code, resolution, release year, -logkd/Ki, kd/Ki, reference, ligand name + unit_conv = {"uM": 1, "mM": 1000.0, "nM": 0.001, "pM": 0.000001} + + with Path(index_file_path).open(encoding="utf-8") as rfile: + lines = [line for line in rfile.readlines() if line[0] != "#" and "Kd=" in line] + for line in lines: + words = line.split() + data["PDB_code"].append(words[0]) + data["resolution"].append(words[1]) + data["release_year"].append(words[2]) + + # Kd conversion to micro molar + unit = re.split(r"=[-+]?(?:\d*\.\d+|\d+)", words[4])[1] + standard_type = re.split(r"=[-+]?(?:\d*\.\d+|\d+)", words[4])[0] + kd = float(re.findall(r"[-+]?(?:\d*\.\d+|\d+)", words[4])[0]) + data["Kd_Ki"].append(standard_type) + data["value"].append(kd * unit_conv[unit]) + data["ligand_name"].append(re.findall(r"\((.*?)\)", words[7])[0]) + + return pd.DataFrame.from_dict(data) + + +# pylint: disable=too-many-arguments,too-many-locals + + +def load_data( # noqa: PLR0913 + index_file_name: str, + base_dir: str, + query: str, + output_txt_path: str, + min_row: int = 1, + max_row: int = -1, + convert_kd_dg: bool = False, +) -> None: + """Filters Kd data beased on a query. + + Args: + index_file_name (str): The PDBbind index file name + base_dir (str): The base directory of the dataset + query (str): The Query to perform + output_txt_path (str): The output text file + min_row (int, optional): min index of rows. Defaults to 1. + max_row (int, optional): max index of rows. Defaults to -1. + convert_kd_dg (bool, optional): If this set to True, + The dG will be calculated. Defaults to False. + """ + index_file_path = Path(base_dir).joinpath("index", index_file_name) + df = read_index_file(str(index_file_path)) + # perform query + df = df.query(query) + + # Perform row slicing (if any) + if int(min_row) != 1 or int(max_row) != -1: + # We want to convert to zero-based indices and we also want + # the upper index to be inclusive (i.e. <=) so -1 lower index. + df = df[ + (int(min_row) - 1) : int(max_row) + ] # pylint: disable=unsubscriptable-object + + # Calculate dG + df = df[["PDB_code", "value", "Kd_Ki"]] + binding_data: list[str] = [] + micromolar = 0.000001 # uM + for _, row in enumerate(df.values): + (pdbcode, binding_datum, kd_ki) = row + binding_datum = binding_datum * micromolar + + if convert_kd_dg: + dg = calculate_dg(binding_datum) + binding_data.append(f"{pdbcode} {binding_datum} {dg} {kd_ki}") + else: + binding_data.append(f"{pdbcode} {binding_datum} {kd_ki}") + + with Path(output_txt_path).open(mode="w", encoding="utf-8") as f: + f.write("\n".join(binding_data)) + + # copy pdb and sdf files + for _, row in df.iterrows(): + pdbcode = row["PDB_code"] + source_pdb_path = Path(base_dir).joinpath(pdbcode, f"{pdbcode}_protein.pdb") + dist_pdb_path = f"{pdbcode}_protein.pdb" + subprocess.run( + ["cp", f"{source_pdb_path}", f"{dist_pdb_path}"], # noqa: S603, S607 + check=True, + ) + source_sdf_path = Path(base_dir).joinpath(pdbcode, f"{pdbcode}_ligand.sdf") + + dist_sdf_path = f"{pdbcode}_ligand.sdf" + subprocess.run( + ["cp", f"{source_sdf_path}", f"{dist_sdf_path}"], # noqa: S603, S607 + check=True, + ) diff --git a/utils/pre-process/data-download/pdbbind_refined_v2020_tool/tests/__init__.py b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/tests/__init__.py new file mode 100644 index 00000000..e0368608 --- /dev/null +++ b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/tests/__init__.py @@ -0,0 +1 @@ +"""Tests for pdbbind_refined_v2020.""" diff --git a/utils/pre-process/data-download/pdbbind_refined_v2020_tool/tests/test_pdbbind_refined_v2020.py b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/tests/test_pdbbind_refined_v2020.py new file mode 100644 index 00000000..b5d3b623 --- /dev/null +++ b/utils/pre-process/data-download/pdbbind_refined_v2020_tool/tests/test_pdbbind_refined_v2020.py @@ -0,0 +1,56 @@ +"""Tests for pdbbind_refined_v2020.""" +import sys +from pathlib import Path + +sys.path.append("src") +from polus.mm.utils.pdbbind_refined_v2020.pdbbind_refined_v2020 import ( # noqa: E402 + pdbbind_refined_v2020, +) + +current_dir = Path(__file__).resolve().parent +target_dir = current_dir.parent.parent.parent.parent.parent / "cwl_utils" +sys.path.append(str(target_dir)) + +from cwl_utilities import call_cwltool # noqa: E402 +from cwl_utilities import create_input_yaml # noqa: E402 +from cwl_utilities import parse_cwl_arguments # noqa: E402 + + +def test_pdbbind_refined_v2020() -> None: + """Test pdbbind_refined_v2020.""" + output_txt_path = "output.txt" + index_file_name = "INDEX_refined_data.2020" + base_dir = Path.cwd() / "refined-set" + query = '(Kd_Ki == "Kd") and (value < 0.001)' + min_row = 1 + max_row = 1 + convert_kd_dg = True + pdbbind_refined_v2020( + output_txt_path, + index_file_name, + base_dir, + query, + min_row, + max_row, + convert_kd_dg, + ) + current_directory = Path.cwd() # Get the current directory + pdb_files = list(current_directory.glob("*.pdb")) # List all *.pdb files + sdf_files = list(current_directory.glob("*.sdf")) # List all *.sdf files + + assert pdb_files, "No .pdb files found in the directory" + assert sdf_files, "No .sdf files found in the directory" + + +def test_extract_pdbbind_refined() -> None: + """Test pdb.""" + cwl_file = Path("extract_pdbbind_refined.cwl") + input_to_props = parse_cwl_arguments(cwl_file) + input_to_props["query"] = '(Kd_Ki == "Kd") and (value < 0.001)' + input_to_props["convert_Kd_dG"] = True + input_to_props["max_row"] = 1 + input_yaml_path = Path("extract_pdbbind_refined.yml") + create_input_yaml(input_to_props, input_yaml_path) + call_cwltool(cwl_file, input_yaml_path) + + assert Path("1e3g_protein.pdb").is_file()